Python tokenization.WordpieceTokenizer() Examples
The following are 30
code examples of tokenization.WordpieceTokenizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tokenization
, or try the search function
.
Example #1
Source File: tokenization_test.py From training with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #2
Source File: tokenization_test.py From SIGIR19-BERT-IR with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #3
Source File: tokenization_test.py From curriculum with GNU General Public License v3.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #4
Source File: tokenization_test.py From delft with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #5
Source File: tokenization_test.py From bert-qa with MIT License | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #6
Source File: tokenization_test.py From KBQA-BERT with MIT License | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #7
Source File: tokenization_test.py From BERT_STS-B with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #8
Source File: tokenization_test.py From BERT-sentiment--classification with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #9
Source File: tokenization_test.py From training with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #10
Source File: tokenization_test.py From uai-sdk with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #11
Source File: tokenization_test.py From adapter-bert with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #12
Source File: tokenization_test.py From pynlp with MIT License | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #13
Source File: tokenization_test.py From pynlp with MIT License | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #14
Source File: tokenization_test.py From dl4marco-bert with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #15
Source File: tokenization_test.py From BERT-for-Sequence-Labeling-and-Text-Classification with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #16
Source File: tokenization_test.py From MedicalRelationExtraction with MIT License | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #17
Source File: tokenization_test.py From coref with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #18
Source File: tokenization_test.py From bert_serving with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #19
Source File: tokenization_test.py From Extending-Google-BERT-as-Question-and-Answering-model-and-Chatbot with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #20
Source File: tokenization_test.py From BERT with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #21
Source File: tokenization_test.py From models with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #22
Source File: tokenization_test.py From models with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #23
Source File: tokenization_test.py From models with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #24
Source File: tokenization_test.py From text_bert_cnn with MIT License | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #25
Source File: tokenization_test.py From DeepCT with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #26
Source File: tokenization_test.py From BERT-Classification-Tutorial with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #27
Source File: tokenization_test.py From BERT-Chinese-Annotation with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #28
Source File: tokenization_test.py From bert-multi-gpu with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #29
Source File: tokenization_test.py From gobbli with Apache License 2.0 | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
Example #30
Source File: tokenization_test.py From nlp_research with MIT License | 6 votes |
def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) self.assertAllEqual(tokenizer.tokenize(""), []) self.assertAllEqual( tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertAllEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])