Python bert.tokenization.BasicTokenizer() Examples
The following are 14
code examples of bert.tokenization.BasicTokenizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bert.tokenization
, or try the search function
.
Example #1
Source File: evidence_corpus.py From XQA with MIT License | 5 votes |
def __init__(self): self._tokenizer = BasicTokenizer(do_lower_case=False)
Example #2
Source File: tokenization_test.py From Bert-TextClassification with MIT License | 5 votes |
def test_chinese(self): tokenizer = tokenization.BasicTokenizer() self.assertAllEqual( tokenizer.tokenize(u"ah\u535A\u63A8zz"), [u"ah", u"\u535A", u"\u63A8", u"zz"])
Example #3
Source File: tokenization_test.py From Bert-TextClassification with MIT License | 5 votes |
def test_basic_tokenizer_lower(self): tokenizer = tokenization.BasicTokenizer(do_lower_case=True) self.assertAllEqual( tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), ["hello", "!", "how", "are", "you", "?"]) self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
Example #4
Source File: tokenization_test.py From Bert-TextClassification with MIT License | 5 votes |
def test_basic_tokenizer_no_lower(self): tokenizer = tokenization.BasicTokenizer(do_lower_case=False) self.assertAllEqual( tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"])
Example #5
Source File: tokenization_test.py From QGforQA with MIT License | 5 votes |
def test_chinese(self): tokenizer = tokenization.BasicTokenizer() self.assertAllEqual( tokenizer.tokenize(u"ah\u535A\u63A8zz"), [u"ah", u"\u535A", u"\u63A8", u"zz"])
Example #6
Source File: tokenization_test.py From QGforQA with MIT License | 5 votes |
def test_basic_tokenizer_lower(self): tokenizer = tokenization.BasicTokenizer(do_lower_case=True) self.assertAllEqual( tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), ["hello", "!", "how", "are", "you", "?"]) self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
Example #7
Source File: tokenization_test.py From QGforQA with MIT License | 5 votes |
def test_basic_tokenizer_no_lower(self): tokenizer = tokenization.BasicTokenizer(do_lower_case=False) self.assertAllEqual( tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"])
Example #8
Source File: build_span_corpus.py From RE3QA with Apache License 2.0 | 5 votes |
def build_wiki_corpus(n_processes): build_dataset("wiki", tokenization.BasicTokenizer(do_lower_case=True), dict( verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"), dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"), train=join(TRIVIA_QA, "qa", "wikipedia-train.json"), test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes)
Example #9
Source File: build_span_corpus.py From RE3QA with Apache License 2.0 | 5 votes |
def build_web_corpus(n_processes): build_dataset("web", tokenization.BasicTokenizer(do_lower_case=True), dict( verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"), dev=join(TRIVIA_QA, "qa", "web-dev.json"), train=join(TRIVIA_QA, "qa", "web-train.json"), test=join(TRIVIA_QA, "qa", "web-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes)
Example #10
Source File: build_span_corpus.py From RE3QA with Apache License 2.0 | 5 votes |
def build_unfiltered_corpus(n_processes): build_dataset("unfiltered", tokenization.BasicTokenizer(do_lower_case=True), dict( dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"), train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"), test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes)
Example #11
Source File: build_span_corpus.py From RE3QA with Apache License 2.0 | 5 votes |
def build_wiki_sample_corpus(n_processes): build_dataset("wiki-sample", tokenization.BasicTokenizer(do_lower_case=True), dict( verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"), dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"), train=join(TRIVIA_QA, "qa", "wikipedia-train.json"), test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes, sample=20)
Example #12
Source File: build_span_corpus.py From RE3QA with Apache License 2.0 | 5 votes |
def build_unfiltered_sample_corpus(n_processes): build_dataset("unfiltered-sample", tokenization.BasicTokenizer(do_lower_case=True), dict( dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"), train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"), test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes, sample=20)
Example #13
Source File: evidence_corpus.py From RE3QA with Apache License 2.0 | 5 votes |
def main(): parse = argparse.ArgumentParser("Pre-tokenize the TriviaQA evidence corpus") parse.add_argument("-o", "--output_dir", type=str, default=join("data", "triviaqa", "evidence")) parse.add_argument("-s", "--source", type=str, default=join(TRIVIA_QA, "evidence")) # This is slow, using more processes is recommended parse.add_argument("-n", "--n_processes", type=int, default=1, help="Number of processes to use") parse.add_argument("--max_tokens", type=int, default=200, help="Number of maximal tokens in each merged paragraph") parse.add_argument("--wiki_only", action="store_true") args = parse.parse_args() tokenizer = tokenization.BasicTokenizer(do_lower_case=True) splitter = MergeParagraphs(args.max_tokens) build_tokenized_corpus(args.source, tokenizer, splitter, args.output_dir, n_processes=args.n_processes, wiki_only=args.wiki_only)
Example #14
Source File: convert_squad_open.py From RE3QA with Apache License 2.0 | 4 votes |
def main(): parse = argparse.ArgumentParser("Pre-tokenize the SQuAD open dev file") parse.add_argument("--input_file", type=str, default=join("data", "squad", "squad_dev_open.pkl")) # This is slow, using more processes is recommended parse.add_argument("--max_tokens", type=int, default=200, help="Number of maximal tokens in each merged paragraph") parse.add_argument("--n_to_select", type=int, default=30, help="Number of paragraphs to retrieve") parse.add_argument("--sort_passage", type=bool, default=True, help="Sort passage according to order") parse.add_argument("--debug", type=bool, default=False, help="Whether to run in debug mode") args = parse.parse_args() dev_examples = pickle.load(open(args.input_file, 'rb')) tokenizer = tokenization.BasicTokenizer(do_lower_case=True) splitter = MergeParagraphs(args.max_tokens) tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=stop_words) detector = FastNormalizedAnswerDetector() ir_count, total_doc_length, pruned_doc_length = 0, 0, 0 out = [] for example_ix, example in tqdm(enumerate(dev_examples), total=len(dev_examples)): paras = [x for x in example.doc_text.split("\n") if len(x) > 0] paragraphs = [tokenizer.tokenize(x) for x in paras] merged_paragraphs = splitter.merge(paragraphs) scores = rank(tfidf, [example.question_text], [" ".join(x) for x in merged_paragraphs]) para_scores = scores[0] para_ranks = np.argsort(para_scores) selection = [i for i in para_ranks[:args.n_to_select]] if args.sort_passage: selection = np.sort(selection) doc_tokens = [] for idx in selection: current_para = merged_paragraphs[idx] doc_tokens += current_para tokenized_answers = [tokenizer.tokenize(x) for x in example.answer_texts] detector.set_question(tokenized_answers) if len(detector.any_found(doc_tokens)) > 0: ir_count += 1 total_doc_length += sum(len(para) for para in merged_paragraphs) pruned_doc_length += len(doc_tokens) out.append(DocumentAndQuestion(example_ix, example.qas_id, example.question_text, doc_tokens, '', 0, 0, True)) if args.debug and example_ix > 5: break print("Recall of answer existence in documents: {:.3f}".format(ir_count / len(out))) print("Average length of documents: {:.3f}".format(total_doc_length / len(out))) print("Average pruned length of documents: {:.3f}".format(pruned_doc_length / len(out))) output_file = join("data", "squad", "eval_open_{}paras_examples.pkl".format(args.n_to_select)) pickle.dump(out, open(output_file, 'wb'))