Python bert.tokenization.BasicTokenizer() Examples

The following are 14 code examples of bert.tokenization.BasicTokenizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module bert.tokenization , or try the search function .
Example #1
Source File: evidence_corpus.py    From XQA with MIT License 5 votes vote down vote up
def __init__(self):
        self._tokenizer = BasicTokenizer(do_lower_case=False) 
Example #2
Source File: tokenization_test.py    From Bert-TextClassification with MIT License 5 votes vote down vote up
def test_chinese(self):
        tokenizer = tokenization.BasicTokenizer()

        self.assertAllEqual(
            tokenizer.tokenize(u"ah\u535A\u63A8zz"),
            [u"ah", u"\u535A", u"\u63A8", u"zz"]) 
Example #3
Source File: tokenization_test.py    From Bert-TextClassification with MIT License 5 votes vote down vote up
def test_basic_tokenizer_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=True)

        self.assertAllEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["hello", "!", "how", "are", "you", "?"])
        self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 
Example #4
Source File: tokenization_test.py    From Bert-TextClassification with MIT License 5 votes vote down vote up
def test_basic_tokenizer_no_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=False)

        self.assertAllEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["HeLLo", "!", "how", "Are", "yoU", "?"]) 
Example #5
Source File: tokenization_test.py    From QGforQA with MIT License 5 votes vote down vote up
def test_chinese(self):
    tokenizer = tokenization.BasicTokenizer()

    self.assertAllEqual(
        tokenizer.tokenize(u"ah\u535A\u63A8zz"),
        [u"ah", u"\u535A", u"\u63A8", u"zz"]) 
Example #6
Source File: tokenization_test.py    From QGforQA with MIT License 5 votes vote down vote up
def test_basic_tokenizer_lower(self):
    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)

    self.assertAllEqual(
        tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
        ["hello", "!", "how", "are", "you", "?"])
    self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 
Example #7
Source File: tokenization_test.py    From QGforQA with MIT License 5 votes vote down vote up
def test_basic_tokenizer_no_lower(self):
    tokenizer = tokenization.BasicTokenizer(do_lower_case=False)

    self.assertAllEqual(
        tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
        ["HeLLo", "!", "how", "Are", "yoU", "?"]) 
Example #8
Source File: build_span_corpus.py    From RE3QA with Apache License 2.0 5 votes vote down vote up
def build_wiki_corpus(n_processes):
    build_dataset("wiki", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"),
                      dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"),
                      train=join(TRIVIA_QA, "qa", "wikipedia-train.json"),
                      test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes) 
Example #9
Source File: build_span_corpus.py    From RE3QA with Apache License 2.0 5 votes vote down vote up
def build_web_corpus(n_processes):
    build_dataset("web", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"),
                      dev=join(TRIVIA_QA, "qa", "web-dev.json"),
                      train=join(TRIVIA_QA, "qa", "web-train.json"),
                      test=join(TRIVIA_QA, "qa", "web-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes) 
Example #10
Source File: build_span_corpus.py    From RE3QA with Apache License 2.0 5 votes vote down vote up
def build_unfiltered_corpus(n_processes):
    build_dataset("unfiltered", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"),
                      train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"),
                      test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes) 
Example #11
Source File: build_span_corpus.py    From RE3QA with Apache License 2.0 5 votes vote down vote up
def build_wiki_sample_corpus(n_processes):
    build_dataset("wiki-sample", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"),
                      dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"),
                      train=join(TRIVIA_QA, "qa", "wikipedia-train.json"),
                      test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes, sample=20) 
Example #12
Source File: build_span_corpus.py    From RE3QA with Apache License 2.0 5 votes vote down vote up
def build_unfiltered_sample_corpus(n_processes):
    build_dataset("unfiltered-sample", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"),
                      train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"),
                      test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes, sample=20) 
Example #13
Source File: evidence_corpus.py    From RE3QA with Apache License 2.0 5 votes vote down vote up
def main():
    parse = argparse.ArgumentParser("Pre-tokenize the TriviaQA evidence corpus")
    parse.add_argument("-o", "--output_dir", type=str, default=join("data", "triviaqa", "evidence"))
    parse.add_argument("-s", "--source", type=str, default=join(TRIVIA_QA, "evidence"))
    # This is slow, using more processes is recommended
    parse.add_argument("-n", "--n_processes", type=int, default=1, help="Number of processes to use")
    parse.add_argument("--max_tokens", type=int, default=200, help="Number of maximal tokens in each merged paragraph")
    parse.add_argument("--wiki_only", action="store_true")
    args = parse.parse_args()

    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
    splitter = MergeParagraphs(args.max_tokens)
    build_tokenized_corpus(args.source, tokenizer, splitter, args.output_dir,
                           n_processes=args.n_processes, wiki_only=args.wiki_only) 
Example #14
Source File: convert_squad_open.py    From RE3QA with Apache License 2.0 4 votes vote down vote up
def main():
    parse = argparse.ArgumentParser("Pre-tokenize the SQuAD open dev file")
    parse.add_argument("--input_file", type=str, default=join("data", "squad", "squad_dev_open.pkl"))
    # This is slow, using more processes is recommended
    parse.add_argument("--max_tokens", type=int, default=200, help="Number of maximal tokens in each merged paragraph")
    parse.add_argument("--n_to_select", type=int, default=30, help="Number of paragraphs to retrieve")
    parse.add_argument("--sort_passage", type=bool, default=True, help="Sort passage according to order")
    parse.add_argument("--debug", type=bool, default=False, help="Whether to run in debug mode")
    args = parse.parse_args()

    dev_examples = pickle.load(open(args.input_file, 'rb'))

    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
    splitter = MergeParagraphs(args.max_tokens)
    tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=stop_words)
    detector = FastNormalizedAnswerDetector()

    ir_count, total_doc_length, pruned_doc_length = 0, 0, 0
    out = []
    for example_ix, example in tqdm(enumerate(dev_examples), total=len(dev_examples)):
        paras = [x for x in example.doc_text.split("\n") if len(x) > 0]
        paragraphs = [tokenizer.tokenize(x) for x in paras]
        merged_paragraphs = splitter.merge(paragraphs)

        scores = rank(tfidf, [example.question_text], [" ".join(x) for x in merged_paragraphs])
        para_scores = scores[0]
        para_ranks = np.argsort(para_scores)
        selection = [i for i in para_ranks[:args.n_to_select]]

        if args.sort_passage:
            selection = np.sort(selection)

        doc_tokens = []
        for idx in selection:
            current_para = merged_paragraphs[idx]
            doc_tokens += current_para

        tokenized_answers = [tokenizer.tokenize(x) for x in example.answer_texts]
        detector.set_question(tokenized_answers)
        if len(detector.any_found(doc_tokens)) > 0:
            ir_count += 1

        total_doc_length += sum(len(para) for para in merged_paragraphs)
        pruned_doc_length += len(doc_tokens)

        out.append(DocumentAndQuestion(example_ix, example.qas_id, example.question_text, doc_tokens,
                                       '', 0, 0, True))
        if args.debug and example_ix > 5:
            break
    print("Recall of answer existence in documents: {:.3f}".format(ir_count / len(out)))
    print("Average length of documents: {:.3f}".format(total_doc_length / len(out)))
    print("Average pruned length of documents: {:.3f}".format(pruned_doc_length / len(out)))
    output_file = join("data", "squad", "eval_open_{}paras_examples.pkl".format(args.n_to_select))
    pickle.dump(out, open(output_file, 'wb'))