Python allennlp.data.tokenizers.WordTokenizer() Examples

The following are 30 code examples of allennlp.data.tokenizers.WordTokenizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data.tokenizers , or try the search function .
Example #1
Source File: util_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_char_span_to_token_span_handles_easy_cases(self):
        # These are _inclusive_ spans, on both sides.
        tokenizer = WordTokenizer()
        passage = u"On January 7, 2012, Beyoncé gave birth to her first child, a daughter, Blue Ivy " +\
            u"Carter, at Lenox Hill Hospital in New York. Five months later, she performed for four " +\
            u"nights at Revel Atlantic City's Ovation Hall to celebrate the resort's opening, her " +\
            u"first performances since giving birth to Blue Ivy."
        tokens = tokenizer.tokenize(passage)
        offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]
        # "January 7, 2012"
        token_span = util.char_span_to_token_span(offsets, (3, 18))[0]
        assert token_span == (1, 4)
        # "Lenox Hill Hospital"
        token_span = util.char_span_to_token_span(offsets, (91, 110))[0]
        assert token_span == (22, 24)
        # "Lenox Hill Hospital in New York."
        token_span = util.char_span_to_token_span(offsets, (91, 123))[0]
        assert token_span == (22, 28) 
Example #2
Source File: reader.py    From fever-naacl-2018 with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 db: FeverDocDB,
                 sentence_level = False,
                 wiki_tokenizer: Tokenizer = None,
                 claim_tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 filtering: str = None) -> None:
        self._sentence_level = sentence_level
        self._wiki_tokenizer = wiki_tokenizer or WordTokenizer()
        self._claim_tokenizer = claim_tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}

        self.db = db

        self.formatter = FEVERGoldFormatter(set(self.db.get_doc_ids()), FEVERLabelSchema(),filtering=filtering)
        self.reader = JSONLineReader() 
Example #3
Source File: ir_labeled_tuple_loader.py    From transformer-kernel-ranking with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 min_doc_length:int = -1,
                 min_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length
        self.min_doc_length = min_doc_length
        self.min_query_length = min_query_length

        self.padding_value = Token(text = "@@PADDING@@",text_id=0) 
Example #4
Source File: language_modeling.py    From magnitude with MIT License 6 votes vote down vote up
def __init__(self,
                 tokens_per_instance      = None,
                 tokenizer            = None,
                 token_indexers                          = None,
                 lazy       = False)        :
        super(LanguageModelingReader, self).__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {u"tokens": SingleIdTokenIndexer()}
        self._tokens_per_instance = tokens_per_instance

        # No matter how you want to represent the input, we'll always represent the output as a
        # single token id.  This code lets you learn a language model that concatenates word
        # embeddings with character-level encoders, in order to predict the word token that comes
        # next.
        self._output_indexer =  None
        for name, indexer in list(self._token_indexers.items()):
            if isinstance(indexer, SingleIdTokenIndexer):
                self._output_indexer = {name: indexer}
                break
        else:
            self._output_indexer = {u"tokens": SingleIdTokenIndexer()}

    #overrides 
Example #5
Source File: semisupervised_text_classification_json.py    From vampire with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 tokenizer: Tokenizer = None,
                 max_sequence_length: int = None,
                 ignore_labels: bool = False,
                 sample: int = None,
                 skip_label_indexing: bool = False,
                 lazy: bool = False) -> None:
        super().__init__(lazy=lazy,
                         token_indexers=token_indexers,
                         tokenizer=tokenizer,
                         max_sequence_length=max_sequence_length,
                         skip_label_indexing=skip_label_indexing)
        self._tokenizer = tokenizer or WordTokenizer()
        self._sample = sample
        self._max_sequence_length = max_sequence_length
        self._ignore_labels = ignore_labels
        self._skip_label_indexing = skip_label_indexing
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        if self._segment_sentences:
            self._sentence_segmenter = SpacySentenceSplitter() 
Example #6
Source File: citation_data_reader_scicite_aux.py    From scicite with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 clean_citation: bool = True,
                 with_elmo: bool = False
                 ) -> None:
        super().__init__(lazy)
        self._clean_citation = clean_citation
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        if with_elmo:
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()} 
Example #7
Source File: citation_data_reader_aclarc_aux.py    From scicite with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 clean_citation: bool = True,
                 with_elmo: bool = False
                 # use_lexicon_features: bool = False,
                 # use_sparse_lexicon_features: bool = False
                 ) -> None:
        super().__init__(lazy)
        self._clean_citation = clean_citation
        self._tokenizer = tokenizer or WordTokenizer()
        if with_elmo:
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()} 
Example #8
Source File: imdb_review_reader.py    From topic-rnn with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 words_per_instance: int = 35,
                 classification_mode=False
                ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer(
            start_tokens=[START_SYMBOL],
            end_tokens=[END_SYMBOL]
        )
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True)
        }

        self._words_per_instance = words_per_instance
        self._classification_mode = classification_mode 
Example #9
Source File: imdb_review_reader.py    From topic-rnn with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 words_per_instance: int = 35
                ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer(
            start_tokens=[START_SYMBOL],
            end_tokens=[END_SYMBOL]
        )
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True)
        }

        self._words_per_instance = words_per_instance 
Example #10
Source File: ir_triple_loader.py    From transformer-kernel-ranking with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 min_doc_length:int = -1,
                 min_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length
        self.min_doc_length = min_doc_length
        self.min_query_length = min_query_length

        self.padding_value = Token(text = "@@PADDING@@",text_id=0) 
Example #11
Source File: nlvr.py    From magnitude with MIT License 6 votes vote down vote up
def __init__(self,
                 lazy       = False,
                 tokenizer            = None,
                 sentence_token_indexers                          = None,
                 nonterminal_indexers                          = None,
                 terminal_indexers                          = None,
                 output_agendas       = True)        :
        super(NlvrDatasetReader, self).__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._sentence_token_indexers = sentence_token_indexers or {u"tokens": SingleIdTokenIndexer()}
        self._nonterminal_indexers = nonterminal_indexers or {u"tokens":
                                                              SingleIdTokenIndexer(u"rule_labels")}
        self._terminal_indexers = terminal_indexers or {u"tokens": SingleIdTokenIndexer(u"rule_labels")}
        self._output_agendas = output_agendas

    #overrides 
Example #12
Source File: citation_data_reader_scicite.py    From scicite with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 use_lexicon_features: bool=False,
                 use_sparse_lexicon_features: bool = False,
                 multilabel: bool = False,
                 with_elmo: bool = False,
                 reader_format: str = 'flat') -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        if with_elmo:
            # self._token_indexers = {"tokens": SingleIdTokenIndexer()}
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()}

        self.use_lexicon_features = use_lexicon_features
        self.use_sparse_lexicon_features = use_sparse_lexicon_features
        if self.use_lexicon_features or self.use_sparse_lexicon_features:
            self.lexicons = {**ALL_ACTION_LEXICONS, **ALL_CONCEPT_LEXICONS}
        self.multilabel = multilabel
        self.reader_format = reader_format 
Example #13
Source File: drop_utils.py    From MTMSN with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 debug: bool = False,
                 tokenizer: Tokenizer = None,
                 include_more_numbers: bool = False,
                 skip_when_all_empty: List[str] = None,
                 max_number_of_answer: int = 8,
                 max_number_count: int = 10,
                 logger = None) -> None:
        super().__init__()
        self.debug = debug
        self._tokenizer = tokenizer or WordTokenizer()
        self.include_more_numbers = include_more_numbers
        self.max_number_of_answer = max_number_of_answer
        self.max_number_count = max_number_count
        self.skip_when_all_empty = skip_when_all_empty if skip_when_all_empty is not None else []
        for item in self.skip_when_all_empty:
            assert item in ["passage_span", "question_span", "addition_subtraction", "counting", "negation"], \
                f"Unsupported skip type: {item}"
        self.logger = logger 
Example #14
Source File: citation_data_reader_aclarc.py    From scicite with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 use_lexicon_features: bool = False,
                 use_sparse_lexicon_features: bool = False,
                 with_elmo: bool = False
                 ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        if with_elmo:
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()}
        self.use_lexicon_features = use_lexicon_features
        self.use_sparse_lexicon_features = use_sparse_lexicon_features
        if self.use_lexicon_features or self.use_sparse_lexicon_features:
            self.lexicons = {**ALL_ACTION_LEXICONS, **ALL_CONCEPT_LEXICONS} 
Example #15
Source File: citation_data_reader_aclarc_aux.py    From scicite with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 clean_citation: bool = True,
                 with_elmo: bool = False
                 ) -> None:
        super().__init__(lazy)
        self._clean_citation = clean_citation
        self._tokenizer = tokenizer or WordTokenizer()
        if with_elmo:
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()} 
Example #16
Source File: ir_tuple_loader.py    From transformer-kernel-ranking with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 source_tokenizer: Tokenizer = None,
                 target_tokenizer: Tokenizer = None,
                 source_token_indexers: Dict[str, TokenIndexer] = None,
                 target_token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 lowercase: bool = True,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._source_tokenizer = source_tokenizer or WordTokenizer() #word_splitter=SimpleWordSplitter()
        self._target_tokenizer = target_tokenizer or self._source_tokenizer
        self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=lowercase)}
        self._target_token_indexers = target_token_indexers or self._source_token_indexers
        self._source_add_start_token = source_add_start_token 
Example #17
Source File: squad_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} 
Example #18
Source File: coca_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} 
Example #19
Source File: clean_coqa_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} 
Example #20
Source File: single_correct_mcq_entailment.py    From multee with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 premise_max_tokens: int = 200, # do dataset statisticscs
                 hypothesis_max_tokens: int = 200,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._premise_max_tokens = premise_max_tokens
        self._hypothesis_max_tokens = hypothesis_max_tokens
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} 
Example #21
Source File: ir_triple_loader.py    From sigir19-neural-ir with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length 
Example #22
Source File: ir_single_sequence_loader.py    From transformer-kernel-ranking with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_seq_length:int = -1,
                 min_seq_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self.max_seq_length = max_seq_length
        self.min_seq_length = min_seq_length

        self.padding_value = Token(text = "@@PADDING@@",text_id=0) 
Example #23
Source File: dataset_reader.py    From swagaf with MIT License 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 use_only_gold_examples: bool = False) -> None:
        super().__init__(lazy=False)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self.use_only_gold_examples = use_only_gold_examples 
Example #24
Source File: dataset_reader.py    From swagaf with MIT License 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 use_only_gold_examples: bool = False) -> None:
        super().__init__(lazy=False)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self.use_only_gold_examples = use_only_gold_examples 
Example #25
Source File: dataset_reader.py    From swagaf with MIT License 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 use_only_gold_examples: bool = False,
                 only_end: bool = False) -> None:
        super().__init__(lazy=False)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self.use_only_gold_examples = use_only_gold_examples
        self.only_end = only_end 
Example #26
Source File: test_readers.py    From summarus with Apache License 2.0 5 votes vote down vote up
def test_cnn_dailymail_reader(self):
        tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
        reader = CNNDailyMailReader(tokenizer, cnn_tokenized_dir=TEST_STORIES_DIR, separate_namespaces=False)
        dataset = reader.read(TEST_URLS_FILE)
        for sample in dataset:
            self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["source_tokens"]), 2)

            self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["target_tokens"]), 2) 
Example #27
Source File: test_readers.py    From summarus with Apache License 2.0 5 votes vote down vote up
def test_ria_reader(self):
        tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
        reader = RIAReader(tokenizer)
        dataset = reader.read(RIA_EXAMPLE_FILE)
        for sample in dataset:
            self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["source_tokens"]), 2)

            self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["target_tokens"]), 2) 
Example #28
Source File: test_readers.py    From summarus with Apache License 2.0 5 votes vote down vote up
def test_ria_copy_reader(self):
        tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
        reader = RIAReader(tokenizer, separate_namespaces=True, save_copy_fields=True)
        dataset = reader.read(RIA_EXAMPLE_FILE)
        vocabulary = Vocabulary.from_instances(dataset)

        for sample in dataset:
            sample.index_fields(vocabulary)
            self.assertIsNotNone(sample.fields["source_tokens"])
            self.assertIsNotNone(sample.fields["target_tokens"])
            self.assertIsNotNone(sample.fields["metadata"].metadata)
            self.assertIsNotNone(sample.fields["source_token_ids"].array)
            self.assertIsNotNone(sample.fields["target_token_ids"].array)
            self.assertIsNotNone(sample.fields["source_to_target"]._mapping_array)
            self.assertIsNotNone(sample.fields["source_to_target"]._target_namespace) 
Example #29
Source File: dataset.py    From R-net with MIT License 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False,
                 max_passage_len=400,
                 truncate_train_only=True) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self.max_passage_len = max_passage_len
        self.truncate_train_only = truncate_train_only 
Example #30
Source File: sent_sim_data.py    From glyce with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_tokens: int = 1000) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer(word_splitter=ChineseSimpleWordSplitter())
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_tokens = max_tokens