Python allennlp.data.tokenizers.Tokenizer() Examples

The following are 30 code examples of allennlp.data.tokenizers.Tokenizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data.tokenizers , or try the search function .
Example #1
Source File: atis_world.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def __init__(self, utterances: List[str], tokenizer: Tokenizer = None) -> None:
        if AtisWorld.sql_table_context is None:
            AtisWorld.sql_table_context = AtisSqlTableContext(
                atis_tables.ALL_TABLES, atis_tables.TABLES_WITH_STRINGS, AtisWorld.database_file
            )
        self.utterances: List[str] = utterances
        self.tokenizer = tokenizer if tokenizer else SpacyTokenizer()
        self.tokenized_utterances = [
            self.tokenizer.tokenize(utterance) for utterance in self.utterances
        ]
        self.dates = self._get_dates()
        self.linked_entities = self._get_linked_entities()

        entities, linking_scores = self._flatten_entities()
        # This has shape (num_entities, num_utterance_tokens).
        self.linking_scores: numpy.ndarray = linking_scores
        self.entities: List[str] = entities
        self.grammar: Grammar = self._update_grammar()
        self.valid_actions = initialize_valid_actions(self.grammar, KEYWORDS) 
Example #2
Source File: citation_data_reader_scicite_aux.py    From scicite with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 clean_citation: bool = True,
                 with_elmo: bool = False
                 # use_lexicon_features: bool = False,
                 # use_sparse_lexicon_features: bool = False
                 ) -> None:
        super().__init__(lazy)
        self._clean_citation = clean_citation
        self._tokenizer = tokenizer or WordTokenizer()
        if with_elmo:
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()}
        # if self.use_lexicon_features or self.use_sparse_lexicon_features:
        #     self.lexicons = {**ALL_ACTION_LEXICONS, **ALL_CONCEPT_LEXICONS} 
Example #3
Source File: ir_labeled_tuple_loader.py    From transformer-kernel-ranking with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 min_doc_length:int = -1,
                 min_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length
        self.min_doc_length = min_doc_length
        self.min_query_length = min_query_length

        self.padding_value = Token(text = "@@PADDING@@",text_id=0) 
Example #4
Source File: citation_data_reader_scicite_aux.py    From scicite with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 clean_citation: bool = True,
                 with_elmo: bool = False
                 ) -> None:
        super().__init__(lazy)
        self._clean_citation = clean_citation
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        if with_elmo:
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()} 
Example #5
Source File: ir_triple_loader.py    From transformer-kernel-ranking with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 min_doc_length:int = -1,
                 min_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length
        self.min_doc_length = min_doc_length
        self.min_query_length = min_query_length

        self.padding_value = Token(text = "@@PADDING@@",text_id=0) 
Example #6
Source File: imdb_review_reader.py    From topic-rnn with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 words_per_instance: int = 35,
                 classification_mode=False
                ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer(
            start_tokens=[START_SYMBOL],
            end_tokens=[END_SYMBOL]
        )
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True)
        }

        self._words_per_instance = words_per_instance
        self._classification_mode = classification_mode 
Example #7
Source File: imdb_review_reader.py    From topic-rnn with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 words_per_instance: int = 35
                ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer(
            start_tokens=[START_SYMBOL],
            end_tokens=[END_SYMBOL]
        )
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True)
        }

        self._words_per_instance = words_per_instance 
Example #8
Source File: bert_labeled_tuple_loader.py    From transformer-kernel-ranking with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 min_doc_length:int = -1,
                 min_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer
        self._token_indexers = token_indexers
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length
        self.min_doc_length = min_doc_length
        self.min_query_length = min_query_length

        self.padding_value = Token(text = "[PAD]",text_id=0)
        self.sep_value = Token(text = "[SEP]") 
Example #9
Source File: bert_triple_loader.py    From transformer-kernel-ranking with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 min_doc_length:int = -1,
                 min_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer
        self._token_indexers = token_indexers
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length
        self.min_doc_length = min_doc_length
        self.min_query_length = min_query_length

        self.padding_value = Token(text = "[PAD]",text_id=0)
        self.sep_value = Token(text = "[SEP]") 
Example #10
Source File: nlvr.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def __init__(
        self,
        lazy: bool = False,
        tokenizer: Tokenizer = None,
        sentence_token_indexers: Dict[str, TokenIndexer] = None,
        nonterminal_indexers: Dict[str, TokenIndexer] = None,
        terminal_indexers: Dict[str, TokenIndexer] = None,
        output_agendas: bool = True,
    ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._sentence_token_indexers = sentence_token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._nonterminal_indexers = nonterminal_indexers or {
            "tokens": SingleIdTokenIndexer("rule_labels")
        }
        self._terminal_indexers = terminal_indexers or {
            "tokens": SingleIdTokenIndexer("rule_labels")
        }
        self._output_agendas = output_agendas 
Example #11
Source File: drop_utils.py    From MTMSN with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 debug: bool = False,
                 tokenizer: Tokenizer = None,
                 include_more_numbers: bool = False,
                 skip_when_all_empty: List[str] = None,
                 max_number_of_answer: int = 8,
                 max_number_count: int = 10,
                 logger = None) -> None:
        super().__init__()
        self.debug = debug
        self._tokenizer = tokenizer or WordTokenizer()
        self.include_more_numbers = include_more_numbers
        self.max_number_of_answer = max_number_of_answer
        self.max_number_count = max_number_count
        self.skip_when_all_empty = skip_when_all_empty if skip_when_all_empty is not None else []
        for item in self.skip_when_all_empty:
            assert item in ["passage_span", "question_span", "addition_subtraction", "counting", "negation"], \
                f"Unsupported skip type: {item}"
        self.logger = logger 
Example #12
Source File: summarization_sentence_tagger_reader.py    From summarus with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer,
                 language: str,
                 source_token_indexers: Dict[str, TokenIndexer] = None,
                 max_sentences_count: int = 100,
                 sentence_max_tokens: int = 100,
                 lowercase: bool = True,
                 lazy: bool = True) -> None:
        super().__init__(lazy=lazy)

        self._tokenizer = tokenizer
        self._lowercase = lowercase
        self._language = language
        self._max_sentences_count = max_sentences_count
        self._sentence_max_tokens = sentence_max_tokens
        self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()} 
Example #13
Source File: nl2bash.py    From nlp-models with MIT License 6 votes vote down vote up
def __init__(
        self,
        target_namespace: str,
        source_tokenizer: Tokenizer = None,
        target_tokenizer: Tokenizer = None,
        source_token_indexers: Dict[str, TokenIndexer] = None,
        target_token_indexers: Dict[str, TokenIndexer] = None,
        lazy: bool = False,
    ) -> None:
        source_tokenizer = source_tokenizer or NL2BashWordSplitter()
        target_tokenizer = target_tokenizer or source_tokenizer
        super().__init__(
            target_namespace,
            source_tokenizer=source_tokenizer,
            target_tokenizer=target_tokenizer,
            source_token_indexers=source_token_indexers,
            target_token_indexers=target_token_indexers,
            lazy=lazy,
        ) 
Example #14
Source File: copynet.py    From nlp-models with MIT License 6 votes vote down vote up
def __init__(
        self,
        target_namespace: str,
        source_tokenizer: Tokenizer = None,
        target_tokenizer: Tokenizer = None,
        source_token_indexers: Dict[str, TokenIndexer] = None,
        target_token_indexers: Dict[str, TokenIndexer] = None,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy)
        self._target_namespace = target_namespace
        self._source_tokenizer = source_tokenizer or SpacyTokenizer()
        self._target_tokenizer = target_tokenizer or self._source_tokenizer
        self._source_token_indexers = source_token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._target_token_indexers = (
            target_token_indexers or self._source_token_indexers
        )
        warnings.warn(
            "The 'copynet' dataset reader has been deprecated in favor of the "
            "'copynet_seq2seq' dataset reader (now part of the AllenNLP library).",
            DeprecationWarning,
        ) 
Example #15
Source File: text_classification_json.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        tokenizer: Tokenizer = None,
        segment_sentences: bool = False,
        max_sequence_length: int = None,
        skip_label_indexing: bool = False,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._segment_sentences = segment_sentences
        self._max_sequence_length = max_sequence_length
        self._skip_label_indexing = skip_label_indexing
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        if self._segment_sentences:
            self._sentence_segmenter = SpacySentenceSplitter() 
Example #16
Source File: semisupervised_text_classification_json.py    From vampire with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 tokenizer: Tokenizer = None,
                 max_sequence_length: int = None,
                 ignore_labels: bool = False,
                 sample: int = None,
                 skip_label_indexing: bool = False,
                 lazy: bool = False) -> None:
        super().__init__(lazy=lazy,
                         token_indexers=token_indexers,
                         tokenizer=tokenizer,
                         max_sequence_length=max_sequence_length,
                         skip_label_indexing=skip_label_indexing)
        self._tokenizer = tokenizer or WordTokenizer()
        self._sample = sample
        self._max_sequence_length = max_sequence_length
        self._ignore_labels = ignore_labels
        self._skip_label_indexing = skip_label_indexing
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        if self._segment_sentences:
            self._sentence_segmenter = SpacySentenceSplitter() 
Example #17
Source File: coca_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} 
Example #18
Source File: squad_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} 
Example #19
Source File: ir_triple_loader.py    From sigir19-neural-ir with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length 
Example #20
Source File: ir_tuple_loader.py    From transformer-kernel-ranking with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 source_tokenizer: Tokenizer = None,
                 target_tokenizer: Tokenizer = None,
                 source_token_indexers: Dict[str, TokenIndexer] = None,
                 target_token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 lowercase: bool = True,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._source_tokenizer = source_tokenizer or WordTokenizer() #word_splitter=SimpleWordSplitter()
        self._target_tokenizer = target_tokenizer or self._source_tokenizer
        self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=lowercase)}
        self._target_token_indexers = target_token_indexers or self._source_token_indexers
        self._source_add_start_token = source_add_start_token 
Example #21
Source File: sent_sim_data.py    From glyce with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_tokens: int = 1000) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer(word_splitter=ChineseSimpleWordSplitter())
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_tokens = max_tokens 
Example #22
Source File: cls_data.py    From glyce with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 max_sentence_length: int,
                 tokenizer: Tokenizer,
                 max_instance: int,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 ) -> None:
        if max_instance > 100000:
            super().__init__(False)
        else:
            super().__init__(False)
        self._tokenizer = tokenizer
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_sentence_length = max_sentence_length
        self.trimmed_count = 0
        self.max_instance = max_instance 
Example #23
Source File: clean_coqa_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} 
Example #24
Source File: single_correct_mcq_entailment.py    From multee with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 premise_max_tokens: int = 200, # do dataset statisticscs
                 hypothesis_max_tokens: int = 200,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._premise_max_tokens = premise_max_tokens
        self._hypothesis_max_tokens = hypothesis_max_tokens
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} 
Example #25
Source File: entailment_pair.py    From multee with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 max_tokens: int = 200,
                 max_tuples: int = 300,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)

        self._max_tokens = max_tokens
        self._max_tuples = max_tuples
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} 
Example #26
Source File: multiple_correct_mcq_entailment.py    From multee with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 premise_max_tokens: int = 65,
                 hypothesis_max_tokens: int = 65,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._premise_max_tokens = premise_max_tokens
        self._hypothesis_max_tokens = hypothesis_max_tokens
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} 
Example #27
Source File: classification_dataset_reader.py    From scibert with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} 
Example #28
Source File: ir_tuple_loader.py    From sigir19-neural-ir with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 source_tokenizer: Tokenizer = None,
                 target_tokenizer: Tokenizer = None,
                 source_token_indexers: Dict[str, TokenIndexer] = None,
                 target_token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 lowercase: bool = True,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._source_tokenizer = source_tokenizer or WordTokenizer() #word_splitter=SimpleWordSplitter()
        self._target_tokenizer = target_tokenizer or self._source_tokenizer
        self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=lowercase)}
        self._target_token_indexers = target_token_indexers or self._source_token_indexers
        self._source_add_start_token = source_add_start_token 
Example #29
Source File: ir_labeled_tuple_loader.py    From sigir19-neural-ir with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length 
Example #30
Source File: atis.py    From allennlp-semparse with Apache License 2.0 5 votes vote down vote up
def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        keep_if_unparseable: bool = False,
        lazy: bool = False,
        tokenizer: Tokenizer = None,
        database_file: str = None,
        num_turns_to_concatenate: int = 1,
    ) -> None:
        super().__init__(lazy)
        self._keep_if_unparseable = keep_if_unparseable
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._database_file = database_file
        self._num_turns_to_concatenate = num_turns_to_concatenate