Python allennlp.data.tokenizers.Tokenizer() Examples
The following are 30
code examples of allennlp.data.tokenizers.Tokenizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.tokenizers
, or try the search function
.
Example #1
Source File: atis_world.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def __init__(self, utterances: List[str], tokenizer: Tokenizer = None) -> None: if AtisWorld.sql_table_context is None: AtisWorld.sql_table_context = AtisSqlTableContext( atis_tables.ALL_TABLES, atis_tables.TABLES_WITH_STRINGS, AtisWorld.database_file ) self.utterances: List[str] = utterances self.tokenizer = tokenizer if tokenizer else SpacyTokenizer() self.tokenized_utterances = [ self.tokenizer.tokenize(utterance) for utterance in self.utterances ] self.dates = self._get_dates() self.linked_entities = self._get_linked_entities() entities, linking_scores = self._flatten_entities() # This has shape (num_entities, num_utterance_tokens). self.linking_scores: numpy.ndarray = linking_scores self.entities: List[str] = entities self.grammar: Grammar = self._update_grammar() self.valid_actions = initialize_valid_actions(self.grammar, KEYWORDS)
Example #2
Source File: citation_data_reader_scicite_aux.py From scicite with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, clean_citation: bool = True, with_elmo: bool = False # use_lexicon_features: bool = False, # use_sparse_lexicon_features: bool = False ) -> None: super().__init__(lazy) self._clean_citation = clean_citation self._tokenizer = tokenizer or WordTokenizer() if with_elmo: self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()} # if self.use_lexicon_features or self.use_sparse_lexicon_features: # self.lexicons = {**ALL_ACTION_LEXICONS, **ALL_CONCEPT_LEXICONS}
Example #3
Source File: ir_labeled_tuple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 6 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, max_doc_length:int = -1, max_query_length:int = -1, min_doc_length:int = -1, min_query_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)} self._source_add_start_token = source_add_start_token self.max_doc_length = max_doc_length self.max_query_length = max_query_length self.min_doc_length = min_doc_length self.min_query_length = min_query_length self.padding_value = Token(text = "@@PADDING@@",text_id=0)
Example #4
Source File: citation_data_reader_scicite_aux.py From scicite with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, clean_citation: bool = True, with_elmo: bool = False ) -> None: super().__init__(lazy) self._clean_citation = clean_citation self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} if with_elmo: self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()}
Example #5
Source File: ir_triple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 6 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, max_doc_length:int = -1, max_query_length:int = -1, min_doc_length:int = -1, min_query_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)} self._source_add_start_token = source_add_start_token self.max_doc_length = max_doc_length self.max_query_length = max_query_length self.min_doc_length = min_doc_length self.min_query_length = min_query_length self.padding_value = Token(text = "@@PADDING@@",text_id=0)
Example #6
Source File: imdb_review_reader.py From topic-rnn with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, words_per_instance: int = 35, classification_mode=False ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer( start_tokens=[START_SYMBOL], end_tokens=[END_SYMBOL] ) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True) } self._words_per_instance = words_per_instance self._classification_mode = classification_mode
Example #7
Source File: imdb_review_reader.py From topic-rnn with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, words_per_instance: int = 35 ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer( start_tokens=[START_SYMBOL], end_tokens=[END_SYMBOL] ) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True) } self._words_per_instance = words_per_instance
Example #8
Source File: bert_labeled_tuple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 6 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, max_doc_length:int = -1, max_query_length:int = -1, min_doc_length:int = -1, min_query_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer self._token_indexers = token_indexers self._source_add_start_token = source_add_start_token self.max_doc_length = max_doc_length self.max_query_length = max_query_length self.min_doc_length = min_doc_length self.min_query_length = min_query_length self.padding_value = Token(text = "[PAD]",text_id=0) self.sep_value = Token(text = "[SEP]")
Example #9
Source File: bert_triple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 6 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, max_doc_length:int = -1, max_query_length:int = -1, min_doc_length:int = -1, min_query_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer self._token_indexers = token_indexers self._source_add_start_token = source_add_start_token self.max_doc_length = max_doc_length self.max_query_length = max_query_length self.min_doc_length = min_doc_length self.min_query_length = min_query_length self.padding_value = Token(text = "[PAD]",text_id=0) self.sep_value = Token(text = "[SEP]")
Example #10
Source File: nlvr.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def __init__( self, lazy: bool = False, tokenizer: Tokenizer = None, sentence_token_indexers: Dict[str, TokenIndexer] = None, nonterminal_indexers: Dict[str, TokenIndexer] = None, terminal_indexers: Dict[str, TokenIndexer] = None, output_agendas: bool = True, ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or SpacyTokenizer() self._sentence_token_indexers = sentence_token_indexers or { "tokens": SingleIdTokenIndexer() } self._nonterminal_indexers = nonterminal_indexers or { "tokens": SingleIdTokenIndexer("rule_labels") } self._terminal_indexers = terminal_indexers or { "tokens": SingleIdTokenIndexer("rule_labels") } self._output_agendas = output_agendas
Example #11
Source File: drop_utils.py From MTMSN with Apache License 2.0 | 6 votes |
def __init__(self, debug: bool = False, tokenizer: Tokenizer = None, include_more_numbers: bool = False, skip_when_all_empty: List[str] = None, max_number_of_answer: int = 8, max_number_count: int = 10, logger = None) -> None: super().__init__() self.debug = debug self._tokenizer = tokenizer or WordTokenizer() self.include_more_numbers = include_more_numbers self.max_number_of_answer = max_number_of_answer self.max_number_count = max_number_count self.skip_when_all_empty = skip_when_all_empty if skip_when_all_empty is not None else [] for item in self.skip_when_all_empty: assert item in ["passage_span", "question_span", "addition_subtraction", "counting", "negation"], \ f"Unsupported skip type: {item}" self.logger = logger
Example #12
Source File: summarization_sentence_tagger_reader.py From summarus with Apache License 2.0 | 6 votes |
def __init__(self, tokenizer: Tokenizer, language: str, source_token_indexers: Dict[str, TokenIndexer] = None, max_sentences_count: int = 100, sentence_max_tokens: int = 100, lowercase: bool = True, lazy: bool = True) -> None: super().__init__(lazy=lazy) self._tokenizer = tokenizer self._lowercase = lowercase self._language = language self._max_sentences_count = max_sentences_count self._sentence_max_tokens = sentence_max_tokens self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()}
Example #13
Source File: nl2bash.py From nlp-models with MIT License | 6 votes |
def __init__( self, target_namespace: str, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, ) -> None: source_tokenizer = source_tokenizer or NL2BashWordSplitter() target_tokenizer = target_tokenizer or source_tokenizer super().__init__( target_namespace, source_tokenizer=source_tokenizer, target_tokenizer=target_tokenizer, source_token_indexers=source_token_indexers, target_token_indexers=target_token_indexers, lazy=lazy, )
Example #14
Source File: copynet.py From nlp-models with MIT License | 6 votes |
def __init__( self, target_namespace: str, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, ) -> None: super().__init__(lazy) self._target_namespace = target_namespace self._source_tokenizer = source_tokenizer or SpacyTokenizer() self._target_tokenizer = target_tokenizer or self._source_tokenizer self._source_token_indexers = source_token_indexers or { "tokens": SingleIdTokenIndexer() } self._target_token_indexers = ( target_token_indexers or self._source_token_indexers ) warnings.warn( "The 'copynet' dataset reader has been deprecated in favor of the " "'copynet_seq2seq' dataset reader (now part of the AllenNLP library).", DeprecationWarning, )
Example #15
Source File: text_classification_json.py From allennlp with Apache License 2.0 | 6 votes |
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, segment_sentences: bool = False, max_sequence_length: int = None, skip_label_indexing: bool = False, **kwargs, ) -> None: super().__init__(**kwargs) self._tokenizer = tokenizer or SpacyTokenizer() self._segment_sentences = segment_sentences self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter()
Example #16
Source File: semisupervised_text_classification_json.py From vampire with Apache License 2.0 | 6 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, max_sequence_length: int = None, ignore_labels: bool = False, sample: int = None, skip_label_indexing: bool = False, lazy: bool = False) -> None: super().__init__(lazy=lazy, token_indexers=token_indexers, tokenizer=tokenizer, max_sequence_length=max_sequence_length, skip_label_indexing=skip_label_indexing) self._tokenizer = tokenizer or WordTokenizer() self._sample = sample self._max_sequence_length = max_sequence_length self._ignore_labels = ignore_labels self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter()
Example #17
Source File: coca_reader.py From SLQA with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #18
Source File: squad_reader.py From SLQA with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #19
Source File: ir_triple_loader.py From sigir19-neural-ir with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, max_doc_length:int = -1, max_query_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)} self._source_add_start_token = source_add_start_token self.max_doc_length = max_doc_length self.max_query_length = max_query_length
Example #20
Source File: ir_tuple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 5 votes |
def __init__(self, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, lowercase: bool = True, lazy: bool = False) -> None: super().__init__(lazy) self._source_tokenizer = source_tokenizer or WordTokenizer() #word_splitter=SimpleWordSplitter() self._target_tokenizer = target_tokenizer or self._source_tokenizer self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=lowercase)} self._target_token_indexers = target_token_indexers or self._source_token_indexers self._source_add_start_token = source_add_start_token
Example #21
Source File: sent_sim_data.py From glyce with Apache License 2.0 | 5 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, max_tokens: int = 1000) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer(word_splitter=ChineseSimpleWordSplitter()) self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} self.max_tokens = max_tokens
Example #22
Source File: cls_data.py From glyce with Apache License 2.0 | 5 votes |
def __init__(self, max_sentence_length: int, tokenizer: Tokenizer, max_instance: int, token_indexers: Dict[str, TokenIndexer] = None, ) -> None: if max_instance > 100000: super().__init__(False) else: super().__init__(False) self._tokenizer = tokenizer self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} self.max_sentence_length = max_sentence_length self.trimmed_count = 0 self.max_instance = max_instance
Example #23
Source File: clean_coqa_reader.py From SLQA with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #24
Source File: single_correct_mcq_entailment.py From multee with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, premise_max_tokens: int = 200, # do dataset statisticscs hypothesis_max_tokens: int = 200, lazy: bool = False) -> None: super().__init__(lazy) self._premise_max_tokens = premise_max_tokens self._hypothesis_max_tokens = hypothesis_max_tokens self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #25
Source File: entailment_pair.py From multee with Apache License 2.0 | 5 votes |
def __init__(self, max_tokens: int = 200, max_tuples: int = 300, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._max_tokens = max_tokens self._max_tuples = max_tuples self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #26
Source File: multiple_correct_mcq_entailment.py From multee with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, premise_max_tokens: int = 65, hypothesis_max_tokens: int = 65, lazy: bool = False) -> None: super().__init__(lazy) self._premise_max_tokens = premise_max_tokens self._hypothesis_max_tokens = hypothesis_max_tokens self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #27
Source File: classification_dataset_reader.py From scibert with Apache License 2.0 | 5 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
Example #28
Source File: ir_tuple_loader.py From sigir19-neural-ir with Apache License 2.0 | 5 votes |
def __init__(self, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, lowercase: bool = True, lazy: bool = False) -> None: super().__init__(lazy) self._source_tokenizer = source_tokenizer or WordTokenizer() #word_splitter=SimpleWordSplitter() self._target_tokenizer = target_tokenizer or self._source_tokenizer self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=lowercase)} self._target_token_indexers = target_token_indexers or self._source_token_indexers self._source_add_start_token = source_add_start_token
Example #29
Source File: ir_labeled_tuple_loader.py From sigir19-neural-ir with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, max_doc_length:int = -1, max_query_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)} self._source_add_start_token = source_add_start_token self.max_doc_length = max_doc_length self.max_query_length = max_query_length
Example #30
Source File: atis.py From allennlp-semparse with Apache License 2.0 | 5 votes |
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, keep_if_unparseable: bool = False, lazy: bool = False, tokenizer: Tokenizer = None, database_file: str = None, num_turns_to_concatenate: int = 1, ) -> None: super().__init__(lazy) self._keep_if_unparseable = keep_if_unparseable self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} self._tokenizer = tokenizer or SpacyTokenizer() self._database_file = database_file self._num_turns_to_concatenate = num_turns_to_concatenate