Python allennlp.data.tokenizers.WordTokenizer() Examples
The following are 30
code examples of allennlp.data.tokenizers.WordTokenizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.tokenizers
, or try the search function
.
Example #1
Source File: util_test.py From magnitude with MIT License | 6 votes |
def test_char_span_to_token_span_handles_easy_cases(self): # These are _inclusive_ spans, on both sides. tokenizer = WordTokenizer() passage = u"On January 7, 2012, Beyoncé gave birth to her first child, a daughter, Blue Ivy " +\ u"Carter, at Lenox Hill Hospital in New York. Five months later, she performed for four " +\ u"nights at Revel Atlantic City's Ovation Hall to celebrate the resort's opening, her " +\ u"first performances since giving birth to Blue Ivy." tokens = tokenizer.tokenize(passage) offsets = [(t.idx, t.idx + len(t.text)) for t in tokens] # "January 7, 2012" token_span = util.char_span_to_token_span(offsets, (3, 18))[0] assert token_span == (1, 4) # "Lenox Hill Hospital" token_span = util.char_span_to_token_span(offsets, (91, 110))[0] assert token_span == (22, 24) # "Lenox Hill Hospital in New York." token_span = util.char_span_to_token_span(offsets, (91, 123))[0] assert token_span == (22, 28)
Example #2
Source File: reader.py From fever-naacl-2018 with Apache License 2.0 | 6 votes |
def __init__(self, db: FeverDocDB, sentence_level = False, wiki_tokenizer: Tokenizer = None, claim_tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, filtering: str = None) -> None: self._sentence_level = sentence_level self._wiki_tokenizer = wiki_tokenizer or WordTokenizer() self._claim_tokenizer = claim_tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.db = db self.formatter = FEVERGoldFormatter(set(self.db.get_doc_ids()), FEVERLabelSchema(),filtering=filtering) self.reader = JSONLineReader()
Example #3
Source File: ir_labeled_tuple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 6 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, max_doc_length:int = -1, max_query_length:int = -1, min_doc_length:int = -1, min_query_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)} self._source_add_start_token = source_add_start_token self.max_doc_length = max_doc_length self.max_query_length = max_query_length self.min_doc_length = min_doc_length self.min_query_length = min_query_length self.padding_value = Token(text = "@@PADDING@@",text_id=0)
Example #4
Source File: language_modeling.py From magnitude with MIT License | 6 votes |
def __init__(self, tokens_per_instance = None, tokenizer = None, token_indexers = None, lazy = False) : super(LanguageModelingReader, self).__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {u"tokens": SingleIdTokenIndexer()} self._tokens_per_instance = tokens_per_instance # No matter how you want to represent the input, we'll always represent the output as a # single token id. This code lets you learn a language model that concatenates word # embeddings with character-level encoders, in order to predict the word token that comes # next. self._output_indexer = None for name, indexer in list(self._token_indexers.items()): if isinstance(indexer, SingleIdTokenIndexer): self._output_indexer = {name: indexer} break else: self._output_indexer = {u"tokens": SingleIdTokenIndexer()} #overrides
Example #5
Source File: semisupervised_text_classification_json.py From vampire with Apache License 2.0 | 6 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, max_sequence_length: int = None, ignore_labels: bool = False, sample: int = None, skip_label_indexing: bool = False, lazy: bool = False) -> None: super().__init__(lazy=lazy, token_indexers=token_indexers, tokenizer=tokenizer, max_sequence_length=max_sequence_length, skip_label_indexing=skip_label_indexing) self._tokenizer = tokenizer or WordTokenizer() self._sample = sample self._max_sequence_length = max_sequence_length self._ignore_labels = ignore_labels self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter()
Example #6
Source File: citation_data_reader_scicite_aux.py From scicite with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, clean_citation: bool = True, with_elmo: bool = False ) -> None: super().__init__(lazy) self._clean_citation = clean_citation self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} if with_elmo: self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()}
Example #7
Source File: citation_data_reader_aclarc_aux.py From scicite with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, clean_citation: bool = True, with_elmo: bool = False # use_lexicon_features: bool = False, # use_sparse_lexicon_features: bool = False ) -> None: super().__init__(lazy) self._clean_citation = clean_citation self._tokenizer = tokenizer or WordTokenizer() if with_elmo: self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()}
Example #8
Source File: imdb_review_reader.py From topic-rnn with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, words_per_instance: int = 35, classification_mode=False ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer( start_tokens=[START_SYMBOL], end_tokens=[END_SYMBOL] ) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True) } self._words_per_instance = words_per_instance self._classification_mode = classification_mode
Example #9
Source File: imdb_review_reader.py From topic-rnn with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, words_per_instance: int = 35 ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer( start_tokens=[START_SYMBOL], end_tokens=[END_SYMBOL] ) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True) } self._words_per_instance = words_per_instance
Example #10
Source File: ir_triple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 6 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, max_doc_length:int = -1, max_query_length:int = -1, min_doc_length:int = -1, min_query_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)} self._source_add_start_token = source_add_start_token self.max_doc_length = max_doc_length self.max_query_length = max_query_length self.min_doc_length = min_doc_length self.min_query_length = min_query_length self.padding_value = Token(text = "@@PADDING@@",text_id=0)
Example #11
Source File: nlvr.py From magnitude with MIT License | 6 votes |
def __init__(self, lazy = False, tokenizer = None, sentence_token_indexers = None, nonterminal_indexers = None, terminal_indexers = None, output_agendas = True) : super(NlvrDatasetReader, self).__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._sentence_token_indexers = sentence_token_indexers or {u"tokens": SingleIdTokenIndexer()} self._nonterminal_indexers = nonterminal_indexers or {u"tokens": SingleIdTokenIndexer(u"rule_labels")} self._terminal_indexers = terminal_indexers or {u"tokens": SingleIdTokenIndexer(u"rule_labels")} self._output_agendas = output_agendas #overrides
Example #12
Source File: citation_data_reader_scicite.py From scicite with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, use_lexicon_features: bool=False, use_sparse_lexicon_features: bool = False, multilabel: bool = False, with_elmo: bool = False, reader_format: str = 'flat') -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() if with_elmo: # self._token_indexers = {"tokens": SingleIdTokenIndexer()} self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()} self.use_lexicon_features = use_lexicon_features self.use_sparse_lexicon_features = use_sparse_lexicon_features if self.use_lexicon_features or self.use_sparse_lexicon_features: self.lexicons = {**ALL_ACTION_LEXICONS, **ALL_CONCEPT_LEXICONS} self.multilabel = multilabel self.reader_format = reader_format
Example #13
Source File: drop_utils.py From MTMSN with Apache License 2.0 | 6 votes |
def __init__(self, debug: bool = False, tokenizer: Tokenizer = None, include_more_numbers: bool = False, skip_when_all_empty: List[str] = None, max_number_of_answer: int = 8, max_number_count: int = 10, logger = None) -> None: super().__init__() self.debug = debug self._tokenizer = tokenizer or WordTokenizer() self.include_more_numbers = include_more_numbers self.max_number_of_answer = max_number_of_answer self.max_number_count = max_number_count self.skip_when_all_empty = skip_when_all_empty if skip_when_all_empty is not None else [] for item in self.skip_when_all_empty: assert item in ["passage_span", "question_span", "addition_subtraction", "counting", "negation"], \ f"Unsupported skip type: {item}" self.logger = logger
Example #14
Source File: citation_data_reader_aclarc.py From scicite with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, use_lexicon_features: bool = False, use_sparse_lexicon_features: bool = False, with_elmo: bool = False ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() if with_elmo: self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()} self.use_lexicon_features = use_lexicon_features self.use_sparse_lexicon_features = use_sparse_lexicon_features if self.use_lexicon_features or self.use_sparse_lexicon_features: self.lexicons = {**ALL_ACTION_LEXICONS, **ALL_CONCEPT_LEXICONS}
Example #15
Source File: citation_data_reader_aclarc_aux.py From scicite with Apache License 2.0 | 5 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, clean_citation: bool = True, with_elmo: bool = False ) -> None: super().__init__(lazy) self._clean_citation = clean_citation self._tokenizer = tokenizer or WordTokenizer() if with_elmo: self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()}
Example #16
Source File: ir_tuple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 5 votes |
def __init__(self, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, lowercase: bool = True, lazy: bool = False) -> None: super().__init__(lazy) self._source_tokenizer = source_tokenizer or WordTokenizer() #word_splitter=SimpleWordSplitter() self._target_tokenizer = target_tokenizer or self._source_tokenizer self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=lowercase)} self._target_token_indexers = target_token_indexers or self._source_token_indexers self._source_add_start_token = source_add_start_token
Example #17
Source File: squad_reader.py From SLQA with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #18
Source File: coca_reader.py From SLQA with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #19
Source File: clean_coqa_reader.py From SLQA with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #20
Source File: single_correct_mcq_entailment.py From multee with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, premise_max_tokens: int = 200, # do dataset statisticscs hypothesis_max_tokens: int = 200, lazy: bool = False) -> None: super().__init__(lazy) self._premise_max_tokens = premise_max_tokens self._hypothesis_max_tokens = hypothesis_max_tokens self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #21
Source File: ir_triple_loader.py From sigir19-neural-ir with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, max_doc_length:int = -1, max_query_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)} self._source_add_start_token = source_add_start_token self.max_doc_length = max_doc_length self.max_query_length = max_query_length
Example #22
Source File: ir_single_sequence_loader.py From transformer-kernel-ranking with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, max_seq_length:int = -1, min_seq_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)} self.max_seq_length = max_seq_length self.min_seq_length = min_seq_length self.padding_value = Token(text = "@@PADDING@@",text_id=0)
Example #23
Source File: dataset_reader.py From swagaf with MIT License | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, use_only_gold_examples: bool = False) -> None: super().__init__(lazy=False) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.use_only_gold_examples = use_only_gold_examples
Example #24
Source File: dataset_reader.py From swagaf with MIT License | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, use_only_gold_examples: bool = False) -> None: super().__init__(lazy=False) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.use_only_gold_examples = use_only_gold_examples
Example #25
Source File: dataset_reader.py From swagaf with MIT License | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, use_only_gold_examples: bool = False, only_end: bool = False) -> None: super().__init__(lazy=False) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.use_only_gold_examples = use_only_gold_examples self.only_end = only_end
Example #26
Source File: test_readers.py From summarus with Apache License 2.0 | 5 votes |
def test_cnn_dailymail_reader(self): tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter()) reader = CNNDailyMailReader(tokenizer, cnn_tokenized_dir=TEST_STORIES_DIR, separate_namespaces=False) dataset = reader.read(TEST_URLS_FILE) for sample in dataset: self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL) self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL) self.assertGreater(len(sample.fields["source_tokens"]), 2) self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL) self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL) self.assertGreater(len(sample.fields["target_tokens"]), 2)
Example #27
Source File: test_readers.py From summarus with Apache License 2.0 | 5 votes |
def test_ria_reader(self): tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter()) reader = RIAReader(tokenizer) dataset = reader.read(RIA_EXAMPLE_FILE) for sample in dataset: self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL) self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL) self.assertGreater(len(sample.fields["source_tokens"]), 2) self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL) self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL) self.assertGreater(len(sample.fields["target_tokens"]), 2)
Example #28
Source File: test_readers.py From summarus with Apache License 2.0 | 5 votes |
def test_ria_copy_reader(self): tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter()) reader = RIAReader(tokenizer, separate_namespaces=True, save_copy_fields=True) dataset = reader.read(RIA_EXAMPLE_FILE) vocabulary = Vocabulary.from_instances(dataset) for sample in dataset: sample.index_fields(vocabulary) self.assertIsNotNone(sample.fields["source_tokens"]) self.assertIsNotNone(sample.fields["target_tokens"]) self.assertIsNotNone(sample.fields["metadata"].metadata) self.assertIsNotNone(sample.fields["source_token_ids"].array) self.assertIsNotNone(sample.fields["target_token_ids"].array) self.assertIsNotNone(sample.fields["source_to_target"]._mapping_array) self.assertIsNotNone(sample.fields["source_to_target"]._target_namespace)
Example #29
Source File: dataset.py From R-net with MIT License | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, max_passage_len=400, truncate_train_only=True) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.max_passage_len = max_passage_len self.truncate_train_only = truncate_train_only
Example #30
Source File: sent_sim_data.py From glyce with Apache License 2.0 | 5 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, max_tokens: int = 1000) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer(word_splitter=ChineseSimpleWordSplitter()) self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} self.max_tokens = max_tokens