Python allennlp.data.token_indexers.SingleIdTokenIndexer() Examples
The following are 30
code examples of allennlp.data.token_indexers.SingleIdTokenIndexer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.token_indexers
, or try the search function
.
Example #1
Source File: text_classification_json.py From allennlp with Apache License 2.0 | 6 votes |
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, segment_sentences: bool = False, max_sequence_length: int = None, skip_label_indexing: bool = False, **kwargs, ) -> None: super().__init__(**kwargs) self._tokenizer = tokenizer or SpacyTokenizer() self._segment_sentences = segment_sentences self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter()
Example #2
Source File: vocabulary_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_from_params_extend_config(self): vocab_dir = self.TEST_DIR / "vocab_save" original_vocab = Vocabulary(non_padded_namespaces=["tokens"]) original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.save_to_files(vocab_dir) text_field = TextField( [Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")} ) instances = Batch([Instance({"text": text_field})]) # If you ask to extend vocab from `directory`, instances must be passed # in Vocabulary constructor, or else there is nothing to extend to. params = Params({"type": "extend", "directory": vocab_dir}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params) # If you ask to extend vocab, `directory` key must be present in params, # or else there is nothing to extend from. params = Params({"type": "extend"}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances=instances)
Example #3
Source File: conll2003.py From allennlp with Apache License 2.0 | 6 votes |
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tag_label: str = "ner", feature_labels: Sequence[str] = (), coding_scheme: str = "IOB1", label_namespace: str = "labels", **kwargs, ) -> None: super().__init__(**kwargs) self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} if tag_label is not None and tag_label not in self._VALID_LABELS: raise ConfigurationError("unknown tag label type: {}".format(tag_label)) for label in feature_labels: if label not in self._VALID_LABELS: raise ConfigurationError("unknown feature label type: {}".format(label)) if coding_scheme not in ("IOB1", "BIOUL"): raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme)) self.tag_label = tag_label self.feature_labels = set(feature_labels) self.coding_scheme = coding_scheme self.label_namespace = label_namespace self._original_coding_scheme = "IOB1"
Example #4
Source File: fever_reader_with_wn.py From combine-FEVER-NSMN with MIT License | 6 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, example_filter=None, wn_p_dict=None, wn_feature_list=wn_persistent_api.default_fn_list, max_l=None) -> None: super().__init__(lazy=lazy) self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer(namespace='tokens')} self._example_filter = example_filter self.wn_p_dict = wn_p_dict if wn_p_dict is None: raise ValueError("Need to specify WN feature dict for FEVER Reader.") self.wn_feature_list = wn_feature_list self.wn_feature_size = len(self.wn_feature_list) * 3 self.max_l = max_l
Example #5
Source File: sampler_test.py From allennlp with Apache License 2.0 | 6 votes |
def setup_method(self): super().setup_method() self.token_indexers = {"tokens": SingleIdTokenIndexer()} self.vocab = Vocabulary() self.this_index = self.vocab.add_token_to_namespace("this") self.is_index = self.vocab.add_token_to_namespace("is") self.a_index = self.vocab.add_token_to_namespace("a") self.sentence_index = self.vocab.add_token_to_namespace("sentence") self.another_index = self.vocab.add_token_to_namespace("another") self.yet_index = self.vocab.add_token_to_namespace("yet") self.very_index = self.vocab.add_token_to_namespace("very") self.long_index = self.vocab.add_token_to_namespace("long") instances = [ self.create_instance(["this", "is", "a", "sentence"]), self.create_instance(["this", "is", "another", "sentence"]), self.create_instance(["yet", "another", "sentence"]), self.create_instance( ["this", "is", "a", "very", "very", "very", "very", "long", "sentence"] ), self.create_instance(["sentence"]), ] self.instances = instances self.lazy_instances = LazyIterable(instances)
Example #6
Source File: reader.py From fever-naacl-2018 with Apache License 2.0 | 6 votes |
def __init__(self, db: FeverDocDB, sentence_level = False, wiki_tokenizer: Tokenizer = None, claim_tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, filtering: str = None) -> None: self._sentence_level = sentence_level self._wiki_tokenizer = wiki_tokenizer or WordTokenizer() self._claim_tokenizer = claim_tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.db = db self.formatter = FEVERGoldFormatter(set(self.db.get_doc_ids()), FEVERLabelSchema(),filtering=filtering) self.reader = JSONLineReader()
Example #7
Source File: single_id_token_indexer_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_count_vocab_items_with_non_default_feature_name(self): tokenizer = SpacyTokenizer(parse=True) tokens = tokenizer.tokenize("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = SingleIdTokenIndexer( namespace="dep_labels", feature_name="dep_", default_value="NONE" ) counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["dep_labels"] == { "ROOT": 1, "nsubj": 1, "det": 1, "NONE": 2, "attr": 1, "punct": 1, }
Example #8
Source File: datareader.py From NLP_Toolkit with Apache License 2.0 | 6 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, delimeters: dict = SEQ_DELIMETERS, skip_correct: bool = False, skip_complex: int = 0, lazy: bool = False, max_len: int = None, test_mode: bool = False, tag_strategy: str = "keep_one", tn_prob: float = 0, tp_prob: float = 0, broken_dot_strategy: str = "keep") -> None: super().__init__(lazy) self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self._delimeters = delimeters self._max_len = max_len self._skip_correct = skip_correct self._skip_complex = skip_complex self._tag_strategy = tag_strategy self._broken_dot_strategy = broken_dot_strategy self._test_mode = test_mode self._tn_prob = tn_prob self._tp_prob = tp_prob
Example #9
Source File: text_field_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_token_padding_lengths_are_computed_correctly(self): field = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "field_with_dict": DictReturningTokenIndexer(token_min_padding_length=3), "words": SingleIdTokenIndexer("words", token_min_padding_length=3), "characters": TokenCharactersIndexer( "characters", min_padding_length=1, token_min_padding_length=3 ), }, ) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { "field_with_dict___token_ids": 5, "field_with_dict___additional_key": 3, "words___tokens": 3, "characters___token_characters": 3, "characters___num_token_characters": 8, } tensors = field.as_tensor(padding_lengths) assert tensors["field_with_dict"]["additional_key"].tolist()[-1] == 0 assert tensors["words"]["tokens"].tolist()[-1] == 0 assert tensors["characters"]["token_characters"].tolist()[-1] == [0] * 8
Example #10
Source File: text_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace="words") capital_a_index = vocab.add_token_to_namespace("A", namespace="words") capital_a_char_index = vocab.add_token_to_namespace("A", namespace="characters") s_index = vocab.add_token_to_namespace("s", namespace="characters") e_index = vocab.add_token_to_namespace("e", namespace="characters") n_index = vocab.add_token_to_namespace("n", namespace="characters") t_index = vocab.add_token_to_namespace("t", namespace="characters") c_index = vocab.add_token_to_namespace("c", namespace="characters") field = TextField( [Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}, ) field.index(vocab) assert field._indexed_tokens["words"]["tokens"] == [capital_a_index, sentence_index] field1 = TextField( [Token(t) for t in ["A", "sentence"]], {"characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1)}, ) field1.index(vocab) assert field1._indexed_tokens["characters"]["token_characters"] == [ [capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index], ] field2 = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1), }, ) field2.index(vocab) assert field2._indexed_tokens["words"]["tokens"] == [capital_a_index, sentence_index] assert field2._indexed_tokens["characters"]["token_characters"] == [ [capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index], ]
Example #11
Source File: text_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_as_tensor_handles_words(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}, ) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1]) )
Example #12
Source File: text_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_padding_lengths_are_computed_correctly(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}, ) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"words___tokens": 5} field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters", min_padding_length=1) }, ) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { "characters___token_characters": 5, "characters___num_token_characters": 8, } field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters", min_padding_length=1), "words": SingleIdTokenIndexer("words"), }, ) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { "characters___token_characters": 5, "characters___num_token_characters": 8, "words___tokens": 5, }
Example #13
Source File: text_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_get_padding_lengths_raises_if_no_indexed_tokens(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}, ) with pytest.raises(ConfigurationError): field.get_padding_lengths()
Example #14
Source File: dataset_reader.py From ConvLab with MIT License | 5 votes |
def __init__(self, context_size: int = 0, agent: str = None, random_context_size: bool = True, token_delimiter: str = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._context_size = context_size self._agent = agent self._random_context_size = random_context_size self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self._token_delimiter = token_delimiter
Example #15
Source File: index_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def setup_method(self): super().setup_method() self.text = TextField( [Token(t) for t in ["here", "is", "a", "sentence", "."]], {"words": SingleIdTokenIndexer("words")}, )
Example #16
Source File: span_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def setup_method(self): super().setup_method() self.indexers = {"words": SingleIdTokenIndexer("words")} self.text = TextField( [Token(t) for t in ["here", "is", "a", "sentence", "for", "spans", "."]], self.indexers )
Example #17
Source File: lazy_dataset_reader_test.py From allennlp with Apache License 2.0 | 5 votes |
def setup_method(self): super().setup_method() token_indexer = {"tokens": SingleIdTokenIndexer()} field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]], token_indexer) field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence", "."]], token_indexer ) field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]], token_indexer) field4 = TextField([Token(t) for t in ["this", "is", "short"]], token_indexer) self.instances = [ Instance({"text1": field1, "text2": field2}), Instance({"text1": field3, "text2": field4}), ]
Example #18
Source File: ebmnlp.py From scibert with Apache License 2.0 | 5 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, label_namespace: str = "labels") -> None: super().__init__() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} self.label_namespace = label_namespace
Example #19
Source File: text_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_as_tensor_handles_words_and_characters_with_longer_lengths(self): field = TextField( [Token(t) for t in ["a", "sentence", "."]], token_indexers={ "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1), }, ) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["words___tokens"] = 5 padding_lengths["characters___token_characters"] = 5 padding_lengths["characters___num_token_characters"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"].detach().cpu().numpy(), numpy.array([1, 2, 1, 0, 0]) ) numpy.testing.assert_array_almost_equal( tensor_dict["characters"]["token_characters"].detach().cpu().numpy(), numpy.array( [ [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ] ), )
Example #20
Source File: text_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_printing_doesnt_crash(self): field = TextField( [Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}, ) print(field)
Example #21
Source File: text_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_token_indexer_returns_dict(self): field = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "field_with_dict": DictReturningTokenIndexer(), "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1), }, ) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { "field_with_dict___token_ids": 5, "field_with_dict___additional_key": 2, "words___tokens": 2, "characters___token_characters": 2, "characters___num_token_characters": 8, } padding_lengths["field_with_dict___token_ids"] = 7 padding_lengths["field_with_dict___additional_key"] = 3 padding_lengths["words___tokens"] = 4 padding_lengths["characters___token_characters"] = 4 tensors = field.as_tensor(padding_lengths) assert list(tensors["field_with_dict"]["token_ids"].shape) == [7] assert list(tensors["field_with_dict"]["additional_key"].shape) == [3] assert list(tensors["words"]["tokens"].shape) == [4] assert list(tensors["characters"]["token_characters"].shape) == [4, 8]
Example #22
Source File: sequence_label_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def setup_method(self): super().setup_method() self.text = TextField( [Token(t) for t in ["here", "are", "some", "words", "."]], {"words": SingleIdTokenIndexer("words")}, )
Example #23
Source File: entailment_tuple_reader.py From scitail with Apache License 2.0 | 5 votes |
def __init__(self, max_tokens: int, max_tuples: int, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: self._max_tokens = max_tokens self._max_tuples = max_tuples self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #24
Source File: universal_dependencies.py From udify with MIT License | 5 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #25
Source File: sigmorphon_2019_task_2.py From udify with MIT License | 5 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.label_to_dimension = {} for dimension, labels in unimorph_schema.items(): for label in labels: self.label_to_dimension[label] = dimension
Example #26
Source File: mesim_wn_simi_v1_2.py From combine-FEVER-NSMN with MIT License | 5 votes |
def __init__(self, model_path): # Prepare Data lazy = False token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() dev_fever_data_reader = WNSIMIReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=420) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') # Build Model # device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) # device_num = -1 if device.type == 'cpu' else 0 device = torch.device("cpu") device_num = -1 if device.type == 'cpu' else 0 biterator = BasicIterator(batch_size=16) biterator.index_with(vocab) model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 450 + dev_fever_data_reader.wn_feature_size), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=400) model.display() model.to(device) model.load_state_dict(torch.load(model_path)) self.model = model self.dev_fever_data_reader = dev_fever_data_reader self.device_num = device_num self.biterator = biterator
Example #27
Source File: nsmn_sent_wise_v1_1.py From combine-FEVER-NSMN with MIT License | 5 votes |
def __init__(self, model_path): # Prepare Data lazy = False token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() dev_fever_data_reader = WNSIMIReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=420) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') # Build Model # device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) # device_num = -1 if device.type == 'cpu' else 0 device = torch.device("cpu") device_num = -1 if device.type == 'cpu' else 0 biterator = BasicIterator(batch_size=16) biterator.index_with(vocab) model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 450 + dev_fever_data_reader.wn_feature_size), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=400) model.display() model.to(device) model.load_state_dict(torch.load(model_path)) self.model = model self.dev_fever_data_reader = dev_fever_data_reader self.device_num = device_num self.biterator = biterator
Example #28
Source File: fever_reader_with_wn_simi.py From combine-FEVER-NSMN with MIT License | 5 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, example_filter=None, wn_p_dict=None, wn_feature_list=wn_persistent_api.default_fn_list, max_l=None, num_encoding=True, shuffle_sentences=False, ablation=None) -> None: # {'rm_wn': True, 'rm_simi': True} super().__init__(lazy=lazy) self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer(namespace='tokens')} self._example_filter = example_filter self.wn_p_dict = wn_p_dict if wn_p_dict is None: print("Need to specify WN feature dict for FEVER Reader. Or if we don't need that.") self.wn_feature_list = wn_feature_list num_encoding_dim = 5 if num_encoding else 0 self.wn_feature_size = len(self.wn_feature_list) * 3 + num_encoding_dim + 1 if wn_p_dict is not None else 0 self.max_l = max_l self.shuffle_sentences = shuffle_sentences self.ablation = ablation if self.ablation is not None and self.ablation['rm_wn']: self.wn_feature_size -= (len(self.wn_feature_list) * 3 + num_encoding_dim) elif self.ablation is not None and self.ablation['rm_simi']: self.wn_feature_size -= 1
Example #29
Source File: fever_sselection_reader.py From combine-FEVER-NSMN with MIT License | 5 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, example_filter=None, max_l=None) -> None: super().__init__(lazy=lazy) self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer(namespace='tokens')} self._example_filter = example_filter self.max_l = max_l
Example #30
Source File: fever_reader.py From combine-FEVER-NSMN with MIT License | 5 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, example_filter=None, max_l=None) -> None: super().__init__(lazy=lazy) self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer(namespace='tokens')} self._example_filter = example_filter self.max_l = max_l