Python allennlp.data.token_indexers.ELMoTokenCharactersIndexer() Examples
The following are 30
code examples of allennlp.data.token_indexers.ELMoTokenCharactersIndexer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.token_indexers
, or try the search function
.
Example #1
Source File: citation_data_reader_scicite_aux.py From scicite with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, clean_citation: bool = True, with_elmo: bool = False ) -> None: super().__init__(lazy) self._clean_citation = clean_citation self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} if with_elmo: self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()}
Example #2
Source File: citation_data_reader_aclarc.py From scicite with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, use_lexicon_features: bool = False, use_sparse_lexicon_features: bool = False, with_elmo: bool = False ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() if with_elmo: self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()} self.use_lexicon_features = use_lexicon_features self.use_sparse_lexicon_features = use_sparse_lexicon_features if self.use_lexicon_features or self.use_sparse_lexicon_features: self.lexicons = {**ALL_ACTION_LEXICONS, **ALL_CONCEPT_LEXICONS}
Example #3
Source File: citation_data_reader_scicite.py From scicite with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, use_lexicon_features: bool=False, use_sparse_lexicon_features: bool = False, multilabel: bool = False, with_elmo: bool = False, reader_format: str = 'flat') -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() if with_elmo: # self._token_indexers = {"tokens": SingleIdTokenIndexer()} self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()} self.use_lexicon_features = use_lexicon_features self.use_sparse_lexicon_features = use_sparse_lexicon_features if self.use_lexicon_features or self.use_sparse_lexicon_features: self.lexicons = {**ALL_ACTION_LEXICONS, **ALL_CONCEPT_LEXICONS} self.multilabel = multilabel self.reader_format = reader_format
Example #4
Source File: fever_reader.py From combine-FEVER-NSMN with MIT License | 6 votes |
def fever_build_vocab(d_list, unk_token_num=None) -> ExVocabulary: if unk_token_num is None: unk_token_num = {'tokens': 2600} token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } nli_dataset_reader = BasicReader(token_indexers=token_indexers) # for in_file in d_list: instances = nli_dataset_reader.read(d_list) whole_vocabulary = ExVocabulary.from_instances(instances, unk_token_num=unk_token_num) print(whole_vocabulary.get_vocab_size('tokens')) # 122827 print(type(whole_vocabulary.get_token_to_index_vocabulary('tokens'))) return whole_vocabulary
Example #5
Source File: citation_data_reader_aclarc_aux.py From scicite with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, clean_citation: bool = True, with_elmo: bool = False # use_lexicon_features: bool = False, # use_sparse_lexicon_features: bool = False ) -> None: super().__init__(lazy) self._clean_citation = clean_citation self._tokenizer = tokenizer or WordTokenizer() if with_elmo: self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()}
Example #6
Source File: mesim_wn_simi_v1_2.py From combine-FEVER-NSMN with MIT License | 5 votes |
def __init__(self, model_path): # Prepare Data lazy = False token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() dev_fever_data_reader = WNSIMIReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=420) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') # Build Model # device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) # device_num = -1 if device.type == 'cpu' else 0 device = torch.device("cpu") device_num = -1 if device.type == 'cpu' else 0 biterator = BasicIterator(batch_size=16) biterator.index_with(vocab) model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 450 + dev_fever_data_reader.wn_feature_size), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=400) model.display() model.to(device) model.load_state_dict(torch.load(model_path)) self.model = model self.dev_fever_data_reader = dev_fever_data_reader self.device_num = device_num self.biterator = biterator
Example #7
Source File: nsmn_sent_wise_v1_1.py From combine-FEVER-NSMN with MIT License | 5 votes |
def __init__(self, model_path): # Prepare Data lazy = False token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() dev_fever_data_reader = WNSIMIReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=420) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') # Build Model # device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) # device_num = -1 if device.type == 'cpu' else 0 device = torch.device("cpu") device_num = -1 if device.type == 'cpu' else 0 biterator = BasicIterator(batch_size=16) biterator.index_with(vocab) model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 450 + dev_fever_data_reader.wn_feature_size), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=400) model.display() model.to(device) model.load_state_dict(torch.load(model_path)) self.model = model self.dev_fever_data_reader = dev_fever_data_reader self.device_num = device_num self.biterator = biterator
Example #8
Source File: elmo_indexer_test.py From magnitude with MIT License | 5 votes |
def test_bos_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token(u'<S>')], Vocabulary(), u"test-elmo") expected_indices = [259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261] assert indices == {u"test-elmo": [expected_indices]}
Example #9
Source File: elmo_indexer_test.py From magnitude with MIT License | 5 votes |
def test_eos_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token(u'</S>')], Vocabulary(), u"test-eos") expected_indices = [259, 258, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261] assert indices == {u"test-eos": [expected_indices]}
Example #10
Source File: elmo_indexer_test.py From magnitude with MIT License | 5 votes |
def test_unicode_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token(unichr(256) + u't')], Vocabulary(), u"test-unicode") expected_indices = [259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261] assert indices == {u"test-unicode": [expected_indices]}
Example #11
Source File: elmo_indexer_test.py From magnitude with MIT License | 5 votes |
def test_elmo_as_array_produces_token_sequence(self): # pylint: disable=invalid-name indexer = ELMoTokenCharactersIndexer() tokens = [Token(u'Second'), Token(u'.')] indices = indexer.tokens_to_indices(tokens, Vocabulary(), u"test-elmo")[u"test-elmo"] padded_tokens = indexer.pad_token_sequence({u'test-elmo': indices}, desired_num_tokens={u'test-elmo': 3}, padding_lengths={}) expected_padded_tokens = [[259, 84, 102, 100, 112, 111, 101, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] assert padded_tokens[u'test-elmo'] == expected_padded_tokens
Example #12
Source File: elmo_indexer_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_elmo_empty_token_list(self): # Basic test indexer = ELMoTokenCharactersIndexer() assert {"elmo_tokens": []} == indexer.get_empty_token_list() # Real world test indexer = {"elmo": indexer} tokens_1 = TextField([Token("Apple")], indexer) targets_1 = ListField([TextField([Token("Apple")], indexer)]) tokens_2 = TextField([Token("Screen"), Token("device")], indexer) targets_2 = ListField( [TextField([Token("Screen")], indexer), TextField([Token("Device")], indexer)] ) instance_1 = Instance({"tokens": tokens_1, "targets": targets_1}) instance_2 = Instance({"tokens": tokens_2, "targets": targets_2}) a_batch = Batch([instance_1, instance_2]) a_batch.index_instances(Vocabulary()) batch_tensor = a_batch.as_tensor_dict() elmo_target_token_indices = batch_tensor["targets"]["elmo"]["elmo_tokens"] # The TextField that is empty should have been created using the # `get_empty_token_list` and then padded with zeros. empty_target = elmo_target_token_indices[0][1].numpy() np.testing.assert_array_equal(np.zeros((1, 50)), empty_target) non_empty_targets = [ elmo_target_token_indices[0][0], elmo_target_token_indices[1][0], elmo_target_token_indices[1][1], ] for non_empty_target in non_empty_targets: with pytest.raises(AssertionError): np.testing.assert_array_equal(np.zeros((1, 50)), non_empty_target)
Example #13
Source File: citation_data_reader_aclarc_aux.py From scicite with Apache License 2.0 | 5 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, clean_citation: bool = True, with_elmo: bool = False ) -> None: super().__init__(lazy) self._clean_citation = clean_citation self._tokenizer = tokenizer or WordTokenizer() if with_elmo: self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()}
Example #14
Source File: vcr.py From r2c with MIT License | 4 votes |
def __init__(self, split, mode, only_use_relevant_dets=True, add_image_as_a_box=True, embs_to_load='bert_da', conditioned_answer_choice=0): """ :param split: train, val, or test :param mode: answer or rationale :param only_use_relevant_dets: True, if we will only use the detections mentioned in the question and answer. False, if we should use all detections. :param add_image_as_a_box: True to add the image in as an additional 'detection'. It'll go first in the list of objects. :param embs_to_load: Which precomputed embeddings to load. :param conditioned_answer_choice: If you're in test mode, the answer labels aren't provided, which could be a problem for the QA->R task. Pass in 'conditioned_answer_choice=i' to always condition on the i-th answer. """ self.split = split self.mode = mode self.only_use_relevant_dets = only_use_relevant_dets print("Only relevant dets" if only_use_relevant_dets else "Using all detections", flush=True) self.add_image_as_a_box = add_image_as_a_box self.conditioned_answer_choice = conditioned_answer_choice with open(os.path.join(VCR_ANNOTS_DIR, '{}.jsonl'.format(split)), 'r') as f: self.items = [json.loads(s) for s in f] if split not in ('test', 'train', 'val'): raise ValueError("Mode must be in test, train, or val. Supplied {}".format(mode)) if mode not in ('answer', 'rationale'): raise ValueError("split must be answer or rationale") self.token_indexers = {'elmo': ELMoTokenCharactersIndexer()} self.vocab = Vocabulary() with open(os.path.join(os.path.dirname(VCR_ANNOTS_DIR), 'dataloaders', 'cocoontology.json'), 'r') as f: coco = json.load(f) self.coco_objects = ['__background__'] + [x['name'] for k, x in sorted(coco.items(), key=lambda x: int(x[0]))] self.coco_obj_to_ind = {o: i for i, o in enumerate(self.coco_objects)} self.embs_to_load = embs_to_load self.h5fn = os.path.join(VCR_ANNOTS_DIR, f'{self.embs_to_load}_{self.mode}_{self.split}.h5') print("Loading embeddings from {}".format(self.h5fn), flush=True)
Example #15
Source File: mesim_wn_simi_v1_3.py From combine-FEVER-NSMN with MIT License | 4 votes |
def hidden_eval_fever(): batch_size = 64 lazy = True SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-18-21:07:28_m_esim_wn_elmo_sample_fixed/i(57000)_epoch(8)_dev(0.5755075507550755)_loss(1.7175163737963839)_seed(12)" dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl" # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() dev_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360) complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, dev_upstream_file) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) # dev_biterator = BasicIterator(batch_size=batch_size * 2) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 300), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300) print("Model Max length:", model.max_l) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} for item in builded_dev_data: del item['label'] print(c_scorer.fever_score(builded_dev_data, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode))
Example #16
Source File: simple_nnmodel_refactor.py From combine-FEVER-NSMN with MIT License | 4 votes |
def pipeline_first_sent_selection_list(org_t_file, upstream_in_file, model_save_path, top_k): batch_size = 64 lazy = True SAVE_PATH = model_save_path print("Model From:", SAVE_PATH) dev_upstream_file = upstream_in_file # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy) complete_upstream_dev_data = get_full_list_from_list_d(org_t_file, dev_upstream_file, pred=True, top_k=top_k) print("Dev size:", len(complete_upstream_dev_data)) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) dev_sent_full_list = hidden_eval(model, eval_iter, complete_upstream_dev_data) return dev_sent_full_list
Example #17
Source File: simple_nnmodel_refactor.py From combine-FEVER-NSMN with MIT License | 4 votes |
def pipeline_first_sent_selection(org_t_file, upstream_in_file, model_save_path, top_k): batch_size = 64 lazy = True SAVE_PATH = model_save_path print("Model From:", SAVE_PATH) dev_upstream_file = upstream_in_file # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy) complete_upstream_dev_data = get_full_list(org_t_file, dev_upstream_file, pred=True, top_k=top_k) print("Dev size:", len(complete_upstream_dev_data)) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) dev_sent_full_list = hidden_eval(model, eval_iter, complete_upstream_dev_data) return dev_sent_full_list
Example #18
Source File: faster_nnmodel.py From combine-FEVER-NSMN with MIT License | 4 votes |
def eval_fever(): # save_path = "/home/easonnie/projects/MiscEnc/saved_models/06-07-21:58:06_esim_elmo/i(60900)_epoch(4)_um_dev(80.03458096013019)_m_dev(79.174732552216)_seed(12)" save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:40:01_esim_elmo_linear_amr_cs_score_filtering_0.5/i(5900)_epoch(3)_um_dev(39.73759153783564)_m_dev(40.18339276617422)_seed(12)" # save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:42:34_esim_elmo_cs_score_filtering_0.7/i(1300)_epoch(4)_um_dev(32.55695687550855)_m_dev(32.42995415180846)_seed(12)" batch_size = 32 # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } csnli_dataset_reader = CNLIReader(token_indexers=token_indexers, example_filter=lambda x: float(x['cs_score']) >= 0.7) # mnli_train_data_path = config.DATA_ROOT / "mnli/multinli_1.0_train.jsonl" mnli_m_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_mdev.jsonl.cs" mnli_um_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_umdev.jsonl.cs" # mnli_train_instances = csnli_dataset_reader.read(mnli_train_data_path) mnli_m_dev_instances = csnli_dataset_reader.read(mnli_m_dev_data_path) mnli_um_dev_instances = csnli_dataset_reader.read(mnli_um_dev_data_path) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300) model.load_state_dict(torch.load(save_path)) model.display() model.to(device) # Create Log File criterion = nn.CrossEntropyLoss() eval_iter = biterator(mnli_m_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) m_dev_score, m_dev_loss = eval_model(model, eval_iter, criterion) eval_iter = biterator(mnli_um_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) um_dev_score, um_dev_loss = eval_model(model, eval_iter, criterion) print(f"Dev(M):{m_dev_score}/{m_dev_loss}") print(f"Dev(UM):{um_dev_score}/{um_dev_loss}")
Example #19
Source File: simple_nnmodel.py From combine-FEVER-NSMN with MIT License | 4 votes |
def get_score_multihop(t_data_file, additional_file, model_path, item_key='prioritized_docids_aside', top_k=6): batch_size = 64 lazy = True SAVE_PATH = model_path print("Model From:", SAVE_PATH) additional_sentence_list = get_additional_list(t_data_file, additional_file, item_key=item_key, top_k=top_k) # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy) print("Additional Dev size:", len(additional_sentence_list)) dev_instances = dev_fever_data_reader.read(additional_sentence_list) # Load Vocabulary dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1) additional_sentence_list = hidden_eval(model, eval_iter, additional_sentence_list) return additional_sentence_list
Example #20
Source File: simple_nnmodel.py From combine-FEVER-NSMN with MIT License | 4 votes |
def pipeline_first_sent_selection_list(org_t_file, upstream_in_file, model_save_path, top_k): batch_size = 64 lazy = True SAVE_PATH = model_save_path print("Model From:", SAVE_PATH) dev_upstream_file = upstream_in_file # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy) complete_upstream_dev_data = get_full_list_from_list_d(org_t_file, dev_upstream_file, pred=True, top_k=top_k) print("Dev size:", len(complete_upstream_dev_data)) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) dev_sent_full_list = hidden_eval(model, eval_iter, complete_upstream_dev_data) return dev_sent_full_list
Example #21
Source File: simple_nnmodel.py From combine-FEVER-NSMN with MIT License | 4 votes |
def pipeline_first_sent_selection(org_t_file, upstream_in_file, model_save_path, top_k): batch_size = 64 lazy = True SAVE_PATH = model_save_path print("Model From:", SAVE_PATH) dev_upstream_file = upstream_in_file # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy) complete_upstream_dev_data = get_full_list(org_t_file, dev_upstream_file, pred=True, top_k=top_k) print("Dev size:", len(complete_upstream_dev_data)) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1) dev_sent_full_list = hidden_eval(model, eval_iter, complete_upstream_dev_data) return dev_sent_full_list
Example #22
Source File: simple_nnmodel.py From combine-FEVER-NSMN with MIT License | 4 votes |
def eval_fever(): # save_path = "/home/easonnie/projects/MiscEnc/saved_models/06-07-21:58:06_esim_elmo/i(60900)_epoch(4)_um_dev(80.03458096013019)_m_dev(79.174732552216)_seed(12)" save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:40:01_esim_elmo_linear_amr_cs_score_filtering_0.5/i(5900)_epoch(3)_um_dev(39.73759153783564)_m_dev(40.18339276617422)_seed(12)" # save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:42:34_esim_elmo_cs_score_filtering_0.7/i(1300)_epoch(4)_um_dev(32.55695687550855)_m_dev(32.42995415180846)_seed(12)" batch_size = 32 # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } csnli_dataset_reader = CNLIReader(token_indexers=token_indexers, example_filter=lambda x: float(x['cs_score']) >= 0.7) # mnli_train_data_path = config.DATA_ROOT / "mnli/multinli_1.0_train.jsonl" mnli_m_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_mdev.jsonl.cs" mnli_um_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_umdev.jsonl.cs" # mnli_train_instances = csnli_dataset_reader.read(mnli_train_data_path) mnli_m_dev_instances = csnli_dataset_reader.read(mnli_m_dev_data_path) mnli_um_dev_instances = csnli_dataset_reader.read(mnli_um_dev_data_path) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300) model.load_state_dict(torch.load(save_path)) model.display() model.to(device) # Create Log File criterion = nn.CrossEntropyLoss() eval_iter = biterator(mnli_m_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) m_dev_score, m_dev_loss = eval_model(model, eval_iter, criterion) eval_iter = biterator(mnli_um_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) um_dev_score, um_dev_loss = eval_model(model, eval_iter, criterion) print(f"Dev(M):{m_dev_score}/{m_dev_loss}") print(f"Dev(UM):{um_dev_score}/{um_dev_loss}")
Example #23
Source File: nn_doc_model.py From combine-FEVER-NSMN with MIT License | 4 votes |
def get_score_multihop(t_data_file, additional_file, model_path, item_key='prioritized_docids_aside', top_k=6): batch_size = 64 lazy = True SAVE_PATH = model_path print("Model From:", SAVE_PATH) additional_sentence_list = get_additional_list(t_data_file, additional_file, item_key=item_key, top_k=top_k) # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy) print("Additional Dev size:", len(additional_sentence_list)) dev_instances = dev_fever_data_reader.read(additional_sentence_list) # Load Vocabulary dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) additional_sentence_list = hidden_eval(model, eval_iter, additional_sentence_list) return additional_sentence_list
Example #24
Source File: nn_doc_model.py From combine-FEVER-NSMN with MIT License | 4 votes |
def pipeline_first_sent_selection(org_t_file, upstream_in_file, model_save_path): batch_size = 128 lazy = True SAVE_PATH = model_save_path print("Model From:", SAVE_PATH) dev_upstream_file = upstream_in_file # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy) complete_upstream_dev_data = get_full_list(org_t_file, dev_upstream_file, pred=True) print("Dev size:", len(complete_upstream_dev_data)) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) dev_sent_full_list = hidden_eval(model, eval_iter, complete_upstream_dev_data) return dev_sent_full_list
Example #25
Source File: nn_doc_model.py From combine-FEVER-NSMN with MIT License | 4 votes |
def pipeline_first_sent_selection(org_t_file, upstream_in_file, model_save_path): batch_size = 128 lazy = True SAVE_PATH = model_save_path print("Model From:", SAVE_PATH) dev_upstream_file = upstream_in_file # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy) complete_upstream_dev_data = get_full_list(org_t_file, dev_upstream_file, pred=True) print("Dev size:", len(complete_upstream_dev_data)) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) dev_sent_full_list = hidden_eval(model, eval_iter, complete_upstream_dev_data) return dev_sent_full_list
Example #26
Source File: esim.py From combine-FEVER-NSMN with MIT License | 4 votes |
def eval_fever(): # save_path = "/home/easonnie/projects/MiscEnc/saved_models/06-07-21:58:06_esim_elmo/i(60900)_epoch(4)_um_dev(80.03458096013019)_m_dev(79.174732552216)_seed(12)" save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:40:01_esim_elmo_linear_amr_cs_score_filtering_0.5/i(5900)_epoch(3)_um_dev(39.73759153783564)_m_dev(40.18339276617422)_seed(12)" # save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:42:34_esim_elmo_cs_score_filtering_0.7/i(1300)_epoch(4)_um_dev(32.55695687550855)_m_dev(32.42995415180846)_seed(12)" batch_size = 32 # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } csnli_dataset_reader = CNLIReader(token_indexers=token_indexers, example_filter=lambda x: float(x['cs_score']) >= 0.7) # mnli_train_data_path = config.DATA_ROOT / "mnli/multinli_1.0_train.jsonl" mnli_m_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_mdev.jsonl.cs" mnli_um_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_umdev.jsonl.cs" # mnli_train_instances = csnli_dataset_reader.read(mnli_train_data_path) mnli_m_dev_instances = csnli_dataset_reader.read(mnli_m_dev_data_path) mnli_um_dev_instances = csnli_dataset_reader.read(mnli_um_dev_data_path) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300) model.load_state_dict(torch.load(save_path)) model.display() model.to(device) # Create Log File criterion = nn.CrossEntropyLoss() eval_iter = biterator(mnli_m_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) m_dev_score, m_dev_loss = eval_model(model, eval_iter, criterion) eval_iter = biterator(mnli_um_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) um_dev_score, um_dev_loss = eval_model(model, eval_iter, criterion) print(f"Dev(M):{m_dev_score}/{m_dev_loss}") print(f"Dev(UM):{um_dev_score}/{um_dev_loss}")
Example #27
Source File: elmo_indexer_test.py From allennlp with Apache License 2.0 | 4 votes |
def test_bos_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token("<S>")], Vocabulary()) expected_indices = [ 259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, ] assert indices == {"elmo_tokens": [expected_indices]}
Example #28
Source File: elmo_indexer_test.py From allennlp with Apache License 2.0 | 4 votes |
def test_eos_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token("</S>")], Vocabulary()) expected_indices = [ 259, 258, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, ] assert indices == {"elmo_tokens": [expected_indices]}
Example #29
Source File: elmo_indexer_test.py From allennlp with Apache License 2.0 | 4 votes |
def test_unicode_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token(chr(256) + "t")], Vocabulary()) expected_indices = [ 259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, ] assert indices == {"elmo_tokens": [expected_indices]}
Example #30
Source File: elmo_indexer_test.py From allennlp with Apache License 2.0 | 4 votes |
def test_elmo_indexer_with_additional_tokens(self): indexer = ELMoTokenCharactersIndexer(tokens_to_add={"<first>": 1}) tokens = [Token("<first>")] indices = indexer.tokens_to_indices(tokens, Vocabulary()) expected_indices = [ [ 259, 2, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, ] ] assert indices["elmo_tokens"] == expected_indices