Python Examples of allennlp.data.token_indexers.ELMoTokenCharactersIndexer

Source File: citation_data_reader_scicite_aux.py From scicite with Apache License 2.0

6 votes

def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 clean_citation: bool = True,
                 with_elmo: bool = False
                 ) -> None:
        super().__init__(lazy)
        self._clean_citation = clean_citation
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        if with_elmo:
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()}

Source File: citation_data_reader_aclarc.py From scicite with Apache License 2.0

6 votes

def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 use_lexicon_features: bool = False,
                 use_sparse_lexicon_features: bool = False,
                 with_elmo: bool = False
                 ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        if with_elmo:
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()}
        self.use_lexicon_features = use_lexicon_features
        self.use_sparse_lexicon_features = use_sparse_lexicon_features
        if self.use_lexicon_features or self.use_sparse_lexicon_features:
            self.lexicons = {**ALL_ACTION_LEXICONS, **ALL_CONCEPT_LEXICONS}

Source File: citation_data_reader_scicite.py From scicite with Apache License 2.0

6 votes

def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 use_lexicon_features: bool=False,
                 use_sparse_lexicon_features: bool = False,
                 multilabel: bool = False,
                 with_elmo: bool = False,
                 reader_format: str = 'flat') -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        if with_elmo:
            # self._token_indexers = {"tokens": SingleIdTokenIndexer()}
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()}

        self.use_lexicon_features = use_lexicon_features
        self.use_sparse_lexicon_features = use_sparse_lexicon_features
        if self.use_lexicon_features or self.use_sparse_lexicon_features:
            self.lexicons = {**ALL_ACTION_LEXICONS, **ALL_CONCEPT_LEXICONS}
        self.multilabel = multilabel
        self.reader_format = reader_format

Source File: fever_reader.py From combine-FEVER-NSMN with MIT License

6 votes

def fever_build_vocab(d_list, unk_token_num=None) -> ExVocabulary:
    if unk_token_num is None:
        unk_token_num = {'tokens': 2600}

    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')   # This is the elmo_characters
    }

    nli_dataset_reader = BasicReader(token_indexers=token_indexers)

    # for in_file in d_list:
    instances = nli_dataset_reader.read(d_list)

    whole_vocabulary = ExVocabulary.from_instances(instances, unk_token_num=unk_token_num)

    print(whole_vocabulary.get_vocab_size('tokens'))  # 122827
    print(type(whole_vocabulary.get_token_to_index_vocabulary('tokens')))

    return whole_vocabulary

Source File: citation_data_reader_aclarc_aux.py From scicite with Apache License 2.0

6 votes

def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 clean_citation: bool = True,
                 with_elmo: bool = False
                 # use_lexicon_features: bool = False,
                 # use_sparse_lexicon_features: bool = False
                 ) -> None:
        super().__init__(lazy)
        self._clean_citation = clean_citation
        self._tokenizer = tokenizer or WordTokenizer()
        if with_elmo:
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()}

Source File: mesim_wn_simi_v1_2.py From combine-FEVER-NSMN with MIT License

5 votes

def __init__(self, model_path):
        # Prepare Data
        lazy = False
        token_indexers = {
            'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
            'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
        }

        p_dict = wn_persistent_api.persistence_load()

        dev_fever_data_reader = WNSIMIReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=420)

        vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
        vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels')

        # Build Model
        # device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
        # device_num = -1 if device.type == 'cpu' else 0

        device = torch.device("cpu")
        device_num = -1 if device.type == 'cpu' else 0

        biterator = BasicIterator(batch_size=16)
        biterator.index_with(vocab)

        model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size,
                                   1024 + 450 + dev_fever_data_reader.wn_feature_size),
                      rnn_size_out=(450, 450),
                      weight=weight_dict['glove.840B.300d'],
                      vocab_size=vocab.get_vocab_size('tokens'),
                      mlp_d=900,
                      embedding_dim=300, max_l=400)

        model.display()
        model.to(device)
        model.load_state_dict(torch.load(model_path))

        self.model = model
        self.dev_fever_data_reader = dev_fever_data_reader
        self.device_num = device_num
        self.biterator = biterator

Source File: nsmn_sent_wise_v1_1.py From combine-FEVER-NSMN with MIT License

5 votes

def __init__(self, model_path):
        # Prepare Data
        lazy = False
        token_indexers = {
            'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
            'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
        }

        p_dict = wn_persistent_api.persistence_load()

        dev_fever_data_reader = WNSIMIReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=420)

        vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
        vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels')

        # Build Model
        # device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
        # device_num = -1 if device.type == 'cpu' else 0

        device = torch.device("cpu")
        device_num = -1 if device.type == 'cpu' else 0

        biterator = BasicIterator(batch_size=16)
        biterator.index_with(vocab)

        model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size,
                                   1024 + 450 + dev_fever_data_reader.wn_feature_size),
                      rnn_size_out=(450, 450),
                      weight=weight_dict['glove.840B.300d'],
                      vocab_size=vocab.get_vocab_size('tokens'),
                      mlp_d=900,
                      embedding_dim=300, max_l=400)

        model.display()
        model.to(device)
        model.load_state_dict(torch.load(model_path))

        self.model = model
        self.dev_fever_data_reader = dev_fever_data_reader
        self.device_num = device_num
        self.biterator = biterator

Source File: elmo_indexer_test.py From magnitude with MIT License

5 votes

def test_bos_to_char_ids(self):
        indexer = ELMoTokenCharactersIndexer()
        indices = indexer.tokens_to_indices([Token(u'<S>')], Vocabulary(), u"test-elmo")
        expected_indices = [259, 257, 260, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261]
        assert indices == {u"test-elmo": [expected_indices]}

Source File: elmo_indexer_test.py From magnitude with MIT License

5 votes

def test_eos_to_char_ids(self):
        indexer = ELMoTokenCharactersIndexer()
        indices = indexer.tokens_to_indices([Token(u'</S>')], Vocabulary(), u"test-eos")
        expected_indices = [259, 258, 260, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261]
        assert indices == {u"test-eos": [expected_indices]}

Source File: elmo_indexer_test.py From magnitude with MIT License

5 votes

def test_unicode_to_char_ids(self):
        indexer = ELMoTokenCharactersIndexer()
        indices = indexer.tokens_to_indices([Token(unichr(256) + u't')], Vocabulary(), u"test-unicode")
        expected_indices = [259, 197, 129, 117, 260, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261]
        assert indices == {u"test-unicode": [expected_indices]}

Source File: elmo_indexer_test.py From magnitude with MIT License

5 votes

def test_elmo_as_array_produces_token_sequence(self): # pylint: disable=invalid-name
        indexer = ELMoTokenCharactersIndexer()
        tokens = [Token(u'Second'), Token(u'.')]
        indices = indexer.tokens_to_indices(tokens, Vocabulary(), u"test-elmo")[u"test-elmo"]
        padded_tokens = indexer.pad_token_sequence({u'test-elmo': indices},
                                                   desired_num_tokens={u'test-elmo': 3},
                                                   padding_lengths={})
        expected_padded_tokens = [[259, 84, 102, 100, 112, 111, 101, 260, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261],
                                  [259, 47, 260, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261, 261, 261, 261, 261,
                                   261, 261, 261, 261, 261],
                                  [0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0, 0, 0, 0, 0,
                                   0, 0, 0, 0, 0]]

        assert padded_tokens[u'test-elmo'] == expected_padded_tokens

Source File: elmo_indexer_test.py From allennlp with Apache License 2.0

5 votes

def test_elmo_empty_token_list(self):
        # Basic test
        indexer = ELMoTokenCharactersIndexer()
        assert {"elmo_tokens": []} == indexer.get_empty_token_list()
        # Real world test
        indexer = {"elmo": indexer}
        tokens_1 = TextField([Token("Apple")], indexer)
        targets_1 = ListField([TextField([Token("Apple")], indexer)])
        tokens_2 = TextField([Token("Screen"), Token("device")], indexer)
        targets_2 = ListField(
            [TextField([Token("Screen")], indexer), TextField([Token("Device")], indexer)]
        )
        instance_1 = Instance({"tokens": tokens_1, "targets": targets_1})
        instance_2 = Instance({"tokens": tokens_2, "targets": targets_2})
        a_batch = Batch([instance_1, instance_2])
        a_batch.index_instances(Vocabulary())
        batch_tensor = a_batch.as_tensor_dict()
        elmo_target_token_indices = batch_tensor["targets"]["elmo"]["elmo_tokens"]
        # The TextField that is empty should have been created using the
        # `get_empty_token_list` and then padded with zeros.
        empty_target = elmo_target_token_indices[0][1].numpy()
        np.testing.assert_array_equal(np.zeros((1, 50)), empty_target)
        non_empty_targets = [
            elmo_target_token_indices[0][0],
            elmo_target_token_indices[1][0],
            elmo_target_token_indices[1][1],
        ]
        for non_empty_target in non_empty_targets:
            with pytest.raises(AssertionError):
                np.testing.assert_array_equal(np.zeros((1, 50)), non_empty_target)

Source File: citation_data_reader_aclarc_aux.py From scicite with Apache License 2.0

5 votes

def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 clean_citation: bool = True,
                 with_elmo: bool = False
                 ) -> None:
        super().__init__(lazy)
        self._clean_citation = clean_citation
        self._tokenizer = tokenizer or WordTokenizer()
        if with_elmo:
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()}

Source File: vcr.py From r2c with MIT License

4 votes

def __init__(self, split, mode, only_use_relevant_dets=True, add_image_as_a_box=True, embs_to_load='bert_da',
                 conditioned_answer_choice=0):
        """

        :param split: train, val, or test
        :param mode: answer or rationale
        :param only_use_relevant_dets: True, if we will only use the detections mentioned in the question and answer.
                                       False, if we should use all detections.
        :param add_image_as_a_box:     True to add the image in as an additional 'detection'. It'll go first in the list
                                       of objects.
        :param embs_to_load: Which precomputed embeddings to load.
        :param conditioned_answer_choice: If you're in test mode, the answer labels aren't provided, which could be
                                          a problem for the QA->R task. Pass in 'conditioned_answer_choice=i'
                                          to always condition on the i-th answer.
        """
        self.split = split
        self.mode = mode
        self.only_use_relevant_dets = only_use_relevant_dets
        print("Only relevant dets" if only_use_relevant_dets else "Using all detections", flush=True)

        self.add_image_as_a_box = add_image_as_a_box
        self.conditioned_answer_choice = conditioned_answer_choice

        with open(os.path.join(VCR_ANNOTS_DIR, '{}.jsonl'.format(split)), 'r') as f:
            self.items = [json.loads(s) for s in f]

        if split not in ('test', 'train', 'val'):
            raise ValueError("Mode must be in test, train, or val. Supplied {}".format(mode))

        if mode not in ('answer', 'rationale'):
            raise ValueError("split must be answer or rationale")

        self.token_indexers = {'elmo': ELMoTokenCharactersIndexer()}
        self.vocab = Vocabulary()

        with open(os.path.join(os.path.dirname(VCR_ANNOTS_DIR), 'dataloaders', 'cocoontology.json'), 'r') as f:
            coco = json.load(f)
        self.coco_objects = ['__background__'] + [x['name'] for k, x in sorted(coco.items(), key=lambda x: int(x[0]))]
        self.coco_obj_to_ind = {o: i for i, o in enumerate(self.coco_objects)}

        self.embs_to_load = embs_to_load
        self.h5fn = os.path.join(VCR_ANNOTS_DIR, f'{self.embs_to_load}_{self.mode}_{self.split}.h5')
        print("Loading embeddings from {}".format(self.h5fn), flush=True)

Source File: mesim_wn_simi_v1_3.py From combine-FEVER-NSMN with MIT License

4 votes

def hidden_eval_fever():
    batch_size = 64
    lazy = True

    SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-18-21:07:28_m_esim_wn_elmo_sample_fixed/i(57000)_epoch(8)_dev(0.5755075507550755)_loss(1.7175163737963839)_seed(12)"

    dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl"

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    p_dict = wn_persistent_api.persistence_load()

    dev_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360)

    complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, dev_upstream_file)
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)
    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)
    # dev_biterator = BasicIterator(batch_size=batch_size * 2)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size,
                               1024 + 300),
                  weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300)

    print("Model Max length:", model.max_l)
    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)

    eval_mode = {'check_sent_id_correct': True, 'standard': True}

    for item in builded_dev_data:
        del item['label']

    print(c_scorer.fever_score(builded_dev_data, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode))

Source File: simple_nnmodel_refactor.py From combine-FEVER-NSMN with MIT License

4 votes

def pipeline_first_sent_selection_list(org_t_file, upstream_in_file, model_save_path, top_k):
    batch_size = 64
    lazy = True
    SAVE_PATH = model_save_path
    print("Model From:", SAVE_PATH)

    dev_upstream_file = upstream_in_file

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy)

    complete_upstream_dev_data = get_full_list_from_list_d(org_t_file, dev_upstream_file, pred=True, top_k=top_k)
    print("Dev size:", len(complete_upstream_dev_data))
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

    # Load Vocabulary
    dev_biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    # THis is important
    vocab.add_token_to_namespace("true", namespace="selection_labels")
    vocab.add_token_to_namespace("false", namespace="selection_labels")
    vocab.add_token_to_namespace("hidden", namespace="selection_labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels')
    # Label value
    vocab.get_index_to_token_vocabulary('selection_labels')

    print(vocab.get_token_to_index_vocabulary('selection_labels'))
    print(vocab.get_vocab_size('tokens'))

    dev_biterator.index_with(vocab)

    # exit(0)
    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300, num_of_class=2)

    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    dev_sent_full_list = hidden_eval(model, eval_iter, complete_upstream_dev_data)

    return dev_sent_full_list

Source File: simple_nnmodel_refactor.py From combine-FEVER-NSMN with MIT License

4 votes

def pipeline_first_sent_selection(org_t_file, upstream_in_file, model_save_path, top_k):
    batch_size = 64
    lazy = True
    SAVE_PATH = model_save_path
    print("Model From:", SAVE_PATH)

    dev_upstream_file = upstream_in_file

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy)

    complete_upstream_dev_data = get_full_list(org_t_file, dev_upstream_file, pred=True, top_k=top_k)
    print("Dev size:", len(complete_upstream_dev_data))
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

    # Load Vocabulary
    dev_biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    # THis is important
    vocab.add_token_to_namespace("true", namespace="selection_labels")
    vocab.add_token_to_namespace("false", namespace="selection_labels")
    vocab.add_token_to_namespace("hidden", namespace="selection_labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels')
    # Label value
    vocab.get_index_to_token_vocabulary('selection_labels')

    print(vocab.get_token_to_index_vocabulary('selection_labels'))
    print(vocab.get_vocab_size('tokens'))

    dev_biterator.index_with(vocab)

    # exit(0)
    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300, num_of_class=2)

    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    dev_sent_full_list = hidden_eval(model, eval_iter, complete_upstream_dev_data)

    return dev_sent_full_list

Source File: faster_nnmodel.py From combine-FEVER-NSMN with MIT License

4 votes

def eval_fever():
    # save_path = "/home/easonnie/projects/MiscEnc/saved_models/06-07-21:58:06_esim_elmo/i(60900)_epoch(4)_um_dev(80.03458096013019)_m_dev(79.174732552216)_seed(12)"
    save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:40:01_esim_elmo_linear_amr_cs_score_filtering_0.5/i(5900)_epoch(3)_um_dev(39.73759153783564)_m_dev(40.18339276617422)_seed(12)"
    # save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:42:34_esim_elmo_cs_score_filtering_0.7/i(1300)_epoch(4)_um_dev(32.55695687550855)_m_dev(32.42995415180846)_seed(12)"
    batch_size = 32

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    csnli_dataset_reader = CNLIReader(token_indexers=token_indexers,
                                      example_filter=lambda x: float(x['cs_score']) >= 0.7)

    # mnli_train_data_path = config.DATA_ROOT / "mnli/multinli_1.0_train.jsonl"
    mnli_m_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_mdev.jsonl.cs"
    mnli_um_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_umdev.jsonl.cs"

    # mnli_train_instances = csnli_dataset_reader.read(mnli_train_data_path)
    mnli_m_dev_instances = csnli_dataset_reader.read(mnli_m_dev_data_path)
    mnli_um_dev_instances = csnli_dataset_reader.read(mnli_um_dev_data_path)

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli")
    vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300)

    model.load_state_dict(torch.load(save_path))

    model.display()
    model.to(device)

    # Create Log File

    criterion = nn.CrossEntropyLoss()

    eval_iter = biterator(mnli_m_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    m_dev_score, m_dev_loss = eval_model(model, eval_iter, criterion)

    eval_iter = biterator(mnli_um_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    um_dev_score, um_dev_loss = eval_model(model, eval_iter, criterion)

    print(f"Dev(M):{m_dev_score}/{m_dev_loss}")
    print(f"Dev(UM):{um_dev_score}/{um_dev_loss}")

Source File: simple_nnmodel.py From combine-FEVER-NSMN with MIT License

4 votes

def get_score_multihop(t_data_file, additional_file, model_path, item_key='prioritized_docids_aside', top_k=6):
    batch_size = 64
    lazy = True

    SAVE_PATH = model_path
    print("Model From:", SAVE_PATH)

    additional_sentence_list = get_additional_list(t_data_file, additional_file, item_key=item_key, top_k=top_k)

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy)

    print("Additional Dev size:", len(additional_sentence_list))
    dev_instances = dev_fever_data_reader.read(additional_sentence_list)

    # Load Vocabulary
    dev_biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    # THis is important
    vocab.add_token_to_namespace("true", namespace="selection_labels")
    vocab.add_token_to_namespace("false", namespace="selection_labels")
    vocab.add_token_to_namespace("hidden", namespace="selection_labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels')
    # Label value
    vocab.get_index_to_token_vocabulary('selection_labels')

    print(vocab.get_token_to_index_vocabulary('selection_labels'))
    print(vocab.get_vocab_size('tokens'))

    dev_biterator.index_with(vocab)

    # exit(0)
    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300, num_of_class=2)

    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1)
    additional_sentence_list = hidden_eval(model, eval_iter, additional_sentence_list)

    return additional_sentence_list

Source File: simple_nnmodel.py From combine-FEVER-NSMN with MIT License

4 votes

def pipeline_first_sent_selection_list(org_t_file, upstream_in_file, model_save_path, top_k):
    batch_size = 64
    lazy = True
    SAVE_PATH = model_save_path
    print("Model From:", SAVE_PATH)

    dev_upstream_file = upstream_in_file

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy)

    complete_upstream_dev_data = get_full_list_from_list_d(org_t_file, dev_upstream_file, pred=True, top_k=top_k)
    print("Dev size:", len(complete_upstream_dev_data))
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

    # Load Vocabulary
    dev_biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    # THis is important
    vocab.add_token_to_namespace("true", namespace="selection_labels")
    vocab.add_token_to_namespace("false", namespace="selection_labels")
    vocab.add_token_to_namespace("hidden", namespace="selection_labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels')
    # Label value
    vocab.get_index_to_token_vocabulary('selection_labels')

    print(vocab.get_token_to_index_vocabulary('selection_labels'))
    print(vocab.get_vocab_size('tokens'))

    dev_biterator.index_with(vocab)

    # exit(0)
    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300, num_of_class=2)

    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    dev_sent_full_list = hidden_eval(model, eval_iter, complete_upstream_dev_data)

    return dev_sent_full_list

Source File: simple_nnmodel.py From combine-FEVER-NSMN with MIT License

4 votes

def pipeline_first_sent_selection(org_t_file, upstream_in_file, model_save_path, top_k):
    batch_size = 64
    lazy = True
    SAVE_PATH = model_save_path
    print("Model From:", SAVE_PATH)

    dev_upstream_file = upstream_in_file

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy)

    complete_upstream_dev_data = get_full_list(org_t_file, dev_upstream_file, pred=True, top_k=top_k)
    print("Dev size:", len(complete_upstream_dev_data))
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

    # Load Vocabulary
    dev_biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    # THis is important
    vocab.add_token_to_namespace("true", namespace="selection_labels")
    vocab.add_token_to_namespace("false", namespace="selection_labels")
    vocab.add_token_to_namespace("hidden", namespace="selection_labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels')
    # Label value
    vocab.get_index_to_token_vocabulary('selection_labels')

    print(vocab.get_token_to_index_vocabulary('selection_labels'))
    print(vocab.get_vocab_size('tokens'))

    dev_biterator.index_with(vocab)

    # exit(0)
    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300, num_of_class=2)

    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1)
    dev_sent_full_list = hidden_eval(model, eval_iter, complete_upstream_dev_data)

    return dev_sent_full_list

Source File: simple_nnmodel.py From combine-FEVER-NSMN with MIT License

4 votes

def eval_fever():
    # save_path = "/home/easonnie/projects/MiscEnc/saved_models/06-07-21:58:06_esim_elmo/i(60900)_epoch(4)_um_dev(80.03458096013019)_m_dev(79.174732552216)_seed(12)"
    save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:40:01_esim_elmo_linear_amr_cs_score_filtering_0.5/i(5900)_epoch(3)_um_dev(39.73759153783564)_m_dev(40.18339276617422)_seed(12)"
    # save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:42:34_esim_elmo_cs_score_filtering_0.7/i(1300)_epoch(4)_um_dev(32.55695687550855)_m_dev(32.42995415180846)_seed(12)"
    batch_size = 32

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    csnli_dataset_reader = CNLIReader(token_indexers=token_indexers,
                                      example_filter=lambda x: float(x['cs_score']) >= 0.7)

    # mnli_train_data_path = config.DATA_ROOT / "mnli/multinli_1.0_train.jsonl"
    mnli_m_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_mdev.jsonl.cs"
    mnli_um_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_umdev.jsonl.cs"

    # mnli_train_instances = csnli_dataset_reader.read(mnli_train_data_path)
    mnli_m_dev_instances = csnli_dataset_reader.read(mnli_m_dev_data_path)
    mnli_um_dev_instances = csnli_dataset_reader.read(mnli_um_dev_data_path)

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli")
    vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300)

    model.load_state_dict(torch.load(save_path))

    model.display()
    model.to(device)

    # Create Log File

    criterion = nn.CrossEntropyLoss()

    eval_iter = biterator(mnli_m_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    m_dev_score, m_dev_loss = eval_model(model, eval_iter, criterion)

    eval_iter = biterator(mnli_um_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    um_dev_score, um_dev_loss = eval_model(model, eval_iter, criterion)

    print(f"Dev(M):{m_dev_score}/{m_dev_loss}")
    print(f"Dev(UM):{um_dev_score}/{um_dev_loss}")

Source File: nn_doc_model.py From combine-FEVER-NSMN with MIT License

4 votes

def get_score_multihop(t_data_file, additional_file, model_path, item_key='prioritized_docids_aside', top_k=6):
    batch_size = 64
    lazy = True

    SAVE_PATH = model_path
    print("Model From:", SAVE_PATH)

    additional_sentence_list = get_additional_list(t_data_file, additional_file, item_key=item_key, top_k=top_k)

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy)

    print("Additional Dev size:", len(additional_sentence_list))
    dev_instances = dev_fever_data_reader.read(additional_sentence_list)

    # Load Vocabulary
    dev_biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    # THis is important
    vocab.add_token_to_namespace("true", namespace="selection_labels")
    vocab.add_token_to_namespace("false", namespace="selection_labels")
    vocab.add_token_to_namespace("hidden", namespace="selection_labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels')
    # Label value
    vocab.get_index_to_token_vocabulary('selection_labels')

    print(vocab.get_token_to_index_vocabulary('selection_labels'))
    print(vocab.get_vocab_size('tokens'))

    dev_biterator.index_with(vocab)

    # exit(0)
    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300, num_of_class=2)

    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    additional_sentence_list = hidden_eval(model, eval_iter, additional_sentence_list)

    return additional_sentence_list

Source File: nn_doc_model.py From combine-FEVER-NSMN with MIT License

4 votes

def pipeline_first_sent_selection(org_t_file, upstream_in_file, model_save_path):
    batch_size = 128
    lazy = True
    SAVE_PATH = model_save_path
    print("Model From:", SAVE_PATH)

    dev_upstream_file = upstream_in_file

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy)

    complete_upstream_dev_data = get_full_list(org_t_file, dev_upstream_file, pred=True)
    print("Dev size:", len(complete_upstream_dev_data))
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

    # Load Vocabulary
    dev_biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    # THis is important
    vocab.add_token_to_namespace("true", namespace="selection_labels")
    vocab.add_token_to_namespace("false", namespace="selection_labels")
    vocab.add_token_to_namespace("hidden", namespace="selection_labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels')
    # Label value
    vocab.get_index_to_token_vocabulary('selection_labels')

    print(vocab.get_token_to_index_vocabulary('selection_labels'))
    print(vocab.get_vocab_size('tokens'))

    dev_biterator.index_with(vocab)

    # exit(0)
    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300, num_of_class=2)

    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    dev_sent_full_list = hidden_eval(model, eval_iter, complete_upstream_dev_data)

    return dev_sent_full_list

Source File: nn_doc_model.py From combine-FEVER-NSMN with MIT License

4 votes

def pipeline_first_sent_selection(org_t_file, upstream_in_file, model_save_path):
    batch_size = 128
    lazy = True
    SAVE_PATH = model_save_path
    print("Model From:", SAVE_PATH)

    dev_upstream_file = upstream_in_file

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy)

    complete_upstream_dev_data = get_full_list(org_t_file, dev_upstream_file, pred=True)
    print("Dev size:", len(complete_upstream_dev_data))
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

    # Load Vocabulary
    dev_biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    # THis is important
    vocab.add_token_to_namespace("true", namespace="selection_labels")
    vocab.add_token_to_namespace("false", namespace="selection_labels")
    vocab.add_token_to_namespace("hidden", namespace="selection_labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels')
    # Label value
    vocab.get_index_to_token_vocabulary('selection_labels')

    print(vocab.get_token_to_index_vocabulary('selection_labels'))
    print(vocab.get_vocab_size('tokens'))

    dev_biterator.index_with(vocab)

    # exit(0)
    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300, num_of_class=2)

    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    dev_sent_full_list = hidden_eval(model, eval_iter, complete_upstream_dev_data)

    return dev_sent_full_list

Source File: esim.py From combine-FEVER-NSMN with MIT License

4 votes

def eval_fever():
    # save_path = "/home/easonnie/projects/MiscEnc/saved_models/06-07-21:58:06_esim_elmo/i(60900)_epoch(4)_um_dev(80.03458096013019)_m_dev(79.174732552216)_seed(12)"
    save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:40:01_esim_elmo_linear_amr_cs_score_filtering_0.5/i(5900)_epoch(3)_um_dev(39.73759153783564)_m_dev(40.18339276617422)_seed(12)"
    # save_path = "/home/easonnie/projects/MiscEnc/saved_models/07-02-14:42:34_esim_elmo_cs_score_filtering_0.7/i(1300)_epoch(4)_um_dev(32.55695687550855)_m_dev(32.42995415180846)_seed(12)"
    batch_size = 32

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    csnli_dataset_reader = CNLIReader(token_indexers=token_indexers,
                                      example_filter=lambda x: float(x['cs_score']) >= 0.7)

    # mnli_train_data_path = config.DATA_ROOT / "mnli/multinli_1.0_train.jsonl"
    mnli_m_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_mdev.jsonl.cs"
    mnli_um_dev_data_path = config.DATA_ROOT / "amrs/mnli_amr_ln/mnli_umdev.jsonl.cs"

    # mnli_train_instances = csnli_dataset_reader.read(mnli_train_data_path)
    mnli_m_dev_instances = csnli_dataset_reader.read(mnli_m_dev_data_path)
    mnli_um_dev_instances = csnli_dataset_reader.read(mnli_um_dev_data_path)

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli")
    vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300)

    model.load_state_dict(torch.load(save_path))

    model.display()
    model.to(device)

    # Create Log File

    criterion = nn.CrossEntropyLoss()

    eval_iter = biterator(mnli_m_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    m_dev_score, m_dev_loss = eval_model(model, eval_iter, criterion)

    eval_iter = biterator(mnli_um_dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    um_dev_score, um_dev_loss = eval_model(model, eval_iter, criterion)

    print(f"Dev(M):{m_dev_score}/{m_dev_loss}")
    print(f"Dev(UM):{um_dev_score}/{um_dev_loss}")

Source File: elmo_indexer_test.py From allennlp with Apache License 2.0

4 votes

def test_bos_to_char_ids(self):
        indexer = ELMoTokenCharactersIndexer()
        indices = indexer.tokens_to_indices([Token("<S>")], Vocabulary())
        expected_indices = [
            259,
            257,
            260,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
        ]
        assert indices == {"elmo_tokens": [expected_indices]}

Source File: elmo_indexer_test.py From allennlp with Apache License 2.0

4 votes

def test_eos_to_char_ids(self):
        indexer = ELMoTokenCharactersIndexer()
        indices = indexer.tokens_to_indices([Token("</S>")], Vocabulary())
        expected_indices = [
            259,
            258,
            260,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
        ]
        assert indices == {"elmo_tokens": [expected_indices]}

Source File: elmo_indexer_test.py From allennlp with Apache License 2.0

4 votes

def test_unicode_to_char_ids(self):
        indexer = ELMoTokenCharactersIndexer()
        indices = indexer.tokens_to_indices([Token(chr(256) + "t")], Vocabulary())
        expected_indices = [
            259,
            197,
            129,
            117,
            260,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
            261,
        ]
        assert indices == {"elmo_tokens": [expected_indices]}

Source File: elmo_indexer_test.py From allennlp with Apache License 2.0

4 votes

def test_elmo_indexer_with_additional_tokens(self):
        indexer = ELMoTokenCharactersIndexer(tokens_to_add={"<first>": 1})
        tokens = [Token("<first>")]
        indices = indexer.tokens_to_indices(tokens, Vocabulary())
        expected_indices = [
            [
                259,
                2,
                260,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
                261,
            ]
        ]
        assert indices["elmo_tokens"] == expected_indices

Python allennlp.data.token_indexers.ELMoTokenCharactersIndexer() Examples