Python allennlp.data.Vocabulary() Examples

The following are 30 code examples of allennlp.data.Vocabulary(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data , or try the search function .
Example #1
Source File: bert_text_classifier.py    From scibert with Apache License 2.0 6 votes vote down vote up
def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 verbose_metrics: bool = False,
                 dropout: float = 0.2,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 ) -> None:
        super(TextClassifier, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.dropout = torch.nn.Dropout(dropout)
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.classifier_feedforward = torch.nn.Linear(self.text_field_embedder.get_output_dim()  , self.num_classes)

        self.label_accuracy = CategoricalAccuracy()
        self.label_f1_metrics = {}

        self.verbose_metrics = verbose_metrics

        for i in range(self.num_classes):
            self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = F1Measure(positive_label=i)
        self.loss = torch.nn.CrossEntropyLoss()

        initializer(self) 
Example #2
Source File: embedding_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_forward_works_with_projection_layer(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("the")
        vocab.add_token_to_namespace("a")
        params = Params(
            {
                "pretrained_file": str(
                    self.FIXTURES_ROOT / "embeddings/glove.6B.300d.sample.txt.gz"
                ),
                "embedding_dim": 300,
                "projection_dim": 20,
            }
        )
        embedding_layer = Embedding.from_params(params, vocab=vocab)
        input_tensor = torch.LongTensor([[3, 2, 1, 0]])
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 4, 20)

        input_tensor = torch.LongTensor([[[3, 2, 1, 0]]])
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 1, 4, 20) 
Example #3
Source File: embedding_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        vocab.add_token_to_namespace("word2")
        unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
        vocab.add_token_to_namespace(unicode_space)
        embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
        with gzip.open(embeddings_filename, "wb") as embeddings_file:
            embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8"))
            embeddings_file.write(f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8"))
        params = Params({"pretrained_file": embeddings_filename, "embedding_dim": 3})
        embedding_layer = Embedding.from_params(params, vocab=vocab)
        word_vector = embedding_layer.weight.data[vocab.get_token_index("word")]
        assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
        word_vector = embedding_layer.weight.data[vocab.get_token_index(unicode_space)]
        assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0]))
        word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
        assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0])) 
Example #4
Source File: pretrained_transformer_indexer_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_indices_to_tokens(self):
        allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-uncased")
        indexer_max_length = PretrainedTransformerIndexer(
            model_name="bert-base-uncased", max_length=4
        )
        indexer_no_max_length = PretrainedTransformerIndexer(model_name="bert-base-uncased")
        string_no_specials = "AllenNLP is great"

        allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
        vocab = Vocabulary()
        indexed = indexer_no_max_length.tokens_to_indices(allennlp_tokens, vocab)
        tokens_from_indices = indexer_no_max_length.indices_to_tokens(indexed, vocab)

        self._assert_tokens_equal(allennlp_tokens, tokens_from_indices)

        indexed = indexer_max_length.tokens_to_indices(allennlp_tokens, vocab)
        tokens_from_indices = indexer_max_length.indices_to_tokens(indexed, vocab)

        # For now we are not removing special tokens introduced from max_length
        sep_cls = [allennlp_tokens[-1], allennlp_tokens[0]]
        expected = (
            allennlp_tokens[:3] + sep_cls + allennlp_tokens[3:5] + sep_cls + allennlp_tokens[5:]
        )

        self._assert_tokens_equal(expected, tokens_from_indices) 
Example #5
Source File: pretrained_transformer_indexer_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_long_sequence_splitting(self):
        tokenizer = cached_transformers.get_tokenizer("bert-base-uncased")
        allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-uncased")
        indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased", max_length=4)
        string_specials = "[CLS] AllenNLP is great [SEP]"
        string_no_specials = "AllenNLP is great"
        tokens = tokenizer.tokenize(string_specials)
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        assert len(expected_ids) == 7  # just to make sure it's what we're expecting
        cls_id, sep_id = expected_ids[0], expected_ids[-1]
        expected_ids = (
            expected_ids[:3]
            + [sep_id, cls_id]
            + expected_ids[3:5]
            + [sep_id, cls_id]
            + expected_ids[5:]
        )

        allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
        assert indexed["token_ids"] == expected_ids
        assert indexed["segment_concat_mask"] == [True] * len(expected_ids)
        assert indexed["mask"] == [True] * 7  # original length 
Example #6
Source File: embedding_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_embedding_vocab_extension_with_specified_namespace(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word1", "tokens_a")
        vocab.add_token_to_namespace("word2", "tokens_a")
        embedding_params = Params({"vocab_namespace": "tokens_a", "embedding_dim": 10})
        embedder = Embedding.from_params(embedding_params, vocab=vocab)
        original_weight = embedder.weight

        assert original_weight.shape[0] == 4

        extension_counter = {"tokens_a": {"word3": 1}}
        vocab._extend(extension_counter)

        embedder.extend_vocab(vocab, "tokens_a")  # specified namespace

        extended_weight = embedder.weight
        assert extended_weight.shape[0] == 5
        assert torch.all(extended_weight[:4, :] == original_weight[:4, :]) 
Example #7
Source File: embedding_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_embedding_vocab_extension_with_default_namespace(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word1")
        vocab.add_token_to_namespace("word2")
        embedding_params = Params({"vocab_namespace": "tokens", "embedding_dim": 10})
        embedder = Embedding.from_params(embedding_params, vocab=vocab)
        original_weight = embedder.weight

        assert original_weight.shape[0] == 4

        extension_counter = {"tokens": {"word3": 1}}
        vocab._extend(extension_counter)

        embedder.extend_vocab(vocab)  # default namespace

        extended_weight = embedder.weight
        assert extended_weight.shape[0] == 5
        assert torch.all(extended_weight[:4, :] == original_weight[:4, :]) 
Example #8
Source File: adjacency_field_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_adjacency_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("a", namespace="labels")
        vocab.add_token_to_namespace("b", namespace="labels")
        vocab.add_token_to_namespace("c", namespace="labels")

        labels = ["a", "b"]
        indices = [(0, 1), (2, 1)]
        adjacency_field = AdjacencyField(indices, self.text, labels)
        adjacency_field.index(vocab)
        tensor = adjacency_field.as_tensor(adjacency_field.get_padding_lengths())
        numpy.testing.assert_equal(
            tensor.numpy(),
            numpy.array(
                [
                    [-1, 0, -1, -1, -1],
                    [-1, -1, -1, -1, -1],
                    [-1, 1, -1, -1, -1],
                    [-1, -1, -1, -1, -1],
                    [-1, -1, -1, -1, -1],
                ]
            ),
        ) 
Example #9
Source File: bag_of_word_counts_token_embedder.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def __init__(
        self,
        vocab: Vocabulary,
        vocab_namespace: str = "tokens",
        projection_dim: int = None,
        ignore_oov: bool = False,
    ) -> None:
        super().__init__()
        self.vocab = vocab
        self.vocab_size = vocab.get_vocab_size(vocab_namespace)
        if projection_dim:
            self._projection = torch.nn.Linear(self.vocab_size, projection_dim)
        else:
            self._projection = None
        self._ignore_oov = ignore_oov
        oov_token = vocab._oov_token
        self._oov_idx = vocab.get_token_to_index_vocabulary(vocab_namespace).get(oov_token)
        if self._oov_idx is None:
            raise ConfigurationError(
                "OOV token does not exist in vocabulary namespace {}".format(vocab_namespace)
            )
        self.output_dim = projection_dim or self.vocab_size 
Example #10
Source File: embedding.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def _read_embeddings_from_hdf5(
    embeddings_filename: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens"
) -> torch.FloatTensor:
    """
    Reads from a hdf5 formatted file. The embedding matrix is assumed to
    be keyed by 'embedding' and of size `(num_tokens, embedding_dim)`.
    """
    with h5py.File(embeddings_filename, "r") as fin:
        embeddings = fin["embedding"][...]

    if list(embeddings.shape) != [vocab.get_vocab_size(namespace), embedding_dim]:
        raise ConfigurationError(
            "Read shape {0} embeddings from the file, but expected {1}".format(
                list(embeddings.shape), [vocab.get_vocab_size(namespace), embedding_dim]
            )
        )

    return torch.FloatTensor(embeddings) 
Example #11
Source File: embedding_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_embedding_vocab_extension_without_stored_namespace(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word1", "tokens_a")
        vocab.add_token_to_namespace("word2", "tokens_a")
        embedding_params = Params({"vocab_namespace": "tokens_a", "embedding_dim": 10})
        embedder = Embedding.from_params(embedding_params, vocab=vocab)

        # Previous models won't have _vocab_namespace attribute. Force it to be None
        embedder._vocab_namespace = None
        original_weight = embedder.weight

        assert original_weight.shape[0] == 4

        extension_counter = {"tokens_a": {"word3": 1}}
        vocab._extend(extension_counter)

        embedder.extend_vocab(vocab, "tokens_a")  # specified namespace

        extended_weight = embedder.weight
        assert extended_weight.shape[0] == 5
        assert torch.all(extended_weight[:4, :] == original_weight[:4, :]) 
Example #12
Source File: crf_tagger.py    From didyprog with MIT License 6 votes vote down vote up
def from_params(cls, vocab: Vocabulary, params: Params) -> 'CrfTagger':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)
        encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))
        label_namespace = params.pop("label_namespace", "labels")
        constraint_type = params.pop("constraint_type", None)
        dropout = params.pop("dropout", None)
        include_start_end_transitions = params.pop("include_start_end_transitions", True)
        initializer = InitializerApplicator.from_params(params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))

        params.assert_empty(cls.__name__)

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   encoder=encoder,
                   label_namespace=label_namespace,
                   constraint_type=constraint_type,
                   dropout=dropout,
                   include_start_end_transitions=include_start_end_transitions,
                   initializer=initializer,
                   regularizer=regularizer) 
Example #13
Source File: embedding_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_embedding_constructed_directly_with_pretrained_file(self):

        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        vocab.add_token_to_namespace("word2")
        unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
        vocab.add_token_to_namespace(unicode_space)
        embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
        with gzip.open(embeddings_filename, "wb") as embeddings_file:
            embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8"))
            embeddings_file.write(f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8"))

        num_embeddings = vocab.get_vocab_size()
        embedding_layer = Embedding(
            embedding_dim=3,
            num_embeddings=num_embeddings,
            pretrained_file=embeddings_filename,
            vocab=vocab,
        )
        word_vector = embedding_layer.weight.data[vocab.get_token_index("word")]
        assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
        word_vector = embedding_layer.weight.data[vocab.get_token_index(unicode_space)]
        assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0]))
        word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
        assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0])) 
Example #14
Source File: pretrained_transformer_indexer_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_transformers_vocab_sizes(self):
        def check_vocab_size(model_name: str):
            namespace = "tags"
            tokenizer = cached_transformers.get_tokenizer(model_name)
            allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
            indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace)
            allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
            vocab = Vocabulary()
            # here we copy entire transformers vocab
            indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
            del indexed
            assert vocab.get_vocab_size(namespace=namespace) == tokenizer.vocab_size

        check_vocab_size("roberta-base")
        check_vocab_size("bert-base-cased")
        check_vocab_size("xlm-mlm-ende-1024") 
Example #15
Source File: pretrained_transformer_indexer_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_as_array_produces_token_sequence_bert_cased_sentence_pair(self):
        tokenizer = cached_transformers.get_tokenizer("bert-base-cased")
        allennlp_tokenizer = PretrainedTransformerTokenizer(
            "bert-base-cased", add_special_tokens=False
        )
        indexer = PretrainedTransformerIndexer(model_name="bert-base-cased")
        default_format = "[CLS] AllenNLP is great! [SEP] Really it is! [SEP]"
        tokens = tokenizer.tokenize(default_format)
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        allennlp_tokens = allennlp_tokenizer.add_special_tokens(
            allennlp_tokenizer.tokenize("AllenNLP is great!"),
            allennlp_tokenizer.tokenize("Really it is!"),
        )
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
        assert indexed["token_ids"] == expected_ids 
Example #16
Source File: character_token_indexer_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_start_and_end_tokens(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace="characters")  # 2
        vocab.add_token_to_namespace("s", namespace="characters")  # 3
        vocab.add_token_to_namespace("e", namespace="characters")  # 4
        vocab.add_token_to_namespace("n", namespace="characters")  # 5
        vocab.add_token_to_namespace("t", namespace="characters")  # 6
        vocab.add_token_to_namespace("c", namespace="characters")  # 7
        vocab.add_token_to_namespace("<", namespace="characters")  # 8
        vocab.add_token_to_namespace(">", namespace="characters")  # 9
        vocab.add_token_to_namespace("/", namespace="characters")  # 10

        indexer = TokenCharactersIndexer(
            "characters", start_tokens=["<s>"], end_tokens=["</s>"], min_padding_length=1
        )
        indices = indexer.tokens_to_indices([Token("sentential")], vocab)
        assert indices == {
            "token_characters": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]]
        } 
Example #17
Source File: pretrained_transformer_indexer_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_as_array_produces_token_sequence_bert_uncased(self):
        tokenizer = cached_transformers.get_tokenizer("bert-base-uncased")
        allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-uncased")
        indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased")
        string_specials = "[CLS] AllenNLP is great [SEP]"
        string_no_specials = "AllenNLP is great"
        tokens = tokenizer.tokenize(string_specials)
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        # tokens tokenized with our pretrained tokenizer have indices in them
        allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
        assert indexed["token_ids"] == expected_ids 
Example #18
Source File: make_vocab_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_make_vocab_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / u'existing'
        extended_serialization_dir = self.TEST_DIR / u'extended'
        existing_vocab_path = existing_serialization_dir / u'vocabulary'
        extended_vocab_path = extended_serialization_dir / u'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace(u'some_weird_token_1', namespace=u'tokens')
        vocab.add_token_to_namespace(u'some_weird_token_2', namespace=u'tokens')
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace(u'N', namespace=u'labels')
        vocab.add_token_to_namespace(u'V', namespace=u'labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params[u'vocabulary'] = {}
        self.params[u'vocabulary'][u'directory_path'] = existing_vocab_path
        self.params[u'vocabulary'][u'extend'] = False
        make_vocab_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / u'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == u'@@UNKNOWN@@'
        assert tokens[1] == u'some_weird_token_1'
        assert tokens[2] == u'some_weird_token_2'
        assert len(tokens) == 3 
Example #19
Source File: test_elmo.py    From pytorch-fast-elmo with MIT License 5 votes vote down vote up
def _sentences_to_ids(sentences):
    indexer = ELMoTokenCharactersIndexer()

    # For each sentence, first create a TextField, then create an instance
    instances = []
    for sentence in sentences:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {'character_ids': indexer})
        instance = Instance({'elmo': field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids'] 
Example #20
Source File: simple_overlap.py    From scitail with Apache License 2.0 5 votes vote down vote up
def from_params(cls, vocab: Vocabulary, params: Params) -> 'SimpleOverlap':
        classifier = FeedForward.from_params(params.pop('classifier'))

        init_params = params.pop('initializer', None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None
                       else InitializerApplicator())

        return cls(vocab=vocab,
                   classifier=classifier,
                   initializer=initializer) 
Example #21
Source File: simple_overlap.py    From scitail with Apache License 2.0 5 votes vote down vote up
def __init__(self, vocab: Vocabulary,
                 classifier: FeedForward,
                 initializer: InitializerApplicator = InitializerApplicator()) -> None:
        super(SimpleOverlap, self).__init__(vocab)
        self.linear_mlp = classifier
        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()
        initializer(self) 
Example #22
Source File: tree_attention.py    From scitail with Apache License 2.0 5 votes vote down vote up
def from_params(cls, vocab: Vocabulary, params: Params) -> 'TreeAttention':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)

        premise_encoder_params = params.pop("premise_encoder", None)
        premise_encoder = Seq2SeqEncoder.from_params(premise_encoder_params)

        attention_similarity = SimilarityFunction.from_params(params.pop('attention_similarity'))
        phrase_probability = FeedForward.from_params(params.pop('phrase_probability'))
        edge_probability = FeedForward.from_params(params.pop('edge_probability'))

        edge_embedding = Embedding.from_params(vocab, params.pop('edge_embedding'))
        use_encoding_for_node = params.pop('use_encoding_for_node')
        ignore_edges = params.pop('ignore_edges', False)

        init_params = params.pop('initializer', None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None
                       else InitializerApplicator())

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   phrase_probability=phrase_probability,
                   edge_probability=edge_probability,
                   premise_encoder=premise_encoder,
                   edge_embedding=edge_embedding,
                   use_encoding_for_node=use_encoding_for_node,
                   attention_similarity=attention_similarity,
                   ignore_edges=ignore_edges,
                   initializer=initializer) 
Example #23
Source File: elmo.py    From magnitude with MIT License 5 votes vote down vote up
def batch_to_ids(batch                 )                :
    u"""
    Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    (len(batch), max sentence length, max word length).

    Parameters
    ----------
    batch : ``List[List[str]]``, required
        A list of tokenized sentences.

    Returns
    -------
        A tensor of padded character ids.
    """
    instances = []
    indexer = ELMoTokenCharactersIndexer()
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens,
                          {u'character_ids': indexer})
        instance = Instance({u"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()[u'elmo'][u'character_ids'] 
Example #24
Source File: pretrained_transformer_mismatched_indexer_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_long_sequence_splitting(self):
        tokenizer = cached_transformers.get_tokenizer("bert-base-uncased")
        indexer = PretrainedTransformerMismatchedIndexer("bert-base-uncased", max_length=4)
        text = ["AllenNLP", "is", "great"]
        tokens = tokenizer.tokenize(" ".join(["[CLS]"] + text + ["[SEP]"]))
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        assert len(expected_ids) == 7  # just to make sure it's what we're expecting
        cls_id, sep_id = expected_ids[0], expected_ids[-1]
        expected_ids = (
            expected_ids[:3]
            + [sep_id, cls_id]
            + expected_ids[3:5]
            + [sep_id, cls_id]
            + expected_ids[5:]
        )

        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices([Token(word) for word in text], vocab)

        assert indexed["token_ids"] == expected_ids
        # [CLS] allen ##nl [SEP] [CLS] #p is [SEP] [CLS] great [SEP]
        assert indexed["segment_concat_mask"] == [True] * len(expected_ids)
        # allennlp is great
        assert indexed["mask"] == [True] * len(text)
        # [CLS] allen #nl #p is great [SEP]
        assert indexed["wordpiece_mask"] == [True] * 7 
Example #25
Source File: dry_run_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_dry_run_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / u'existing'
        extended_serialization_dir = self.TEST_DIR / u'extended'
        existing_vocab_path = existing_serialization_dir / u'vocabulary'
        extended_vocab_path = extended_serialization_dir / u'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace(u'some_weird_token_1', namespace=u'tokens')
        vocab.add_token_to_namespace(u'some_weird_token_2', namespace=u'tokens')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params[u'vocabulary'] = {}
        self.params[u'vocabulary'][u'directory_path'] = existing_vocab_path
        self.params[u'vocabulary'][u'extend'] = True
        self.params[u'vocabulary'][u'min_count'] = {u"tokens" : 3}
        dry_run_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == set([u'labels.txt', u'non_padded_namespaces.txt', u'tokens.txt'])

        with open(extended_vocab_path / u'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == u'@@UNKNOWN@@'
        assert tokens[1] == u'some_weird_token_1'
        assert tokens[2] == u'some_weird_token_2'

        tokens.sort()
        assert tokens == [u'.', u'@@UNKNOWN@@', u'animals', u'are',
                          u'some_weird_token_1', u'some_weird_token_2']

        with open(extended_vocab_path / u'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == [u'N', u'V'] 
Example #26
Source File: elmo_indexer_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_elmo_empty_token_list(self):
        # Basic test
        indexer = ELMoTokenCharactersIndexer()
        assert {"elmo_tokens": []} == indexer.get_empty_token_list()
        # Real world test
        indexer = {"elmo": indexer}
        tokens_1 = TextField([Token("Apple")], indexer)
        targets_1 = ListField([TextField([Token("Apple")], indexer)])
        tokens_2 = TextField([Token("Screen"), Token("device")], indexer)
        targets_2 = ListField(
            [TextField([Token("Screen")], indexer), TextField([Token("Device")], indexer)]
        )
        instance_1 = Instance({"tokens": tokens_1, "targets": targets_1})
        instance_2 = Instance({"tokens": tokens_2, "targets": targets_2})
        a_batch = Batch([instance_1, instance_2])
        a_batch.index_instances(Vocabulary())
        batch_tensor = a_batch.as_tensor_dict()
        elmo_target_token_indices = batch_tensor["targets"]["elmo"]["elmo_tokens"]
        # The TextField that is empty should have been created using the
        # `get_empty_token_list` and then padded with zeros.
        empty_target = elmo_target_token_indices[0][1].numpy()
        np.testing.assert_array_equal(np.zeros((1, 50)), empty_target)
        non_empty_targets = [
            elmo_target_token_indices[0][0],
            elmo_target_token_indices[1][0],
            elmo_target_token_indices[1][1],
        ]
        for non_empty_target in non_empty_targets:
            with pytest.raises(AssertionError):
                np.testing.assert_array_equal(np.zeros((1, 50)), non_empty_target) 
Example #27
Source File: pretrained_transformer_indexer_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_transformers_vocabs_added_correctly(self):
        namespace, model_name = "tags", "roberta-base"
        tokenizer = cached_transformers.get_tokenizer(model_name)
        allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
        indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace)
        allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
        vocab = Vocabulary()
        # here we copy entire transformers vocab
        indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
        del indexed
        assert vocab.get_token_to_index_vocabulary(namespace=namespace) == tokenizer.encoder 
Example #28
Source File: pretrained_transformer_indexer_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_as_array_produces_token_sequence_bert_cased(self):
        tokenizer = cached_transformers.get_tokenizer("bert-base-cased")
        allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased")
        indexer = PretrainedTransformerIndexer(model_name="bert-base-cased")
        string_specials = "[CLS] AllenNLP is great [SEP]"
        string_no_specials = "AllenNLP is great"
        tokens = tokenizer.tokenize(string_specials)
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        # tokens tokenized with our pretrained tokenizer have indices in them
        allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
        assert indexed["token_ids"] == expected_ids 
Example #29
Source File: pretrained_transformer_indexer_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_as_array_produces_token_sequence_roberta(self):
        tokenizer = cached_transformers.get_tokenizer("roberta-base")
        allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base")
        indexer = PretrainedTransformerIndexer(model_name="roberta-base")
        string_specials = "<s> AllenNLP is great </s>"
        string_no_specials = "AllenNLP is great"
        tokens = tokenizer.tokenize(string_specials)
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        # tokens tokenized with our pretrained tokenizer have indices in them
        allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
        assert indexed["token_ids"] == expected_ids 
Example #30
Source File: pretrained_transformer_indexer_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_mask(self):
        # We try these models, because
        #  - BERT pads tokens with 0
        #  - RoBERTa pads tokens with 1
        #  - GPT2 has no padding token, so we choose 0
        for model in ["bert-base-uncased", "roberta-base", "gpt2"]:
            allennlp_tokenizer = PretrainedTransformerTokenizer(model)
            indexer = PretrainedTransformerIndexer(model_name=model)
            string_no_specials = "AllenNLP is great"
            allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
            vocab = Vocabulary()
            indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
            expected_masks = [True] * len(indexed["token_ids"])
            assert indexed["mask"] == expected_masks
            max_length = 10
            padding_lengths = {key: max_length for key in indexed.keys()}
            padded_tokens = indexer.as_padded_tensor_dict(indexed, padding_lengths)
            padding_length = max_length - len(indexed["mask"])
            expected_masks = expected_masks + ([False] * padding_length)
            assert len(padded_tokens["mask"]) == max_length
            assert padded_tokens["mask"].tolist() == expected_masks

            assert len(padded_tokens["token_ids"]) == max_length
            pad_token_id = allennlp_tokenizer.tokenizer.pad_token_id
            if pad_token_id is None:
                pad_token_id = 0
            padding_suffix = [pad_token_id] * padding_length
            assert padded_tokens["token_ids"][-padding_length:].tolist() == padding_suffix