Python allennlp.data.Vocabulary() Examples
The following are 30
code examples of allennlp.data.Vocabulary().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data
, or try the search function
.
Example #1
Source File: bert_text_classifier.py From scibert with Apache License 2.0 | 6 votes |
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, verbose_metrics: bool = False, dropout: float = 0.2, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, ) -> None: super(TextClassifier, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.dropout = torch.nn.Dropout(dropout) self.num_classes = self.vocab.get_vocab_size("labels") self.classifier_feedforward = torch.nn.Linear(self.text_field_embedder.get_output_dim() , self.num_classes) self.label_accuracy = CategoricalAccuracy() self.label_f1_metrics = {} self.verbose_metrics = verbose_metrics for i in range(self.num_classes): self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = F1Measure(positive_label=i) self.loss = torch.nn.CrossEntropyLoss() initializer(self)
Example #2
Source File: embedding_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_forward_works_with_projection_layer(self): vocab = Vocabulary() vocab.add_token_to_namespace("the") vocab.add_token_to_namespace("a") params = Params( { "pretrained_file": str( self.FIXTURES_ROOT / "embeddings/glove.6B.300d.sample.txt.gz" ), "embedding_dim": 300, "projection_dim": 20, } ) embedding_layer = Embedding.from_params(params, vocab=vocab) input_tensor = torch.LongTensor([[3, 2, 1, 0]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 4, 20) input_tensor = torch.LongTensor([[[3, 2, 1, 0]]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 1, 4, 20)
Example #3
Source File: embedding_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_embedding_layer_actually_initializes_word_vectors_correctly(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0" vocab.add_token_to_namespace(unicode_space) embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, "wb") as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8")) embeddings_file.write(f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8")) params = Params({"pretrained_file": embeddings_filename, "embedding_dim": 3}) embedding_layer = Embedding.from_params(params, vocab=vocab) word_vector = embedding_layer.weight.data[vocab.get_token_index("word")] assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index(unicode_space)] assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
Example #4
Source File: pretrained_transformer_indexer_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_indices_to_tokens(self): allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-uncased") indexer_max_length = PretrainedTransformerIndexer( model_name="bert-base-uncased", max_length=4 ) indexer_no_max_length = PretrainedTransformerIndexer(model_name="bert-base-uncased") string_no_specials = "AllenNLP is great" allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer_no_max_length.tokens_to_indices(allennlp_tokens, vocab) tokens_from_indices = indexer_no_max_length.indices_to_tokens(indexed, vocab) self._assert_tokens_equal(allennlp_tokens, tokens_from_indices) indexed = indexer_max_length.tokens_to_indices(allennlp_tokens, vocab) tokens_from_indices = indexer_max_length.indices_to_tokens(indexed, vocab) # For now we are not removing special tokens introduced from max_length sep_cls = [allennlp_tokens[-1], allennlp_tokens[0]] expected = ( allennlp_tokens[:3] + sep_cls + allennlp_tokens[3:5] + sep_cls + allennlp_tokens[5:] ) self._assert_tokens_equal(expected, tokens_from_indices)
Example #5
Source File: pretrained_transformer_indexer_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_long_sequence_splitting(self): tokenizer = cached_transformers.get_tokenizer("bert-base-uncased") allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-uncased") indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased", max_length=4) string_specials = "[CLS] AllenNLP is great [SEP]" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) assert len(expected_ids) == 7 # just to make sure it's what we're expecting cls_id, sep_id = expected_ids[0], expected_ids[-1] expected_ids = ( expected_ids[:3] + [sep_id, cls_id] + expected_ids[3:5] + [sep_id, cls_id] + expected_ids[5:] ) allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids assert indexed["segment_concat_mask"] == [True] * len(expected_ids) assert indexed["mask"] == [True] * 7 # original length
Example #6
Source File: embedding_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_embedding_vocab_extension_with_specified_namespace(self): vocab = Vocabulary() vocab.add_token_to_namespace("word1", "tokens_a") vocab.add_token_to_namespace("word2", "tokens_a") embedding_params = Params({"vocab_namespace": "tokens_a", "embedding_dim": 10}) embedder = Embedding.from_params(embedding_params, vocab=vocab) original_weight = embedder.weight assert original_weight.shape[0] == 4 extension_counter = {"tokens_a": {"word3": 1}} vocab._extend(extension_counter) embedder.extend_vocab(vocab, "tokens_a") # specified namespace extended_weight = embedder.weight assert extended_weight.shape[0] == 5 assert torch.all(extended_weight[:4, :] == original_weight[:4, :])
Example #7
Source File: embedding_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_embedding_vocab_extension_with_default_namespace(self): vocab = Vocabulary() vocab.add_token_to_namespace("word1") vocab.add_token_to_namespace("word2") embedding_params = Params({"vocab_namespace": "tokens", "embedding_dim": 10}) embedder = Embedding.from_params(embedding_params, vocab=vocab) original_weight = embedder.weight assert original_weight.shape[0] == 4 extension_counter = {"tokens": {"word3": 1}} vocab._extend(extension_counter) embedder.extend_vocab(vocab) # default namespace extended_weight = embedder.weight assert extended_weight.shape[0] == 5 assert torch.all(extended_weight[:4, :] == original_weight[:4, :])
Example #8
Source File: adjacency_field_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_adjacency_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("a", namespace="labels") vocab.add_token_to_namespace("b", namespace="labels") vocab.add_token_to_namespace("c", namespace="labels") labels = ["a", "b"] indices = [(0, 1), (2, 1)] adjacency_field = AdjacencyField(indices, self.text, labels) adjacency_field.index(vocab) tensor = adjacency_field.as_tensor(adjacency_field.get_padding_lengths()) numpy.testing.assert_equal( tensor.numpy(), numpy.array( [ [-1, 0, -1, -1, -1], [-1, -1, -1, -1, -1], [-1, 1, -1, -1, -1], [-1, -1, -1, -1, -1], [-1, -1, -1, -1, -1], ] ), )
Example #9
Source File: bag_of_word_counts_token_embedder.py From allennlp with Apache License 2.0 | 6 votes |
def __init__( self, vocab: Vocabulary, vocab_namespace: str = "tokens", projection_dim: int = None, ignore_oov: bool = False, ) -> None: super().__init__() self.vocab = vocab self.vocab_size = vocab.get_vocab_size(vocab_namespace) if projection_dim: self._projection = torch.nn.Linear(self.vocab_size, projection_dim) else: self._projection = None self._ignore_oov = ignore_oov oov_token = vocab._oov_token self._oov_idx = vocab.get_token_to_index_vocabulary(vocab_namespace).get(oov_token) if self._oov_idx is None: raise ConfigurationError( "OOV token does not exist in vocabulary namespace {}".format(vocab_namespace) ) self.output_dim = projection_dim or self.vocab_size
Example #10
Source File: embedding.py From allennlp with Apache License 2.0 | 6 votes |
def _read_embeddings_from_hdf5( embeddings_filename: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens" ) -> torch.FloatTensor: """ Reads from a hdf5 formatted file. The embedding matrix is assumed to be keyed by 'embedding' and of size `(num_tokens, embedding_dim)`. """ with h5py.File(embeddings_filename, "r") as fin: embeddings = fin["embedding"][...] if list(embeddings.shape) != [vocab.get_vocab_size(namespace), embedding_dim]: raise ConfigurationError( "Read shape {0} embeddings from the file, but expected {1}".format( list(embeddings.shape), [vocab.get_vocab_size(namespace), embedding_dim] ) ) return torch.FloatTensor(embeddings)
Example #11
Source File: embedding_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_embedding_vocab_extension_without_stored_namespace(self): vocab = Vocabulary() vocab.add_token_to_namespace("word1", "tokens_a") vocab.add_token_to_namespace("word2", "tokens_a") embedding_params = Params({"vocab_namespace": "tokens_a", "embedding_dim": 10}) embedder = Embedding.from_params(embedding_params, vocab=vocab) # Previous models won't have _vocab_namespace attribute. Force it to be None embedder._vocab_namespace = None original_weight = embedder.weight assert original_weight.shape[0] == 4 extension_counter = {"tokens_a": {"word3": 1}} vocab._extend(extension_counter) embedder.extend_vocab(vocab, "tokens_a") # specified namespace extended_weight = embedder.weight assert extended_weight.shape[0] == 5 assert torch.all(extended_weight[:4, :] == original_weight[:4, :])
Example #12
Source File: crf_tagger.py From didyprog with MIT License | 6 votes |
def from_params(cls, vocab: Vocabulary, params: Params) -> 'CrfTagger': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params) encoder = Seq2SeqEncoder.from_params(params.pop("encoder")) label_namespace = params.pop("label_namespace", "labels") constraint_type = params.pop("constraint_type", None) dropout = params.pop("dropout", None) include_start_end_transitions = params.pop("include_start_end_transitions", True) initializer = InitializerApplicator.from_params(params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params(params.pop('regularizer', [])) params.assert_empty(cls.__name__) return cls(vocab=vocab, text_field_embedder=text_field_embedder, encoder=encoder, label_namespace=label_namespace, constraint_type=constraint_type, dropout=dropout, include_start_end_transitions=include_start_end_transitions, initializer=initializer, regularizer=regularizer)
Example #13
Source File: embedding_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_embedding_constructed_directly_with_pretrained_file(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0" vocab.add_token_to_namespace(unicode_space) embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, "wb") as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8")) embeddings_file.write(f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8")) num_embeddings = vocab.get_vocab_size() embedding_layer = Embedding( embedding_dim=3, num_embeddings=num_embeddings, pretrained_file=embeddings_filename, vocab=vocab, ) word_vector = embedding_layer.weight.data[vocab.get_token_index("word")] assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index(unicode_space)] assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
Example #14
Source File: pretrained_transformer_indexer_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_transformers_vocab_sizes(self): def check_vocab_size(model_name: str): namespace = "tags" tokenizer = cached_transformers.get_tokenizer(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") vocab = Vocabulary() # here we copy entire transformers vocab indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) del indexed assert vocab.get_vocab_size(namespace=namespace) == tokenizer.vocab_size check_vocab_size("roberta-base") check_vocab_size("bert-base-cased") check_vocab_size("xlm-mlm-ende-1024")
Example #15
Source File: pretrained_transformer_indexer_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_as_array_produces_token_sequence_bert_cased_sentence_pair(self): tokenizer = cached_transformers.get_tokenizer("bert-base-cased") allennlp_tokenizer = PretrainedTransformerTokenizer( "bert-base-cased", add_special_tokens=False ) indexer = PretrainedTransformerIndexer(model_name="bert-base-cased") default_format = "[CLS] AllenNLP is great! [SEP] Really it is! [SEP]" tokens = tokenizer.tokenize(default_format) expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = allennlp_tokenizer.add_special_tokens( allennlp_tokenizer.tokenize("AllenNLP is great!"), allennlp_tokenizer.tokenize("Really it is!"), ) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
Example #16
Source File: character_token_indexer_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_start_and_end_tokens(self): vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace="characters") # 2 vocab.add_token_to_namespace("s", namespace="characters") # 3 vocab.add_token_to_namespace("e", namespace="characters") # 4 vocab.add_token_to_namespace("n", namespace="characters") # 5 vocab.add_token_to_namespace("t", namespace="characters") # 6 vocab.add_token_to_namespace("c", namespace="characters") # 7 vocab.add_token_to_namespace("<", namespace="characters") # 8 vocab.add_token_to_namespace(">", namespace="characters") # 9 vocab.add_token_to_namespace("/", namespace="characters") # 10 indexer = TokenCharactersIndexer( "characters", start_tokens=["<s>"], end_tokens=["</s>"], min_padding_length=1 ) indices = indexer.tokens_to_indices([Token("sentential")], vocab) assert indices == { "token_characters": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]] }
Example #17
Source File: pretrained_transformer_indexer_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_as_array_produces_token_sequence_bert_uncased(self): tokenizer = cached_transformers.get_tokenizer("bert-base-uncased") allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-uncased") indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased") string_specials = "[CLS] AllenNLP is great [SEP]" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) # tokens tokenized with our pretrained tokenizer have indices in them allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
Example #18
Source File: make_vocab_test.py From magnitude with MIT License | 5 votes |
def test_make_vocab_without_extension(self): existing_serialization_dir = self.TEST_DIR / u'existing' extended_serialization_dir = self.TEST_DIR / u'extended' existing_vocab_path = existing_serialization_dir / u'vocabulary' extended_vocab_path = extended_serialization_dir / u'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace(u'some_weird_token_1', namespace=u'tokens') vocab.add_token_to_namespace(u'some_weird_token_2', namespace=u'tokens') # if extend is False, its users responsibility to make sure that dataset instances # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in # namespace for which there could be OOV entries seen in dataset during indexing. # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token. # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront. vocab.add_token_to_namespace(u'N', namespace=u'labels') vocab.add_token_to_namespace(u'V', namespace=u'labels') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params[u'vocabulary'] = {} self.params[u'vocabulary'][u'directory_path'] = existing_vocab_path self.params[u'vocabulary'][u'extend'] = False make_vocab_from_params(self.params, extended_serialization_dir) with open(extended_vocab_path / u'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == u'@@UNKNOWN@@' assert tokens[1] == u'some_weird_token_1' assert tokens[2] == u'some_weird_token_2' assert len(tokens) == 3
Example #19
Source File: test_elmo.py From pytorch-fast-elmo with MIT License | 5 votes |
def _sentences_to_ids(sentences): indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
Example #20
Source File: simple_overlap.py From scitail with Apache License 2.0 | 5 votes |
def from_params(cls, vocab: Vocabulary, params: Params) -> 'SimpleOverlap': classifier = FeedForward.from_params(params.pop('classifier')) init_params = params.pop('initializer', None) initializer = (InitializerApplicator.from_params(init_params) if init_params is not None else InitializerApplicator()) return cls(vocab=vocab, classifier=classifier, initializer=initializer)
Example #21
Source File: simple_overlap.py From scitail with Apache License 2.0 | 5 votes |
def __init__(self, vocab: Vocabulary, classifier: FeedForward, initializer: InitializerApplicator = InitializerApplicator()) -> None: super(SimpleOverlap, self).__init__(vocab) self.linear_mlp = classifier self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self)
Example #22
Source File: tree_attention.py From scitail with Apache License 2.0 | 5 votes |
def from_params(cls, vocab: Vocabulary, params: Params) -> 'TreeAttention': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params) premise_encoder_params = params.pop("premise_encoder", None) premise_encoder = Seq2SeqEncoder.from_params(premise_encoder_params) attention_similarity = SimilarityFunction.from_params(params.pop('attention_similarity')) phrase_probability = FeedForward.from_params(params.pop('phrase_probability')) edge_probability = FeedForward.from_params(params.pop('edge_probability')) edge_embedding = Embedding.from_params(vocab, params.pop('edge_embedding')) use_encoding_for_node = params.pop('use_encoding_for_node') ignore_edges = params.pop('ignore_edges', False) init_params = params.pop('initializer', None) initializer = (InitializerApplicator.from_params(init_params) if init_params is not None else InitializerApplicator()) return cls(vocab=vocab, text_field_embedder=text_field_embedder, phrase_probability=phrase_probability, edge_probability=edge_probability, premise_encoder=premise_encoder, edge_embedding=edge_embedding, use_encoding_for_node=use_encoding_for_node, attention_similarity=attention_similarity, ignore_edges=ignore_edges, initializer=initializer)
Example #23
Source File: elmo.py From magnitude with MIT License | 5 votes |
def batch_to_ids(batch ) : u""" Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length). Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. Returns ------- A tensor of padded character ids. """ instances = [] indexer = ELMoTokenCharactersIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {u'character_ids': indexer}) instance = Instance({u"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()[u'elmo'][u'character_ids']
Example #24
Source File: pretrained_transformer_mismatched_indexer_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_long_sequence_splitting(self): tokenizer = cached_transformers.get_tokenizer("bert-base-uncased") indexer = PretrainedTransformerMismatchedIndexer("bert-base-uncased", max_length=4) text = ["AllenNLP", "is", "great"] tokens = tokenizer.tokenize(" ".join(["[CLS]"] + text + ["[SEP]"])) expected_ids = tokenizer.convert_tokens_to_ids(tokens) assert len(expected_ids) == 7 # just to make sure it's what we're expecting cls_id, sep_id = expected_ids[0], expected_ids[-1] expected_ids = ( expected_ids[:3] + [sep_id, cls_id] + expected_ids[3:5] + [sep_id, cls_id] + expected_ids[5:] ) vocab = Vocabulary() indexed = indexer.tokens_to_indices([Token(word) for word in text], vocab) assert indexed["token_ids"] == expected_ids # [CLS] allen ##nl [SEP] [CLS] #p is [SEP] [CLS] great [SEP] assert indexed["segment_concat_mask"] == [True] * len(expected_ids) # allennlp is great assert indexed["mask"] == [True] * len(text) # [CLS] allen #nl #p is great [SEP] assert indexed["wordpiece_mask"] == [True] * 7
Example #25
Source File: dry_run_test.py From magnitude with MIT License | 5 votes |
def test_dry_run_with_extension(self): existing_serialization_dir = self.TEST_DIR / u'existing' extended_serialization_dir = self.TEST_DIR / u'extended' existing_vocab_path = existing_serialization_dir / u'vocabulary' extended_vocab_path = extended_serialization_dir / u'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace(u'some_weird_token_1', namespace=u'tokens') vocab.add_token_to_namespace(u'some_weird_token_2', namespace=u'tokens') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params[u'vocabulary'] = {} self.params[u'vocabulary'][u'directory_path'] = existing_vocab_path self.params[u'vocabulary'][u'extend'] = True self.params[u'vocabulary'][u'min_count'] = {u"tokens" : 3} dry_run_from_params(self.params, extended_serialization_dir) vocab_files = os.listdir(extended_vocab_path) assert set(vocab_files) == set([u'labels.txt', u'non_padded_namespaces.txt', u'tokens.txt']) with open(extended_vocab_path / u'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == u'@@UNKNOWN@@' assert tokens[1] == u'some_weird_token_1' assert tokens[2] == u'some_weird_token_2' tokens.sort() assert tokens == [u'.', u'@@UNKNOWN@@', u'animals', u'are', u'some_weird_token_1', u'some_weird_token_2'] with open(extended_vocab_path / u'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == [u'N', u'V']
Example #26
Source File: elmo_indexer_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_elmo_empty_token_list(self): # Basic test indexer = ELMoTokenCharactersIndexer() assert {"elmo_tokens": []} == indexer.get_empty_token_list() # Real world test indexer = {"elmo": indexer} tokens_1 = TextField([Token("Apple")], indexer) targets_1 = ListField([TextField([Token("Apple")], indexer)]) tokens_2 = TextField([Token("Screen"), Token("device")], indexer) targets_2 = ListField( [TextField([Token("Screen")], indexer), TextField([Token("Device")], indexer)] ) instance_1 = Instance({"tokens": tokens_1, "targets": targets_1}) instance_2 = Instance({"tokens": tokens_2, "targets": targets_2}) a_batch = Batch([instance_1, instance_2]) a_batch.index_instances(Vocabulary()) batch_tensor = a_batch.as_tensor_dict() elmo_target_token_indices = batch_tensor["targets"]["elmo"]["elmo_tokens"] # The TextField that is empty should have been created using the # `get_empty_token_list` and then padded with zeros. empty_target = elmo_target_token_indices[0][1].numpy() np.testing.assert_array_equal(np.zeros((1, 50)), empty_target) non_empty_targets = [ elmo_target_token_indices[0][0], elmo_target_token_indices[1][0], elmo_target_token_indices[1][1], ] for non_empty_target in non_empty_targets: with pytest.raises(AssertionError): np.testing.assert_array_equal(np.zeros((1, 50)), non_empty_target)
Example #27
Source File: pretrained_transformer_indexer_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_transformers_vocabs_added_correctly(self): namespace, model_name = "tags", "roberta-base" tokenizer = cached_transformers.get_tokenizer(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") vocab = Vocabulary() # here we copy entire transformers vocab indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) del indexed assert vocab.get_token_to_index_vocabulary(namespace=namespace) == tokenizer.encoder
Example #28
Source File: pretrained_transformer_indexer_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_as_array_produces_token_sequence_bert_cased(self): tokenizer = cached_transformers.get_tokenizer("bert-base-cased") allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased") indexer = PretrainedTransformerIndexer(model_name="bert-base-cased") string_specials = "[CLS] AllenNLP is great [SEP]" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) # tokens tokenized with our pretrained tokenizer have indices in them allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
Example #29
Source File: pretrained_transformer_indexer_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_as_array_produces_token_sequence_roberta(self): tokenizer = cached_transformers.get_tokenizer("roberta-base") allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base") indexer = PretrainedTransformerIndexer(model_name="roberta-base") string_specials = "<s> AllenNLP is great </s>" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) # tokens tokenized with our pretrained tokenizer have indices in them allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
Example #30
Source File: pretrained_transformer_indexer_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_mask(self): # We try these models, because # - BERT pads tokens with 0 # - RoBERTa pads tokens with 1 # - GPT2 has no padding token, so we choose 0 for model in ["bert-base-uncased", "roberta-base", "gpt2"]: allennlp_tokenizer = PretrainedTransformerTokenizer(model) indexer = PretrainedTransformerIndexer(model_name=model) string_no_specials = "AllenNLP is great" allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) expected_masks = [True] * len(indexed["token_ids"]) assert indexed["mask"] == expected_masks max_length = 10 padding_lengths = {key: max_length for key in indexed.keys()} padded_tokens = indexer.as_padded_tensor_dict(indexed, padding_lengths) padding_length = max_length - len(indexed["mask"]) expected_masks = expected_masks + ([False] * padding_length) assert len(padded_tokens["mask"]) == max_length assert padded_tokens["mask"].tolist() == expected_masks assert len(padded_tokens["token_ids"]) == max_length pad_token_id = allennlp_tokenizer.tokenizer.pad_token_id if pad_token_id is None: pad_token_id = 0 padding_suffix = [pad_token_id] * padding_length assert padded_tokens["token_ids"][-padding_length:].tolist() == padding_suffix