Python allennlp.data.vocabulary.Vocabulary() Examples
The following are 30
code examples of allennlp.data.vocabulary.Vocabulary().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.vocabulary
, or try the search function
.
Example #1
Source File: model.py From DISTRE with Apache License 2.0 | 6 votes |
def __init__(self, embedder: TextFieldEmbedder, vocab: Vocabulary, lm_head: LanguageModelHead=None, clf_head: ClassificationHead=None, language_model_weight: float=.5) -> None: super().__init__(vocab) assert not (lm_head is None and clf_head is None) self.embedder = embedder self.clf_head = clf_head self.lm_head = lm_head self.language_model_weight = language_model_weight self.vocab = vocab
Example #2
Source File: field.py From allennlp with Apache License 2.0 | 6 votes |
def count_vocab_items(self, counter: Dict[str, Dict[str, int]]): """ If there are strings in this field that need to be converted into integers through a :class:`Vocabulary`, here is where we count them, to determine which tokens are in or out of the vocabulary. If your `Field` does not have any strings that need to be converted into indices, you do not need to implement this method. A note on this `counter`: because `Fields` can represent conceptually different things, we separate the vocabulary items by `namespaces`. This way, we can use a single shared mechanism to handle all mappings from strings to integers in all fields, while keeping words in a `TextField` from sharing the same ids with labels in a `LabelField` (e.g., "entailment" or "contradiction" are labels in an entailment task) Additionally, a single `Field` might want to use multiple namespaces - `TextFields` can be represented as a combination of word ids and character ids, and you don't want words and characters to share the same vocabulary - "a" as a word should get a different id from "a" as a character, and the vocabulary sizes of words and characters are very different. Because of this, the first key in the `counter` object is a `namespace`, like "tokens", "token_characters", "tags", or "labels", and the second key is the actual vocabulary item. """ pass
Example #3
Source File: mv_lstm.py From transformer-kernel-ranking with Apache License 2.0 | 6 votes |
def __init__(self, word_embeddings: TextFieldEmbedder, vocab: Vocabulary, lstm_hidden_dim: int, top_k: int, cuda_device: int) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.query_rep = nn.LSTM(self.word_embeddings.get_output_dim(),lstm_hidden_dim,batch_first=True,bidirectional=True) self.doc_rep = nn.LSTM(self.word_embeddings.get_output_dim(),lstm_hidden_dim,batch_first=True,bidirectional=True) # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() self.top_k = top_k self.dense = nn.Linear(top_k, out_features=20, bias=True) self.dense2 = nn.Linear(20, out_features=20, bias=True) self.dense3 = nn.Linear(20, out_features=1, bias=False)
Example #4
Source File: token_characters_indexer.py From allennlp with Apache License 2.0 | 6 votes |
def tokens_to_indices( self, tokens: List[Token], vocabulary: Vocabulary ) -> Dict[str, List[List[int]]]: indices: List[List[int]] = [] for token in itertools.chain(self._start_tokens, tokens, self._end_tokens): token_indices: List[int] = [] if token.text is None: raise ConfigurationError( "TokenCharactersIndexer needs a tokenizer that retains text" ) for character in self._character_tokenizer.tokenize(token.text): if getattr(character, "text_id", None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just # use this id instead. index = character.text_id else: index = vocabulary.get_token_index(character.text, self._namespace) token_indices.append(index) indices.append(token_indices) return {"token_characters": indices}
Example #5
Source File: lstm_character.py From allennlp_tutorial with MIT License | 6 votes |
def __init__(self, vocab: Vocabulary, word_embedder: TextFieldEmbedder, character_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, character_encoder: Seq2VecEncoder) -> None: super().__init__(vocab) self._word_embedder = word_embedder self._character_embedder = character_embedder self._character_encoder = character_encoder self._encoder = encoder self._classifier = torch.nn.Linear( in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels') ) self._f1 = SpanBasedF1Measure(vocab, 'labels')
Example #6
Source File: single_id_token_indexer.py From allennlp with Apache License 2.0 | 6 votes |
def tokens_to_indices( self, tokens: List[Token], vocabulary: Vocabulary ) -> Dict[str, List[int]]: indices: List[int] = [] for token in itertools.chain(self._start_tokens, tokens, self._end_tokens): text = self._get_feature_value(token) if self.namespace is None: # We could have a check here that `text` is an int; not sure it's worth it. indices.append(text) # type: ignore else: if self.lowercase_tokens: text = text.lower() indices.append(vocabulary.get_token_index(text, self.namespace)) return {"tokens": indices}
Example #7
Source File: lstm_crf.py From allennlp_tutorial with MIT License | 6 votes |
def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder) -> None: super().__init__(vocab) self._embedder = embedder self._encoder = encoder self._classifier = torch.nn.Linear( in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels') ) self._crf = ConditionalRandomField( vocab.get_vocab_size('labels') ) self._f1 = SpanBasedF1Measure(vocab, 'labels')
Example #8
Source File: pretrained_transformer_indexer.py From allennlp with Apache License 2.0 | 6 votes |
def _add_encoding_to_vocabulary_if_needed(self, vocab: Vocabulary) -> None: """ Copies tokens from ```transformers``` model's vocab to the specified namespace. """ if self._added_to_vocabulary: return try: vocab_items = self._tokenizer.get_vocab().items() except NotImplementedError: vocab_items = ( (self._tokenizer.convert_ids_to_tokens(idx), idx) for idx in range(self._tokenizer.vocab_size) ) for word, idx in vocab_items: vocab._token_to_index[self._namespace][word] = idx vocab._index_to_token[self._namespace][idx] = word self._added_to_vocabulary = True
Example #9
Source File: multilabel_field_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_multilabel_field_empty_field_works(self): vocab = Vocabulary() vocab.add_token_to_namespace("label1", namespace="test_empty_labels") vocab.add_token_to_namespace("label2", namespace="test_empty_labels") f = MultiLabelField([], label_namespace="test_empty_labels") f.index(vocab) tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0])) g = f.empty_field() g.index(vocab) tensor = g.as_tensor(g.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0])) h = MultiLabelField( [0, 0, 1], label_namespace="test_empty_labels", num_labels=3, skip_indexing=True ) tensor = h.empty_field().as_tensor(None).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0, 0]))
Example #10
Source File: pretrained_transformer_mismatched_indexer.py From allennlp with Apache License 2.0 | 6 votes |
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList: self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary) wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize([t.text for t in tokens]) # For tokens that don't correspond to any word pieces, we put (-1, -1) into the offsets. # That results in the embedding for the token to be all zeros. offsets = [x if x is not None else (-1, -1) for x in offsets] output: IndexedTokenList = { "token_ids": [t.text_id for t in wordpieces], "mask": [True] * len(tokens), # for original tokens (i.e. word-level) "type_ids": [t.type_id for t in wordpieces], "offsets": offsets, "wordpiece_mask": [True] * len(wordpieces), # for wordpieces (i.e. subword-level) } return self._matched_indexer._postprocess_output(output)
Example #11
Source File: multilabel_f1_measure.py From ConvLab with MIT License | 6 votes |
def __init__(self, vocabulary: Vocabulary, namespace: str = "intent_labels", ignore_classes: List[str] = None, coarse: bool = True) -> None: """ Parameters ---------- vocabulary : ``Vocabulary``, required. A vocabulary containing the label namespace. namespace : str, required. The vocabulary namespace for labels. ignore_classes : List[str], optional. Labels which will be ignored when computing metrics. """ self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(namespace) self._ignore_classes: List[str] = ignore_classes or [] self._coarse = coarse # These will hold per label span counts. self._true_positives: Dict[str, int] = defaultdict(int) self._false_positives: Dict[str, int] = defaultdict(int) self._false_negatives: Dict[str, int] = defaultdict(int)
Example #12
Source File: multilabel_field.py From stog with MIT License | 5 votes |
def _maybe_warn_for_namespace(self, label_namespace: str) -> None: if not (label_namespace.endswith("labels") or label_namespace.endswith("tags")): if label_namespace not in self._already_warned_namespaces: logger.warning("Your label namespace was '%s'. We recommend you use a namespace " "ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by " "default to your vocabulary. See documentation for " "`non_padded_namespaces` parameter in Vocabulary.", self._label_namespace) self._already_warned_namespaces.add(label_namespace)
Example #13
Source File: knowledge_graph_field.py From stog with MIT License | 5 votes |
def index(self, vocab: Vocabulary): self._indexed_entity_texts = {} for indexer_name, indexer in self._token_indexers.items(): indexer_arrays: Dict[str, List] = defaultdict(list) for entity_text in self.entity_texts: for index_name, indexed in indexer.tokens_to_indices(entity_text, vocab, indexer_name).items(): indexer_arrays[index_name].append(indexed) self._indexed_entity_texts.update(indexer_arrays)
Example #14
Source File: copynet.py From summarus with Apache License 2.0 | 5 votes |
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, attention: Attention, beam_size: int, max_decoding_steps: int, target_embedding_dim: int = None, copy_token: str = "@COPY@", source_namespace: str = "tokens", target_namespace: str = "target_tokens", tensor_based_metric: Metric = None, token_based_metric: Metric = None, tie_embeddings: bool = False) -> None: target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim() CopyNetSeq2Seq.__init__( self, vocab, source_embedder, encoder, attention, beam_size, max_decoding_steps, target_embedding_dim, copy_token, source_namespace, target_namespace, tensor_based_metric, token_based_metric ) self._tie_embeddings = tie_embeddings if self._tie_embeddings: assert source_namespace == target_namespace assert "token_embedder_tokens" in dict(self._source_embedder.named_children()) source_token_embedder = dict(self._source_embedder.named_children())["token_embedder_tokens"] self._target_embedder.weight = source_token_embedder.weight if tensor_based_metric is None: self._tensor_based_metric = None
Example #15
Source File: tiny_single_id.py From optuna with MIT License | 5 votes |
def tokens_to_indices( self, tokens: List[Token], vocabulary: Vocabulary, index_name: str ) -> Dict[str, List[int]]: indices: List[int] = [] for token in itertools.chain(self._start_tokens, tokens, self._end_tokens): text = token.text if self.lowercase_tokens: text = text.lower() indices.append(vocabulary.get_token_index(text, "tokens")) return {index_name: indices}
Example #16
Source File: lstm.py From allennlp_tutorial with MIT License | 5 votes |
def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder) -> None: super().__init__(vocab) self._embedder = embedder self._encoder = encoder self._classifier = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) self._f1 = SpanBasedF1Measure(vocab, 'labels', 'IOB1')
Example #17
Source File: custom_composed_seq2seq.py From summarus with Apache License 2.0 | 5 votes |
def __init__(self, vocab: Vocabulary, source_text_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, decoder: SeqDecoder, tied_source_embedder_key: Optional[str] = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(CustomComposedSeq2Seq, self).__init__(vocab, regularizer) self._source_text_embedder = source_text_embedder self._encoder = encoder self._decoder = decoder if self._encoder.get_output_dim() != self._decoder.get_output_dim(): raise ConfigurationError(f"Encoder output dimension {self._encoder.get_output_dim()} should be" f" equal to decoder dimension {self._decoder.get_output_dim()}.") if tied_source_embedder_key: if not isinstance(self._source_text_embedder, BasicTextFieldEmbedder): raise ConfigurationError("Unable to tie embeddings," "Source text embedder is not an instance of `BasicTextFieldEmbedder`.") source_embedder = self._source_text_embedder._token_embedders[tied_source_embedder_key] if not isinstance(source_embedder, Embedding): raise ConfigurationError("Unable to tie embeddings," "Selected source embedder is not an instance of `Embedding`.") if source_embedder.get_output_dim() != self._decoder.target_embedder.get_output_dim(): raise ConfigurationError(f"Output Dimensions mismatch between" f"source embedder and target embedder.") self._source_text_embedder._token_embedders[tied_source_embedder_key] = self._decoder.target_embedder initializer(self)
Example #18
Source File: multilabel_field.py From gtos with MIT License | 5 votes |
def index(self, vocab: Vocabulary): if self._label_ids is None: self._label_ids = [vocab.get_token_index(label, self._label_namespace) # type: ignore for label in self.labels] if not self._num_labels: self._num_labels = vocab.get_vocab_size(self._label_namespace)
Example #19
Source File: multilabel_field.py From gtos with MIT License | 5 votes |
def _maybe_warn_for_namespace(self, label_namespace: str) -> None: if not (label_namespace.endswith("labels") or label_namespace.endswith("tags")): if label_namespace not in self._already_warned_namespaces: logger.warning("Your label namespace was '%s'. We recommend you use a namespace " "ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by " "default to your vocabulary. See documentation for " "`non_padded_namespaces` parameter in Vocabulary.", self._label_namespace) self._already_warned_namespaces.add(label_namespace)
Example #20
Source File: knowledge_graph_field.py From gtos with MIT License | 5 votes |
def index(self, vocab: Vocabulary): self._indexed_entity_texts = {} for indexer_name, indexer in self._token_indexers.items(): indexer_arrays: Dict[str, List] = defaultdict(list) for entity_text in self.entity_texts: for index_name, indexed in indexer.tokens_to_indices(entity_text, vocab, indexer_name).items(): indexer_arrays[index_name].append(indexed) self._indexed_entity_texts.update(indexer_arrays)
Example #21
Source File: generator.py From rupo with Apache License 2.0 | 5 votes |
def __init__(self, model: LanguageModel, token_vocabulary: Vocabulary, stress_vocabulary: StressVocabulary, eos_index: int): self.model = model # type: LanguageModel self.token_vocabulary = token_vocabulary # type: Vocabulary self.stress_vocabulary = stress_vocabulary # type: StressVocabulary self.eos_index = eos_index
Example #22
Source File: vocabulary.py From rupo with Apache License 2.0 | 5 votes |
def inflate_stress_vocabulary(vocabulary: Vocabulary, stress_predictor: StressPredictor): vocab = StressVocabulary() for index, word in vocabulary.get_index_to_token_vocabulary("tokens").items(): stresses = [Stress(pos, Stress.Type.PRIMARY) for pos in stress_predictor.predict(word)] word = StressedWord(word, set(stresses)) vocab.add_word(word, index) return vocab
Example #23
Source File: multilabel_field_test.py From magnitude with MIT License | 5 votes |
def test_multilabel_field_empty_field_works(self): vocab = Vocabulary() vocab.add_token_to_namespace(u"label1", namespace=u"test_empty_labels") vocab.add_token_to_namespace(u"label2", namespace=u"test_empty_labels") f = MultiLabelField([], label_namespace=u"test_empty_labels") f.index(vocab) tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
Example #24
Source File: label_field.py From magnitude with MIT License | 5 votes |
def _maybe_warn_for_namespace(self, label_namespace ) : if not (self._label_namespace.endswith(u"labels") or self._label_namespace.endswith(u"tags")): if label_namespace not in self._already_warned_namespaces: logger.warning(u"Your label namespace was '%s'. We recommend you use a namespace " u"ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by " u"default to your vocabulary. See documentation for " u"`non_padded_namespaces` parameter in Vocabulary.", self._label_namespace) self._already_warned_namespaces.add(label_namespace) #overrides
Example #25
Source File: multilabel_field.py From stog with MIT License | 5 votes |
def index(self, vocab: Vocabulary): if self._label_ids is None: self._label_ids = [vocab.get_token_index(label, self._label_namespace) # type: ignore for label in self.labels] if not self._num_labels: self._num_labels = vocab.get_vocab_size(self._label_namespace)
Example #26
Source File: multilabel_field_test.py From magnitude with MIT License | 5 votes |
def test_multilabel_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace(u"rel0", namespace=u"rel_labels") vocab.add_token_to_namespace(u"rel1", namespace=u"rel_labels") vocab.add_token_to_namespace(u"rel2", namespace=u"rel_labels") f = MultiLabelField([u"rel1", u"rel0"], label_namespace=u"rel_labels") f.index(vocab) tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([1, 1, 0]))
Example #27
Source File: sequence_label_field.py From magnitude with MIT License | 5 votes |
def _maybe_warn_for_namespace(self, label_namespace ) : if not (self._label_namespace.endswith(u"labels") or self._label_namespace.endswith(u"tags")): if label_namespace not in self._already_warned_namespaces: logger.warning(u"Your label namespace was '%s'. We recommend you use a namespace " u"ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by " u"default to your vocabulary. See documentation for " u"`non_padded_namespaces` parameter in Vocabulary.", self._label_namespace) self._already_warned_namespaces.add(label_namespace) #overrides
Example #28
Source File: multilabel_field.py From magnitude with MIT License | 5 votes |
def _maybe_warn_for_namespace(self, label_namespace ) : if not (label_namespace.endswith(u"labels") or label_namespace.endswith(u"tags")): if label_namespace not in self._already_warned_namespaces: logger.warning(u"Your label namespace was '%s'. We recommend you use a namespace " u"ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by " u"default to your vocabulary. See documentation for " u"`non_padded_namespaces` parameter in Vocabulary.", self._label_namespace) self._already_warned_namespaces.add(label_namespace) #overrides
Example #29
Source File: field.py From magnitude with MIT License | 5 votes |
def index(self, vocab ): u""" Given a :class:`Vocabulary`, converts all strings in this field into (typically) integers. This `modifies` the ``Field`` object, it does not return anything. If your ``Field`` does not have any strings that need to be converted into indices, you do not need to implement this method. """ pass
Example #30
Source File: span_based_f1_measure.py From magnitude with MIT License | 5 votes |
def __init__(self, vocabulary , tag_namespace = u"tags", ignore_classes = None, label_encoding = u"BIO") : u""" Parameters ---------- vocabulary : ``Vocabulary``, required. A vocabulary containing the tag namespace. tag_namespace : str, required. This metric assumes that a BIO format is used in which the labels are of the format: ["B-LABEL", "I-LABEL"]. ignore_classes : List[str], optional. Span labels which will be ignored when computing span metrics. A "span label" is the part that comes after the BIO label, so it would be "ARG1" for the tag "B-ARG1". For example by passing: ``ignore_classes=["V"]`` the following sequence would not consider the "V" span at index (2, 3) when computing the precision, recall and F1 metrics. ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"] This is helpful for instance, to avoid computing metrics for "V" spans in a BIO tagging scheme which are typically not included. label_encoding : ``str``, optional (default = "BIO") The encoding used to specify label span endpoints in the sequence. Valid options are "BIO", "IOB1", or BIOUL". """ if label_encoding not in [u"BIO", u"IOB1", u"BIOUL"]: raise ConfigurationError(u"Unknown label encoding - expected 'BIO', 'IOB1', 'BIOUL'.") self._label_encoding = label_encoding self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace) self._ignore_classes = ignore_classes or [] # These will hold per label span counts. self._true_positives = defaultdict(int) self._false_positives = defaultdict(int) self._false_negatives = defaultdict(int)