Python Examples of allennlp.data.vocabulary.Vocabulary

Source File: model.py From DISTRE with Apache License 2.0

6 votes

def __init__(self,
                 embedder: TextFieldEmbedder,
                 vocab: Vocabulary,
                 lm_head: LanguageModelHead=None,
                 clf_head: ClassificationHead=None,
                 language_model_weight: float=.5) -> None:
        
        super().__init__(vocab)
        
        assert not (lm_head is None and clf_head is None)
        
        self.embedder = embedder
        self.clf_head = clf_head
        self.lm_head = lm_head
        self.language_model_weight = language_model_weight
        self.vocab = vocab

Source File: field.py From allennlp with Apache License 2.0

6 votes

def count_vocab_items(self, counter: Dict[str, Dict[str, int]]):
        """
        If there are strings in this field that need to be converted into integers through a
        :class:`Vocabulary`, here is where we count them, to determine which tokens are in or out
        of the vocabulary.

        If your `Field` does not have any strings that need to be converted into indices, you do
        not need to implement this method.

        A note on this `counter`: because `Fields` can represent conceptually different things,
        we separate the vocabulary items by `namespaces`.  This way, we can use a single shared
        mechanism to handle all mappings from strings to integers in all fields, while keeping
        words in a `TextField` from sharing the same ids with labels in a `LabelField` (e.g.,
        "entailment" or "contradiction" are labels in an entailment task)

        Additionally, a single `Field` might want to use multiple namespaces - `TextFields` can
        be represented as a combination of word ids and character ids, and you don't want words and
        characters to share the same vocabulary - "a" as a word should get a different id from "a"
        as a character, and the vocabulary sizes of words and characters are very different.

        Because of this, the first key in the `counter` object is a `namespace`, like "tokens",
        "token_characters", "tags", or "labels", and the second key is the actual vocabulary item.
        """
        pass

Source File: mv_lstm.py From transformer-kernel-ranking with Apache License 2.0

6 votes

def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 vocab: Vocabulary,
                 lstm_hidden_dim: int,
                 top_k: int,
                 cuda_device: int) -> None:
        super().__init__(vocab)

        self.word_embeddings = word_embeddings

        self.query_rep = nn.LSTM(self.word_embeddings.get_output_dim(),lstm_hidden_dim,batch_first=True,bidirectional=True)
        self.doc_rep = nn.LSTM(self.word_embeddings.get_output_dim(),lstm_hidden_dim,batch_first=True,bidirectional=True)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) 
        self.cosine_module = CosineMatrixAttention()

        self.top_k = top_k

        self.dense = nn.Linear(top_k, out_features=20, bias=True)
        self.dense2 = nn.Linear(20, out_features=20, bias=True)
        self.dense3 = nn.Linear(20, out_features=1, bias=False)

Source File: token_characters_indexer.py From allennlp with Apache License 2.0

6 votes

def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[List[int]]]:
        indices: List[List[int]] = []
        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            token_indices: List[int] = []
            if token.text is None:
                raise ConfigurationError(
                    "TokenCharactersIndexer needs a tokenizer that retains text"
                )
            for character in self._character_tokenizer.tokenize(token.text):
                if getattr(character, "text_id", None) is not None:
                    # `text_id` being set on the token means that we aren't using the vocab, we just
                    # use this id instead.
                    index = character.text_id
                else:
                    index = vocabulary.get_token_index(character.text, self._namespace)
                token_indices.append(index)
            indices.append(token_indices)
        return {"token_characters": indices}

Source File: lstm_character.py From allennlp_tutorial with MIT License

6 votes

def __init__(self,
                 vocab: Vocabulary,
                 word_embedder: TextFieldEmbedder,
                 character_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 character_encoder: Seq2VecEncoder) -> None:
        super().__init__(vocab)

        self._word_embedder = word_embedder
        self._character_embedder = character_embedder
        self._character_encoder = character_encoder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(
            in_features=encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels')
        )

        self._f1 = SpanBasedF1Measure(vocab, 'labels')

Source File: single_id_token_indexer.py From allennlp with Apache License 2.0

6 votes

def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[int]]:
        indices: List[int] = []

        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            text = self._get_feature_value(token)
            if self.namespace is None:
                # We could have a check here that `text` is an int; not sure it's worth it.
                indices.append(text)  # type: ignore
            else:
                if self.lowercase_tokens:
                    text = text.lower()
                indices.append(vocabulary.get_token_index(text, self.namespace))

        return {"tokens": indices}

Source File: lstm_crf.py From allennlp_tutorial with MIT License

6 votes

def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder) -> None:
        super().__init__(vocab)

        self._embedder = embedder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(
            in_features=encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels')
        )
        self._crf = ConditionalRandomField(
            vocab.get_vocab_size('labels')
        )

        self._f1 = SpanBasedF1Measure(vocab, 'labels')

Source File: pretrained_transformer_indexer.py From allennlp with Apache License 2.0

6 votes

def _add_encoding_to_vocabulary_if_needed(self, vocab: Vocabulary) -> None:
        """
        Copies tokens from ```transformers``` model's vocab to the specified namespace.
        """
        if self._added_to_vocabulary:
            return

        try:
            vocab_items = self._tokenizer.get_vocab().items()
        except NotImplementedError:
            vocab_items = (
                (self._tokenizer.convert_ids_to_tokens(idx), idx)
                for idx in range(self._tokenizer.vocab_size)
            )
        for word, idx in vocab_items:
            vocab._token_to_index[self._namespace][word] = idx
            vocab._index_to_token[self._namespace][idx] = word

        self._added_to_vocabulary = True

Source File: multilabel_field_test.py From allennlp with Apache License 2.0

6 votes

def test_multilabel_field_empty_field_works(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
        vocab.add_token_to_namespace("label2", namespace="test_empty_labels")

        f = MultiLabelField([], label_namespace="test_empty_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
        g = f.empty_field()
        g.index(vocab)
        tensor = g.as_tensor(g.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))

        h = MultiLabelField(
            [0, 0, 1], label_namespace="test_empty_labels", num_labels=3, skip_indexing=True
        )
        tensor = h.empty_field().as_tensor(None).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0, 0]))

Source File: pretrained_transformer_mismatched_indexer.py From allennlp with Apache License 2.0

6 votes

def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
        self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)

        wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize([t.text for t in tokens])

        # For tokens that don't correspond to any word pieces, we put (-1, -1) into the offsets.
        # That results in the embedding for the token to be all zeros.
        offsets = [x if x is not None else (-1, -1) for x in offsets]

        output: IndexedTokenList = {
            "token_ids": [t.text_id for t in wordpieces],
            "mask": [True] * len(tokens),  # for original tokens (i.e. word-level)
            "type_ids": [t.type_id for t in wordpieces],
            "offsets": offsets,
            "wordpiece_mask": [True] * len(wordpieces),  # for wordpieces (i.e. subword-level)
        }

        return self._matched_indexer._postprocess_output(output)

Source File: multilabel_f1_measure.py From ConvLab with MIT License

6 votes

def __init__(self,
                 vocabulary: Vocabulary,
                 namespace: str = "intent_labels",
                 ignore_classes: List[str] = None,
                 coarse: bool = True) -> None:
        """
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the label namespace.
        namespace : str, required.
            The vocabulary namespace for labels.
        ignore_classes : List[str], optional.
            Labels which will be ignored when computing metrics.
        """
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(namespace)
        self._ignore_classes: List[str] = ignore_classes or []
        self._coarse = coarse

        # These will hold per label span counts.
        self._true_positives: Dict[str, int] = defaultdict(int)
        self._false_positives: Dict[str, int] = defaultdict(int)
        self._false_negatives: Dict[str, int] = defaultdict(int)

Source File: multilabel_field.py From stog with MIT License

5 votes

def _maybe_warn_for_namespace(self, label_namespace: str) -> None:
        if not (label_namespace.endswith("labels") or label_namespace.endswith("tags")):
            if label_namespace not in self._already_warned_namespaces:
                logger.warning("Your label namespace was '%s'. We recommend you use a namespace "
                               "ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
                               "default to your vocabulary.  See documentation for "
                               "`non_padded_namespaces` parameter in Vocabulary.",
                               self._label_namespace)
                self._already_warned_namespaces.add(label_namespace)

Source File: knowledge_graph_field.py From stog with MIT License

5 votes

def index(self, vocab: Vocabulary):
        self._indexed_entity_texts = {}
        for indexer_name, indexer in self._token_indexers.items():
            indexer_arrays: Dict[str, List] = defaultdict(list)

            for entity_text in self.entity_texts:
                for index_name, indexed in indexer.tokens_to_indices(entity_text, vocab, indexer_name).items():
                    indexer_arrays[index_name].append(indexed)

            self._indexed_entity_texts.update(indexer_arrays)

Source File: copynet.py From summarus with Apache License 2.0

5 votes

def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 attention: Attention,
                 beam_size: int,
                 max_decoding_steps: int,
                 target_embedding_dim: int = None,
                 copy_token: str = "@COPY@",
                 source_namespace: str = "tokens",
                 target_namespace: str = "target_tokens",
                 tensor_based_metric: Metric = None,
                 token_based_metric: Metric = None,
                 tie_embeddings: bool = False) -> None:
        target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim()
        CopyNetSeq2Seq.__init__(
            self,
            vocab,
            source_embedder,
            encoder,
            attention,
            beam_size,
            max_decoding_steps,
            target_embedding_dim,
            copy_token,
            source_namespace,
            target_namespace,
            tensor_based_metric,
            token_based_metric
        )
        self._tie_embeddings = tie_embeddings

        if self._tie_embeddings:
            assert source_namespace == target_namespace
            assert "token_embedder_tokens" in dict(self._source_embedder.named_children())
            source_token_embedder = dict(self._source_embedder.named_children())["token_embedder_tokens"]
            self._target_embedder.weight = source_token_embedder.weight

        if tensor_based_metric is None:
            self._tensor_based_metric = None

Source File: tiny_single_id.py From optuna with MIT License

5 votes

def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary, index_name: str
    ) -> Dict[str, List[int]]:
        indices: List[int] = []

        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            text = token.text
            if self.lowercase_tokens:
                text = text.lower()
            indices.append(vocabulary.get_token_index(text, "tokens"))

        return {index_name: indices}

Source File: lstm.py From allennlp_tutorial with MIT License

5 votes

def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder) -> None:
        super().__init__(vocab)

        self._embedder = embedder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                           out_features=vocab.get_vocab_size('labels'))

        self._f1 = SpanBasedF1Measure(vocab, 'labels', 'IOB1')

Source File: custom_composed_seq2seq.py From summarus with Apache License 2.0

5 votes

def __init__(self,
                 vocab: Vocabulary,
                 source_text_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 decoder: SeqDecoder,
                 tied_source_embedder_key: Optional[str] = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:

        super(CustomComposedSeq2Seq, self).__init__(vocab, regularizer)

        self._source_text_embedder = source_text_embedder
        self._encoder = encoder
        self._decoder = decoder

        if self._encoder.get_output_dim() != self._decoder.get_output_dim():
            raise ConfigurationError(f"Encoder output dimension {self._encoder.get_output_dim()} should be"
                                     f" equal to decoder dimension {self._decoder.get_output_dim()}.")
        if tied_source_embedder_key:
            if not isinstance(self._source_text_embedder, BasicTextFieldEmbedder):
                raise ConfigurationError("Unable to tie embeddings,"
                                         "Source text embedder is not an instance of `BasicTextFieldEmbedder`.")
            source_embedder = self._source_text_embedder._token_embedders[tied_source_embedder_key]
            if not isinstance(source_embedder, Embedding):
                raise ConfigurationError("Unable to tie embeddings,"
                                         "Selected source embedder is not an instance of `Embedding`.")
            if source_embedder.get_output_dim() != self._decoder.target_embedder.get_output_dim():
                raise ConfigurationError(f"Output Dimensions mismatch between"
                                         f"source embedder and target embedder.")
            self._source_text_embedder._token_embedders[tied_source_embedder_key] = self._decoder.target_embedder
        initializer(self)

Source File: multilabel_field.py From gtos with MIT License

5 votes

def index(self, vocab: Vocabulary):
        if self._label_ids is None:
            self._label_ids = [vocab.get_token_index(label, self._label_namespace)  # type: ignore
                               for label in self.labels]
        if not self._num_labels:
            self._num_labels = vocab.get_vocab_size(self._label_namespace)

Source File: multilabel_field.py From gtos with MIT License

5 votes

def _maybe_warn_for_namespace(self, label_namespace: str) -> None:
        if not (label_namespace.endswith("labels") or label_namespace.endswith("tags")):
            if label_namespace not in self._already_warned_namespaces:
                logger.warning("Your label namespace was '%s'. We recommend you use a namespace "
                               "ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
                               "default to your vocabulary.  See documentation for "
                               "`non_padded_namespaces` parameter in Vocabulary.",
                               self._label_namespace)
                self._already_warned_namespaces.add(label_namespace)

Source File: knowledge_graph_field.py From gtos with MIT License

5 votes

def index(self, vocab: Vocabulary):
        self._indexed_entity_texts = {}
        for indexer_name, indexer in self._token_indexers.items():
            indexer_arrays: Dict[str, List] = defaultdict(list)

            for entity_text in self.entity_texts:
                for index_name, indexed in indexer.tokens_to_indices(entity_text, vocab, indexer_name).items():
                    indexer_arrays[index_name].append(indexed)

            self._indexed_entity_texts.update(indexer_arrays)

Source File: generator.py From rupo with Apache License 2.0

5 votes

def __init__(self,
                 model: LanguageModel,
                 token_vocabulary: Vocabulary,
                 stress_vocabulary: StressVocabulary,
                 eos_index: int):
        self.model = model  # type: LanguageModel
        self.token_vocabulary = token_vocabulary  # type: Vocabulary
        self.stress_vocabulary = stress_vocabulary  # type: StressVocabulary
        self.eos_index = eos_index

Source File: vocabulary.py From rupo with Apache License 2.0

5 votes

def inflate_stress_vocabulary(vocabulary: Vocabulary, stress_predictor: StressPredictor):
    vocab = StressVocabulary()
    for index, word in vocabulary.get_index_to_token_vocabulary("tokens").items():
        stresses = [Stress(pos, Stress.Type.PRIMARY) for pos in stress_predictor.predict(word)]
        word = StressedWord(word, set(stresses))
        vocab.add_word(word, index)
    return vocab

Source File: multilabel_field_test.py From magnitude with MIT License

5 votes

def test_multilabel_field_empty_field_works(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace(u"label1", namespace=u"test_empty_labels")
        vocab.add_token_to_namespace(u"label2", namespace=u"test_empty_labels")

        f = MultiLabelField([], label_namespace=u"test_empty_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))

Source File: label_field.py From magnitude with MIT License

5 votes

def _maybe_warn_for_namespace(self, label_namespace     )        :
        if not (self._label_namespace.endswith(u"labels") or self._label_namespace.endswith(u"tags")):
            if label_namespace not in self._already_warned_namespaces:
                logger.warning(u"Your label namespace was '%s'. We recommend you use a namespace "
                               u"ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
                               u"default to your vocabulary.  See documentation for "
                               u"`non_padded_namespaces` parameter in Vocabulary.",
                               self._label_namespace)
                self._already_warned_namespaces.add(label_namespace)

    #overrides

Source File: multilabel_field.py From stog with MIT License

5 votes

def index(self, vocab: Vocabulary):
        if self._label_ids is None:
            self._label_ids = [vocab.get_token_index(label, self._label_namespace)  # type: ignore
                               for label in self.labels]
        if not self._num_labels:
            self._num_labels = vocab.get_vocab_size(self._label_namespace)

Source File: multilabel_field_test.py From magnitude with MIT License

5 votes

def test_multilabel_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace(u"rel0", namespace=u"rel_labels")
        vocab.add_token_to_namespace(u"rel1", namespace=u"rel_labels")
        vocab.add_token_to_namespace(u"rel2", namespace=u"rel_labels")

        f = MultiLabelField([u"rel1", u"rel0"], label_namespace=u"rel_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([1, 1, 0]))

Source File: sequence_label_field.py From magnitude with MIT License

5 votes

def _maybe_warn_for_namespace(self, label_namespace     )        :
        if not (self._label_namespace.endswith(u"labels") or self._label_namespace.endswith(u"tags")):
            if label_namespace not in self._already_warned_namespaces:
                logger.warning(u"Your label namespace was '%s'. We recommend you use a namespace "
                               u"ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
                               u"default to your vocabulary.  See documentation for "
                               u"`non_padded_namespaces` parameter in Vocabulary.",
                               self._label_namespace)
                self._already_warned_namespaces.add(label_namespace)

    #overrides

Source File: multilabel_field.py From magnitude with MIT License

5 votes

def _maybe_warn_for_namespace(self, label_namespace     )        :
        if not (label_namespace.endswith(u"labels") or label_namespace.endswith(u"tags")):
            if label_namespace not in self._already_warned_namespaces:
                logger.warning(u"Your label namespace was '%s'. We recommend you use a namespace "
                               u"ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
                               u"default to your vocabulary.  See documentation for "
                               u"`non_padded_namespaces` parameter in Vocabulary.",
                               self._label_namespace)
                self._already_warned_namespaces.add(label_namespace)

    #overrides

Source File: field.py From magnitude with MIT License

5 votes

def index(self, vocab            ):
        u"""
        Given a :class:`Vocabulary`, converts all strings in this field into (typically) integers.
        This `modifies` the ``Field`` object, it does not return anything.

        If your ``Field`` does not have any strings that need to be converted into indices, you do
        not need to implement this method.
        """
        pass

Source File: span_based_f1_measure.py From magnitude with MIT License

5 votes

def __init__(self,
                 vocabulary            ,
                 tag_namespace      = u"tags",
                 ignore_classes            = None,
                 label_encoding      = u"BIO")        :
        u"""
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the tag namespace.
        tag_namespace : str, required.
            This metric assumes that a BIO format is used in which the
            labels are of the format: ["B-LABEL", "I-LABEL"].
        ignore_classes : List[str], optional.
            Span labels which will be ignored when computing span metrics.
            A "span label" is the part that comes after the BIO label, so it
            would be "ARG1" for the tag "B-ARG1". For example by passing:

             ``ignore_classes=["V"]``
            the following sequence would not consider the "V" span at index (2, 3)
            when computing the precision, recall and F1 metrics.

            ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"]

            This is helpful for instance, to avoid computing metrics for "V"
            spans in a BIO tagging scheme which are typically not included.
        label_encoding : ``str``, optional (default = "BIO")
            The encoding used to specify label span endpoints in the sequence.
            Valid options are "BIO", "IOB1", or BIOUL".
        """
        if label_encoding not in [u"BIO", u"IOB1", u"BIOUL"]:
            raise ConfigurationError(u"Unknown label encoding - expected 'BIO', 'IOB1', 'BIOUL'.")

        self._label_encoding = label_encoding
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace)
        self._ignore_classes =  ignore_classes or []

        # These will hold per label span counts.
        self._true_positives =  defaultdict(int)
        self._false_positives =  defaultdict(int)
        self._false_negatives =  defaultdict(int)

Python allennlp.data.vocabulary.Vocabulary() Examples