Python allennlp.data.vocabulary.Vocabulary() Examples

The following are 30 code examples of allennlp.data.vocabulary.Vocabulary(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data.vocabulary , or try the search function .
Example #1
Source File: model.py    From DISTRE with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 embedder: TextFieldEmbedder,
                 vocab: Vocabulary,
                 lm_head: LanguageModelHead=None,
                 clf_head: ClassificationHead=None,
                 language_model_weight: float=.5) -> None:
        
        super().__init__(vocab)
        
        assert not (lm_head is None and clf_head is None)
        
        self.embedder = embedder
        self.clf_head = clf_head
        self.lm_head = lm_head
        self.language_model_weight = language_model_weight
        self.vocab = vocab 
Example #2
Source File: field.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def count_vocab_items(self, counter: Dict[str, Dict[str, int]]):
        """
        If there are strings in this field that need to be converted into integers through a
        :class:`Vocabulary`, here is where we count them, to determine which tokens are in or out
        of the vocabulary.

        If your `Field` does not have any strings that need to be converted into indices, you do
        not need to implement this method.

        A note on this `counter`: because `Fields` can represent conceptually different things,
        we separate the vocabulary items by `namespaces`.  This way, we can use a single shared
        mechanism to handle all mappings from strings to integers in all fields, while keeping
        words in a `TextField` from sharing the same ids with labels in a `LabelField` (e.g.,
        "entailment" or "contradiction" are labels in an entailment task)

        Additionally, a single `Field` might want to use multiple namespaces - `TextFields` can
        be represented as a combination of word ids and character ids, and you don't want words and
        characters to share the same vocabulary - "a" as a word should get a different id from "a"
        as a character, and the vocabulary sizes of words and characters are very different.

        Because of this, the first key in the `counter` object is a `namespace`, like "tokens",
        "token_characters", "tags", or "labels", and the second key is the actual vocabulary item.
        """
        pass 
Example #3
Source File: mv_lstm.py    From transformer-kernel-ranking with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 vocab: Vocabulary,
                 lstm_hidden_dim: int,
                 top_k: int,
                 cuda_device: int) -> None:
        super().__init__(vocab)

        self.word_embeddings = word_embeddings

        self.query_rep = nn.LSTM(self.word_embeddings.get_output_dim(),lstm_hidden_dim,batch_first=True,bidirectional=True)
        self.doc_rep = nn.LSTM(self.word_embeddings.get_output_dim(),lstm_hidden_dim,batch_first=True,bidirectional=True)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) 
        self.cosine_module = CosineMatrixAttention()

        self.top_k = top_k

        self.dense = nn.Linear(top_k, out_features=20, bias=True)
        self.dense2 = nn.Linear(20, out_features=20, bias=True)
        self.dense3 = nn.Linear(20, out_features=1, bias=False) 
Example #4
Source File: token_characters_indexer.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[List[int]]]:
        indices: List[List[int]] = []
        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            token_indices: List[int] = []
            if token.text is None:
                raise ConfigurationError(
                    "TokenCharactersIndexer needs a tokenizer that retains text"
                )
            for character in self._character_tokenizer.tokenize(token.text):
                if getattr(character, "text_id", None) is not None:
                    # `text_id` being set on the token means that we aren't using the vocab, we just
                    # use this id instead.
                    index = character.text_id
                else:
                    index = vocabulary.get_token_index(character.text, self._namespace)
                token_indices.append(index)
            indices.append(token_indices)
        return {"token_characters": indices} 
Example #5
Source File: lstm_character.py    From allennlp_tutorial with MIT License 6 votes vote down vote up
def __init__(self,
                 vocab: Vocabulary,
                 word_embedder: TextFieldEmbedder,
                 character_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 character_encoder: Seq2VecEncoder) -> None:
        super().__init__(vocab)

        self._word_embedder = word_embedder
        self._character_embedder = character_embedder
        self._character_encoder = character_encoder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(
            in_features=encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels')
        )

        self._f1 = SpanBasedF1Measure(vocab, 'labels') 
Example #6
Source File: single_id_token_indexer.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[int]]:
        indices: List[int] = []

        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            text = self._get_feature_value(token)
            if self.namespace is None:
                # We could have a check here that `text` is an int; not sure it's worth it.
                indices.append(text)  # type: ignore
            else:
                if self.lowercase_tokens:
                    text = text.lower()
                indices.append(vocabulary.get_token_index(text, self.namespace))

        return {"tokens": indices} 
Example #7
Source File: lstm_crf.py    From allennlp_tutorial with MIT License 6 votes vote down vote up
def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder) -> None:
        super().__init__(vocab)

        self._embedder = embedder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(
            in_features=encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels')
        )
        self._crf = ConditionalRandomField(
            vocab.get_vocab_size('labels')
        )

        self._f1 = SpanBasedF1Measure(vocab, 'labels') 
Example #8
Source File: pretrained_transformer_indexer.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def _add_encoding_to_vocabulary_if_needed(self, vocab: Vocabulary) -> None:
        """
        Copies tokens from ```transformers``` model's vocab to the specified namespace.
        """
        if self._added_to_vocabulary:
            return

        try:
            vocab_items = self._tokenizer.get_vocab().items()
        except NotImplementedError:
            vocab_items = (
                (self._tokenizer.convert_ids_to_tokens(idx), idx)
                for idx in range(self._tokenizer.vocab_size)
            )
        for word, idx in vocab_items:
            vocab._token_to_index[self._namespace][word] = idx
            vocab._index_to_token[self._namespace][idx] = word

        self._added_to_vocabulary = True 
Example #9
Source File: multilabel_field_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_multilabel_field_empty_field_works(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
        vocab.add_token_to_namespace("label2", namespace="test_empty_labels")

        f = MultiLabelField([], label_namespace="test_empty_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
        g = f.empty_field()
        g.index(vocab)
        tensor = g.as_tensor(g.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))

        h = MultiLabelField(
            [0, 0, 1], label_namespace="test_empty_labels", num_labels=3, skip_indexing=True
        )
        tensor = h.empty_field().as_tensor(None).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0, 0])) 
Example #10
Source File: pretrained_transformer_mismatched_indexer.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
        self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)

        wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize([t.text for t in tokens])

        # For tokens that don't correspond to any word pieces, we put (-1, -1) into the offsets.
        # That results in the embedding for the token to be all zeros.
        offsets = [x if x is not None else (-1, -1) for x in offsets]

        output: IndexedTokenList = {
            "token_ids": [t.text_id for t in wordpieces],
            "mask": [True] * len(tokens),  # for original tokens (i.e. word-level)
            "type_ids": [t.type_id for t in wordpieces],
            "offsets": offsets,
            "wordpiece_mask": [True] * len(wordpieces),  # for wordpieces (i.e. subword-level)
        }

        return self._matched_indexer._postprocess_output(output) 
Example #11
Source File: multilabel_f1_measure.py    From ConvLab with MIT License 6 votes vote down vote up
def __init__(self,
                 vocabulary: Vocabulary,
                 namespace: str = "intent_labels",
                 ignore_classes: List[str] = None,
                 coarse: bool = True) -> None:
        """
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the label namespace.
        namespace : str, required.
            The vocabulary namespace for labels.
        ignore_classes : List[str], optional.
            Labels which will be ignored when computing metrics.
        """
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(namespace)
        self._ignore_classes: List[str] = ignore_classes or []
        self._coarse = coarse

        # These will hold per label span counts.
        self._true_positives: Dict[str, int] = defaultdict(int)
        self._false_positives: Dict[str, int] = defaultdict(int)
        self._false_negatives: Dict[str, int] = defaultdict(int) 
Example #12
Source File: multilabel_field.py    From stog with MIT License 5 votes vote down vote up
def _maybe_warn_for_namespace(self, label_namespace: str) -> None:
        if not (label_namespace.endswith("labels") or label_namespace.endswith("tags")):
            if label_namespace not in self._already_warned_namespaces:
                logger.warning("Your label namespace was '%s'. We recommend you use a namespace "
                               "ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
                               "default to your vocabulary.  See documentation for "
                               "`non_padded_namespaces` parameter in Vocabulary.",
                               self._label_namespace)
                self._already_warned_namespaces.add(label_namespace) 
Example #13
Source File: knowledge_graph_field.py    From stog with MIT License 5 votes vote down vote up
def index(self, vocab: Vocabulary):
        self._indexed_entity_texts = {}
        for indexer_name, indexer in self._token_indexers.items():
            indexer_arrays: Dict[str, List] = defaultdict(list)

            for entity_text in self.entity_texts:
                for index_name, indexed in indexer.tokens_to_indices(entity_text, vocab, indexer_name).items():
                    indexer_arrays[index_name].append(indexed)

            self._indexed_entity_texts.update(indexer_arrays) 
Example #14
Source File: copynet.py    From summarus with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 attention: Attention,
                 beam_size: int,
                 max_decoding_steps: int,
                 target_embedding_dim: int = None,
                 copy_token: str = "@COPY@",
                 source_namespace: str = "tokens",
                 target_namespace: str = "target_tokens",
                 tensor_based_metric: Metric = None,
                 token_based_metric: Metric = None,
                 tie_embeddings: bool = False) -> None:
        target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim()
        CopyNetSeq2Seq.__init__(
            self,
            vocab,
            source_embedder,
            encoder,
            attention,
            beam_size,
            max_decoding_steps,
            target_embedding_dim,
            copy_token,
            source_namespace,
            target_namespace,
            tensor_based_metric,
            token_based_metric
        )
        self._tie_embeddings = tie_embeddings

        if self._tie_embeddings:
            assert source_namespace == target_namespace
            assert "token_embedder_tokens" in dict(self._source_embedder.named_children())
            source_token_embedder = dict(self._source_embedder.named_children())["token_embedder_tokens"]
            self._target_embedder.weight = source_token_embedder.weight

        if tensor_based_metric is None:
            self._tensor_based_metric = None 
Example #15
Source File: tiny_single_id.py    From optuna with MIT License 5 votes vote down vote up
def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary, index_name: str
    ) -> Dict[str, List[int]]:
        indices: List[int] = []

        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            text = token.text
            if self.lowercase_tokens:
                text = text.lower()
            indices.append(vocabulary.get_token_index(text, "tokens"))

        return {index_name: indices} 
Example #16
Source File: lstm.py    From allennlp_tutorial with MIT License 5 votes vote down vote up
def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder) -> None:
        super().__init__(vocab)

        self._embedder = embedder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                           out_features=vocab.get_vocab_size('labels'))

        self._f1 = SpanBasedF1Measure(vocab, 'labels', 'IOB1') 
Example #17
Source File: custom_composed_seq2seq.py    From summarus with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 vocab: Vocabulary,
                 source_text_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 decoder: SeqDecoder,
                 tied_source_embedder_key: Optional[str] = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:

        super(CustomComposedSeq2Seq, self).__init__(vocab, regularizer)

        self._source_text_embedder = source_text_embedder
        self._encoder = encoder
        self._decoder = decoder

        if self._encoder.get_output_dim() != self._decoder.get_output_dim():
            raise ConfigurationError(f"Encoder output dimension {self._encoder.get_output_dim()} should be"
                                     f" equal to decoder dimension {self._decoder.get_output_dim()}.")
        if tied_source_embedder_key:
            if not isinstance(self._source_text_embedder, BasicTextFieldEmbedder):
                raise ConfigurationError("Unable to tie embeddings,"
                                         "Source text embedder is not an instance of `BasicTextFieldEmbedder`.")
            source_embedder = self._source_text_embedder._token_embedders[tied_source_embedder_key]
            if not isinstance(source_embedder, Embedding):
                raise ConfigurationError("Unable to tie embeddings,"
                                         "Selected source embedder is not an instance of `Embedding`.")
            if source_embedder.get_output_dim() != self._decoder.target_embedder.get_output_dim():
                raise ConfigurationError(f"Output Dimensions mismatch between"
                                         f"source embedder and target embedder.")
            self._source_text_embedder._token_embedders[tied_source_embedder_key] = self._decoder.target_embedder
        initializer(self) 
Example #18
Source File: multilabel_field.py    From gtos with MIT License 5 votes vote down vote up
def index(self, vocab: Vocabulary):
        if self._label_ids is None:
            self._label_ids = [vocab.get_token_index(label, self._label_namespace)  # type: ignore
                               for label in self.labels]
        if not self._num_labels:
            self._num_labels = vocab.get_vocab_size(self._label_namespace) 
Example #19
Source File: multilabel_field.py    From gtos with MIT License 5 votes vote down vote up
def _maybe_warn_for_namespace(self, label_namespace: str) -> None:
        if not (label_namespace.endswith("labels") or label_namespace.endswith("tags")):
            if label_namespace not in self._already_warned_namespaces:
                logger.warning("Your label namespace was '%s'. We recommend you use a namespace "
                               "ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
                               "default to your vocabulary.  See documentation for "
                               "`non_padded_namespaces` parameter in Vocabulary.",
                               self._label_namespace)
                self._already_warned_namespaces.add(label_namespace) 
Example #20
Source File: knowledge_graph_field.py    From gtos with MIT License 5 votes vote down vote up
def index(self, vocab: Vocabulary):
        self._indexed_entity_texts = {}
        for indexer_name, indexer in self._token_indexers.items():
            indexer_arrays: Dict[str, List] = defaultdict(list)

            for entity_text in self.entity_texts:
                for index_name, indexed in indexer.tokens_to_indices(entity_text, vocab, indexer_name).items():
                    indexer_arrays[index_name].append(indexed)

            self._indexed_entity_texts.update(indexer_arrays) 
Example #21
Source File: generator.py    From rupo with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 model: LanguageModel,
                 token_vocabulary: Vocabulary,
                 stress_vocabulary: StressVocabulary,
                 eos_index: int):
        self.model = model  # type: LanguageModel
        self.token_vocabulary = token_vocabulary  # type: Vocabulary
        self.stress_vocabulary = stress_vocabulary  # type: StressVocabulary
        self.eos_index = eos_index 
Example #22
Source File: vocabulary.py    From rupo with Apache License 2.0 5 votes vote down vote up
def inflate_stress_vocabulary(vocabulary: Vocabulary, stress_predictor: StressPredictor):
    vocab = StressVocabulary()
    for index, word in vocabulary.get_index_to_token_vocabulary("tokens").items():
        stresses = [Stress(pos, Stress.Type.PRIMARY) for pos in stress_predictor.predict(word)]
        word = StressedWord(word, set(stresses))
        vocab.add_word(word, index)
    return vocab 
Example #23
Source File: multilabel_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_multilabel_field_empty_field_works(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace(u"label1", namespace=u"test_empty_labels")
        vocab.add_token_to_namespace(u"label2", namespace=u"test_empty_labels")

        f = MultiLabelField([], label_namespace=u"test_empty_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0])) 
Example #24
Source File: label_field.py    From magnitude with MIT License 5 votes vote down vote up
def _maybe_warn_for_namespace(self, label_namespace     )        :
        if not (self._label_namespace.endswith(u"labels") or self._label_namespace.endswith(u"tags")):
            if label_namespace not in self._already_warned_namespaces:
                logger.warning(u"Your label namespace was '%s'. We recommend you use a namespace "
                               u"ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
                               u"default to your vocabulary.  See documentation for "
                               u"`non_padded_namespaces` parameter in Vocabulary.",
                               self._label_namespace)
                self._already_warned_namespaces.add(label_namespace)

    #overrides 
Example #25
Source File: multilabel_field.py    From stog with MIT License 5 votes vote down vote up
def index(self, vocab: Vocabulary):
        if self._label_ids is None:
            self._label_ids = [vocab.get_token_index(label, self._label_namespace)  # type: ignore
                               for label in self.labels]
        if not self._num_labels:
            self._num_labels = vocab.get_vocab_size(self._label_namespace) 
Example #26
Source File: multilabel_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_multilabel_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace(u"rel0", namespace=u"rel_labels")
        vocab.add_token_to_namespace(u"rel1", namespace=u"rel_labels")
        vocab.add_token_to_namespace(u"rel2", namespace=u"rel_labels")

        f = MultiLabelField([u"rel1", u"rel0"], label_namespace=u"rel_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([1, 1, 0])) 
Example #27
Source File: sequence_label_field.py    From magnitude with MIT License 5 votes vote down vote up
def _maybe_warn_for_namespace(self, label_namespace     )        :
        if not (self._label_namespace.endswith(u"labels") or self._label_namespace.endswith(u"tags")):
            if label_namespace not in self._already_warned_namespaces:
                logger.warning(u"Your label namespace was '%s'. We recommend you use a namespace "
                               u"ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
                               u"default to your vocabulary.  See documentation for "
                               u"`non_padded_namespaces` parameter in Vocabulary.",
                               self._label_namespace)
                self._already_warned_namespaces.add(label_namespace)

    #overrides 
Example #28
Source File: multilabel_field.py    From magnitude with MIT License 5 votes vote down vote up
def _maybe_warn_for_namespace(self, label_namespace     )        :
        if not (label_namespace.endswith(u"labels") or label_namespace.endswith(u"tags")):
            if label_namespace not in self._already_warned_namespaces:
                logger.warning(u"Your label namespace was '%s'. We recommend you use a namespace "
                               u"ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
                               u"default to your vocabulary.  See documentation for "
                               u"`non_padded_namespaces` parameter in Vocabulary.",
                               self._label_namespace)
                self._already_warned_namespaces.add(label_namespace)

    #overrides 
Example #29
Source File: field.py    From magnitude with MIT License 5 votes vote down vote up
def index(self, vocab            ):
        u"""
        Given a :class:`Vocabulary`, converts all strings in this field into (typically) integers.
        This `modifies` the ``Field`` object, it does not return anything.

        If your ``Field`` does not have any strings that need to be converted into indices, you do
        not need to implement this method.
        """
        pass 
Example #30
Source File: span_based_f1_measure.py    From magnitude with MIT License 5 votes vote down vote up
def __init__(self,
                 vocabulary            ,
                 tag_namespace      = u"tags",
                 ignore_classes            = None,
                 label_encoding      = u"BIO")        :
        u"""
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the tag namespace.
        tag_namespace : str, required.
            This metric assumes that a BIO format is used in which the
            labels are of the format: ["B-LABEL", "I-LABEL"].
        ignore_classes : List[str], optional.
            Span labels which will be ignored when computing span metrics.
            A "span label" is the part that comes after the BIO label, so it
            would be "ARG1" for the tag "B-ARG1". For example by passing:

             ``ignore_classes=["V"]``
            the following sequence would not consider the "V" span at index (2, 3)
            when computing the precision, recall and F1 metrics.

            ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"]

            This is helpful for instance, to avoid computing metrics for "V"
            spans in a BIO tagging scheme which are typically not included.
        label_encoding : ``str``, optional (default = "BIO")
            The encoding used to specify label span endpoints in the sequence.
            Valid options are "BIO", "IOB1", or BIOUL".
        """
        if label_encoding not in [u"BIO", u"IOB1", u"BIOUL"]:
            raise ConfigurationError(u"Unknown label encoding - expected 'BIO', 'IOB1', 'BIOUL'.")

        self._label_encoding = label_encoding
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace)
        self._ignore_classes =  ignore_classes or []

        # These will hold per label span counts.
        self._true_positives =  defaultdict(int)
        self._false_positives =  defaultdict(int)
        self._false_negatives =  defaultdict(int)