Python Examples of allennlp.data.tokenizers.token.Token

Source File: pretrained_transformer_mismatched_indexer.py From allennlp with Apache License 2.0

6 votes

def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
        self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)

        wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize([t.text for t in tokens])

        # For tokens that don't correspond to any word pieces, we put (-1, -1) into the offsets.
        # That results in the embedding for the token to be all zeros.
        offsets = [x if x is not None else (-1, -1) for x in offsets]

        output: IndexedTokenList = {
            "token_ids": [t.text_id for t in wordpieces],
            "mask": [True] * len(tokens),  # for original tokens (i.e. word-level)
            "type_ids": [t.type_id for t in wordpieces],
            "offsets": offsets,
            "wordpiece_mask": [True] * len(wordpieces),  # for wordpieces (i.e. subword-level)
        }

        return self._matched_indexer._postprocess_output(output)

Source File: tokenizer.py From allennlp with Apache License 2.0

6 votes

def add_special_tokens(
        self, tokens1: List[Token], tokens2: Optional[List[Token]] = None
    ) -> List[Token]:
        """
        Adds special tokens to tokenized text. These are tokens like [CLS] or [SEP].

        Not all tokenizers do this. The default is to just return the tokens unchanged.

        # Parameters

        tokens1 : `List[Token]`
            The list of tokens to add special tokens to.
        tokens2 : `Optional[List[Token]]`
            An optional second list of tokens. This will be concatenated with `tokens1`. Special tokens will be
            added as appropriate.

        # Returns
        tokens : `List[Token]`
            The combined list of tokens, with special tokens added.
        """
        return tokens1 + (tokens2 or [])

Source File: token_characters_indexer.py From allennlp with Apache License 2.0

6 votes

def __init__(
        self,
        namespace: str = "token_characters",
        character_tokenizer: CharacterTokenizer = CharacterTokenizer(),
        start_tokens: List[str] = None,
        end_tokens: List[str] = None,
        min_padding_length: int = 0,
        token_min_padding_length: int = 0,
    ) -> None:
        super().__init__(token_min_padding_length)
        if min_padding_length == 0:
            url = "https://github.com/allenai/allennlp/issues/1954"
            warnings.warn(
                "You are using the default value (0) of `min_padding_length`, "
                f"which can cause some subtle bugs (more info see {url}). "
                "Strongly recommend to set a value, usually the maximum size "
                "of the convolutional layer size when using CnnEncoder.",
                UserWarning,
            )
        self._min_padding_length = min_padding_length
        self._namespace = namespace
        self._character_tokenizer = character_tokenizer

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]

Source File: token_characters_indexer.py From allennlp with Apache License 2.0

6 votes

def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[List[int]]]:
        indices: List[List[int]] = []
        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            token_indices: List[int] = []
            if token.text is None:
                raise ConfigurationError(
                    "TokenCharactersIndexer needs a tokenizer that retains text"
                )
            for character in self._character_tokenizer.tokenize(token.text):
                if getattr(character, "text_id", None) is not None:
                    # `text_id` being set on the token means that we aren't using the vocab, we just
                    # use this id instead.
                    index = character.text_id
                else:
                    index = vocabulary.get_token_index(character.text, self._namespace)
                token_indices.append(index)
            indices.append(token_indices)
        return {"token_characters": indices}

Source File: prolocal_dataset_reader.py From propara with Apache License 2.0

6 votes

def text_to_instance(self,  # type: ignore
                         sentence_tokens: List[str],
                         verb_vector: List[int],
                         entity_vector: List[int],
                         state_change_types: Optional[List[str]] = None,
                         state_change_tags: Optional[List[str]] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        # encode inputs
        token_field = TextField([Token(word) for word in sentence_tokens], self._token_indexers)
        fields['tokens'] = token_field
        fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags')
        fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags')

        # encode outputs
        if state_change_types:
            fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels')
        if state_change_tags:
            fields['state_change_tags'] = SequenceLabelField(state_change_tags, token_field, 'state_change_tags')

        return Instance(fields)

Source File: single_id_token_indexer.py From allennlp with Apache License 2.0

6 votes

def __init__(
        self,
        namespace: Optional[str] = "tokens",
        lowercase_tokens: bool = False,
        start_tokens: List[str] = None,
        end_tokens: List[str] = None,
        feature_name: str = "text",
        default_value: str = _DEFAULT_VALUE,
        token_min_padding_length: int = 0,
    ) -> None:
        super().__init__(token_min_padding_length)
        self.namespace = namespace
        self.lowercase_tokens = lowercase_tokens

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]
        self._feature_name = feature_name
        self._default_value = default_value

Source File: knowledge_graph_field.py From gtos with MIT License

6 votes

def _span_overlap_fraction(self,
                               entity: str,
                               entity_text: List[Token],
                               token: Token,
                               token_index: int,
                               tokens: List[Token]) -> float:
        entity_words = set(entity_token.text for entity_token in entity_text)
        if not entity_words:
            # Some tables have empty cells.
            return 0
        seen_entity_words = set()
        token_index_left = token_index
        while token_index < len(tokens) and tokens[token_index].text in entity_words:
            seen_entity_words.add(tokens[token_index].text)
            token_index += 1
        while token_index_left >= 0 and tokens[token_index_left].text in entity_words:
            seen_entity_words.add(tokens[token_index_left].text)
            token_index_left -= 1
        return len(seen_entity_words) / len(entity_words)

Source File: knowledge_graph_field.py From gtos with MIT License

6 votes

def _span_lemma_overlap_fraction(self,
                                     entity: str,
                                     entity_text: List[Token],
                                     token: Token,
                                     token_index: int,
                                     tokens: List[Token]) -> float:
        entity_lemmas = set(entity_token.lemma_ for entity_token in entity_text)
        if not entity_lemmas:
            # Some tables have empty cells.
            return 0
        seen_entity_lemmas = set()
        token_index_left = token_index
        while token_index < len(tokens) and tokens[token_index].lemma_ in entity_lemmas:
            seen_entity_lemmas.add(tokens[token_index].lemma_)
            token_index += 1
        while token_index_left >= 0 and tokens[token_index_left].lemma_ in entity_lemmas:
            seen_entity_lemmas.add(tokens[token_index_left].lemma_)
            token_index_left -= 1
        return len(seen_entity_lemmas) / len(entity_lemmas)

    # pylint: enable=unused-argument,no-self-use

Source File: character_tokenizer.py From allennlp with Apache License 2.0

6 votes

def tokenize(self, text: str) -> List[Token]:
        if self._lowercase_characters:
            text = text.lower()
        if self._byte_encoding is not None:
            # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out
            # of this.
            tokens = [Token(text_id=c + 1) for c in text.encode(self._byte_encoding)]
        else:
            tokens = [Token(t) for t in list(text)]
        for start_token in self._start_tokens:
            if isinstance(start_token, int):
                token = Token(text_id=start_token, idx=0)
            else:
                token = Token(text=start_token, idx=0)
            tokens.insert(0, token)
        for end_token in self._end_tokens:
            if isinstance(end_token, int):
                token = Token(text_id=end_token, idx=0)
            else:
                token = Token(text=end_token, idx=0)
            tokens.append(token)
        return tokens

Source File: knowledge_graph_field.py From gtos with MIT License

6 votes

def _number_token_match(self,
                            entity: str,
                            entity_text: List[Token],
                            token: Token,
                            token_index: int,
                            tokens: List[Token]) -> float:
        # PNP had a "spanFeatures" function that said whether an entity was a-priori known to link
        # to a token or set of tokens in the question.  This was only used for numbers, and it's
        # not totally clear to me how this number feature overlapped with the token match features
        # in the original implementation (I think in most cases it was the same, except for things
        # like "four million", because the token match is derived from the entity name, which would
        # be 4000000, and wouldn't match "four million").
        #
        # Our implementation basically just adds a duplicate token match feature that's specific to
        # numbers.  It'll break in some rare cases (e.g., "Which four had four million ..."), but
        # those shouldn't be a big deal.
        if entity.startswith('fb:'):
            # This check works because numbers are the only entities that don't start with "fb:".
            return 0.0
        return self._contains_exact_token_match(entity, entity_text, token, token_index, tokens)

Source File: text_field.py From allennlp with Apache License 2.0

6 votes

def get_padding_lengths(self) -> Dict[str, int]:
        """
        The `TextField` has a list of `Tokens`, and each `Token` gets converted into arrays by
        (potentially) several `TokenIndexers`.  This method gets the max length (over tokens)
        associated with each of these arrays.
        """
        if self._indexed_tokens is None:
            raise ConfigurationError(
                "You must call .index(vocabulary) on a field before determining padding lengths."
            )

        padding_lengths = {}
        for indexer_name, indexer in self._token_indexers.items():
            indexer_lengths = indexer.get_padding_lengths(self._indexed_tokens[indexer_name])
            for key, length in indexer_lengths.items():
                padding_lengths[f"{indexer_name}___{key}"] = length
        return padding_lengths

Source File: pretrained_transformer_tokenizer.py From allennlp with Apache License 2.0

6 votes

def _intra_word_tokenize(
        self, string_tokens: List[str]
    ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]:
        tokens: List[Token] = []
        offsets: List[Optional[Tuple[int, int]]] = []
        for token_string in string_tokens:
            wordpieces = self.tokenizer.encode_plus(
                token_string,
                add_special_tokens=False,
                return_tensors=None,
                return_offsets_mapping=False,
                return_attention_mask=False,
                return_token_type_ids=False,
            )
            wp_ids = wordpieces["input_ids"]

            if len(wp_ids) > 0:
                offsets.append((len(tokens), len(tokens) + len(wp_ids) - 1))
                tokens.extend(
                    Token(text=wp_text, text_id=wp_id)
                    for wp_id, wp_text in zip(wp_ids, self.tokenizer.convert_ids_to_tokens(wp_ids))
                )
            else:
                offsets.append(None)
        return tokens, offsets

Source File: pretrained_transformer_tokenizer.py From allennlp with Apache License 2.0

6 votes

def intra_word_tokenize_sentence_pair(
        self, string_tokens_a: List[str], string_tokens_b: List[str]
    ) -> Tuple[List[Token], List[Tuple[int, int]], List[Tuple[int, int]]]:
        """
        Tokenizes each word into wordpieces separately and returns the wordpiece IDs.
        Also calculates offsets such that wordpieces[offsets[i][0]:offsets[i][1] + 1]
        corresponds to the original i-th token.

        This function inserts special tokens.
        """
        tokens_a, offsets_a = self._intra_word_tokenize(string_tokens_a)
        tokens_b, offsets_b = self._intra_word_tokenize(string_tokens_b)
        offsets_b = self._increment_offsets(
            offsets_b,
            (
                len(self.sequence_pair_start_tokens)
                + len(tokens_a)
                + len(self.sequence_pair_mid_tokens)
            ),
        )
        tokens_a = self.add_special_tokens(tokens_a, tokens_b)
        offsets_a = self._increment_offsets(offsets_a, len(self.sequence_pair_start_tokens))

        return tokens_a, offsets_a, offsets_b

Source File: semeval_2010_task_8_reader.py From DISTRE with Apache License 2.0

6 votes

def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         entity_1: Tuple[int],
                         entity_2: Tuple[int],
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        
        tokens = [OpenAISplitter._standardize(token) for token in tokens]
        tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__']
            
        sentence = TextField([Token(text=t) for t in tokens], self._token_indexers)
        fields['sentence'] = sentence
        #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence)
        #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence)
        
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields)

Source File: pretrained_transformer_tokenizer.py From allennlp with Apache License 2.0

6 votes

def add_special_tokens(
        self, tokens1: List[Token], tokens2: Optional[List[Token]] = None
    ) -> List[Token]:
        # Make sure we don't change the input parameters
        tokens1 = copy.deepcopy(tokens1)
        tokens2 = copy.deepcopy(tokens2)

        # We add special tokens and also set token type ids.
        if tokens2 is None:
            for token in tokens1:
                token.type_id = self.single_sequence_token_type_id
            return self.single_sequence_start_tokens + tokens1 + self.single_sequence_end_tokens
        else:
            for token in tokens1:
                token.type_id = self.sequence_pair_first_token_type_id
            for token in tokens2:
                token.type_id = self.sequence_pair_second_token_type_id
            return (
                self.sequence_pair_start_tokens
                + tokens1
                + self.sequence_pair_mid_tokens
                + tokens2
                + self.sequence_pair_end_tokens
            )

Source File: propara_dataset_reader_test.py From propara with Apache License 2.0

6 votes

def test_find_span(self):
        sentence = [Token("My"), Token("car"), Token("is"), Token("-"), Token("grey"), Token("?")]

        # Single token
        assert _find_span([Token("car")], sentence) == (1, 1)

        # Multi token
        assert _find_span([Token("My"), Token("car")], sentence) == (0, 1)

        # Case insensitive
        assert _find_span([Token("my"), Token("car")], sentence) == (0, 1)

        # Not in sentence
        assert _find_span([Token("my"), Token("truck")], sentence) == (-1, -1)

        # Unknown
        assert _find_span([Token("?")], sentence) == (-2, -2)

        # Absent
        assert _find_span([Token("-")], sentence) == (-3, -3)

Source File: character_tokenizer.py From magnitude with MIT License

6 votes

def tokenize(self, text     )               :
        if self._lowercase_characters:
            text = text.lower()
        if self._byte_encoding is not None:
            # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out
            # of this.
            tokens = [Token(text_id=c + 1) for c in text.encode(self._byte_encoding)]
        else:
            tokens = [Token(t) for t in list(text)]
        for start_token in self._start_tokens:
            if isinstance(start_token, int):
                token = Token(text_id=start_token, idx=0)
            else:
                token = Token(text=start_token, idx=0)
            tokens.insert(0, token)
        for end_token in self._end_tokens:
            if isinstance(end_token, int):
                token = Token(text_id=end_token, idx=0)
            else:
                token = Token(text=end_token, idx=0)
            tokens.append(token)
        return tokens

Source File: word_stemmer.py From magnitude with MIT License

5 votes

def stem_word(self, word       )         :
        u"""
        Returns a new ``Token`` with ``word.text`` replaced by a stemmed word.
        """
        raise NotImplementedError

Source File: bling_fire_tokenizer.py From transformer-kernel-ranking with Apache License 2.0

5 votes

def tokenize(self, sentence: str) -> List[Token]:
        return [Token(t) for t in text_to_words(sentence).split()]

Source File: bert_field.py From r2c with MIT License

5 votes

def __init__(self, tokens: List[Token], embs: numpy.ndarray, padding_value: int = 0,
            token_indexers=None) -> None:
        self.tokens = tokens
        self.embs = embs
        self.padding_value = padding_value

        if len(self.tokens) != self.embs.shape[0]:
            raise ValueError("The tokens you passed into the BERTField, {} "
                             "aren't the same size as the embeddings of shape {}".format(self.tokens, self.embs.shape))
        assert len(self.tokens) == self.embs.shape[0]

Source File: word_splitter_test.py From magnitude with MIT License

5 votes

def test_tokenize_handles_unicode_letters(self):
        sentence = u"HAL9000   and    Ångström"
        expected_tokens = [Token(u"HAL", 0), Token(u"9000", 3), Token(u"and", 10), Token(u"Ångström", 17)]
        tokens = self.word_splitter.split_words(sentence)
        assert [t.text for t in tokens] == [t.text for t in expected_tokens]
        assert [t.idx for t in tokens] == [t.idx for t in expected_tokens]

Source File: text_field.py From magnitude with MIT License

5 votes

def __init__(self, tokens             , token_indexers                         )        :
        self.tokens = tokens
        self._token_indexers = token_indexers
        self._indexed_tokens = None
        self._indexer_name_to_indexed_token = None

        if not all([isinstance(x, (Token, SpacyToken)) for x in tokens]):
            raise ConfigurationError(u"TextFields must be passed Tokens. "
                                     u"Found: {} with types {}.".format(tokens, [type(x) for x in tokens]))

    #overrides

Source File: knowledge_graph_field.py From gtos with MIT License

5 votes

def _exact_token_match(self,
                           entity: str,
                           entity_text: List[Token],
                           token: Token,
                           token_index: int,
                           tokens: List[Token]) -> float:
        if len(entity_text) != 1:
            return 0.0
        return self._contains_exact_token_match(entity, entity_text, token, token_index, tokens)

Source File: tokenizer.py From magnitude with MIT License

5 votes

def tokenize(self, text     )               :
        u"""
        Actually implements splitting words into tokens.

        Returns
        -------
        tokens : ``List[Token]``
        """
        raise NotImplementedError

Source File: word_tokenizer.py From magnitude with MIT License

5 votes

def _filter_and_stem(self, words             )               :
        filtered_words = self._word_filter.filter_words(words)
        stemmed_words = [self._word_stemmer.stem_word(word) for word in filtered_words]
        for start_token in self._start_tokens:
            stemmed_words.insert(0, Token(start_token, 0))
        for end_token in self._end_tokens:
            stemmed_words.append(Token(end_token, -1))
        return stemmed_words

Source File: word_splitter.py From magnitude with MIT License

5 votes

def split_words(self, sentence     )               :
        # This works because our Token class matches spacy's.
        return _remove_spaces(self.spacy(sentence))

Source File: word_splitter.py From magnitude with MIT License

5 votes

def split_words(self, sentence     )               :
        return [Token(t) for t in sentence.split()]

Source File: word_splitter.py From magnitude with MIT License

5 votes

def split_words(self, sentence     )               :
        u"""
        Splits a sentence into word tokens.  We handle four kinds of things: words with punctuation
        that should be ignored as a special case (Mr. Mrs., etc.), contractions/genitives (isn't,
        don't, Matt's), and beginning and ending punctuation ("antennagate", (parentheticals), and
        such.).

        The basic outline is to split on whitespace, then check each of these cases.  First, we
        strip off beginning punctuation, then strip off ending punctuation, then strip off
        contractions.  When we strip something off the beginning of a word, we can add it to the
        list of tokens immediately.  When we strip it off the end, we have to save it to be added
        to after the word itself has been added.  Before stripping off any part of a token, we
        first check to be sure the token isn't in our list of special cases.
        """
        fields = sentence.split()
        tokens              = []
        for field in fields:
            add_at_end              = []
            while self._can_split(field) and field[0] in self.beginning_punctuation:
                tokens.append(Token(field[0]))
                field = field[1:]
            while self._can_split(field) and field[-1] in self.ending_punctuation:
                add_at_end.insert(0, Token(field[-1]))
                field = field[:-1]

            # There could (rarely) be several contractions in a word, but we check contractions
            # sequentially, in a random order.  If we've removed one, we need to check again to be
            # sure there aren't others.
            remove_contractions = True
            while remove_contractions:
                remove_contractions = False
                for contraction in self.contractions:
                    if self._can_split(field) and field.lower().endswith(contraction):
                        add_at_end.insert(0, Token(field[-len(contraction):]))
                        field = field[:-len(contraction)]
                        remove_contractions = True
            if field:
                tokens.append(Token(field))
            tokens.extend(add_at_end)
        return tokens

Source File: word_splitter.py From magnitude with MIT License

5 votes

def split_words(self, sentence     )               :
        u"""
        Splits ``sentence`` into a list of :class:`Token` objects.
        """
        raise NotImplementedError

Source File: dep_label_indexer.py From magnitude with MIT License

5 votes

def count_vocab_items(self, token       , counter                           ):
        dep_label = token.dep_
        if not dep_label:
            if token.text not in self._logged_errors:
                logger.warning(u"Token had no dependency label: %s", token.text)
                self._logged_errors.add(token.text)
            dep_label = u'NONE'
        counter[self.namespace][dep_label] += 1

    #overrides

Python allennlp.data.tokenizers.token.Token() Examples