Python allennlp.data.tokenizers.token.Token() Examples
The following are 30
code examples of allennlp.data.tokenizers.token.Token().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.tokenizers.token
, or try the search function
.
Example #1
Source File: pretrained_transformer_mismatched_indexer.py From allennlp with Apache License 2.0 | 6 votes |
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList: self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary) wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize([t.text for t in tokens]) # For tokens that don't correspond to any word pieces, we put (-1, -1) into the offsets. # That results in the embedding for the token to be all zeros. offsets = [x if x is not None else (-1, -1) for x in offsets] output: IndexedTokenList = { "token_ids": [t.text_id for t in wordpieces], "mask": [True] * len(tokens), # for original tokens (i.e. word-level) "type_ids": [t.type_id for t in wordpieces], "offsets": offsets, "wordpiece_mask": [True] * len(wordpieces), # for wordpieces (i.e. subword-level) } return self._matched_indexer._postprocess_output(output)
Example #2
Source File: tokenizer.py From allennlp with Apache License 2.0 | 6 votes |
def add_special_tokens( self, tokens1: List[Token], tokens2: Optional[List[Token]] = None ) -> List[Token]: """ Adds special tokens to tokenized text. These are tokens like [CLS] or [SEP]. Not all tokenizers do this. The default is to just return the tokens unchanged. # Parameters tokens1 : `List[Token]` The list of tokens to add special tokens to. tokens2 : `Optional[List[Token]]` An optional second list of tokens. This will be concatenated with `tokens1`. Special tokens will be added as appropriate. # Returns tokens : `List[Token]` The combined list of tokens, with special tokens added. """ return tokens1 + (tokens2 or [])
Example #3
Source File: token_characters_indexer.py From allennlp with Apache License 2.0 | 6 votes |
def __init__( self, namespace: str = "token_characters", character_tokenizer: CharacterTokenizer = CharacterTokenizer(), start_tokens: List[str] = None, end_tokens: List[str] = None, min_padding_length: int = 0, token_min_padding_length: int = 0, ) -> None: super().__init__(token_min_padding_length) if min_padding_length == 0: url = "https://github.com/allenai/allennlp/issues/1954" warnings.warn( "You are using the default value (0) of `min_padding_length`, " f"which can cause some subtle bugs (more info see {url}). " "Strongly recommend to set a value, usually the maximum size " "of the convolutional layer size when using CnnEncoder.", UserWarning, ) self._min_padding_length = min_padding_length self._namespace = namespace self._character_tokenizer = character_tokenizer self._start_tokens = [Token(st) for st in (start_tokens or [])] self._end_tokens = [Token(et) for et in (end_tokens or [])]
Example #4
Source File: token_characters_indexer.py From allennlp with Apache License 2.0 | 6 votes |
def tokens_to_indices( self, tokens: List[Token], vocabulary: Vocabulary ) -> Dict[str, List[List[int]]]: indices: List[List[int]] = [] for token in itertools.chain(self._start_tokens, tokens, self._end_tokens): token_indices: List[int] = [] if token.text is None: raise ConfigurationError( "TokenCharactersIndexer needs a tokenizer that retains text" ) for character in self._character_tokenizer.tokenize(token.text): if getattr(character, "text_id", None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just # use this id instead. index = character.text_id else: index = vocabulary.get_token_index(character.text, self._namespace) token_indices.append(index) indices.append(token_indices) return {"token_characters": indices}
Example #5
Source File: prolocal_dataset_reader.py From propara with Apache License 2.0 | 6 votes |
def text_to_instance(self, # type: ignore sentence_tokens: List[str], verb_vector: List[int], entity_vector: List[int], state_change_types: Optional[List[str]] = None, state_change_tags: Optional[List[str]] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} # encode inputs token_field = TextField([Token(word) for word in sentence_tokens], self._token_indexers) fields['tokens'] = token_field fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags') fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags') # encode outputs if state_change_types: fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels') if state_change_tags: fields['state_change_tags'] = SequenceLabelField(state_change_tags, token_field, 'state_change_tags') return Instance(fields)
Example #6
Source File: single_id_token_indexer.py From allennlp with Apache License 2.0 | 6 votes |
def __init__( self, namespace: Optional[str] = "tokens", lowercase_tokens: bool = False, start_tokens: List[str] = None, end_tokens: List[str] = None, feature_name: str = "text", default_value: str = _DEFAULT_VALUE, token_min_padding_length: int = 0, ) -> None: super().__init__(token_min_padding_length) self.namespace = namespace self.lowercase_tokens = lowercase_tokens self._start_tokens = [Token(st) for st in (start_tokens or [])] self._end_tokens = [Token(et) for et in (end_tokens or [])] self._feature_name = feature_name self._default_value = default_value
Example #7
Source File: knowledge_graph_field.py From gtos with MIT License | 6 votes |
def _span_overlap_fraction(self, entity: str, entity_text: List[Token], token: Token, token_index: int, tokens: List[Token]) -> float: entity_words = set(entity_token.text for entity_token in entity_text) if not entity_words: # Some tables have empty cells. return 0 seen_entity_words = set() token_index_left = token_index while token_index < len(tokens) and tokens[token_index].text in entity_words: seen_entity_words.add(tokens[token_index].text) token_index += 1 while token_index_left >= 0 and tokens[token_index_left].text in entity_words: seen_entity_words.add(tokens[token_index_left].text) token_index_left -= 1 return len(seen_entity_words) / len(entity_words)
Example #8
Source File: knowledge_graph_field.py From gtos with MIT License | 6 votes |
def _span_lemma_overlap_fraction(self, entity: str, entity_text: List[Token], token: Token, token_index: int, tokens: List[Token]) -> float: entity_lemmas = set(entity_token.lemma_ for entity_token in entity_text) if not entity_lemmas: # Some tables have empty cells. return 0 seen_entity_lemmas = set() token_index_left = token_index while token_index < len(tokens) and tokens[token_index].lemma_ in entity_lemmas: seen_entity_lemmas.add(tokens[token_index].lemma_) token_index += 1 while token_index_left >= 0 and tokens[token_index_left].lemma_ in entity_lemmas: seen_entity_lemmas.add(tokens[token_index_left].lemma_) token_index_left -= 1 return len(seen_entity_lemmas) / len(entity_lemmas) # pylint: enable=unused-argument,no-self-use
Example #9
Source File: character_tokenizer.py From allennlp with Apache License 2.0 | 6 votes |
def tokenize(self, text: str) -> List[Token]: if self._lowercase_characters: text = text.lower() if self._byte_encoding is not None: # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out # of this. tokens = [Token(text_id=c + 1) for c in text.encode(self._byte_encoding)] else: tokens = [Token(t) for t in list(text)] for start_token in self._start_tokens: if isinstance(start_token, int): token = Token(text_id=start_token, idx=0) else: token = Token(text=start_token, idx=0) tokens.insert(0, token) for end_token in self._end_tokens: if isinstance(end_token, int): token = Token(text_id=end_token, idx=0) else: token = Token(text=end_token, idx=0) tokens.append(token) return tokens
Example #10
Source File: knowledge_graph_field.py From gtos with MIT License | 6 votes |
def _number_token_match(self, entity: str, entity_text: List[Token], token: Token, token_index: int, tokens: List[Token]) -> float: # PNP had a "spanFeatures" function that said whether an entity was a-priori known to link # to a token or set of tokens in the question. This was only used for numbers, and it's # not totally clear to me how this number feature overlapped with the token match features # in the original implementation (I think in most cases it was the same, except for things # like "four million", because the token match is derived from the entity name, which would # be 4000000, and wouldn't match "four million"). # # Our implementation basically just adds a duplicate token match feature that's specific to # numbers. It'll break in some rare cases (e.g., "Which four had four million ..."), but # those shouldn't be a big deal. if entity.startswith('fb:'): # This check works because numbers are the only entities that don't start with "fb:". return 0.0 return self._contains_exact_token_match(entity, entity_text, token, token_index, tokens)
Example #11
Source File: text_field.py From allennlp with Apache License 2.0 | 6 votes |
def get_padding_lengths(self) -> Dict[str, int]: """ The `TextField` has a list of `Tokens`, and each `Token` gets converted into arrays by (potentially) several `TokenIndexers`. This method gets the max length (over tokens) associated with each of these arrays. """ if self._indexed_tokens is None: raise ConfigurationError( "You must call .index(vocabulary) on a field before determining padding lengths." ) padding_lengths = {} for indexer_name, indexer in self._token_indexers.items(): indexer_lengths = indexer.get_padding_lengths(self._indexed_tokens[indexer_name]) for key, length in indexer_lengths.items(): padding_lengths[f"{indexer_name}___{key}"] = length return padding_lengths
Example #12
Source File: pretrained_transformer_tokenizer.py From allennlp with Apache License 2.0 | 6 votes |
def _intra_word_tokenize( self, string_tokens: List[str] ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]: tokens: List[Token] = [] offsets: List[Optional[Tuple[int, int]]] = [] for token_string in string_tokens: wordpieces = self.tokenizer.encode_plus( token_string, add_special_tokens=False, return_tensors=None, return_offsets_mapping=False, return_attention_mask=False, return_token_type_ids=False, ) wp_ids = wordpieces["input_ids"] if len(wp_ids) > 0: offsets.append((len(tokens), len(tokens) + len(wp_ids) - 1)) tokens.extend( Token(text=wp_text, text_id=wp_id) for wp_id, wp_text in zip(wp_ids, self.tokenizer.convert_ids_to_tokens(wp_ids)) ) else: offsets.append(None) return tokens, offsets
Example #13
Source File: pretrained_transformer_tokenizer.py From allennlp with Apache License 2.0 | 6 votes |
def intra_word_tokenize_sentence_pair( self, string_tokens_a: List[str], string_tokens_b: List[str] ) -> Tuple[List[Token], List[Tuple[int, int]], List[Tuple[int, int]]]: """ Tokenizes each word into wordpieces separately and returns the wordpiece IDs. Also calculates offsets such that wordpieces[offsets[i][0]:offsets[i][1] + 1] corresponds to the original i-th token. This function inserts special tokens. """ tokens_a, offsets_a = self._intra_word_tokenize(string_tokens_a) tokens_b, offsets_b = self._intra_word_tokenize(string_tokens_b) offsets_b = self._increment_offsets( offsets_b, ( len(self.sequence_pair_start_tokens) + len(tokens_a) + len(self.sequence_pair_mid_tokens) ), ) tokens_a = self.add_special_tokens(tokens_a, tokens_b) offsets_a = self._increment_offsets(offsets_a, len(self.sequence_pair_start_tokens)) return tokens_a, offsets_a, offsets_b
Example #14
Source File: semeval_2010_task_8_reader.py From DISTRE with Apache License 2.0 | 6 votes |
def text_to_instance(self, # type: ignore tokens: List[str], entity_1: Tuple[int], entity_2: Tuple[int], label: str = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} tokens = [OpenAISplitter._standardize(token) for token in tokens] tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__'] sentence = TextField([Token(text=t) for t in tokens], self._token_indexers) fields['sentence'] = sentence #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence) #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence) if label: fields['label'] = LabelField(label) return Instance(fields)
Example #15
Source File: pretrained_transformer_tokenizer.py From allennlp with Apache License 2.0 | 6 votes |
def add_special_tokens( self, tokens1: List[Token], tokens2: Optional[List[Token]] = None ) -> List[Token]: # Make sure we don't change the input parameters tokens1 = copy.deepcopy(tokens1) tokens2 = copy.deepcopy(tokens2) # We add special tokens and also set token type ids. if tokens2 is None: for token in tokens1: token.type_id = self.single_sequence_token_type_id return self.single_sequence_start_tokens + tokens1 + self.single_sequence_end_tokens else: for token in tokens1: token.type_id = self.sequence_pair_first_token_type_id for token in tokens2: token.type_id = self.sequence_pair_second_token_type_id return ( self.sequence_pair_start_tokens + tokens1 + self.sequence_pair_mid_tokens + tokens2 + self.sequence_pair_end_tokens )
Example #16
Source File: propara_dataset_reader_test.py From propara with Apache License 2.0 | 6 votes |
def test_find_span(self): sentence = [Token("My"), Token("car"), Token("is"), Token("-"), Token("grey"), Token("?")] # Single token assert _find_span([Token("car")], sentence) == (1, 1) # Multi token assert _find_span([Token("My"), Token("car")], sentence) == (0, 1) # Case insensitive assert _find_span([Token("my"), Token("car")], sentence) == (0, 1) # Not in sentence assert _find_span([Token("my"), Token("truck")], sentence) == (-1, -1) # Unknown assert _find_span([Token("?")], sentence) == (-2, -2) # Absent assert _find_span([Token("-")], sentence) == (-3, -3)
Example #17
Source File: character_tokenizer.py From magnitude with MIT License | 6 votes |
def tokenize(self, text ) : if self._lowercase_characters: text = text.lower() if self._byte_encoding is not None: # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out # of this. tokens = [Token(text_id=c + 1) for c in text.encode(self._byte_encoding)] else: tokens = [Token(t) for t in list(text)] for start_token in self._start_tokens: if isinstance(start_token, int): token = Token(text_id=start_token, idx=0) else: token = Token(text=start_token, idx=0) tokens.insert(0, token) for end_token in self._end_tokens: if isinstance(end_token, int): token = Token(text_id=end_token, idx=0) else: token = Token(text=end_token, idx=0) tokens.append(token) return tokens
Example #18
Source File: word_stemmer.py From magnitude with MIT License | 5 votes |
def stem_word(self, word ) : u""" Returns a new ``Token`` with ``word.text`` replaced by a stemmed word. """ raise NotImplementedError
Example #19
Source File: bling_fire_tokenizer.py From transformer-kernel-ranking with Apache License 2.0 | 5 votes |
def tokenize(self, sentence: str) -> List[Token]: return [Token(t) for t in text_to_words(sentence).split()]
Example #20
Source File: bert_field.py From r2c with MIT License | 5 votes |
def __init__(self, tokens: List[Token], embs: numpy.ndarray, padding_value: int = 0, token_indexers=None) -> None: self.tokens = tokens self.embs = embs self.padding_value = padding_value if len(self.tokens) != self.embs.shape[0]: raise ValueError("The tokens you passed into the BERTField, {} " "aren't the same size as the embeddings of shape {}".format(self.tokens, self.embs.shape)) assert len(self.tokens) == self.embs.shape[0]
Example #21
Source File: word_splitter_test.py From magnitude with MIT License | 5 votes |
def test_tokenize_handles_unicode_letters(self): sentence = u"HAL9000 and Ångström" expected_tokens = [Token(u"HAL", 0), Token(u"9000", 3), Token(u"and", 10), Token(u"Ångström", 17)] tokens = self.word_splitter.split_words(sentence) assert [t.text for t in tokens] == [t.text for t in expected_tokens] assert [t.idx for t in tokens] == [t.idx for t in expected_tokens]
Example #22
Source File: text_field.py From magnitude with MIT License | 5 votes |
def __init__(self, tokens , token_indexers ) : self.tokens = tokens self._token_indexers = token_indexers self._indexed_tokens = None self._indexer_name_to_indexed_token = None if not all([isinstance(x, (Token, SpacyToken)) for x in tokens]): raise ConfigurationError(u"TextFields must be passed Tokens. " u"Found: {} with types {}.".format(tokens, [type(x) for x in tokens])) #overrides
Example #23
Source File: knowledge_graph_field.py From gtos with MIT License | 5 votes |
def _exact_token_match(self, entity: str, entity_text: List[Token], token: Token, token_index: int, tokens: List[Token]) -> float: if len(entity_text) != 1: return 0.0 return self._contains_exact_token_match(entity, entity_text, token, token_index, tokens)
Example #24
Source File: tokenizer.py From magnitude with MIT License | 5 votes |
def tokenize(self, text ) : u""" Actually implements splitting words into tokens. Returns ------- tokens : ``List[Token]`` """ raise NotImplementedError
Example #25
Source File: word_tokenizer.py From magnitude with MIT License | 5 votes |
def _filter_and_stem(self, words ) : filtered_words = self._word_filter.filter_words(words) stemmed_words = [self._word_stemmer.stem_word(word) for word in filtered_words] for start_token in self._start_tokens: stemmed_words.insert(0, Token(start_token, 0)) for end_token in self._end_tokens: stemmed_words.append(Token(end_token, -1)) return stemmed_words
Example #26
Source File: word_splitter.py From magnitude with MIT License | 5 votes |
def split_words(self, sentence ) : # This works because our Token class matches spacy's. return _remove_spaces(self.spacy(sentence))
Example #27
Source File: word_splitter.py From magnitude with MIT License | 5 votes |
def split_words(self, sentence ) : return [Token(t) for t in sentence.split()]
Example #28
Source File: word_splitter.py From magnitude with MIT License | 5 votes |
def split_words(self, sentence ) : u""" Splits a sentence into word tokens. We handle four kinds of things: words with punctuation that should be ignored as a special case (Mr. Mrs., etc.), contractions/genitives (isn't, don't, Matt's), and beginning and ending punctuation ("antennagate", (parentheticals), and such.). The basic outline is to split on whitespace, then check each of these cases. First, we strip off beginning punctuation, then strip off ending punctuation, then strip off contractions. When we strip something off the beginning of a word, we can add it to the list of tokens immediately. When we strip it off the end, we have to save it to be added to after the word itself has been added. Before stripping off any part of a token, we first check to be sure the token isn't in our list of special cases. """ fields = sentence.split() tokens = [] for field in fields: add_at_end = [] while self._can_split(field) and field[0] in self.beginning_punctuation: tokens.append(Token(field[0])) field = field[1:] while self._can_split(field) and field[-1] in self.ending_punctuation: add_at_end.insert(0, Token(field[-1])) field = field[:-1] # There could (rarely) be several contractions in a word, but we check contractions # sequentially, in a random order. If we've removed one, we need to check again to be # sure there aren't others. remove_contractions = True while remove_contractions: remove_contractions = False for contraction in self.contractions: if self._can_split(field) and field.lower().endswith(contraction): add_at_end.insert(0, Token(field[-len(contraction):])) field = field[:-len(contraction)] remove_contractions = True if field: tokens.append(Token(field)) tokens.extend(add_at_end) return tokens
Example #29
Source File: word_splitter.py From magnitude with MIT License | 5 votes |
def split_words(self, sentence ) : u""" Splits ``sentence`` into a list of :class:`Token` objects. """ raise NotImplementedError
Example #30
Source File: dep_label_indexer.py From magnitude with MIT License | 5 votes |
def count_vocab_items(self, token , counter ): dep_label = token.dep_ if not dep_label: if token.text not in self._logged_errors: logger.warning(u"Token had no dependency label: %s", token.text) self._logged_errors.add(token.text) dep_label = u'NONE' counter[self.namespace][dep_label] += 1 #overrides