Python allennlp.data.fields.TextField() Examples
The following are 30
code examples of allennlp.data.fields.TextField().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.fields
, or try the search function
.
Example #1
Source File: instance_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_duplicate(self): # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in # a `TextField`. See https://github.com/allenai/allennlp/issues/4270. instance = Instance( { "words": TextField( [Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")} ) } ) other = instance.duplicate() assert other == instance # Adding new fields to the original instance should not effect the duplicate. instance.add_field("labels", LabelField("some_label")) assert "labels" not in other.fields assert other != instance # sanity check on the '__eq__' method.
Example #2
Source File: dataset_reader.py From ConvLab with MIT License | 6 votes |
def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None, intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} # print([t.text for t in context_tokens]) fields["context_tokens"] = TextField(context_tokens, self._token_indexers) fields["tokens"] = TextField(tokens, self._token_indexers) fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) if tags is not None: fields["tags"] = SequenceLabelField(tags, fields["tokens"]) if intents is not None: fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels") if dialog_act is not None: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': dialog_act}) else: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}}) return Instance(fields)
Example #3
Source File: data_loading.py From teaching with GNU General Public License v3.0 | 6 votes |
def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_tokenized = self._tokenizer.tokenize(query_sequence) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] query_field = TextField(query_tokenized, self._token_indexers) doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence) if self.max_doc_length > -1: doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length] doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers) doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence) if self.max_doc_length > -1: doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length] doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers) return Instance({ "query_tokens":query_field, "doc_pos_tokens":doc_pos_field, "doc_neg_tokens": doc_neg_field})
Example #4
Source File: ebmnlp.py From scibert with Apache License 2.0 | 6 votes |
def _read(self, file_path: str) -> Iterable[Instance]: # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in itertools.groupby(data_file, _is_divider): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists fields = [list(field) for field in zip(*fields)] tokens_, _, _, pico_tags = fields # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens_] yield self.text_to_instance(tokens, pico_tags)
Example #5
Source File: semeval_2010_task_8_reader.py From DISTRE with Apache License 2.0 | 6 votes |
def text_to_instance(self, # type: ignore tokens: List[str], entity_1: Tuple[int], entity_2: Tuple[int], label: str = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} tokens = [OpenAISplitter._standardize(token) for token in tokens] tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__'] sentence = TextField([Token(text=t) for t in tokens], self._token_indexers) fields['sentence'] = sentence #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence) #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence) if label: fields['label'] = LabelField(label) return Instance(fields)
Example #6
Source File: data_loading.py From teaching with GNU General Public License v3.0 | 6 votes |
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_id_field = LabelField(int(query_id), skip_indexing=True) doc_id_field = LabelField(int(doc_id), skip_indexing=True) query_tokenized = self._tokenizer.tokenize(query_sequence) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] query_field = TextField(query_tokenized, self._token_indexers) doc_tokenized = self._tokenizer.tokenize(doc_sequence) if self.max_doc_length > -1: doc_tokenized = doc_tokenized[:self.max_doc_length] doc_field = TextField(doc_tokenized, self._token_indexers) return Instance({ "query_id":query_id_field, "doc_id":doc_id_field, "query_tokens":query_field, "doc_tokens":doc_field})
Example #7
Source File: dataset_reader.py From ConvLab with MIT License | 6 votes |
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None, intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} sequence = TextField(tokens, self._token_indexers) fields["tokens"] = sequence if tags: fields["tags"] = SequenceLabelField(tags, sequence) if domain: fields["domain"] = LabelField(domain, label_namespace="domain_labels") if intent: fields["intent"] = LabelField(intent, label_namespace="intent_labels") if dialog_act is not None: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': dialog_act}) else: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}}) return Instance(fields)
Example #8
Source File: dataset_test.py From allennlp with Apache License 2.0 | 6 votes |
def get_instances(self): field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer ) field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence", "."]], self.token_indexer, ) field3 = TextField( [Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer ) field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer) instances = [ Instance({"text1": field1, "text2": field2}), Instance({"text1": field3, "text2": field4}), ] return instances
Example #9
Source File: vocabulary_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_saving_and_loading_works_with_byte_encoding(self): # We're going to set a vocabulary from a TextField using byte encoding, index it, save the # vocab, load the vocab, then index the text field again, and make sure we get the same # result. tokenizer = CharacterTokenizer(byte_encoding="utf-8") token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer, min_padding_length=2) tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]] text_field = TextField(tokens, {"characters": token_indexer}) dataset = Batch([Instance({"sentence": text_field})]) vocab = Vocabulary.from_instances(dataset) text_field.index(vocab) indexed_tokens = deepcopy(text_field._indexed_tokens) vocab_dir = self.TEST_DIR / "vocab_save" vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) text_field2 = TextField(tokens, {"characters": token_indexer}) text_field2.index(vocab2) indexed_tokens2 = deepcopy(text_field2._indexed_tokens) assert indexed_tokens == indexed_tokens2
Example #10
Source File: vocabulary_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_from_params_extend_config(self): vocab_dir = self.TEST_DIR / "vocab_save" original_vocab = Vocabulary(non_padded_namespaces=["tokens"]) original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.save_to_files(vocab_dir) text_field = TextField( [Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")} ) instances = Batch([Instance({"text": text_field})]) # If you ask to extend vocab from `directory`, instances must be passed # in Vocabulary constructor, or else there is nothing to extend to. params = Params({"type": "extend", "directory": vocab_dir}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params) # If you ask to extend vocab, `directory` key must be present in params, # or else there is nothing to extend from. params = Params({"type": "extend"}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances=instances)
Example #11
Source File: vocabulary_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_max_vocab_size_partial_dict(self): indexers = { "tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer(min_padding_length=3), } instance = Instance( { "text": TextField( [Token(w) for w in "Abc def ghi jkl mno pqr stu vwx yz".split(" ")], indexers ) } ) dataset = Batch([instance]) params = Params({"max_vocab_size": {"tokens": 1}}) vocab = Vocabulary.from_params(params=params, instances=dataset) assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2 assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
Example #12
Source File: index_field_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_equality(self): index_field1 = IndexField(4, self.text) index_field2 = IndexField(4, self.text) index_field3 = IndexField( 4, TextField( [Token(t) for t in ["AllenNLP", "is", "the", "bomb", "!"]], {"words": SingleIdTokenIndexer("words")}, ), ) assert index_field1 == 4 assert index_field1 == index_field1 assert index_field1 == index_field2 assert index_field1 != index_field3 assert index_field2 != index_field3 assert index_field3 == index_field3
Example #13
Source File: text_field_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_as_tensor_handles_characters(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters", min_padding_length=1) }, ) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array( [ [1, 1, 1, 3, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4], [1, 0, 0, 0, 0, 0, 0, 0], ] ) numpy.testing.assert_array_almost_equal( tensor_dict["characters"]["token_characters"].detach().cpu().numpy(), expected_character_array, )
Example #14
Source File: text_field_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_token_padding_lengths_are_computed_correctly(self): field = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "field_with_dict": DictReturningTokenIndexer(token_min_padding_length=3), "words": SingleIdTokenIndexer("words", token_min_padding_length=3), "characters": TokenCharactersIndexer( "characters", min_padding_length=1, token_min_padding_length=3 ), }, ) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { "field_with_dict___token_ids": 5, "field_with_dict___additional_key": 3, "words___tokens": 3, "characters___token_characters": 3, "characters___num_token_characters": 8, } tensors = field.as_tensor(padding_lengths) assert tensors["field_with_dict"]["additional_key"].tolist()[-1] == 0 assert tensors["words"]["tokens"].tolist()[-1] == 0 assert tensors["characters"]["token_characters"].tolist()[-1] == [0] * 8
Example #15
Source File: conll2003.py From magnitude with MIT License | 6 votes |
def _read(self, file_path ) : # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, u"r") as data_file: logger.info(u"Reading instances from lines in file at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in itertools.groupby(data_file, _is_divider): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists tokens, pos_tags, chunk_tags, ner_tags = [list(field) for field in izip(*fields)] # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens] yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)
Example #16
Source File: snli.py From magnitude with MIT License | 6 votes |
def text_to_instance(self, # type: ignore premise , hypothesis , label = None) : # pylint: disable=arguments-differ fields = {} premise_tokens = self._tokenizer.tokenize(premise) hypothesis_tokens = self._tokenizer.tokenize(hypothesis) fields[u'premise'] = TextField(premise_tokens, self._token_indexers) fields[u'hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) if label: fields[u'label'] = LabelField(label) metadata = {u"premise_tokens": [x.text for x in premise_tokens], u"hypothesis_tokens": [x.text for x in hypothesis_tokens]} fields[u"metadata"] = MetadataField(metadata) return Instance(fields)
Example #17
Source File: semantic_role_labeling.py From magnitude with MIT License | 6 votes |
def text_to_instance(self, # type: ignore tokens , verb_label , tags = None) : u""" We take `pre-tokenized` input here, along with a verb label. The verb label should be a one-hot binary vector, the same length as the tokens, indicating the position of the verb to find arguments for. """ # pylint: disable=arguments-differ fields = {} text_field = TextField(tokens, token_indexers=self._token_indexers) fields[u'tokens'] = text_field fields[u'verb_indicator'] = SequenceLabelField(verb_label, text_field) if tags: fields[u'tags'] = SequenceLabelField(tags, text_field) if all([x == 0 for x in verb_label]): verb = None else: verb = tokens[verb_label.index(1)].text fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens], u"verb": verb}) return Instance(fields)
Example #18
Source File: seq2seq.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, source_string , target_string = None) : # type: ignore # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) if self._source_add_start_token: tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) return Instance({u"source_tokens": source_field, u"target_tokens": target_field}) else: return Instance({u'source_tokens': source_field})
Example #19
Source File: ebmnlp.py From scibert with Apache License 2.0 | 5 votes |
def text_to_instance(self, tokens: List[Token], pico_tags: List[str] = None): sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) # Set the field 'labels' according to the specified PIO element if pico_tags is not None: instance_fields['tags'] = SequenceLabelField(pico_tags, sequence, self.label_namespace) return Instance(instance_fields)
Example #20
Source File: text_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_as_tensor_handles_longer_lengths(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}, ) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["words___tokens"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0]), )
Example #21
Source File: text_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_as_tensor_handles_words(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}, ) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1]) )
Example #22
Source File: text_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_get_padding_lengths_raises_if_no_indexed_tokens(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}, ) with pytest.raises(ConfigurationError): field.get_padding_lengths()
Example #23
Source File: text_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace="words") capital_a_index = vocab.add_token_to_namespace("A", namespace="words") capital_a_char_index = vocab.add_token_to_namespace("A", namespace="characters") s_index = vocab.add_token_to_namespace("s", namespace="characters") e_index = vocab.add_token_to_namespace("e", namespace="characters") n_index = vocab.add_token_to_namespace("n", namespace="characters") t_index = vocab.add_token_to_namespace("t", namespace="characters") c_index = vocab.add_token_to_namespace("c", namespace="characters") field = TextField( [Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}, ) field.index(vocab) assert field._indexed_tokens["words"]["tokens"] == [capital_a_index, sentence_index] field1 = TextField( [Token(t) for t in ["A", "sentence"]], {"characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1)}, ) field1.index(vocab) assert field1._indexed_tokens["characters"]["token_characters"] == [ [capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index], ] field2 = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1), }, ) field2.index(vocab) assert field2._indexed_tokens["words"]["tokens"] == [capital_a_index, sentence_index] assert field2._indexed_tokens["characters"]["token_characters"] == [ [capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index], ]
Example #24
Source File: ontonotes_ner.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, # type: ignore tokens , ner_tags = None) : u""" We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields = {u'tokens': sequence} instance_fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]}) # Add "tag label" to instance if ner_tags is not None: if self._coding_scheme == u"BIOUL": ner_tags = to_bioul(ner_tags, encoding=u"BIO") instance_fields[u'tags'] = SequenceLabelField(ner_tags, sequence) return Instance(instance_fields)
Example #25
Source File: classification_dataset_reader.py From scibert with Apache License 2.0 | 5 votes |
def text_to_instance(self, text: str, label: str = None, metadata: Any = None) -> Instance: # type: ignore text_tokens = self._tokenizer.tokenize(text) fields = { 'text': TextField(text_tokens, self._token_indexers), } if label is not None: fields['label'] = LabelField(label) if metadata: fields['metadata'] = MetadataField(metadata) return Instance(fields)
Example #26
Source File: index_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def setup_method(self): super().setup_method() self.text = TextField( [Token(t) for t in ["here", "is", "a", "sentence", "."]], {"words": SingleIdTokenIndexer("words")}, )
Example #27
Source File: list_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_empty_list_can_be_tensorized(self): tokenizer = SpacyTokenizer() tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, self.word_indexer) list_field = ListField([text_field.empty_field()]) fields = { "list": list_field, "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer), } instance = Instance(fields) instance.index_fields(self.vocab) instance.as_tensor_dict()
Example #28
Source File: sequence_tagging.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, tokens , tags = None) : # type: ignore u""" We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields = {} sequence = TextField(tokens, self._token_indexers) fields[u"tokens"] = sequence fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]}) if tags is not None: fields[u"tags"] = SequenceLabelField(tags, sequence) return Instance(fields)
Example #29
Source File: span_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_equality(self): span_field1 = SpanField(2, 3, self.text) span_field2 = SpanField(2, 3, self.text) span_field3 = SpanField( 2, 3, TextField([Token(t) for t in ["not", "the", "same", "tokens"]], self.indexers) ) assert span_field1 == (2, 3) assert span_field1 == span_field1 assert span_field1 == span_field2 assert span_field1 != span_field3 assert span_field2 != span_field3
Example #30
Source File: text_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_token_indexer_returns_dict(self): field = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "field_with_dict": DictReturningTokenIndexer(), "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1), }, ) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { "field_with_dict___token_ids": 5, "field_with_dict___additional_key": 2, "words___tokens": 2, "characters___token_characters": 2, "characters___num_token_characters": 8, } padding_lengths["field_with_dict___token_ids"] = 7 padding_lengths["field_with_dict___additional_key"] = 3 padding_lengths["words___tokens"] = 4 padding_lengths["characters___token_characters"] = 4 tensors = field.as_tensor(padding_lengths) assert list(tensors["field_with_dict"]["token_ids"].shape) == [7] assert list(tensors["field_with_dict"]["additional_key"].shape) == [3] assert list(tensors["words"]["tokens"].shape) == [4] assert list(tensors["characters"]["token_characters"].shape) == [4, 8]