Python allennlp.common.util.ensure_list() Examples

The following are 30 code examples of allennlp.common.util.ensure_list(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.common.util , or try the search function .
Example #1
Source File: quora_paraphrase_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_read_from_file(self, lazy):
        reader = QuoraParaphraseDatasetReader(lazy=lazy)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'quora_paraphrase.tsv')
        instances = ensure_list(instances)

        instance1 = {u"premise": u"What should I do to avoid sleeping in class ?".split(),
                     u"hypothesis": u"How do I not sleep in a boring class ?".split(),
                     u"label": u"1"}

        instance2 = {u"premise": u"Do women support each other more than men do ?".split(),
                     u"hypothesis": u"Do women need more compliments than men ?".split(),
                     u"label": u"0"}

        instance3 = {u"premise": u"How can one root android devices ?".split(),
                     u"hypothesis": u"How do I root an Android device ?".split(),
                     u"label": u"1"}

        assert len(instances) == 3

        for instance, expected_instance in izip(instances, [instance1, instance2, instance3]):
            fields = instance.fields
            assert [t.text for t in fields[u"premise"].tokens] == expected_instance[u"premise"]
            assert [t.text for t in fields[u"hypothesis"].tokens] == expected_instance[u"hypothesis"]
            assert fields[u"label"].label == expected_instance[u"label"] 
Example #2
Source File: imdb_test.py    From topic-rnn with Apache License 2.0 6 votes vote down vote up
def test_read_from_file(self):
        # pylint: disable=R0201
        reader = IMDBLanguageModelingReader()
        dataset = reader.read(TestIMDBReader.DATASET_PATH)
        instances = ensure_list(dataset)

        assert len(instances) == 10
        fields = instances[0].fields
        assert [t.text for t in fields["source"].tokens] == TestIMDBReader.INSTANCE_0["source"]
        assert [t.text for t in fields["target"].tokens] == TestIMDBReader.INSTANCE_0["target"]
        fields = instances[1].fields
        assert [t.text for t in fields["source"].tokens] == TestIMDBReader.INSTANCE_1["source"]
        assert [t.text for t in fields["target"].tokens] == TestIMDBReader.INSTANCE_1["target"]
        fields = instances[7].fields
        assert [t.text for t in fields["source"].tokens] == TestIMDBReader.INSTANCE_7["source"]
        assert [t.text for t in fields["target"].tokens] == TestIMDBReader.INSTANCE_7["target"] 
Example #3
Source File: conll2003_dataset_reader_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_read_from_file(self, lazy, coding_scheme):
        conll_reader = Conll2003DatasetReader(lazy=lazy, coding_scheme=coding_scheme)
        instances = conll_reader.read(unicode(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'conll2003.txt'))
        instances = ensure_list(instances)

        if coding_scheme == u'IOB1':
            expected_labels = [u'I-ORG', u'O', u'I-PER', u'O', u'O', u'I-LOC', u'O']
        else:
            expected_labels = [u'U-ORG', u'O', u'U-PER', u'O', u'O', u'U-LOC', u'O']

        fields = instances[0].fields
        tokens = [t.text for t in fields[u'tokens'].tokens]
        assert tokens == [u'U.N.', u'official', u'Ekeus', u'heads', u'for', u'Baghdad', u'.']
        assert fields[u"tags"].labels == expected_labels

        fields = instances[1].fields
        tokens = [t.text for t in fields[u'tokens'].tokens]
        assert tokens == [u'AI2', u'engineer', u'Joel', u'lives', u'in', u'Seattle', u'.']
        assert fields[u"tags"].labels == expected_labels 
Example #4
Source File: sequence_tagging_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_brown_corpus_format(self):
        reader = SequenceTaggingDatasetReader(word_tag_delimiter=u'/')
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'brown_corpus.txt')
        instances = ensure_list(instances)

        assert len(instances) == 4
        fields = instances[0].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"cats", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"dogs", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"snakes", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[3].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"birds", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"] 
Example #5
Source File: sequence_tagging_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_default_format(self, lazy):
        reader = SequenceTaggingDatasetReader(lazy=lazy)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv')
        instances = ensure_list(instances)

        assert len(instances) == 4
        fields = instances[0].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"cats", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"dogs", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"snakes", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[3].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"birds", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"] 
Example #6
Source File: stanford_sentiment_tree_bank_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_2_class(self):
        reader = StanfordSentimentTreeBankDatasetReader(granularity=u"2-class")
        instances = reader.read(self.sst_path)
        instances = ensure_list(instances)

        instance1 = {u"tokens": [u"The", u"actors", u"are", u"fantastic", u"."],
                     u"label": u"1"}
        instance2 = {u"tokens": [u"It", u"was", u"terrible", u"."],
                     u"label": u"0"}

        assert len(instances) == 2
        fields = instances[0].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance1[u"tokens"]
        assert fields[u"label"].label == instance1[u"label"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance2[u"tokens"]
        assert fields[u"label"].label == instance2[u"label"] 
Example #7
Source File: stanford_sentiment_tree_bank_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_3_class(self):
        reader = StanfordSentimentTreeBankDatasetReader(granularity=u"3-class")
        instances = reader.read(self.sst_path)
        instances = ensure_list(instances)

        instance1 = {u"tokens": [u"The", u"actors", u"are", u"fantastic", u"."],
                     u"label": u"2"}
        instance2 = {u"tokens": [u"It", u"was", u"terrible", u"."],
                     u"label": u"0"}
        instance3 = {u"tokens": [u"Chomp", u"chomp", u"!"],
                     u"label": u"1"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance1[u"tokens"]
        assert fields[u"label"].label == instance1[u"label"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance2[u"tokens"]
        assert fields[u"label"].label == instance2[u"label"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance3[u"tokens"]
        assert fields[u"label"].label == instance3[u"label"] 
Example #8
Source File: stanford_sentiment_tree_bank_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_use_subtrees(self):
        reader = StanfordSentimentTreeBankDatasetReader(use_subtrees=True)
        instances = reader.read(self.sst_path)
        instances = ensure_list(instances)

        instance1 = {u"tokens": [u"The", u"actors", u"are", u"fantastic", u"."],
                     u"label": u"4"}
        instance2 = {u"tokens": [u"The", u"actors"],
                     u"label": u"2"}
        instance3 = {u"tokens": [u"The"],
                     u"label": u"2"}

        assert len(instances) == 21
        fields = instances[0].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance1[u"tokens"]
        assert fields[u"label"].label == instance1[u"label"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance2[u"tokens"]
        assert fields[u"label"].label == instance2[u"label"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance3[u"tokens"]
        assert fields[u"label"].label == instance3[u"label"] 
Example #9
Source File: stanford_sentiment_tree_bank_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_read_from_file(self, lazy):
        reader = StanfordSentimentTreeBankDatasetReader(lazy=lazy)
        instances = reader.read(self.sst_path)
        instances = ensure_list(instances)

        instance1 = {u"tokens": [u"The", u"actors", u"are", u"fantastic", u"."],
                     u"label": u"4"}
        instance2 = {u"tokens": [u"It", u"was", u"terrible", u"."],
                     u"label": u"0"}
        instance3 = {u"tokens": [u"Chomp", u"chomp", u"!"],
                     u"label": u"2"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance1[u"tokens"]
        assert fields[u"label"].label == instance1[u"label"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance2[u"tokens"]
        assert fields[u"label"].label == instance2[u"label"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance3[u"tokens"]
        assert fields[u"label"].label == instance3[u"label"] 
Example #10
Source File: language_modeling_dataset_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_read_from_file(self, lazy):
        reader = LanguageModelingReader(tokens_per_instance=3, lazy=lazy)

        instances = ensure_list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'language_modeling.txt'))
        # The last potential instance is left out, which is ok, because we don't have an end token
        # in here, anyway.
        assert len(instances) == 5

        assert [t.text for t in instances[0].fields[u"input_tokens"].tokens] == [u"This", u"is", u"a"]
        assert [t.text for t in instances[0].fields[u"output_tokens"].tokens] == [u"is", u"a", u"sentence"]

        assert [t.text for t in instances[1].fields[u"input_tokens"].tokens] == [u"sentence", u"for", u"language"]
        assert [t.text for t in instances[1].fields[u"output_tokens"].tokens] == [u"for", u"language", u"modelling"]

        assert [t.text for t in instances[2].fields[u"input_tokens"].tokens] == [u"modelling", u".", u"Here"]
        assert [t.text for t in instances[2].fields[u"output_tokens"].tokens] == [u".", u"Here", u"'s"]

        assert [t.text for t in instances[3].fields[u"input_tokens"].tokens] == [u"'s", u"another", u"one"]
        assert [t.text for t in instances[3].fields[u"output_tokens"].tokens] == [u"another", u"one", u"for"]

        assert [t.text for t in instances[4].fields[u"input_tokens"].tokens] == [u"for", u"extra", u"language"]
        assert [t.text for t in instances[4].fields[u"output_tokens"].tokens] == [u"extra", u"language", u"modelling"] 
Example #11
Source File: test_tacred_reader.py    From kb with Apache License 2.0 6 votes vote down vote up
def test_tacred_dataset_reader(self):
        reader = get_reader()
        instances = ensure_list(reader.read('tests/fixtures/tacred/LDC2018T24.json'))

        # Check number of instances is correct
        self.assertEqual(len(instances), 3)

        # Check that first instance's tokens are correct
        tokens_0 = [x.text for x in instances[0]['tokens']]

        initial_tokens_0 = tokens_0[:6]
        expected_initial_tokens_0 = ['[CLS]', 'douglas', 'flint', '[SEP]', 'chairman', '[SEP]']
        self.assertListEqual(initial_tokens_0, expected_initial_tokens_0)

        final_tokens_0 = tokens_0[-6:]
        expected_final_tokens_0 = ['a', 'govern', '##ment', '[UNK]', '.', '[SEP]']
        self.assertListEqual(final_tokens_0, expected_final_tokens_0)

        # Check that first instances label is correct
        label_0 = instances[0]['label_ids'].label
        expected_label_0 = LABEL_MAP['per:title']
        self.assertEqual(label_0, expected_label_0) 
Example #12
Source File: test_tacred_reader.py    From kb with Apache License 2.0 6 votes vote down vote up
def test_entity_mask(self):
        # Check 'mask' mode has expected behavior
        reader = get_reader()
        reader.entity_masking = 'mask'
        instances = ensure_list(reader.read('tests/fixtures/tacred/LDC2018T24.json'))

        tokens_0 = [x.text for x in instances[0]['tokens']]
        subj_tokens_0 = tokens_0[14]
        self.assertEqual(subj_tokens_0, '[MASK]')

        tokens_0 = [x.text for x in instances[0]['tokens']]
        obj_tokens_0 = tokens_0[17]
        self.assertEqual(obj_tokens_0, '[MASK]')

        # Check 'type/role' mode has expected behavior
        reader.entity_masking = 'type/role'
        instances = ensure_list(reader.read('tests/fixtures/tacred/LDC2018T24.json'))

        tokens_0 = [x.text for x in instances[0]['tokens']]
        subj_tokens_0 = tokens_0[14]
        self.assertEqual(subj_tokens_0, '[s-person]')

        tokens_0 = [x.text for x in instances[0]['tokens']]
        obj_tokens_0 = tokens_0[17]
        self.assertEqual(obj_tokens_0, '[o-title]') 
Example #13
Source File: test_dataset_reader_main.py    From scicite with Apache License 2.0 6 votes vote down vote up
def test_read_from_file(self):
        reader = AclarcDatasetReader()
        instances = ensure_list(reader.read('tests/fixtures/aclarc-train.jsonl'))
        instance1 = {"citation_text": ['Typical', 'examples', 'are', 'Bulgarian']}
        assert len(instances) == 10
        fields = instances[0].fields
        assert isinstance(instances, list)
        assert [t.text for t in fields['citation_text'].tokens][:4] == instance1['citation_text']

        reader = AclSectionTitleDatasetReader()
        instances = ensure_list(reader.read('tests/fixtures/aclarc-section-title.jsonl'))
        instance1 = {"section_name": 'related work', "citation_text": ['With', 'C99']}
        assert len(instances) == 10
        fields = instances[1].fields
        assert isinstance(instances, list)
        assert [t.text for t in fields['citation_text'].tokens][:2] == instance1['citation_text']
        assert fields['section_label'].label == instance1['section_name']

        reader = AclCiteWorthinessDatasetReader()
        instances = ensure_list(reader.read('tests/fixtures/aclarc-cite-worthiness.jsonl'))
        instance1 = {"is_citation": 'False'}
        fields = instances[1].fields
        assert isinstance(instances, list)
        assert fields['is_citation'].label == instance1['is_citation'] 
Example #14
Source File: sequence_tagging_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_brown_corpus_format(self):
        reader = SequenceTaggingDatasetReader(word_tag_delimiter="/")
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "brown_corpus.txt")
        instances = ensure_list(instances)

        assert len(instances) == 4
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == ["cats", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == ["dogs", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == ["snakes", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[3].fields
        assert [t.text for t in fields["tokens"].tokens] == ["birds", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"] 
Example #15
Source File: semisupervised_text_classification_json_test.py    From vampire with Apache License 2.0 6 votes vote down vote up
def test_read_from_file_and_truncates_properly(self):

        reader = SemiSupervisedTextClassificationJsonReader(max_sequence_length=5)
        ag_path = self.FIXTURES_ROOT / "imdb" / "train.jsonl"
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ['...', 'And', 'I', 'never', 'thought'],
                     "label": "neg"}
        instance2 = {"tokens": ['The', 'fight', 'scenes', 'were', 'great'],
                     "label": "pos"}
        instance3 = {"tokens": ['The', 'only', 'way', 'this', 'is'],
                     "label": "neg"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"]
        assert fields["label"].label == instance3["label"] 
Example #16
Source File: seq2seq_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_default_format(self, lazy):
        reader = Seq2SeqDatasetReader(lazy=lazy)
        instances = reader.read(unicode(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'seq2seq_copy.tsv'))
        instances = ensure_list(instances)

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields[u"source_tokens"].tokens] == [u"@start@", u"this", u"is",
                                                                    u"a", u"sentence", u"@end@"]
        assert [t.text for t in fields[u"target_tokens"].tokens] == [u"@start@", u"this", u"is",
                                                                    u"a", u"sentence", u"@end@"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"source_tokens"].tokens] == [u"@start@", u"this", u"is",
                                                                    u"another", u"@end@"]
        assert [t.text for t in fields[u"target_tokens"].tokens] == [u"@start@", u"this", u"is",
                                                                    u"another", u"@end@"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"source_tokens"].tokens] == [u"@start@", u"all", u"these", u"sentences",
                                                                    u"should", u"get", u"copied", u"@end@"]
        assert [t.text for t in fields[u"target_tokens"].tokens] == [u"@start@", u"all", u"these", u"sentences",
                                                                    u"should", u"get", u"copied", u"@end@"] 
Example #17
Source File: text_classification_json_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_read_from_file_ag_news_corpus_and_truncates_properly(self, lazy):
        reader = TextClassificationJsonReader(lazy=lazy, max_sequence_length=5)
        ag_path = (
            AllenNlpTestCase.FIXTURES_ROOT
            / "data"
            / "text_classification_json"
            / "ag_news_corpus.jsonl"
        )
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ["Memphis", "Rout", "Still", "Stings", "for"], "label": "2"}
        instance2 = {"tokens": ["AP", "-", "Eli", "Manning", "has"], "label": "2"}
        instance3 = {"tokens": ["A", "conference", "dedicated", "to", "online"], "label": "4"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"]
        assert fields["label"].label == instance3["label"] 
Example #18
Source File: srl_dataset_reader_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_srl_reader_can_filter_by_domain(self):

        conll_reader = SrlReader(domain_identifier=u"subdomain2")
        instances = conll_reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'conll_2012')
        instances = ensure_list(instances)
        # If we'd included the folder, we'd have 9 instances.
        assert len(instances) == 2 
Example #19
Source File: copynet_test.py    From nlp-models with MIT License 5 votes vote down vote up
def setUp(self):
        super(TestCopyNetReader, self).setUp()
        params = Params.from_file("nlpete/tests/fixtures/copynet/experiment.json")
        self.reader = DatasetReader.from_params(params["dataset_reader"])
        instances = self.reader.read("nlpete/tests/fixtures/copynet/copyover.tsv")
        self.instances = ensure_list(instances)
        self.vocab = Vocabulary.from_params(
            params=params["vocabulary"], instances=instances
        ) 
Example #20
Source File: nl2bash_test.py    From nlp-models with MIT License 5 votes vote down vote up
def setUp(self):
        super(TestNL2BashReader, self).setUp()
        self.reader = NL2BashDatasetReader("target_tokens")
        instances = self.reader.read("nlpete/tests/fixtures/nl2bash/train.tsv")
        self.instances = ensure_list(instances) 
Example #21
Source File: prolocal_dataset_reader_test.py    From propara with Apache License 2.0 5 votes vote down vote up
def test_read_from_file(self):
        sc_reader = ProLocalDatasetReader()
        instances = sc_reader.read('tests/fixtures/prolocal_toy_data.tsv')
        instances = ensure_list(instances)

        # read first instance
        fields = instances[0].fields
        correct_tokens = ["Green", "plants", "absorb", "water", "from", "the", "soil"]
        read_tokens = [t.text for t in fields["tokens"].tokens]
        assert correct_tokens == read_tokens
        assert fields["entity_span"].labels == [0, 0, 0, 1, 0, 0, 0]
        assert fields["verb_span"].labels == [0, 0, 1, 0, 0, 0, 0]

        assert fields["state_change_type_labels"].label == 'MOVE'
        assert fields["state_change_tags"].labels == ['B-LOC-TO', 'I-LOC-TO', 'O', 'O', 'O', 'B-LOC-FROM', 'I-LOC-FROM']

        # read second instance
        fields = instances[1].fields
        print(fields)
        read_tokens = [t.text for t in fields["tokens"].tokens]
        assert read_tokens == ["Rocks", "in", "the", "shore", "break"]
        assert fields["entity_span"].labels == [1, 0, 0, 0, 0]
        assert fields["verb_span"].labels == [0, 0, 0, 0, 1]

        assert fields["state_change_type_labels"].label == 'DESTROY'
        assert fields["state_change_tags"].labels == ['O', 'O', 'B-LOC-FROM', 'I-LOC-FROM', 'O'] 
Example #22
Source File: text_classification_json_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_set_skip_indexing_true(self, lazy):
        reader = TextClassificationJsonReader(lazy=lazy, skip_label_indexing=True)
        ag_path = (
            AllenNlpTestCase.FIXTURES_ROOT
            / "data"
            / "text_classification_json"
            / "integer_labels.jsonl"
        )
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ["This", "text", "has", "label", "0"], "label": 0}
        instance2 = {"tokens": ["This", "text", "has", "label", "1"], "label": 1}

        assert len(instances) == 2
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]

        with pytest.raises(ValueError) as exec_info:
            ag_path = (
                AllenNlpTestCase.FIXTURES_ROOT
                / "data"
                / "text_classification_json"
                / "imdb_corpus.jsonl"
            )
            ensure_list(reader.read(ag_path))
        assert str(exec_info.value) == "Labels must be integers if skip_label_indexing is True." 
Example #23
Source File: test_kg_probe_reader.py    From kb with Apache License 2.0 5 votes vote down vote up
def test_kg_probe_reader(self):
        reader = get_reader()
        instances = ensure_list(reader.read('tests/fixtures/kg_probe/file1.txt'))

        # Check instances are correct length
        self.assertEqual(len(instances), 2)

        # Check masking is performed properly
        expected_tokens_0 = ['[CLS]', '[MASK]', '[MASK]', '[UNK]', 'quick',
                             '##est', '.', '[SEP]']
        tokens_0 = [x.text for x in instances[0]['tokens'].tokens]
        self.assertListEqual(expected_tokens_0, tokens_0)

        expected_mask_indicator_0 = np.array([0,1,1,0,0,0,0,0], dtype=np.uint8)
        mask_indicator_0 = instances[0]['mask_indicator'].array
        assert np.allclose(expected_mask_indicator_0, mask_indicator_0)

        expected_tokens_1 = ['[CLS]', 'the', 'brown', 'fox', 'jumped', 'over',
                             'the', '[MASK]', '[MASK]', '[MASK]', '[MASK]',
                             '.', '[SEP]']
        tokens_1 = [x.text for x in instances[1]['tokens'].tokens]
        self.assertListEqual(expected_tokens_1, tokens_1)

        expected_mask_indicator_1 = np.array([0,0,0,0,0,0,0,1,1,1,1,0,0], dtype=np.uint8)
        mask_indicator_1 = instances[1]['mask_indicator'].array
        assert np.allclose(expected_mask_indicator_1, mask_indicator_1) 
Example #24
Source File: test_ultra_fine_reader.py    From kb with Apache License 2.0 5 votes vote down vote up
def test_ultra_fine_reader_entity_markers(self):
        reader = get_reader("entity_markers")
        instances = ensure_list(reader.read('tests/fixtures/evaluation/ultra_fine/train.json'))

        # Check number of instances is correct
        self.assertEqual(len(instances), 2)

        # Check that first instance's tokens are correct
        tokens_0 = [x.text for x in instances[0]['tokens']]
        segments_0 = list(instances[0]['segment_ids'].array)
        actual = list(zip(tokens_0, segments_0))
        expected = [('[CLS]', 0),
                 ('the', 0),
                 ('british', 0),
                 ('information', 0),
                 ('commissioner', 0),
                 ("'s", 0),
                 ('office', 0),
                 ('invites', 0),
                 ('[e1start]', 0),
                 ('web', 0),
                 ('users', 0),
                 ('[e1end]', 0),
                 ('to', 0),
                 ('locate', 0),
                 ('its', 0),
                 ('add', 0),
                 ('##ress', 0),
                 ('using', 0),
                 ('google', 0),
                 ('[UNK]', 0),
                 ('.', 0),
                 ('[SEP]', 0)]
        self.assertListEqual(actual, expected)

        self.assertEqual(actual[instances[0]['index_a'].label], ('[e1start]', 0)) 
Example #25
Source File: test_dataset_reader_main.py    From scicite with Apache License 2.0 5 votes vote down vote up
def test_read_from_file_scicite(self):
        reader = SciciteDatasetReader()
        instances = ensure_list(reader.read('tests/fixtures/scicite-train.jsonl'))
        instance1 = {"citation_text": ['These', 'results', 'are', 'in']}
        assert len(instances) == 10
        fields = instances[0].fields
        assert isinstance(instances, list)
        assert [t.text for t in fields['citation_text'].tokens][:4] == instance1['citation_text']
        print(fields.keys())
        assert fields['labels'].label == "result"

        reader = SciciteSectitleDatasetReader()
        instances = ensure_list(reader.read('tests/fixtures/scicite-section-title.jsonl'))
        instance1 = {"section_name": 'introduction', "citation_text": ['SVM', 'and']}
        assert len(instances) == 10
        fields = instances[0].fields
        assert isinstance(instances, list)
        assert [t.text for t in fields['citation_text'].tokens][:2] == instance1['citation_text']
        assert fields['section_label'].label == instance1['section_name']
        assert 'is_citation' not in fields

        reader = SciCiteWorthinessDataReader()
        instances = ensure_list(reader.read('tests/fixtures/scicite-cite-worthiness.jsonl'))
        instance1 = {"is_citation": 'True'}
        fields = instances[0].fields
        assert isinstance(instances, list)
        assert fields['is_citation'].label == instance1['is_citation']
        assert 'section_name' not in fields.keys() 
Example #26
Source File: semisupervised_text_classification_json_test.py    From vampire with Apache License 2.0 5 votes vote down vote up
def test_samples_properly(self):
        reader = SemiSupervisedTextClassificationJsonReader(sample=1, max_sequence_length=5)
        ag_path = self.FIXTURES_ROOT / "imdb" / "train.jsonl"
        params = Params({"random_seed": 5, "numpy_seed": 5, "pytorch_seed": 5})
        prepare_environment(params)
        instances = reader.read(ag_path)
        instances = ensure_list(instances)
        instance = {"tokens": ['The', 'fight', 'scenes', 'were', 'great'],
                    "label": "pos"}
        assert len(instances) == 1
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance["tokens"]
        assert fields["label"].label == instance["label"] 
Example #27
Source File: semisupervised_text_classification_json_test.py    From vampire with Apache License 2.0 5 votes vote down vote up
def test_ignores_label_properly(self):

        imdb_labeled_path = self.FIXTURES_ROOT / "imdb" / "train.jsonl"
        reader = SemiSupervisedTextClassificationJsonReader(ignore_labels=True)
        instances = reader.read(imdb_labeled_path)
        instances = ensure_list(instances)
        fields = [i.fields for i in instances]
        labels = [f.get('label') for f in fields]
        assert labels == [None] * 3 
Example #28
Source File: seq2seq_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_source_add_start_token(self):
        reader = Seq2SeqDatasetReader(source_add_start_token=False)
        instances = reader.read(unicode(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'seq2seq_copy.tsv'))
        instances = ensure_list(instances)

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields[u"source_tokens"].tokens] == [u"this", u"is", u"a", u"sentence", u"@end@"]
        assert [t.text for t in fields[u"target_tokens"].tokens] == [u"@start@", u"this", u"is",
                                                                    u"a", u"sentence", u"@end@"] 
Example #29
Source File: snli_reader_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_read_from_file(self, lazy):
        reader = SnliReader(lazy=lazy)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'snli.jsonl')
        instances = ensure_list(instances)

        instance1 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"training", u"his", u"horse", u"for", u"a",
                                    u"competition", u"."],
                     u"label": u"neutral"}

        instance2 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"at", u"a", u"diner", u",", u"ordering", u"an",
                                    u"omelette", u"."],
                     u"label": u"contradiction"}
        instance3 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"outdoors", u",", u"on", u"a", u"horse", u"."],
                     u"label": u"entailment"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance1[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance1[u"hypothesis"]
        assert fields[u"label"].label == instance1[u"label"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance2[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance2[u"hypothesis"]
        assert fields[u"label"].label == instance2[u"label"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance3[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance3[u"hypothesis"]
        assert fields[u"label"].label == instance3[u"label"] 
Example #30
Source File: triviaqa_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_read(self, lazy):
        params = Params({
                u'base_tarball_path': unicode(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'triviaqa-sample.tgz'),
                u'lazy': lazy
                })
        reader = TriviaQaReader.from_params(params)
        instances = reader.read(u'web-train.json')
        instances = ensure_list(instances)
        assert len(instances) == 3

        assert [t.text for t in instances[0].fields[u"question"].tokens[:3]] == [u"Which", u"American", u"-"]
        assert [t.text for t in instances[0].fields[u"passage"].tokens[:3]] == [u"The", u"Nobel", u"Prize"]
        url = u"http://www.nobelprize.org/nobel_prizes/literature/laureates/1930/"
        assert [t.text for t in instances[0].fields[u"passage"].tokens[-3:]] == [u"<", url, u">"]
        assert instances[0].fields[u"span_start"].sequence_index == 12
        assert instances[0].fields[u"span_end"].sequence_index == 13

        assert [t.text for t in instances[1].fields[u"question"].tokens[:3]] == [u"Which", u"American", u"-"]
        assert [t.text for t in instances[1].fields[u"passage"].tokens[:3]] == [u"Why", u"Do", u"n’t"]
        assert [t.text for t in instances[1].fields[u"passage"].tokens[-3:]] == [u"adults", u",", u"and"]
        assert instances[1].fields[u"span_start"].sequence_index == 38
        assert instances[1].fields[u"span_end"].sequence_index == 39

        assert [t.text for t in instances[2].fields[u"question"].tokens[:3]] == [u"Where", u"in", u"England"]
        assert [t.text for t in instances[2].fields[u"passage"].tokens[:3]] == [u"Judi", u"Dench", u"-"]
        assert [t.text for t in instances[2].fields[u"passage"].tokens[-3:]] == [u")", u"(", u"special"]
        assert instances[2].fields[u"span_start"].sequence_index == 16
        assert instances[2].fields[u"span_end"].sequence_index == 16