Python Examples of allennlp.common.util.ensure

Source File: quora_paraphrase_test.py From magnitude with MIT License

6 votes

def test_read_from_file(self, lazy):
        reader = QuoraParaphraseDatasetReader(lazy=lazy)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'quora_paraphrase.tsv')
        instances = ensure_list(instances)

        instance1 = {u"premise": u"What should I do to avoid sleeping in class ?".split(),
                     u"hypothesis": u"How do I not sleep in a boring class ?".split(),
                     u"label": u"1"}

        instance2 = {u"premise": u"Do women support each other more than men do ?".split(),
                     u"hypothesis": u"Do women need more compliments than men ?".split(),
                     u"label": u"0"}

        instance3 = {u"premise": u"How can one root android devices ?".split(),
                     u"hypothesis": u"How do I root an Android device ?".split(),
                     u"label": u"1"}

        assert len(instances) == 3

        for instance, expected_instance in izip(instances, [instance1, instance2, instance3]):
            fields = instance.fields
            assert [t.text for t in fields[u"premise"].tokens] == expected_instance[u"premise"]
            assert [t.text for t in fields[u"hypothesis"].tokens] == expected_instance[u"hypothesis"]
            assert fields[u"label"].label == expected_instance[u"label"]

Source File: imdb_test.py From topic-rnn with Apache License 2.0

6 votes

def test_read_from_file(self):
        # pylint: disable=R0201
        reader = IMDBLanguageModelingReader()
        dataset = reader.read(TestIMDBReader.DATASET_PATH)
        instances = ensure_list(dataset)

        assert len(instances) == 10
        fields = instances[0].fields
        assert [t.text for t in fields["source"].tokens] == TestIMDBReader.INSTANCE_0["source"]
        assert [t.text for t in fields["target"].tokens] == TestIMDBReader.INSTANCE_0["target"]
        fields = instances[1].fields
        assert [t.text for t in fields["source"].tokens] == TestIMDBReader.INSTANCE_1["source"]
        assert [t.text for t in fields["target"].tokens] == TestIMDBReader.INSTANCE_1["target"]
        fields = instances[7].fields
        assert [t.text for t in fields["source"].tokens] == TestIMDBReader.INSTANCE_7["source"]
        assert [t.text for t in fields["target"].tokens] == TestIMDBReader.INSTANCE_7["target"]

Source File: conll2003_dataset_reader_test.py From magnitude with MIT License

6 votes

def test_read_from_file(self, lazy, coding_scheme):
        conll_reader = Conll2003DatasetReader(lazy=lazy, coding_scheme=coding_scheme)
        instances = conll_reader.read(unicode(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'conll2003.txt'))
        instances = ensure_list(instances)

        if coding_scheme == u'IOB1':
            expected_labels = [u'I-ORG', u'O', u'I-PER', u'O', u'O', u'I-LOC', u'O']
        else:
            expected_labels = [u'U-ORG', u'O', u'U-PER', u'O', u'O', u'U-LOC', u'O']

        fields = instances[0].fields
        tokens = [t.text for t in fields[u'tokens'].tokens]
        assert tokens == [u'U.N.', u'official', u'Ekeus', u'heads', u'for', u'Baghdad', u'.']
        assert fields[u"tags"].labels == expected_labels

        fields = instances[1].fields
        tokens = [t.text for t in fields[u'tokens'].tokens]
        assert tokens == [u'AI2', u'engineer', u'Joel', u'lives', u'in', u'Seattle', u'.']
        assert fields[u"tags"].labels == expected_labels

Source File: sequence_tagging_test.py From magnitude with MIT License

6 votes

def test_brown_corpus_format(self):
        reader = SequenceTaggingDatasetReader(word_tag_delimiter=u'/')
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'brown_corpus.txt')
        instances = ensure_list(instances)

        assert len(instances) == 4
        fields = instances[0].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"cats", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"dogs", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"snakes", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[3].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"birds", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]

Source File: sequence_tagging_test.py From magnitude with MIT License

6 votes

def test_default_format(self, lazy):
        reader = SequenceTaggingDatasetReader(lazy=lazy)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv')
        instances = ensure_list(instances)

        assert len(instances) == 4
        fields = instances[0].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"cats", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"dogs", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"snakes", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[3].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"birds", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]

Source File: stanford_sentiment_tree_bank_test.py From magnitude with MIT License

6 votes

def test_2_class(self):
        reader = StanfordSentimentTreeBankDatasetReader(granularity=u"2-class")
        instances = reader.read(self.sst_path)
        instances = ensure_list(instances)

        instance1 = {u"tokens": [u"The", u"actors", u"are", u"fantastic", u"."],
                     u"label": u"1"}
        instance2 = {u"tokens": [u"It", u"was", u"terrible", u"."],
                     u"label": u"0"}

        assert len(instances) == 2
        fields = instances[0].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance1[u"tokens"]
        assert fields[u"label"].label == instance1[u"label"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance2[u"tokens"]
        assert fields[u"label"].label == instance2[u"label"]

Source File: stanford_sentiment_tree_bank_test.py From magnitude with MIT License

6 votes

def test_3_class(self):
        reader = StanfordSentimentTreeBankDatasetReader(granularity=u"3-class")
        instances = reader.read(self.sst_path)
        instances = ensure_list(instances)

        instance1 = {u"tokens": [u"The", u"actors", u"are", u"fantastic", u"."],
                     u"label": u"2"}
        instance2 = {u"tokens": [u"It", u"was", u"terrible", u"."],
                     u"label": u"0"}
        instance3 = {u"tokens": [u"Chomp", u"chomp", u"!"],
                     u"label": u"1"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance1[u"tokens"]
        assert fields[u"label"].label == instance1[u"label"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance2[u"tokens"]
        assert fields[u"label"].label == instance2[u"label"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance3[u"tokens"]
        assert fields[u"label"].label == instance3[u"label"]

Source File: stanford_sentiment_tree_bank_test.py From magnitude with MIT License

6 votes

def test_use_subtrees(self):
        reader = StanfordSentimentTreeBankDatasetReader(use_subtrees=True)
        instances = reader.read(self.sst_path)
        instances = ensure_list(instances)

        instance1 = {u"tokens": [u"The", u"actors", u"are", u"fantastic", u"."],
                     u"label": u"4"}
        instance2 = {u"tokens": [u"The", u"actors"],
                     u"label": u"2"}
        instance3 = {u"tokens": [u"The"],
                     u"label": u"2"}

        assert len(instances) == 21
        fields = instances[0].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance1[u"tokens"]
        assert fields[u"label"].label == instance1[u"label"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance2[u"tokens"]
        assert fields[u"label"].label == instance2[u"label"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance3[u"tokens"]
        assert fields[u"label"].label == instance3[u"label"]

Source File: stanford_sentiment_tree_bank_test.py From magnitude with MIT License

6 votes

def test_read_from_file(self, lazy):
        reader = StanfordSentimentTreeBankDatasetReader(lazy=lazy)
        instances = reader.read(self.sst_path)
        instances = ensure_list(instances)

        instance1 = {u"tokens": [u"The", u"actors", u"are", u"fantastic", u"."],
                     u"label": u"4"}
        instance2 = {u"tokens": [u"It", u"was", u"terrible", u"."],
                     u"label": u"0"}
        instance3 = {u"tokens": [u"Chomp", u"chomp", u"!"],
                     u"label": u"2"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance1[u"tokens"]
        assert fields[u"label"].label == instance1[u"label"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance2[u"tokens"]
        assert fields[u"label"].label == instance2[u"label"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"tokens"].tokens] == instance3[u"tokens"]
        assert fields[u"label"].label == instance3[u"label"]

Source File: language_modeling_dataset_test.py From magnitude with MIT License

6 votes

def test_read_from_file(self, lazy):
        reader = LanguageModelingReader(tokens_per_instance=3, lazy=lazy)

        instances = ensure_list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'language_modeling.txt'))
        # The last potential instance is left out, which is ok, because we don't have an end token
        # in here, anyway.
        assert len(instances) == 5

        assert [t.text for t in instances[0].fields[u"input_tokens"].tokens] == [u"This", u"is", u"a"]
        assert [t.text for t in instances[0].fields[u"output_tokens"].tokens] == [u"is", u"a", u"sentence"]

        assert [t.text for t in instances[1].fields[u"input_tokens"].tokens] == [u"sentence", u"for", u"language"]
        assert [t.text for t in instances[1].fields[u"output_tokens"].tokens] == [u"for", u"language", u"modelling"]

        assert [t.text for t in instances[2].fields[u"input_tokens"].tokens] == [u"modelling", u".", u"Here"]
        assert [t.text for t in instances[2].fields[u"output_tokens"].tokens] == [u".", u"Here", u"'s"]

        assert [t.text for t in instances[3].fields[u"input_tokens"].tokens] == [u"'s", u"another", u"one"]
        assert [t.text for t in instances[3].fields[u"output_tokens"].tokens] == [u"another", u"one", u"for"]

        assert [t.text for t in instances[4].fields[u"input_tokens"].tokens] == [u"for", u"extra", u"language"]
        assert [t.text for t in instances[4].fields[u"output_tokens"].tokens] == [u"extra", u"language", u"modelling"]

Source File: test_tacred_reader.py From kb with Apache License 2.0

6 votes

def test_tacred_dataset_reader(self):
        reader = get_reader()
        instances = ensure_list(reader.read('tests/fixtures/tacred/LDC2018T24.json'))

        # Check number of instances is correct
        self.assertEqual(len(instances), 3)

        # Check that first instance's tokens are correct
        tokens_0 = [x.text for x in instances[0]['tokens']]

        initial_tokens_0 = tokens_0[:6]
        expected_initial_tokens_0 = ['[CLS]', 'douglas', 'flint', '[SEP]', 'chairman', '[SEP]']
        self.assertListEqual(initial_tokens_0, expected_initial_tokens_0)

        final_tokens_0 = tokens_0[-6:]
        expected_final_tokens_0 = ['a', 'govern', '##ment', '[UNK]', '.', '[SEP]']
        self.assertListEqual(final_tokens_0, expected_final_tokens_0)

        # Check that first instances label is correct
        label_0 = instances[0]['label_ids'].label
        expected_label_0 = LABEL_MAP['per:title']
        self.assertEqual(label_0, expected_label_0)

Source File: test_tacred_reader.py From kb with Apache License 2.0

6 votes

def test_entity_mask(self):
        # Check 'mask' mode has expected behavior
        reader = get_reader()
        reader.entity_masking = 'mask'
        instances = ensure_list(reader.read('tests/fixtures/tacred/LDC2018T24.json'))

        tokens_0 = [x.text for x in instances[0]['tokens']]
        subj_tokens_0 = tokens_0[14]
        self.assertEqual(subj_tokens_0, '[MASK]')

        tokens_0 = [x.text for x in instances[0]['tokens']]
        obj_tokens_0 = tokens_0[17]
        self.assertEqual(obj_tokens_0, '[MASK]')

        # Check 'type/role' mode has expected behavior
        reader.entity_masking = 'type/role'
        instances = ensure_list(reader.read('tests/fixtures/tacred/LDC2018T24.json'))

        tokens_0 = [x.text for x in instances[0]['tokens']]
        subj_tokens_0 = tokens_0[14]
        self.assertEqual(subj_tokens_0, '[s-person]')

        tokens_0 = [x.text for x in instances[0]['tokens']]
        obj_tokens_0 = tokens_0[17]
        self.assertEqual(obj_tokens_0, '[o-title]')

Source File: test_dataset_reader_main.py From scicite with Apache License 2.0

6 votes

def test_read_from_file(self):
        reader = AclarcDatasetReader()
        instances = ensure_list(reader.read('tests/fixtures/aclarc-train.jsonl'))
        instance1 = {"citation_text": ['Typical', 'examples', 'are', 'Bulgarian']}
        assert len(instances) == 10
        fields = instances[0].fields
        assert isinstance(instances, list)
        assert [t.text for t in fields['citation_text'].tokens][:4] == instance1['citation_text']

        reader = AclSectionTitleDatasetReader()
        instances = ensure_list(reader.read('tests/fixtures/aclarc-section-title.jsonl'))
        instance1 = {"section_name": 'related work', "citation_text": ['With', 'C99']}
        assert len(instances) == 10
        fields = instances[1].fields
        assert isinstance(instances, list)
        assert [t.text for t in fields['citation_text'].tokens][:2] == instance1['citation_text']
        assert fields['section_label'].label == instance1['section_name']

        reader = AclCiteWorthinessDatasetReader()
        instances = ensure_list(reader.read('tests/fixtures/aclarc-cite-worthiness.jsonl'))
        instance1 = {"is_citation": 'False'}
        fields = instances[1].fields
        assert isinstance(instances, list)
        assert fields['is_citation'].label == instance1['is_citation']

Source File: sequence_tagging_test.py From allennlp with Apache License 2.0

6 votes

def test_brown_corpus_format(self):
        reader = SequenceTaggingDatasetReader(word_tag_delimiter="/")
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "brown_corpus.txt")
        instances = ensure_list(instances)

        assert len(instances) == 4
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == ["cats", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == ["dogs", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == ["snakes", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[3].fields
        assert [t.text for t in fields["tokens"].tokens] == ["birds", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]

Source File: semisupervised_text_classification_json_test.py From vampire with Apache License 2.0

6 votes

def test_read_from_file_and_truncates_properly(self):

        reader = SemiSupervisedTextClassificationJsonReader(max_sequence_length=5)
        ag_path = self.FIXTURES_ROOT / "imdb" / "train.jsonl"
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ['...', 'And', 'I', 'never', 'thought'],
                     "label": "neg"}
        instance2 = {"tokens": ['The', 'fight', 'scenes', 'were', 'great'],
                     "label": "pos"}
        instance3 = {"tokens": ['The', 'only', 'way', 'this', 'is'],
                     "label": "neg"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"]
        assert fields["label"].label == instance3["label"]

Source File: seq2seq_test.py From magnitude with MIT License

6 votes

def test_default_format(self, lazy):
        reader = Seq2SeqDatasetReader(lazy=lazy)
        instances = reader.read(unicode(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'seq2seq_copy.tsv'))
        instances = ensure_list(instances)

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields[u"source_tokens"].tokens] == [u"@start@", u"this", u"is",
                                                                    u"a", u"sentence", u"@end@"]
        assert [t.text for t in fields[u"target_tokens"].tokens] == [u"@start@", u"this", u"is",
                                                                    u"a", u"sentence", u"@end@"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"source_tokens"].tokens] == [u"@start@", u"this", u"is",
                                                                    u"another", u"@end@"]
        assert [t.text for t in fields[u"target_tokens"].tokens] == [u"@start@", u"this", u"is",
                                                                    u"another", u"@end@"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"source_tokens"].tokens] == [u"@start@", u"all", u"these", u"sentences",
                                                                    u"should", u"get", u"copied", u"@end@"]
        assert [t.text for t in fields[u"target_tokens"].tokens] == [u"@start@", u"all", u"these", u"sentences",
                                                                    u"should", u"get", u"copied", u"@end@"]

Source File: text_classification_json_test.py From allennlp with Apache License 2.0

6 votes

def test_read_from_file_ag_news_corpus_and_truncates_properly(self, lazy):
        reader = TextClassificationJsonReader(lazy=lazy, max_sequence_length=5)
        ag_path = (
            AllenNlpTestCase.FIXTURES_ROOT
            / "data"
            / "text_classification_json"
            / "ag_news_corpus.jsonl"
        )
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ["Memphis", "Rout", "Still", "Stings", "for"], "label": "2"}
        instance2 = {"tokens": ["AP", "-", "Eli", "Manning", "has"], "label": "2"}
        instance3 = {"tokens": ["A", "conference", "dedicated", "to", "online"], "label": "4"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"]
        assert fields["label"].label == instance3["label"]

Source File: srl_dataset_reader_test.py From magnitude with MIT License

5 votes

def test_srl_reader_can_filter_by_domain(self):

        conll_reader = SrlReader(domain_identifier=u"subdomain2")
        instances = conll_reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'conll_2012')
        instances = ensure_list(instances)
        # If we'd included the folder, we'd have 9 instances.
        assert len(instances) == 2

Source File: copynet_test.py From nlp-models with MIT License

5 votes

def setUp(self):
        super(TestCopyNetReader, self).setUp()
        params = Params.from_file("nlpete/tests/fixtures/copynet/experiment.json")
        self.reader = DatasetReader.from_params(params["dataset_reader"])
        instances = self.reader.read("nlpete/tests/fixtures/copynet/copyover.tsv")
        self.instances = ensure_list(instances)
        self.vocab = Vocabulary.from_params(
            params=params["vocabulary"], instances=instances
        )

Source File: nl2bash_test.py From nlp-models with MIT License

5 votes

def setUp(self):
        super(TestNL2BashReader, self).setUp()
        self.reader = NL2BashDatasetReader("target_tokens")
        instances = self.reader.read("nlpete/tests/fixtures/nl2bash/train.tsv")
        self.instances = ensure_list(instances)

Source File: prolocal_dataset_reader_test.py From propara with Apache License 2.0

5 votes

def test_read_from_file(self):
        sc_reader = ProLocalDatasetReader()
        instances = sc_reader.read('tests/fixtures/prolocal_toy_data.tsv')
        instances = ensure_list(instances)

        # read first instance
        fields = instances[0].fields
        correct_tokens = ["Green", "plants", "absorb", "water", "from", "the", "soil"]
        read_tokens = [t.text for t in fields["tokens"].tokens]
        assert correct_tokens == read_tokens
        assert fields["entity_span"].labels == [0, 0, 0, 1, 0, 0, 0]
        assert fields["verb_span"].labels == [0, 0, 1, 0, 0, 0, 0]

        assert fields["state_change_type_labels"].label == 'MOVE'
        assert fields["state_change_tags"].labels == ['B-LOC-TO', 'I-LOC-TO', 'O', 'O', 'O', 'B-LOC-FROM', 'I-LOC-FROM']

        # read second instance
        fields = instances[1].fields
        print(fields)
        read_tokens = [t.text for t in fields["tokens"].tokens]
        assert read_tokens == ["Rocks", "in", "the", "shore", "break"]
        assert fields["entity_span"].labels == [1, 0, 0, 0, 0]
        assert fields["verb_span"].labels == [0, 0, 0, 0, 1]

        assert fields["state_change_type_labels"].label == 'DESTROY'
        assert fields["state_change_tags"].labels == ['O', 'O', 'B-LOC-FROM', 'I-LOC-FROM', 'O']

Source File: text_classification_json_test.py From allennlp with Apache License 2.0

5 votes

def test_set_skip_indexing_true(self, lazy):
        reader = TextClassificationJsonReader(lazy=lazy, skip_label_indexing=True)
        ag_path = (
            AllenNlpTestCase.FIXTURES_ROOT
            / "data"
            / "text_classification_json"
            / "integer_labels.jsonl"
        )
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ["This", "text", "has", "label", "0"], "label": 0}
        instance2 = {"tokens": ["This", "text", "has", "label", "1"], "label": 1}

        assert len(instances) == 2
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]

        with pytest.raises(ValueError) as exec_info:
            ag_path = (
                AllenNlpTestCase.FIXTURES_ROOT
                / "data"
                / "text_classification_json"
                / "imdb_corpus.jsonl"
            )
            ensure_list(reader.read(ag_path))
        assert str(exec_info.value) == "Labels must be integers if skip_label_indexing is True."

Source File: test_kg_probe_reader.py From kb with Apache License 2.0

5 votes

def test_kg_probe_reader(self):
        reader = get_reader()
        instances = ensure_list(reader.read('tests/fixtures/kg_probe/file1.txt'))

        # Check instances are correct length
        self.assertEqual(len(instances), 2)

        # Check masking is performed properly
        expected_tokens_0 = ['[CLS]', '[MASK]', '[MASK]', '[UNK]', 'quick',
                             '##est', '.', '[SEP]']
        tokens_0 = [x.text for x in instances[0]['tokens'].tokens]
        self.assertListEqual(expected_tokens_0, tokens_0)

        expected_mask_indicator_0 = np.array([0,1,1,0,0,0,0,0], dtype=np.uint8)
        mask_indicator_0 = instances[0]['mask_indicator'].array
        assert np.allclose(expected_mask_indicator_0, mask_indicator_0)

        expected_tokens_1 = ['[CLS]', 'the', 'brown', 'fox', 'jumped', 'over',
                             'the', '[MASK]', '[MASK]', '[MASK]', '[MASK]',
                             '.', '[SEP]']
        tokens_1 = [x.text for x in instances[1]['tokens'].tokens]
        self.assertListEqual(expected_tokens_1, tokens_1)

        expected_mask_indicator_1 = np.array([0,0,0,0,0,0,0,1,1,1,1,0,0], dtype=np.uint8)
        mask_indicator_1 = instances[1]['mask_indicator'].array
        assert np.allclose(expected_mask_indicator_1, mask_indicator_1)

Source File: test_ultra_fine_reader.py From kb with Apache License 2.0

5 votes

def test_ultra_fine_reader_entity_markers(self):
        reader = get_reader("entity_markers")
        instances = ensure_list(reader.read('tests/fixtures/evaluation/ultra_fine/train.json'))

        # Check number of instances is correct
        self.assertEqual(len(instances), 2)

        # Check that first instance's tokens are correct
        tokens_0 = [x.text for x in instances[0]['tokens']]
        segments_0 = list(instances[0]['segment_ids'].array)
        actual = list(zip(tokens_0, segments_0))
        expected = [('[CLS]', 0),
                 ('the', 0),
                 ('british', 0),
                 ('information', 0),
                 ('commissioner', 0),
                 ("'s", 0),
                 ('office', 0),
                 ('invites', 0),
                 ('[e1start]', 0),
                 ('web', 0),
                 ('users', 0),
                 ('[e1end]', 0),
                 ('to', 0),
                 ('locate', 0),
                 ('its', 0),
                 ('add', 0),
                 ('##ress', 0),
                 ('using', 0),
                 ('google', 0),
                 ('[UNK]', 0),
                 ('.', 0),
                 ('[SEP]', 0)]
        self.assertListEqual(actual, expected)

        self.assertEqual(actual[instances[0]['index_a'].label], ('[e1start]', 0))

Source File: test_dataset_reader_main.py From scicite with Apache License 2.0

5 votes

def test_read_from_file_scicite(self):
        reader = SciciteDatasetReader()
        instances = ensure_list(reader.read('tests/fixtures/scicite-train.jsonl'))
        instance1 = {"citation_text": ['These', 'results', 'are', 'in']}
        assert len(instances) == 10
        fields = instances[0].fields
        assert isinstance(instances, list)
        assert [t.text for t in fields['citation_text'].tokens][:4] == instance1['citation_text']
        print(fields.keys())
        assert fields['labels'].label == "result"

        reader = SciciteSectitleDatasetReader()
        instances = ensure_list(reader.read('tests/fixtures/scicite-section-title.jsonl'))
        instance1 = {"section_name": 'introduction', "citation_text": ['SVM', 'and']}
        assert len(instances) == 10
        fields = instances[0].fields
        assert isinstance(instances, list)
        assert [t.text for t in fields['citation_text'].tokens][:2] == instance1['citation_text']
        assert fields['section_label'].label == instance1['section_name']
        assert 'is_citation' not in fields

        reader = SciCiteWorthinessDataReader()
        instances = ensure_list(reader.read('tests/fixtures/scicite-cite-worthiness.jsonl'))
        instance1 = {"is_citation": 'True'}
        fields = instances[0].fields
        assert isinstance(instances, list)
        assert fields['is_citation'].label == instance1['is_citation']
        assert 'section_name' not in fields.keys()

Source File: semisupervised_text_classification_json_test.py From vampire with Apache License 2.0

5 votes

def test_samples_properly(self):
        reader = SemiSupervisedTextClassificationJsonReader(sample=1, max_sequence_length=5)
        ag_path = self.FIXTURES_ROOT / "imdb" / "train.jsonl"
        params = Params({"random_seed": 5, "numpy_seed": 5, "pytorch_seed": 5})
        prepare_environment(params)
        instances = reader.read(ag_path)
        instances = ensure_list(instances)
        instance = {"tokens": ['The', 'fight', 'scenes', 'were', 'great'],
                    "label": "pos"}
        assert len(instances) == 1
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance["tokens"]
        assert fields["label"].label == instance["label"]

Source File: semisupervised_text_classification_json_test.py From vampire with Apache License 2.0

5 votes

def test_ignores_label_properly(self):

        imdb_labeled_path = self.FIXTURES_ROOT / "imdb" / "train.jsonl"
        reader = SemiSupervisedTextClassificationJsonReader(ignore_labels=True)
        instances = reader.read(imdb_labeled_path)
        instances = ensure_list(instances)
        fields = [i.fields for i in instances]
        labels = [f.get('label') for f in fields]
        assert labels == [None] * 3

Source File: seq2seq_test.py From magnitude with MIT License

5 votes

def test_source_add_start_token(self):
        reader = Seq2SeqDatasetReader(source_add_start_token=False)
        instances = reader.read(unicode(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'seq2seq_copy.tsv'))
        instances = ensure_list(instances)

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields[u"source_tokens"].tokens] == [u"this", u"is", u"a", u"sentence", u"@end@"]
        assert [t.text for t in fields[u"target_tokens"].tokens] == [u"@start@", u"this", u"is",
                                                                    u"a", u"sentence", u"@end@"]

Source File: snli_reader_test.py From magnitude with MIT License

5 votes

def test_read_from_file(self, lazy):
        reader = SnliReader(lazy=lazy)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'snli.jsonl')
        instances = ensure_list(instances)

        instance1 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"training", u"his", u"horse", u"for", u"a",
                                    u"competition", u"."],
                     u"label": u"neutral"}

        instance2 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"at", u"a", u"diner", u",", u"ordering", u"an",
                                    u"omelette", u"."],
                     u"label": u"contradiction"}
        instance3 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"outdoors", u",", u"on", u"a", u"horse", u"."],
                     u"label": u"entailment"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance1[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance1[u"hypothesis"]
        assert fields[u"label"].label == instance1[u"label"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance2[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance2[u"hypothesis"]
        assert fields[u"label"].label == instance2[u"label"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance3[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance3[u"hypothesis"]
        assert fields[u"label"].label == instance3[u"label"]

Source File: triviaqa_test.py From magnitude with MIT License

5 votes

def test_read(self, lazy):
        params = Params({
                u'base_tarball_path': unicode(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'triviaqa-sample.tgz'),
                u'lazy': lazy
                })
        reader = TriviaQaReader.from_params(params)
        instances = reader.read(u'web-train.json')
        instances = ensure_list(instances)
        assert len(instances) == 3

        assert [t.text for t in instances[0].fields[u"question"].tokens[:3]] == [u"Which", u"American", u"-"]
        assert [t.text for t in instances[0].fields[u"passage"].tokens[:3]] == [u"The", u"Nobel", u"Prize"]
        url = u"http://www.nobelprize.org/nobel_prizes/literature/laureates/1930/"
        assert [t.text for t in instances[0].fields[u"passage"].tokens[-3:]] == [u"<", url, u">"]
        assert instances[0].fields[u"span_start"].sequence_index == 12
        assert instances[0].fields[u"span_end"].sequence_index == 13

        assert [t.text for t in instances[1].fields[u"question"].tokens[:3]] == [u"Which", u"American", u"-"]
        assert [t.text for t in instances[1].fields[u"passage"].tokens[:3]] == [u"Why", u"Do", u"n’t"]
        assert [t.text for t in instances[1].fields[u"passage"].tokens[-3:]] == [u"adults", u",", u"and"]
        assert instances[1].fields[u"span_start"].sequence_index == 38
        assert instances[1].fields[u"span_end"].sequence_index == 39

        assert [t.text for t in instances[2].fields[u"question"].tokens[:3]] == [u"Where", u"in", u"England"]
        assert [t.text for t in instances[2].fields[u"passage"].tokens[:3]] == [u"Judi", u"Dench", u"-"]
        assert [t.text for t in instances[2].fields[u"passage"].tokens[-3:]] == [u")", u"(", u"special"]
        assert instances[2].fields[u"span_start"].sequence_index == 16
        assert instances[2].fields[u"span_end"].sequence_index == 16

Python allennlp.common.util.ensure_list() Examples