Python sklearn.datasets.fetch_20newsgroups() Examples

The following are 30 code examples of sklearn.datasets.fetch_20newsgroups(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.datasets , or try the search function .
Example #1
Source File: datasets.py    From flambe with MIT License 10 votes vote down vote up
def __init__(self,
                 cache: bool = False,
                 transform: Dict[str, Union[Field, Dict]] = None) -> None:
        """Initialize the NewsGroupDataset builtin."""
        try:
            from sklearn.datasets import fetch_20newsgroups
        except ImportError:
            raise ImportError("Install sklearn to use the NewsGroupDataset")

        train = fetch_20newsgroups(subset='train')
        test = fetch_20newsgroups(subset='test')

        train = [(' '.join(d.split()), str(t)) for d, t in zip(train['data'], train['target'])]
        test = [(' '.join(d.split()), str(t)) for d, t in zip(test['data'], test['target'])]

        named_cols = ['text', 'label']
        super().__init__(
            train=train,
            val=None,
            test=test,
            cache=cache,
            named_columns=named_cols,
            transform=transform
        ) 
Example #2
Source File: DataModule.py    From sgd-influence with MIT License 8 votes vote down vote up
def load(self):
        categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
        newsgroups_train = fetch_20newsgroups(
            subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
        newsgroups_test = fetch_20newsgroups(
            subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)
        vectorizer = TfidfVectorizer(stop_words='english', min_df=0.001, max_df=0.20)
        vectors = vectorizer.fit_transform(newsgroups_train.data)
        vectors_test = vectorizer.transform(newsgroups_test.data)
        x1 = vectors
        y1 = newsgroups_train.target
        x2 = vectors_test
        y2 = newsgroups_test.target
        x = np.array(np.r_[x1.todense(), x2.todense()])
        y = np.r_[y1, y2]
        return x, y 
Example #3
Source File: _validateSchema.py    From nyoka with Apache License 2.0 7 votes vote down vote up
def test_validate_sklearn_sgd_with_text_cv(self):
        categories = ['alt.atheism','talk.religion.misc']
        data = fetch_20newsgroups(subset='train', categories=categories)
        X = data.data[:4]
        Y = data.target[:4]
        features = ['input']
        target = 'output'
        model = SGDClassifier(loss="log")
        file_name = model.__class__.__name__ + '_CountVec_.pmml'
        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('clf', model)
        ])
        pipeline.fit(X, Y)
        skl_to_pmml(pipeline, features , target, file_name)
        self.assertEqual(self.schema.is_valid(file_name), True) 
Example #4
Source File: test_corpusFromScikit.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def _te_ss_t_build(self):
		from sklearn.datasets import fetch_20newsgroups
		from sklearn.feature_extraction.text import CountVectorizer

		newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
		count_vectorizer = CountVectorizer()
		X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
		corpus = CorpusFromScikit(
			X=X_counts,
			y=newsgroups_train.target,
			feature_vocabulary=count_vectorizer.vocabulary_,
			category_names=newsgroups_train.target_names,
			raw_texts=newsgroups_train.data
		).build()
		self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
		self.assertEqual(corpus
		                 .get_term_freq_df()
		                 .assign(score=corpus.get_scaled_f_scores('alt.atheism'))
		                 .sort_values(by='score', ascending=False).index.tolist()[:5],
		                 ['atheism', 'atheists', 'islam', 'atheist', 'belief'])
		self.assertGreater(len(corpus.get_texts()[0]), 5) 
Example #5
Source File: test_minhash_encoder.py    From dirty_cat with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_MinHashEncoder(n_sample=70, minmax_hash=False):
    X_txt = fetch_20newsgroups(subset='train')['data']
    X = X_txt[:n_sample]

    for minmax_hash in [True, False]:
        for hashing in ['fast', 'murmur']:

            if minmax_hash and hashing == 'murmur':
                pass # not implemented

            # Test output shape
            encoder = MinHashEncoder(n_components=50, hashing=hashing)
            encoder.fit(X)
            y = encoder.transform(X)
            assert y.shape == (n_sample, 50), str(y.shape)
            assert len(set(y[0])) == 50

            # Test same seed return the same output
            encoder = MinHashEncoder(50, hashing=hashing)
            encoder.fit(X)
            y2 = encoder.transform(X)
            np.testing.assert_array_equal(y, y2)

            # Test min property
            if not minmax_hash:
                X_substring = [x[:x.find(' ')] for x in X]
                encoder = MinHashEncoder(50, hashing=hashing)
                encoder.fit(X_substring)
                y_substring = encoder.transform(X_substring)
                np.testing.assert_array_less(y - y_substring, 0.0001) 
Example #6
Source File: _validateSchema.py    From nyoka with Apache License 2.0 6 votes vote down vote up
def test_validate_sklearn_sgd_with_text(self):
        categories = ['alt.atheism','talk.religion.misc']
        data = fetch_20newsgroups(subset='train', categories=categories)
        X = data.data[:4]
        Y = data.target[:4]
        features = ['input']
        target = 'output'
        model = SGDClassifier(loss="log")
        file_name = model.__class__.__name__ + '_TfIdfVec_.pmml'
        pipeline = Pipeline([
            ('vect', TfidfVectorizer()),
            ('clf', model)
        ])
        pipeline.fit(X, Y)
        skl_to_pmml(pipeline, features , target, file_name)
        self.assertEqual(self.schema.is_valid(file_name), True) 
Example #7
Source File: documents.py    From text_embedding with MIT License 6 votes vote down vote up
def ng(partitions=['train', 'test']):
  '''loads 20 NewsGroups topic classification dataset
  Args:
    partitions: component(s) of data to load; can be a string (for one partition) or list of strings
  Returns:
    ((list of documents, list of labels) for each partition)
  '''

  if type(partitions) == str:
    data = fetch_20newsgroups(subset=partitions)
    return data['data'], list(data['target'])
  output = []
  for partition in partitions:
    data = fetch_20newsgroups(subset=partition)
    output.append((data['data'], list(data['target'])))
  return output 
Example #8
Source File: test_split.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_20_newsgroups():
    data = fetch_20newsgroups()
    X, y = data.data, data.target
    r = dask_ml.model_selection.train_test_split(X, y)
    X_train, X_test, y_train, y_test = r
    for X in [X_train, X_test]:
        assert isinstance(X, list)
        assert isinstance(X[0], str)
    for y in [y_train, y_test]:
        assert isinstance(y, np.ndarray)
        assert y.dtype == int 
Example #9
Source File: 20newsgroup.py    From OpenNE with MIT License 5 votes vote down vote up
def fetch_data(path):
    from sklearn.datasets import fetch_20newsgroups
    categories = ['comp.graphics', 'rec.sport.baseball', 'talk.politics.guns']
    dataset = fetch_20newsgroups(path, categories=categories)
    return dataset 
Example #10
Source File: test_spacy_model_export.py    From mlflow with Apache License 2.0 5 votes vote down vote up
def _get_train_test_dataset(cats_to_fetch, limit=100):
    newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), shuffle=True,
                                    categories=cats_to_fetch)
    X = newsgroups.data[:limit]
    y = newsgroups.target[:limit]

    X = [six.text_type(x) for x in X]  # Ensure all strings to unicode for python 2.7 compatibility

    # Category 0 comp-graphic, 1 rec.sport baseball. We can threat it as a binary class.
    cats = [{"comp.graphics": not bool(el), "rec.sport.baseball": bool(el)} for el in y]

    split = int(len(X) * 0.8)
    return X[:split], cats[:split], X[split:], cats[split:] 
Example #11
Source File: datasets.py    From DEC-keras with MIT License 5 votes vote down vote up
def load_newsgroups():
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.datasets import fetch_20newsgroups
    newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

    vectorizer = TfidfVectorizer(max_features=2000, dtype=np.float64, sublinear_tf=True)
    x_sparse = vectorizer.fit_transform(newsgroups.data)
    x = np.asarray(x_sparse.todense())
    y = newsgroups.target
    print('News group data shape ', x.shape)
    print("News group number of clusters: ", np.unique(y).size)
    return x, y 
Example #12
Source File: classification.py    From text-analytics-with-python with Apache License 2.0 5 votes vote down vote up
def get_data():
    data = fetch_20newsgroups(subset='all',
                              shuffle=True,
                              remove=('headers', 'footers', 'quotes'))
    return data 
Example #13
Source File: test_sklearn_tfidf_vectorizer_converter_dataset.py    From sklearn-onnx with MIT License 5 votes vote down vote up
def test_tfidf_20newsgroups(self):
        data = fetch_20newsgroups()
        X, y = np.array(data.data)[:100], np.array(data.target)[:100]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.5, random_state=42)

        model = TfidfVectorizer().fit(X_train)
        onnx_model = convert_sklearn(
            model, 'cv', [('input', StringTensorType(X_test.shape))])
        dump_data_and_model(
            X_test, model, onnx_model,
            basename="SklearnTfidfVectorizer20newsgroups",
            allow_failure="StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.4.0')") 
Example #14
Source File: test_sklearn_tfidf_vectorizer_converter_dataset.py    From sklearn-onnx with MIT License 5 votes vote down vote up
def test_tfidf_20newsgroups_nolowercase(self):
        data = fetch_20newsgroups()
        X, y = np.array(data.data)[:100], np.array(data.target)[:100]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.5, random_state=42)

        model = TfidfVectorizer(lowercase=False).fit(X_train)
        onnx_model = convert_sklearn(
            model, 'cv', [('input', StringTensorType(X_test.shape))])
        dump_data_and_model(
            X_test, model, onnx_model,
            basename="SklearnTfidfVectorizer20newsgroupsNOLower",
            allow_failure="StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.4.0')") 
Example #15
Source File: test_sklearn_tfidf_transformer_converter_sparse.py    From sklearn-onnx with MIT License 5 votes vote down vote up
def test_model_tfidf_transform_bug(self):
        categories = [
            "alt.atheism",
            "soc.religion.christian",
            "comp.graphics",
            "sci.med",
        ]
        twenty_train = fetch_20newsgroups(subset="train",
                                          categories=categories,
                                          shuffle=True,
                                          random_state=0)
        text_clf = Pipeline([("vect", CountVectorizer()),
                             ("tfidf", TfidfTransformer())])
        twenty_train.data[0] = "bruît " + twenty_train.data[0]
        text_clf.fit(twenty_train.data, twenty_train.target)
        model_onnx = convert_sklearn(
            text_clf,
            name="DocClassifierCV-Tfidf",
            initial_types=[("input", StringTensorType([5]))],
        )
        dump_data_and_model(
            twenty_train.data[5:10],
            text_clf,
            model_onnx,
            basename="SklearnPipelineTfidfTransformer",
            # Operator mul is not implemented in onnxruntime
            allow_failure="StrictVersion(onnx.__version__)"
                          " <= StrictVersion('1.5')",
        ) 
Example #16
Source File: test_sklearn_documentation.py    From sklearn-onnx with MIT License 5 votes vote down vote up
def test_pipeline_tfidf(self):
        categories = ["alt.atheism", "talk.religion.misc"]
        train = fetch_20newsgroups(random_state=1,
                                   subset="test",
                                   categories=categories)
        train_data = SubjectBodyExtractor().fit_transform(train.data)
        tfi = TfidfVectorizer(min_df=30)
        tdata = train_data[:300, :1]
        tfi.fit(tdata.ravel())
        extra = {
            TfidfVectorizer: {
                "separators": [
                    " ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)"
                ]
            }
        }
        model_onnx = convert_sklearn(
            tfi,
            "tfidf",
            initial_types=[("input", StringTensorType([1]))],
            options=extra,
        )
        dump_data_and_model(
            tdata[:5],
            tfi,
            model_onnx,
            basename="SklearnDocumentationTfIdf-OneOff-SklCol",
            allow_failure="StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.4.0')",
        ) 
Example #17
Source File: graphs.py    From text-gcn-chainer with Creative Commons Zero v1.0 Universal 5 votes vote down vote up
def load_20newsgroups(validation_ratio, normalization):
    """Load text network (20 news group)

    Arguments:
        validation_ratio (float): Ratio of validation split
        normalization (str): Variant of normalization method to use.

    Returns:
        adj (chainer.utils.sparse.CooMatrix): (Node, Node) shape
            normalized adjency matrix.
        labels (np.ndarray): (Node, ) shape labels array
        idx_train (np.ndarray): Indices of the train
        idx_val (np.ndarray): Indices of val array
        idx_test (np.ndarray): Indices of test array
    """
    train = fetch_20newsgroups(subset='train')
    test = fetch_20newsgroups(subset='test')
    adj = create_text_adjacency_matrix(
        [tokenize(t) for t in (train['data'] + test['data'])])
    if normalization == 'gcn':
        adj = normalize(adj)
    else:
        adj = normalize_pygcn(adj)
    n_train = int(len(train['data']) * (1.0 - validation_ratio))
    n_all = len(train['data']) + len(test['data'])
    idx_train = np.array(list(range(n_train)), np.int32)
    idx_val = np.array(list(range(n_train, len(train['data']))), np.int32)
    idx_test = np.array(list(range(len(train['data']), n_all)), np.int32)

    labels = np.concatenate(
        (train['target'], test['target'], np.full([adj.shape[0] - n_all], -1)))
    labels = labels.astype(np.int32)
    adj = to_chainer_sparse_variable(adj)

    return adj, labels, idx_train, idx_val, idx_test 
Example #18
Source File: common_utils.py    From interpret-community with MIT License 5 votes vote down vote up
def create_binary_sparse_newsgroups_data():
    categories = ['alt.atheism', 'soc.religion.christian']
    newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
    newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
    class_names = ['atheism', 'christian']
    x_train = newsgroups_train.data
    x_test = newsgroups_test.data
    y_train = newsgroups_train.target
    y_validation = newsgroups_test.target
    from sklearn.feature_extraction.text import HashingVectorizer
    vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,
                                   n_features=2**16)
    x_train = vectorizer.transform(x_train)
    x_test = vectorizer.transform(x_test)
    return x_train, x_test, y_train, y_validation, class_names, vectorizer 
Example #19
Source File: common_utils.py    From interpret-community with MIT License 5 votes vote down vote up
def create_multiclass_sparse_newsgroups_data():
    remove = ('headers', 'footers', 'quotes')
    categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
    from sklearn.datasets import fetch_20newsgroups
    ngroups = fetch_20newsgroups(subset='train', categories=categories,
                                 shuffle=True, random_state=42, remove=remove)
    x_train, x_test, y_train, y_validation = train_test_split(ngroups.data, ngroups.target,
                                                              test_size=0.02, random_state=42)
    from sklearn.feature_extraction.text import HashingVectorizer
    vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,
                                   n_features=2**16)
    x_train = vectorizer.transform(x_train)
    x_test = vectorizer.transform(x_test)
    return x_train, x_test, y_train, y_validation, categories, vectorizer 
Example #20
Source File: datasets.py    From MLPrimitives with MIT License 5 votes vote down vote up
def load_newsgroups():
    """20 News Groups dataset.

    The data of this dataset is a 1d numpy array vector containing the texts
    from 11314 newsgroups posts, and the target is a 1d numpy integer array
    containing the label of one of the 20 topics that they are about.
    """
    dataset = datasets.fetch_20newsgroups()
    return Dataset(load_newsgroups.__doc__, np.array(dataset.data), dataset.target,
                   accuracy_score, 'text', 'classification', 'multiclass', stratify=True) 
Example #21
Source File: test_20news.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_20news():
    try:
        data = datasets.fetch_20newsgroups(
            subset='all', download_if_missing=False, shuffle=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # Extract a reduced dataset
    data2cats = datasets.fetch_20newsgroups(
        subset='all', categories=data.target_names[-1:-3:-1], shuffle=False)
    # Check that the ordering of the target_names is the same
    # as the ordering in the full dataset
    assert_equal(data2cats.target_names,
                 data.target_names[-2:])
    # Assert that we have only 0 and 1 as labels
    assert_equal(np.unique(data2cats.target).tolist(), [0, 1])

    # Check that the number of filenames is consistent with data/target
    assert_equal(len(data2cats.filenames), len(data2cats.target))
    assert_equal(len(data2cats.filenames), len(data2cats.data))

    # Check that the first entry of the reduced dataset corresponds to
    # the first entry of the corresponding category in the full dataset
    entry1 = data2cats.data[0]
    category = data2cats.target_names[data2cats.target[0]]
    label = data.target_names.index(category)
    entry2 = data.data[np.where(data.target == label)[0][0]]
    assert_equal(entry1, entry2) 
Example #22
Source File: test_20news.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_20news_length_consistency():
    """Checks the length consistencies within the bunch

    This is a non-regression test for a bug present in 0.16.1.
    """
    try:
        data = datasets.fetch_20newsgroups(
            subset='all', download_if_missing=False, shuffle=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")
    # Extract the full dataset
    data = datasets.fetch_20newsgroups(subset='all')
    assert_equal(len(data['data']), len(data.data))
    assert_equal(len(data['target']), len(data.target))
    assert_equal(len(data['filenames']), len(data.filenames)) 
Example #23
Source File: test_20news.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_20news_vectorized():
    try:
        datasets.fetch_20newsgroups(subset='all',
                                    download_if_missing=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # test subset = train
    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314, 130107))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    # test subset = test
    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (7532, 130107))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    # test subset = all
    bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314 + 7532, 130107))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64) 
Example #24
Source File: common_utils.py    From interpret-text with MIT License 5 votes vote down vote up
def create_binary_newsgroups_data():
    categories = ["alt.atheism", "soc.religion.christian"]
    newsgroups_train = fetch_20newsgroups(subset="train", categories=categories)
    newsgroups_test = fetch_20newsgroups(subset="test", categories=categories)
    class_names = ["atheism", "christian"]
    return newsgroups_train, newsgroups_test, class_names 
Example #25
Source File: test_fast_hash.py    From dirty_cat with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_fast_hash():

    from sklearn import datasets
    data = datasets.fetch_20newsgroups()
    a = data.data[0]

    min_hash = ngram_min_hash(a, seed=0)
    min_hash2 = ngram_min_hash(a, seed=0)
    assert min_hash == min_hash2

    list_min_hash = [ngram_min_hash(a, seed=seed) for seed in range(50)]
    assert len(set(list_min_hash)) > 45, 'Too many hash collisions'

    min_hash4 = ngram_min_hash(a, seed=0, return_minmax=True)
    assert len(min_hash4) == 2 
Example #26
Source File: ctm.py    From pgmult with MIT License 5 votes vote down vote up
def load_newsgroup_data(V, cats, sort_data=True):
    from sklearn.datasets import fetch_20newsgroups
    print("Downloading newsgroups data...")
    print('cats = %s' % cats)
    newsgroups = fetch_20newsgroups(
        subset="train", categories=cats, remove=('headers', 'footers', 'quotes'))
    return get_sparse_repr(newsgroups.data, V, sort_data) 
Example #27
Source File: cnn_text_util.py    From opentc with MIT License 5 votes vote down vote up
def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
    """
    Retrieve data from 20 newsgroups
    :param subset: train, test or all
    :param categories: List of newsgroup name
    :param shuffle: shuffle the list or not
    :param random_state: seed integer to shuffle the dataset
    :return: data and labels of the newsgroup
    """
    datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
    return datasets 
Example #28
Source File: twenty_newsgroup.py    From opentc with MIT License 5 votes vote down vote up
def __init__(self, cfg=None):
        super().__init__()
        self.__dataset__ = fetch_20newsgroups(subset=cfg['subset'], categories=cfg['categories'],
                                              shuffle=cfg['shuffle'], random_state=cfg['random_state']) 
Example #29
Source File: data_helpers.py    From text-classification with Apache License 2.0 5 votes vote down vote up
def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
    """
    Retrieve data from 20 newsgroups
    :param subset: train, test or all
    :param categories: List of newsgroup name
    :param shuffle: shuffle the list or not
    :param random_state: seed integer to shuffle the dataset
    :return: data and labels of the newsgroup
    """
    datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
    return datasets 
Example #30
Source File: test_20news.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_20news():
    try:
        data = datasets.fetch_20newsgroups(
            subset='all', download_if_missing=False, shuffle=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # Extract a reduced dataset
    data2cats = datasets.fetch_20newsgroups(
        subset='all', categories=data.target_names[-1:-3:-1], shuffle=False)
    # Check that the ordering of the target_names is the same
    # as the ordering in the full dataset
    assert_equal(data2cats.target_names,
                 data.target_names[-2:])
    # Assert that we have only 0 and 1 as labels
    assert_equal(np.unique(data2cats.target).tolist(), [0, 1])

    # Check that the number of filenames is consistent with data/target
    assert_equal(len(data2cats.filenames), len(data2cats.target))
    assert_equal(len(data2cats.filenames), len(data2cats.data))

    # Check that the first entry of the reduced dataset corresponds to
    # the first entry of the corresponding category in the full dataset
    entry1 = data2cats.data[0]
    category = data2cats.target_names[data2cats.target[0]]
    label = data.target_names.index(category)
    entry2 = data.data[np.where(data.target == label)[0][0]]
    assert_equal(entry1, entry2)