Python sklearn.datasets.fetch_20newsgroups() Examples
The following are 30
code examples of sklearn.datasets.fetch_20newsgroups().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.datasets
, or try the search function
.
Example #1
Source File: datasets.py From flambe with MIT License | 10 votes |
def __init__(self, cache: bool = False, transform: Dict[str, Union[Field, Dict]] = None) -> None: """Initialize the NewsGroupDataset builtin.""" try: from sklearn.datasets import fetch_20newsgroups except ImportError: raise ImportError("Install sklearn to use the NewsGroupDataset") train = fetch_20newsgroups(subset='train') test = fetch_20newsgroups(subset='test') train = [(' '.join(d.split()), str(t)) for d, t in zip(train['data'], train['target'])] test = [(' '.join(d.split()), str(t)) for d, t in zip(test['data'], test['target'])] named_cols = ['text', 'label'] super().__init__( train=train, val=None, test=test, cache=cache, named_columns=named_cols, transform=transform )
Example #2
Source File: DataModule.py From sgd-influence with MIT License | 8 votes |
def load(self): categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware'] newsgroups_train = fetch_20newsgroups( subset='train', remove=('headers', 'footers', 'quotes'), categories=categories) newsgroups_test = fetch_20newsgroups( subset='test', remove=('headers', 'footers', 'quotes'), categories=categories) vectorizer = TfidfVectorizer(stop_words='english', min_df=0.001, max_df=0.20) vectors = vectorizer.fit_transform(newsgroups_train.data) vectors_test = vectorizer.transform(newsgroups_test.data) x1 = vectors y1 = newsgroups_train.target x2 = vectors_test y2 = newsgroups_test.target x = np.array(np.r_[x1.todense(), x2.todense()]) y = np.r_[y1, y2] return x, y
Example #3
Source File: _validateSchema.py From nyoka with Apache License 2.0 | 7 votes |
def test_validate_sklearn_sgd_with_text_cv(self): categories = ['alt.atheism','talk.religion.misc'] data = fetch_20newsgroups(subset='train', categories=categories) X = data.data[:4] Y = data.target[:4] features = ['input'] target = 'output' model = SGDClassifier(loss="log") file_name = model.__class__.__name__ + '_CountVec_.pmml' pipeline = Pipeline([ ('vect', CountVectorizer()), ('clf', model) ]) pipeline.fit(X, Y) skl_to_pmml(pipeline, features , target, file_name) self.assertEqual(self.schema.is_valid(file_name), True)
Example #4
Source File: test_corpusFromScikit.py From scattertext with Apache License 2.0 | 6 votes |
def _te_ss_t_build(self): from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) count_vectorizer = CountVectorizer() X_counts = count_vectorizer.fit_transform(newsgroups_train.data) corpus = CorpusFromScikit( X=X_counts, y=newsgroups_train.target, feature_vocabulary=count_vectorizer.vocabulary_, category_names=newsgroups_train.target_names, raw_texts=newsgroups_train.data ).build() self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics']) self.assertEqual(corpus .get_term_freq_df() .assign(score=corpus.get_scaled_f_scores('alt.atheism')) .sort_values(by='score', ascending=False).index.tolist()[:5], ['atheism', 'atheists', 'islam', 'atheist', 'belief']) self.assertGreater(len(corpus.get_texts()[0]), 5)
Example #5
Source File: test_minhash_encoder.py From dirty_cat with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_MinHashEncoder(n_sample=70, minmax_hash=False): X_txt = fetch_20newsgroups(subset='train')['data'] X = X_txt[:n_sample] for minmax_hash in [True, False]: for hashing in ['fast', 'murmur']: if minmax_hash and hashing == 'murmur': pass # not implemented # Test output shape encoder = MinHashEncoder(n_components=50, hashing=hashing) encoder.fit(X) y = encoder.transform(X) assert y.shape == (n_sample, 50), str(y.shape) assert len(set(y[0])) == 50 # Test same seed return the same output encoder = MinHashEncoder(50, hashing=hashing) encoder.fit(X) y2 = encoder.transform(X) np.testing.assert_array_equal(y, y2) # Test min property if not minmax_hash: X_substring = [x[:x.find(' ')] for x in X] encoder = MinHashEncoder(50, hashing=hashing) encoder.fit(X_substring) y_substring = encoder.transform(X_substring) np.testing.assert_array_less(y - y_substring, 0.0001)
Example #6
Source File: _validateSchema.py From nyoka with Apache License 2.0 | 6 votes |
def test_validate_sklearn_sgd_with_text(self): categories = ['alt.atheism','talk.religion.misc'] data = fetch_20newsgroups(subset='train', categories=categories) X = data.data[:4] Y = data.target[:4] features = ['input'] target = 'output' model = SGDClassifier(loss="log") file_name = model.__class__.__name__ + '_TfIdfVec_.pmml' pipeline = Pipeline([ ('vect', TfidfVectorizer()), ('clf', model) ]) pipeline.fit(X, Y) skl_to_pmml(pipeline, features , target, file_name) self.assertEqual(self.schema.is_valid(file_name), True)
Example #7
Source File: documents.py From text_embedding with MIT License | 6 votes |
def ng(partitions=['train', 'test']): '''loads 20 NewsGroups topic classification dataset Args: partitions: component(s) of data to load; can be a string (for one partition) or list of strings Returns: ((list of documents, list of labels) for each partition) ''' if type(partitions) == str: data = fetch_20newsgroups(subset=partitions) return data['data'], list(data['target']) output = [] for partition in partitions: data = fetch_20newsgroups(subset=partition) output.append((data['data'], list(data['target']))) return output
Example #8
Source File: test_split.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_20_newsgroups(): data = fetch_20newsgroups() X, y = data.data, data.target r = dask_ml.model_selection.train_test_split(X, y) X_train, X_test, y_train, y_test = r for X in [X_train, X_test]: assert isinstance(X, list) assert isinstance(X[0], str) for y in [y_train, y_test]: assert isinstance(y, np.ndarray) assert y.dtype == int
Example #9
Source File: 20newsgroup.py From OpenNE with MIT License | 5 votes |
def fetch_data(path): from sklearn.datasets import fetch_20newsgroups categories = ['comp.graphics', 'rec.sport.baseball', 'talk.politics.guns'] dataset = fetch_20newsgroups(path, categories=categories) return dataset
Example #10
Source File: test_spacy_model_export.py From mlflow with Apache License 2.0 | 5 votes |
def _get_train_test_dataset(cats_to_fetch, limit=100): newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), shuffle=True, categories=cats_to_fetch) X = newsgroups.data[:limit] y = newsgroups.target[:limit] X = [six.text_type(x) for x in X] # Ensure all strings to unicode for python 2.7 compatibility # Category 0 comp-graphic, 1 rec.sport baseball. We can threat it as a binary class. cats = [{"comp.graphics": not bool(el), "rec.sport.baseball": bool(el)} for el in y] split = int(len(X) * 0.8) return X[:split], cats[:split], X[split:], cats[split:]
Example #11
Source File: datasets.py From DEC-keras with MIT License | 5 votes |
def load_newsgroups(): from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.datasets import fetch_20newsgroups newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer(max_features=2000, dtype=np.float64, sublinear_tf=True) x_sparse = vectorizer.fit_transform(newsgroups.data) x = np.asarray(x_sparse.todense()) y = newsgroups.target print('News group data shape ', x.shape) print("News group number of clusters: ", np.unique(y).size) return x, y
Example #12
Source File: classification.py From text-analytics-with-python with Apache License 2.0 | 5 votes |
def get_data(): data = fetch_20newsgroups(subset='all', shuffle=True, remove=('headers', 'footers', 'quotes')) return data
Example #13
Source File: test_sklearn_tfidf_vectorizer_converter_dataset.py From sklearn-onnx with MIT License | 5 votes |
def test_tfidf_20newsgroups(self): data = fetch_20newsgroups() X, y = np.array(data.data)[:100], np.array(data.target)[:100] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=42) model = TfidfVectorizer().fit(X_train) onnx_model = convert_sklearn( model, 'cv', [('input', StringTensorType(X_test.shape))]) dump_data_and_model( X_test, model, onnx_model, basename="SklearnTfidfVectorizer20newsgroups", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')")
Example #14
Source File: test_sklearn_tfidf_vectorizer_converter_dataset.py From sklearn-onnx with MIT License | 5 votes |
def test_tfidf_20newsgroups_nolowercase(self): data = fetch_20newsgroups() X, y = np.array(data.data)[:100], np.array(data.target)[:100] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=42) model = TfidfVectorizer(lowercase=False).fit(X_train) onnx_model = convert_sklearn( model, 'cv', [('input', StringTensorType(X_test.shape))]) dump_data_and_model( X_test, model, onnx_model, basename="SklearnTfidfVectorizer20newsgroupsNOLower", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')")
Example #15
Source File: test_sklearn_tfidf_transformer_converter_sparse.py From sklearn-onnx with MIT License | 5 votes |
def test_model_tfidf_transform_bug(self): categories = [ "alt.atheism", "soc.religion.christian", "comp.graphics", "sci.med", ] twenty_train = fetch_20newsgroups(subset="train", categories=categories, shuffle=True, random_state=0) text_clf = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer())]) twenty_train.data[0] = "bruît " + twenty_train.data[0] text_clf.fit(twenty_train.data, twenty_train.target) model_onnx = convert_sklearn( text_clf, name="DocClassifierCV-Tfidf", initial_types=[("input", StringTensorType([5]))], ) dump_data_and_model( twenty_train.data[5:10], text_clf, model_onnx, basename="SklearnPipelineTfidfTransformer", # Operator mul is not implemented in onnxruntime allow_failure="StrictVersion(onnx.__version__)" " <= StrictVersion('1.5')", )
Example #16
Source File: test_sklearn_documentation.py From sklearn-onnx with MIT License | 5 votes |
def test_pipeline_tfidf(self): categories = ["alt.atheism", "talk.religion.misc"] train = fetch_20newsgroups(random_state=1, subset="test", categories=categories) train_data = SubjectBodyExtractor().fit_transform(train.data) tfi = TfidfVectorizer(min_df=30) tdata = train_data[:300, :1] tfi.fit(tdata.ravel()) extra = { TfidfVectorizer: { "separators": [ " ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)" ] } } model_onnx = convert_sklearn( tfi, "tfidf", initial_types=[("input", StringTensorType([1]))], options=extra, ) dump_data_and_model( tdata[:5], tfi, model_onnx, basename="SklearnDocumentationTfIdf-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')", )
Example #17
Source File: graphs.py From text-gcn-chainer with Creative Commons Zero v1.0 Universal | 5 votes |
def load_20newsgroups(validation_ratio, normalization): """Load text network (20 news group) Arguments: validation_ratio (float): Ratio of validation split normalization (str): Variant of normalization method to use. Returns: adj (chainer.utils.sparse.CooMatrix): (Node, Node) shape normalized adjency matrix. labels (np.ndarray): (Node, ) shape labels array idx_train (np.ndarray): Indices of the train idx_val (np.ndarray): Indices of val array idx_test (np.ndarray): Indices of test array """ train = fetch_20newsgroups(subset='train') test = fetch_20newsgroups(subset='test') adj = create_text_adjacency_matrix( [tokenize(t) for t in (train['data'] + test['data'])]) if normalization == 'gcn': adj = normalize(adj) else: adj = normalize_pygcn(adj) n_train = int(len(train['data']) * (1.0 - validation_ratio)) n_all = len(train['data']) + len(test['data']) idx_train = np.array(list(range(n_train)), np.int32) idx_val = np.array(list(range(n_train, len(train['data']))), np.int32) idx_test = np.array(list(range(len(train['data']), n_all)), np.int32) labels = np.concatenate( (train['target'], test['target'], np.full([adj.shape[0] - n_all], -1))) labels = labels.astype(np.int32) adj = to_chainer_sparse_variable(adj) return adj, labels, idx_train, idx_val, idx_test
Example #18
Source File: common_utils.py From interpret-community with MIT License | 5 votes |
def create_binary_sparse_newsgroups_data(): categories = ['alt.atheism', 'soc.religion.christian'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) class_names = ['atheism', 'christian'] x_train = newsgroups_train.data x_test = newsgroups_test.data y_train = newsgroups_train.target y_validation = newsgroups_test.target from sklearn.feature_extraction.text import HashingVectorizer vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, n_features=2**16) x_train = vectorizer.transform(x_train) x_test = vectorizer.transform(x_test) return x_train, x_test, y_train, y_validation, class_names, vectorizer
Example #19
Source File: common_utils.py From interpret-community with MIT License | 5 votes |
def create_multiclass_sparse_newsgroups_data(): remove = ('headers', 'footers', 'quotes') categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] from sklearn.datasets import fetch_20newsgroups ngroups = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42, remove=remove) x_train, x_test, y_train, y_validation = train_test_split(ngroups.data, ngroups.target, test_size=0.02, random_state=42) from sklearn.feature_extraction.text import HashingVectorizer vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, n_features=2**16) x_train = vectorizer.transform(x_train) x_test = vectorizer.transform(x_test) return x_train, x_test, y_train, y_validation, categories, vectorizer
Example #20
Source File: datasets.py From MLPrimitives with MIT License | 5 votes |
def load_newsgroups(): """20 News Groups dataset. The data of this dataset is a 1d numpy array vector containing the texts from 11314 newsgroups posts, and the target is a 1d numpy integer array containing the label of one of the 20 topics that they are about. """ dataset = datasets.fetch_20newsgroups() return Dataset(load_newsgroups.__doc__, np.array(dataset.data), dataset.target, accuracy_score, 'text', 'classification', 'multiclass', stratify=True)
Example #21
Source File: test_20news.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_20news(): try: data = datasets.fetch_20newsgroups( subset='all', download_if_missing=False, shuffle=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # Extract a reduced dataset data2cats = datasets.fetch_20newsgroups( subset='all', categories=data.target_names[-1:-3:-1], shuffle=False) # Check that the ordering of the target_names is the same # as the ordering in the full dataset assert_equal(data2cats.target_names, data.target_names[-2:]) # Assert that we have only 0 and 1 as labels assert_equal(np.unique(data2cats.target).tolist(), [0, 1]) # Check that the number of filenames is consistent with data/target assert_equal(len(data2cats.filenames), len(data2cats.target)) assert_equal(len(data2cats.filenames), len(data2cats.data)) # Check that the first entry of the reduced dataset corresponds to # the first entry of the corresponding category in the full dataset entry1 = data2cats.data[0] category = data2cats.target_names[data2cats.target[0]] label = data.target_names.index(category) entry2 = data.data[np.where(data.target == label)[0][0]] assert_equal(entry1, entry2)
Example #22
Source File: test_20news.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_20news_length_consistency(): """Checks the length consistencies within the bunch This is a non-regression test for a bug present in 0.16.1. """ try: data = datasets.fetch_20newsgroups( subset='all', download_if_missing=False, shuffle=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # Extract the full dataset data = datasets.fetch_20newsgroups(subset='all') assert_equal(len(data['data']), len(data.data)) assert_equal(len(data['target']), len(data.target)) assert_equal(len(data['filenames']), len(data.filenames))
Example #23
Source File: test_20news.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_20news_vectorized(): try: datasets.fetch_20newsgroups(subset='all', download_if_missing=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # test subset = train bunch = datasets.fetch_20newsgroups_vectorized(subset="train") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (11314, 130107)) assert_equal(bunch.target.shape[0], 11314) assert_equal(bunch.data.dtype, np.float64) # test subset = test bunch = datasets.fetch_20newsgroups_vectorized(subset="test") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (7532, 130107)) assert_equal(bunch.target.shape[0], 7532) assert_equal(bunch.data.dtype, np.float64) # test subset = all bunch = datasets.fetch_20newsgroups_vectorized(subset='all') assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (11314 + 7532, 130107)) assert_equal(bunch.target.shape[0], 11314 + 7532) assert_equal(bunch.data.dtype, np.float64)
Example #24
Source File: common_utils.py From interpret-text with MIT License | 5 votes |
def create_binary_newsgroups_data(): categories = ["alt.atheism", "soc.religion.christian"] newsgroups_train = fetch_20newsgroups(subset="train", categories=categories) newsgroups_test = fetch_20newsgroups(subset="test", categories=categories) class_names = ["atheism", "christian"] return newsgroups_train, newsgroups_test, class_names
Example #25
Source File: test_fast_hash.py From dirty_cat with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_fast_hash(): from sklearn import datasets data = datasets.fetch_20newsgroups() a = data.data[0] min_hash = ngram_min_hash(a, seed=0) min_hash2 = ngram_min_hash(a, seed=0) assert min_hash == min_hash2 list_min_hash = [ngram_min_hash(a, seed=seed) for seed in range(50)] assert len(set(list_min_hash)) > 45, 'Too many hash collisions' min_hash4 = ngram_min_hash(a, seed=0, return_minmax=True) assert len(min_hash4) == 2
Example #26
Source File: ctm.py From pgmult with MIT License | 5 votes |
def load_newsgroup_data(V, cats, sort_data=True): from sklearn.datasets import fetch_20newsgroups print("Downloading newsgroups data...") print('cats = %s' % cats) newsgroups = fetch_20newsgroups( subset="train", categories=cats, remove=('headers', 'footers', 'quotes')) return get_sparse_repr(newsgroups.data, V, sort_data)
Example #27
Source File: cnn_text_util.py From opentc with MIT License | 5 votes |
def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42): """ Retrieve data from 20 newsgroups :param subset: train, test or all :param categories: List of newsgroup name :param shuffle: shuffle the list or not :param random_state: seed integer to shuffle the dataset :return: data and labels of the newsgroup """ datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state) return datasets
Example #28
Source File: twenty_newsgroup.py From opentc with MIT License | 5 votes |
def __init__(self, cfg=None): super().__init__() self.__dataset__ = fetch_20newsgroups(subset=cfg['subset'], categories=cfg['categories'], shuffle=cfg['shuffle'], random_state=cfg['random_state'])
Example #29
Source File: data_helpers.py From text-classification with Apache License 2.0 | 5 votes |
def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42): """ Retrieve data from 20 newsgroups :param subset: train, test or all :param categories: List of newsgroup name :param shuffle: shuffle the list or not :param random_state: seed integer to shuffle the dataset :return: data and labels of the newsgroup """ datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state) return datasets
Example #30
Source File: test_20news.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_20news(): try: data = datasets.fetch_20newsgroups( subset='all', download_if_missing=False, shuffle=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # Extract a reduced dataset data2cats = datasets.fetch_20newsgroups( subset='all', categories=data.target_names[-1:-3:-1], shuffle=False) # Check that the ordering of the target_names is the same # as the ordering in the full dataset assert_equal(data2cats.target_names, data.target_names[-2:]) # Assert that we have only 0 and 1 as labels assert_equal(np.unique(data2cats.target).tolist(), [0, 1]) # Check that the number of filenames is consistent with data/target assert_equal(len(data2cats.filenames), len(data2cats.target)) assert_equal(len(data2cats.filenames), len(data2cats.data)) # Check that the first entry of the reduced dataset corresponds to # the first entry of the corresponding category in the full dataset entry1 = data2cats.data[0] category = data2cats.target_names[data2cats.target[0]] label = data.target_names.index(category) entry2 = data.data[np.where(data.target == label)[0][0]] assert_equal(entry1, entry2)