Python sklearn.utils.shuffle() Examples
The following are 30
code examples of sklearn.utils.shuffle().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.utils
, or try the search function
.
Example #1
Source File: test_data.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_power_transformer_nans(method): # Make sure lambda estimation is not influenced by NaN values # and that transform() supports NaN silently X = np.abs(X_1col) pt = PowerTransformer(method=method) pt.fit(X) lmbda_no_nans = pt.lambdas_[0] # concat nans at the end and check lambda stays the same X = np.concatenate([X, np.full_like(X, np.nan)]) X = shuffle(X, random_state=0) pt.fit(X) lmbda_nans = pt.lambdas_[0] assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5) X_trans = pt.transform(X) assert_array_equal(np.isnan(X_trans), np.isnan(X))
Example #2
Source File: vqc.py From qiskit-aqua with Apache License 2.0 | 6 votes |
def batch_data(self, data, labels=None, minibatch_size=-1): """ batch data """ label_batches = None if 0 < minibatch_size < len(data): batch_size = min(minibatch_size, len(data)) if labels is not None: shuffled_samples, shuffled_labels = shuffle(data, labels, random_state=aqua_globals.random_seed) label_batches = np.array_split(shuffled_labels, batch_size) else: shuffled_samples = shuffle(data, random_state=aqua_globals.random_seed) batches = np.array_split(shuffled_samples, batch_size) else: batches = np.asarray([data]) label_batches = np.asarray([labels]) return batches, label_batches
Example #3
Source File: helper.py From Kitchen2D with MIT License | 6 votes |
def gen_biased_data(func, pos_ratio, N): ''' Generate N data points on function func, with pos_ratio percentage of the data points to have a positive label. ''' pos = [] neg = [] i = 0 while len(pos) < pos_ratio * N or len(neg) < N - pos_ratio * N: x = np.random.uniform(func.x_range[0], func.x_range[1]) y = func(x) if y > 0: if len(pos) < pos_ratio * N: pos.append(np.hstack((x, y))) elif len(neg) < N - pos_ratio * N: neg.append(np.hstack((x, y))) xy = np.vstack((pos, neg)) xy = shuffle(xy) return xy[:, :-1], xy[:, -1]
Example #4
Source File: train.py From models with MIT License | 6 votes |
def run_epoch(): for xmb, mmb, ymb in iter_data(*shuffle(trX, trM, trYt, random_state=np.random), n_batch=n_batch_train, truncate=True, verbose=True): global n_updates XMB = model.xp.asarray(xmb) YMB = model.xp.asarray(ymb) MMB = model.xp.asarray(mmb) h = model(XMB) lm_logits = lm_head(h) clf_logits = clf_head(h, XMB) compute_loss_fct(XMB, YMB, MMB, clf_logits, lm_logits) n_updates += 1 if n_updates in [ 1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0: log()
Example #5
Source File: spec.py From BirdCLEF-Baseline with MIT License | 6 votes |
def getSpecs(path): specs = [] noise = [] # Get mel-specs for file for spec in audio.specsFromFile(path, rate=cfg.SAMPLE_RATE, seconds=cfg.SPEC_LENGTH, overlap=cfg.SPEC_OVERLAP, minlen=cfg.SPEC_MINLEN, fmin=cfg.SPEC_FMIN, fmax=cfg.SPEC_FMAX, spec_type=cfg.SPEC_TYPE, shape=(cfg.IM_SIZE[1], cfg.IM_SIZE[0])): # Determine signal to noise ratio s2n = audio.signal2noise(spec) specs.append(spec) noise.append(s2n) # Shuffle arrays (we want to select randomly later) specs, noise = shuffle(specs, noise, random_state=RANDOM) return specs, noise
Example #6
Source File: functional_autoencoder_test.py From FATE with Apache License 2.0 | 6 votes |
def getKaggleMNIST(file_path): # MNIST data: # column 0 is labels # column 1-785 is data, with values 0 .. 255 # total size of CSV: (42000, 1, 28, 28) train = pd.read_csv(file_path) train = train.as_matrix() train = shuffle(train) Xtrain = train[:-1000, 1:] / 255 Ytrain = train[:-1000, 0].astype(np.int32) Xtest = train[-1000:, 1:] / 255 Ytest = train[-1000:, 0].astype(np.int32) return Xtrain, Ytrain, Xtest, Ytest
Example #7
Source File: iterators.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def reset(self): """Resets the iterator to the beginning of the data.""" self.curr_idx = 0 #shuffle data in each bucket random.shuffle(self.idx) for i, buck in enumerate(self.sentences): self.indices[i], self.sentences[i], self.characters[i], self.label[i] = shuffle(self.indices[i], self.sentences[i], self.characters[i], self.label[i]) self.ndindex = [] self.ndsent = [] self.ndchar = [] self.ndlabel = [] #for each bucket of data for i, buck in enumerate(self.sentences): #append the lists with an array self.ndindex.append(ndarray.array(self.indices[i], dtype=self.dtype)) self.ndsent.append(ndarray.array(self.sentences[i], dtype=self.dtype)) self.ndchar.append(ndarray.array(self.characters[i], dtype=self.dtype)) self.ndlabel.append(ndarray.array(self.label[i], dtype=self.dtype))
Example #8
Source File: test_combination.py From pyod with BSD 2-Clause "Simplified" License | 6 votes |
def test_aom_static_norepeat(self): score = aom(self.scores, 3, method='static', bootstrap_estimators=False, random_state=42) assert_equal(score.shape, (4,)) shuffled_list = shuffle(list(range(0, 6, 1)), random_state=42) manual_scores = np.zeros([4, 3]) manual_scores[:, 0] = np.max(self.scores[:, shuffled_list[0:2]], axis=1) manual_scores[:, 1] = np.max(self.scores[:, shuffled_list[2:4]], axis=1) manual_scores[:, 2] = np.max(self.scores[:, shuffled_list[4:6]], axis=1) manual_score = np.mean(manual_scores, axis=1) assert_array_equal(score, manual_score)
Example #9
Source File: test_combination.py From pyod with BSD 2-Clause "Simplified" License | 6 votes |
def test_moa_static_norepeat(self): score = moa(self.scores, 3, method='static', bootstrap_estimators=False, random_state=42) assert_equal(score.shape, (4,)) shuffled_list = shuffle(list(range(0, 6, 1)), random_state=42) manual_scores = np.zeros([4, 3]) manual_scores[:, 0] = np.mean(self.scores[:, shuffled_list[0:2]], axis=1) manual_scores[:, 1] = np.mean(self.scores[:, shuffled_list[2:4]], axis=1) manual_scores[:, 2] = np.mean(self.scores[:, shuffled_list[4:6]], axis=1) manual_score = np.max(manual_scores, axis=1) assert_array_equal(score, manual_score)
Example #10
Source File: lda_model.py From redshells with MIT License | 6 votes |
def fit(self, texts: List[List[str]], adjust_passes=True, test_size=0.1, random_state=123, dictionary: Optional[gensim.corpora.Dictionary] = None) -> None: texts = shuffle(texts) dictionary = dictionary or self._make_dictionary(texts) corpus = self._make_corpus(texts=texts, dictionary=dictionary) train, test = train_test_split(corpus, test_size=test_size, random_state=random_state) passes = np.clip(int(round(100000 / (len(corpus) + 1))), 1, 20) if adjust_passes else 1 self._lda = gensim.models.LdaModel( alpha='auto', corpus=train, num_topics=self.n_topics, id2word=dictionary, iterations=self.iterations, passes=passes) self.log_perplexity = self._lda.log_perplexity(test) logger.info('log_perplexity=%s', self.log_perplexity)
Example #11
Source File: test_weight_boosting.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_importances(): # Check variable importances. X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=1) for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(X, y) importances = clf.feature_importances_ assert_equal(importances.shape[0], 10) assert_equal((importances[:3, np.newaxis] >= importances[3:]).all(), True)
Example #12
Source File: iterators.py From training_results_v0.6 with Apache License 2.0 | 6 votes |
def reset(self): """Resets the iterator to the beginning of the data.""" self.curr_idx = 0 #shuffle data in each bucket random.shuffle(self.idx) for i, buck in enumerate(self.sentences): self.indices[i], self.sentences[i], self.characters[i], self.label[i] = shuffle(self.indices[i], self.sentences[i], self.characters[i], self.label[i]) self.ndindex = [] self.ndsent = [] self.ndchar = [] self.ndlabel = [] #for each bucket of data for i, buck in enumerate(self.sentences): #append the lists with an array self.ndindex.append(ndarray.array(self.indices[i], dtype=self.dtype)) self.ndsent.append(ndarray.array(self.sentences[i], dtype=self.dtype)) self.ndchar.append(ndarray.array(self.characters[i], dtype=self.dtype)) self.ndlabel.append(ndarray.array(self.label[i], dtype=self.dtype))
Example #13
Source File: utils.py From adversarial-autoencoder with MIT License | 6 votes |
def load_mnist(): with open('mnist/train-images-idx3-ubyte', 'rb') as f: data = np.fromfile(file=f, dtype=np.uint8) X_train = data[16:].reshape(60000, 28 * 28).astype(np.float32) with open('mnist/train-labels-idx1-ubyte', 'rb') as f: data = np.fromfile(file=f, dtype=np.uint8) y_train = data[8:].reshape(60000).astype(np.uint8) with open('mnist/t10k-images-idx3-ubyte', 'rb') as f: data = np.fromfile(file=f, dtype=np.uint8) X_test = data[16:].reshape(10000, 28 * 28).astype(np.float32) with open('mnist/t10k-labels-idx1-ubyte', 'rb') as f: data = np.fromfile(file=f, dtype=np.uint8) y_test = data[8:].reshape(10000).astype(np.uint8) X_train, y_train = shuffle(X_train, y_train) X_test, y_test = shuffle(X_test, y_test) X_train /= 255. X_test /= 255. return X_train, y_train, X_test, y_test
Example #14
Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_learning_curve_batch_and_incremental_learning_are_equal(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) train_sizes = np.linspace(0.2, 1.0, 5) estimator = PassiveAggressiveClassifier(max_iter=1, tol=None, shuffle=False) train_sizes_inc, train_scores_inc, test_scores_inc = \ learning_curve( estimator, X, y, train_sizes=train_sizes, cv=3, exploit_incremental_learning=True) train_sizes_batch, train_scores_batch, test_scores_batch = \ learning_curve( estimator, X, y, cv=3, train_sizes=train_sizes, exploit_incremental_learning=False) assert_array_equal(train_sizes_inc, train_sizes_batch) assert_array_almost_equal(train_scores_inc.mean(axis=1), train_scores_batch.mean(axis=1)) assert_array_almost_equal(test_scores_inc.mean(axis=1), test_scores_batch.mean(axis=1))
Example #15
Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def check_cross_val_predict_multiclass(est, X, y, method): """Helper for tests of cross_val_predict with multiclass classification""" cv = KFold(n_splits=3, shuffle=False) # Generate expected outputs float_min = np.finfo(np.float64).min default_values = {'decision_function': float_min, 'predict_log_proba': float_min, 'predict_proba': 0} expected_predictions = np.full((len(X), len(set(y))), default_values[method], dtype=np.float64) _, y_enc = np.unique(y, return_inverse=True) for train, test in cv.split(X, y_enc): est = clone(est).fit(X[train], y_enc[train]) fold_preds = getattr(est, method)(X[test]) i_cols_fit = np.unique(y_enc[train]) expected_predictions[np.ix_(test, i_cols_fit)] = fold_preds # Check actual outputs for several representations of y for tg in [y, y + 1, y - 2, y.astype('str')]: assert_allclose(cross_val_predict(est, X, tg, method=method, cv=cv), expected_predictions)
Example #16
Source File: dataset.py From Neural-Network-Programming-with-TensorFlow with MIT License | 6 votes |
def read_train_sets(train_path, image_size, classes, validation_size=0): class DataSets(object): pass data_sets = DataSets() images, labels, ids, cls = load_train(train_path, image_size, classes) images, labels, ids, cls = shuffle(images, labels, ids, cls) # shuffle the data if isinstance(validation_size, float): validation_size = int(validation_size * images.shape[0]) validation_images = images[:validation_size] validation_labels = labels[:validation_size] validation_ids = ids[:validation_size] validation_cls = cls[:validation_size] train_images = images[validation_size:] train_labels = labels[validation_size:] train_ids = ids[validation_size:] train_cls = cls[validation_size:] data_sets.train = DataSet(train_images, train_labels, train_ids, train_cls) data_sets.valid = DataSet(validation_images, validation_labels, validation_ids, validation_cls) return data_sets
Example #17
Source File: dataset.py From Neural-Network-Programming-with-TensorFlow with MIT License | 6 votes |
def next_batch(self, batch_size): """Return the next `batch_size` examples from this data set.""" start = self._index_in_epoch self._index_in_epoch += batch_size if self._index_in_epoch > self._num_examples: # Finished epoch self._epochs_completed += 1 # # Shuffle the data (maybe) # perm = np.arange(self._num_examples) # np.random.shuffle(perm) # self._images = self._images[perm] # self._labels = self._labels[perm] # Start next epoch start = 0 self._index_in_epoch = batch_size assert batch_size <= self._num_examples end = self._index_in_epoch return self._images[start:end], self._labels[start:end], self._ids[start:end], self._cls[start:end]
Example #18
Source File: dataset.py From cv-tricks.com with MIT License | 6 votes |
def read_train_sets(train_path, image_size, classes, validation_size): class DataSets(object): pass data_sets = DataSets() images, labels, img_names, cls = load_train(train_path, image_size, classes) images, labels, img_names, cls = shuffle(images, labels, img_names, cls) if isinstance(validation_size, float): validation_size = int(validation_size * images.shape[0]) validation_images = images[:validation_size] validation_labels = labels[:validation_size] validation_img_names = img_names[:validation_size] validation_cls = cls[:validation_size] train_images = images[validation_size:] train_labels = labels[validation_size:] train_img_names = img_names[validation_size:] train_cls = cls[validation_size:] data_sets.train = DataSet(train_images, train_labels, train_img_names, train_cls) data_sets.valid = DataSet(validation_images, validation_labels, validation_img_names, validation_cls) return data_sets
Example #19
Source File: train_model.py From laughter-detection with MIT License | 5 votes |
def evaluate_on_parts(data_parts, label_parts, name): #train_data_parts, train_label_parts = shuffle(train_data_parts, train_label_parts, random_state=0) i = 0 accs = [] while i < len(data_parts): #if i % 10000 == 0: print(i) X_subset, y_subset = get_data_subset(data_parts, label_parts, i, i+100) #model.fit(X_subset,y_subset,shuffle=True,batch_size = 2000, epochs=1,verbose=False) acc = model.evaluate(X_subset, y_subset,verbose=False)[1] accs.append(acc) i += 100 print("%s accuracy %f " % (name, np.mean(accs))) return (np.mean(accs))
Example #20
Source File: value_function.py From PPO-Stein-Control-Variate with MIT License | 5 votes |
def fit(self, x, y): """ Fit model to current data batch + previous data batch Args: x: features y: target """ num_batches = max(x.shape[0] // 256, 1) batch_size = x.shape[0] // num_batches y_hat = self.predict(x) # check explained variance prior to update old_exp_var = 1 - np.var(y - y_hat)/np.var(y) if self.replay_buffer_x is None: x_train, y_train = x, y else: x_train = np.concatenate([x, self.replay_buffer_x]) y_train = np.concatenate([y, self.replay_buffer_y]) self.replay_buffer_x = x self.replay_buffer_y = y for e in range(self.epochs): x_train, y_train = shuffle(x_train, y_train) for j in range(num_batches): start = j * batch_size end = (j + 1) * batch_size feed_dict = {self.obs_ph: x_train[start:end, :], self.val_ph: y_train[start:end]} _, l = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict) y_hat = self.predict(x) loss = np.mean(np.square(y_hat - y)) # explained variance after update exp_var = 1 - np.var(y - y_hat) / np.var(y) # diagnose over-fitting of val func logger.record_dicts({ 'VarFuncLoss': loss, 'ExplainedVarNew':exp_var, 'ExplainedVarOld': old_exp_var})
Example #21
Source File: train_model.py From laughter-detection with MIT License | 5 votes |
def train_on_parts(train_data_parts, train_label_parts, name): train_data_parts, train_label_parts = shuffle(train_data_parts, train_label_parts, random_state=0) i = 0 accs = [] while i < len(train_data_parts): #print(i) X_subset, y_subset = get_data_subset(train_data_parts, train_label_parts, i, i+2000) model.fit(X_subset,y_subset,shuffle=True,batch_size = 500, epochs=1,verbose=False) acc = model.evaluate(X_subset, y_subset,verbose=False)[1] accs.append(acc) #print(np.mean(accs)) i += 2000 print("%s accuracy %f" % (name, np.mean(accs)))
Example #22
Source File: value_function.py From PPO-Stein-Control-Variate with MIT License | 5 votes |
def fit(self, x, y): """ Fit model to current data batch + previous data batch Args: x: features y: target """ num_batches = max(x.shape[0] // 256, 1) batch_size = x.shape[0] // num_batches y_hat = self.predict(x) # check explained variance prior to update old_exp_var = 1 - np.var(y - y_hat)/np.var(y) if self.replay_buffer_x is None: x_train, y_train = x, y else: x_train = np.concatenate([x, self.replay_buffer_x]) y_train = np.concatenate([y, self.replay_buffer_y]) self.replay_buffer_x = x self.replay_buffer_y = y for e in range(self.epochs): x_train, y_train = shuffle(x_train, y_train) for j in range(num_batches): start = j * batch_size end = (j + 1) * batch_size feed_dict = {self.obs_ph: x_train[start:end, :], self.val_ph: y_train[start:end]} _, l = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict) y_hat = self.predict(x) loss = np.mean(np.square(y_hat - y)) # explained variance after update exp_var = 1 - np.var(y - y_hat) / np.var(y) # diagnose over-fitting of val func logger.record_dicts({ 'VarFuncLoss': loss, 'ExplainedVarNew': exp_var, 'ExplainedVarOld': old_exp_var})
Example #23
Source File: image_loading.py From classifying-cancer with GNU General Public License v3.0 | 5 votes |
def read_img_sets(image_dir, image_size, validation_size=0): class DataSets: pass data_sets = DataSets() images, labels, ids, cls, cls_map = load_data(image_dir, image_size) images, labels, ids, cls = shuffle(images, labels, ids, cls) if isinstance(validation_size, float): validation_size = int(validation_size * images.shape[0]) test_images = images[:validation_size] test_labels = labels[:validation_size] test_ids = ids[:validation_size] test_cls = cls[:validation_size] train_images = images[validation_size:] train_labels = labels[validation_size:] train_ids = ids[validation_size:] train_cls = cls[validation_size:] data_sets.train = DataSet(train_images, train_labels, train_ids, train_cls) data_sets.test = DataSet(test_images, test_labels, test_ids, test_cls) return data_sets, cls_map
Example #24
Source File: wordvec_regressor.py From Wordbatch with GNU General Public License v2.0 | 5 votes |
def fit_batch(self, texts, labels, rcount): texts, labels = shuffle(texts, labels) print("Transforming", rcount) #texts= self.wb.fit_transform(texts, tn__batcher=self.batcher, dct__reset= False, dct__batcher= self.batcher) texts = self.wb.fit_transform(texts) print("Training", rcount) self.clf.fit(texts, labels, reset= False)
Example #25
Source File: generator.py From KerasDeepSpeech with GNU Affero General Public License v3.0 | 5 votes |
def genshuffle(self): self.wavpath, self.transcript, self.finish = shuffle(self.wavpath, self.transcript, self.finish)
Example #26
Source File: generator.py From KerasDeepSpeech with GNU Affero General Public License v3.0 | 5 votes |
def shuffle_data(self): self.wavpath, self.transcript, self.finish = shuffle(self.wavpath, self.transcript, self.finish) return
Example #27
Source File: shrink.py From CarND-Transfer-Learning-Lab with MIT License | 5 votes |
def main(_): # load bottleneck data with open(FLAGS.training_file, 'rb') as f: train_data = pickle.load(f) X_train = train_data['features'] y_train = train_data['labels'] print(X_train.shape, y_train.shape) X_train, y_train = shuffle(X_train, y_train, random_state=0) keep_indices = [] keep_counter = Counter() for i, label in enumerate(y_train.reshape(-1)): if keep_counter[label] < FLAGS.size: keep_counter[label] += 1 keep_indices.append(i) X_train_small = X_train[keep_indices] y_train_small = y_train[keep_indices] print(X_train_small.shape, y_train_small.shape) print("Writing to {}".format(FLAGS.output_file)) data = {'features': X_train_small, 'labels': y_train_small} pickle.dump(data, open(FLAGS.output_file, 'wb')) # parses flags and calls the `main` function above
Example #28
Source File: test_multiclass.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_ovo_ties2(): # test that ties can not only be won by the first two labels X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y_ref = np.array([2, 0, 1, 2]) # cycle through labels so that each label wins once for i in range(3): y = (y_ref + i) % 3 multi_clf = OneVsOneClassifier(Perceptron(shuffle=False, max_iter=4, tol=None)) ovo_prediction = multi_clf.fit(X, y).predict(X) assert_equal(ovo_prediction[0], i % 3)
Example #29
Source File: test_multiclass.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_ovo_ties(): # Test that ties are broken using the decision function, # not defaulting to the smallest label X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y = np.array([2, 0, 1, 2]) multi_clf = OneVsOneClassifier(Perceptron(shuffle=False, max_iter=4, tol=None)) ovo_prediction = multi_clf.fit(X, y).predict(X) ovo_decision = multi_clf.decision_function(X) # Classifiers are in order 0-1, 0-2, 1-2 # Use decision_function to compute the votes and the normalized # sum_of_confidences, which is used to disambiguate when there is a tie in # votes. votes = np.round(ovo_decision) normalized_confidences = ovo_decision - votes # For the first point, there is one vote per class assert_array_equal(votes[0, :], 1) # For the rest, there is no tie and the prediction is the argmax assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:]) # For the tie, the prediction is the class with the highest score assert_equal(ovo_prediction[0], normalized_confidences[0].argmax()) # 0.23. warning about tol not having its correct default value.
Example #30
Source File: mnist.py From skorch with BSD 3-Clause "New" or "Revised" License | 5 votes |
def get_data(num_samples): mnist = fetch_openml('mnist_784') torch.manual_seed(0) X = mnist.data.astype('float32').reshape(-1, 1, 28, 28) y = mnist.target.astype('int64') X, y = shuffle(X, y) X, y = X[:num_samples], y[:num_samples] X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) X_train /= 255 X_test /= 255 return X_train, X_test, y_train, y_test