Python sklearn.cross_validation.train_test_split() Examples
The following are 30
code examples of sklearn.cross_validation.train_test_split().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.cross_validation
, or try the search function
.
Example #1
Source File: analysis.py From smallrnaseq with GNU General Public License v3.0 | 7 votes |
def classify(X, y, cl, name=''): """Classification using gene features""" from sklearn.metrics import classification_report, accuracy_score np.random.seed() ind = np.random.permutation(len(X)) from sklearn.cross_validation import train_test_split Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.4) #print X cl.fit(Xtrain, ytrain) ypred = cl.predict(Xtest) print (classification_report(ytest, ypred)) #print accuracy_score(ytest, ypred) from sklearn import cross_validation yl = pd.Categorical(y).labels sc = cross_validation.cross_val_score(cl, X, yl, scoring='roc_auc', cv=5) print("AUC: %0.2f (+/- %0.2f)" % (sc.mean(), sc.std() * 2)) return cl
Example #2
Source File: label_digits.py From libact with BSD 2-Clause "Simplified" License | 7 votes |
def split_train_test(n_classes): from sklearn.datasets import load_digits n_labeled = 5 digits = load_digits(n_class=n_classes) # consider binary case X = digits.data y = digits.target print(np.shape(X)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) while len(np.unique(y_train[:n_labeled])) < n_classes: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33) trn_ds = Dataset(X_train, np.concatenate( [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)])) tst_ds = Dataset(X_test, y_test) return trn_ds, tst_ds, digits
Example #3
Source File: data_preparation_tools.py From corpus-to-graph-ml with MIT License | 6 votes |
def split_to_test_and_train(data, labels, entities, test_size=DEFAULT_TEST_SIZE): d_train, d_test, l_train, l_test, c_train, c_test = train_test_split(data, labels, entities, test_size=test_size) d_test_2 = [] l_test_2 = [] c_test_2 = [] train_dict = {} for d in d_train: train_dict[d] = 1 for d,l,c in zip(d_test, l_test, c_test): if (train_dict.has_key(d)): continue d_test_2.append(d) l_test_2.append(l) c_test_2.append(c) return (d_train, d_test_2, l_train, l_test_2, c_train, c_test_2) # utility to extracts entities from preproceseed files
Example #4
Source File: p119_squential_backward_selection.py From PythonMachineLearningExamples with MIT License | 6 votes |
def fit(self, X, y): X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=self.test_size, random_state=self.random_state) dim = X_train.shape[1] self.indices_ = tuple(range(dim)) self.subsets_ = [self.indices_] score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_) self.scores_ = [score] while dim > self.k_features: scores = [] subsets = [] for p in combinations(self.indices_, r=dim-1): score = self._calc_score(X_train, y_train, X_test, y_test, p) scores.append(score) subsets.append(p) best = np.argmax(scores) self.indices_ = subsets[best] self.subsets_.append(self.indices_) dim -= 1 self.scores_.append(scores[best]) self.k_score_ = self.scores_[-1] return self
Example #5
Source File: sklearn-RS-demo-cf-item-test.py From AiLearning with GNU General Public License v3.0 | 6 votes |
def splitData(self, dataFile, test_size): # 加载数据集 header = ['user_id', 'item_id', 'rating', 'timestamp'] df = pd.read_csv(dataFile, sep='\t', names=header) self.n_users = df.user_id.unique().shape[0] self.n_items = df.item_id.unique().shape[0] print('Number of users = ' + str(self.n_users) + ' | Number of items = ' + str(self.n_items)) # 拆分数据集: 用户+电影 self.train_data, self.test_data = cv.train_test_split( df, test_size=test_size) print('分离训练集和测试集成功', file=sys.stderr) print('len(train) = %s' % np.shape(self.train_data)[0], file=sys.stderr) print('len(test) = %s' % np.shape(self.test_data)[0], file=sys.stderr)
Example #6
Source File: prepare_data.py From personal-photos-model with Apache License 2.0 | 6 votes |
def _shuffle_images_for_target(self, data, target): """ Takes all the non-paired images for a given person, slices them into training, validation, and training sets, and shuffles within each of these sets. """ # train_test_split can only partition into two sets, so we have to partition into two sets, then # further partition the validation set into a test set. (train_data, other_data, train_target, other_target) = train_test_split(data, target, train_size=0.7, test_size=0.3, random_state=0) self._train["data"].extend(train_data) self._train["target"].extend(train_target) (validation_data, test_data, validation_target, test_target) = train_test_split(other_data, other_target, train_size=0.9, test_size=0.1, random_state=0) self._validation["data"].extend(validation_data) self._validation["target"].extend(validation_target) self._test["data"].extend(test_data) self._test["target"].extend(test_target)
Example #7
Source File: data_loader.py From datastories-semeval2017-task4 with MIT License | 6 votes |
def load_train_val_test(self, only_test=False): X_train, X_rest, y_train, y_rest = train_test_split(self.X, self.y, test_size=0.3, stratify=self.y, random_state=42) X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.5, stratify=y_rest, random_state=42) if not only_test: print("\nPreparing training set...") training = prepare_dataset(X_train, y_train, self.pipeline, self.y_one_hot) print("\nPreparing validation set...") validation = prepare_dataset(X_val, y_val, self.pipeline, self.y_one_hot) print("\nPreparing test set...") testing = prepare_dataset(X_test, y_test, self.pipeline, self.y_one_hot) if only_test: return testing else: return training, validation, testing
Example #8
Source File: iris_run_config.py From deep_image_model with Apache License 2.0 | 6 votes |
def main(unused_argv): # Load dataset. iris = datasets.load_iris() x_train, x_test, y_train, y_test = cross_validation.train_test_split( iris.data, iris.target, test_size=0.2, random_state=42) # You can define you configurations by providing a RunConfig object to # estimator to control session configurations, e.g. num_cores # and gpu_memory_fraction run_config = tf.contrib.learn.estimators.RunConfig( num_cores=3, gpu_memory_fraction=0.6) # Build 3 layer DNN with 10, 20, 10 units respectively. feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input( x_train) classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3, config=run_config) # Fit and predict. classifier.fit(x_train, y_train, steps=200) predictions = list(classifier.predict(x_test, as_iterable=True)) score = metrics.accuracy_score(y_test, predictions) print('Accuracy: {0:f}'.format(score))
Example #9
Source File: iris.py From deep_image_model with Apache License 2.0 | 6 votes |
def main(unused_argv): # Load dataset. iris = learn.datasets.load_dataset('iris') x_train, x_test, y_train, y_test = cross_validation.train_test_split( iris.data, iris.target, test_size=0.2, random_state=42) # Build 3 layer DNN with 10, 20, 10 units respectively. feature_columns = learn.infer_real_valued_columns_from_input(x_train) classifier = learn.DNNClassifier( feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3) # Fit and predict. classifier.fit(x_train, y_train, steps=200) predictions = list(classifier.predict(x_test, as_iterable=True)) score = metrics.accuracy_score(y_test, predictions) print('Accuracy: {0:f}'.format(score))
Example #10
Source File: iris_custom_decay_dnn.py From deep_image_model with Apache License 2.0 | 6 votes |
def main(unused_argv): iris = datasets.load_iris() x_train, x_test, y_train, y_test = train_test_split( iris.data, iris.target, test_size=0.2, random_state=42) feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input( x_train) classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3, optimizer=optimizer_exp_decay) classifier.fit(x_train, y_train, steps=800) predictions = list(classifier.predict(x_test, as_iterable=True)) score = metrics.accuracy_score(y_test, predictions) print('Accuracy: {0:f}'.format(score))
Example #11
Source File: faces.py From ConvNetPy with MIT License | 6 votes |
def load_data(): global training_data, testing_data lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4) xs = lfw_people.data ys = lfw_people.target inputs = [] labels = list(ys) for face in xs: V = Vol(50, 37, 1, 0.0) V.w = list(face) inputs.append(augment(V, 30)) x_tr, x_te, y_tr, y_te = train_test_split(inputs, labels, test_size=0.25) training_data = zip(x_tr, y_tr) testing_data = zip(x_te, y_te) print 'Dataset made...'
Example #12
Source File: test_display.py From diogenes with MIT License | 6 votes |
def test_get_top_features(self): M, labels = uft.generate_test_matrix(1000, 15, random_state=0) M = utils.cast_np_sa_to_nd(M) M_train, M_test, labels_train, labels_test = train_test_split( M, labels) clf = RandomForestClassifier(random_state=0) clf.fit(M_train, labels_train) ctrl_feat_importances = clf.feature_importances_ ctrl_col_names = ['f{}'.format(i) for i in xrange(15)] ctrl_feat_ranks = np.argsort(ctrl_feat_importances)[::-1][:10] ctrl = utils.convert_to_sa( zip(ctrl_col_names, ctrl_feat_importances), col_names=('feat_name', 'score'))[ctrl_feat_ranks] res = dsp.get_top_features(clf, M, verbose=False) self.assertTrue(uft.array_equal(ctrl, res)) res = dsp.get_top_features(clf, col_names=['f{}'.format(i) for i in xrange(15)], verbose=False) self.assertTrue(uft.array_equal(ctrl, res))
Example #13
Source File: data_loader.py From datastories-semeval2017-task4 with MIT License | 5 votes |
def load_final(self): X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.1, stratify=self.y, random_state=27) print("\nPreparing training set...") training = prepare_dataset(X_train, y_train, self.pipeline, self.y_one_hot) print("\nPreparing test set...") testing = prepare_dataset(X_test, y_test, self.pipeline, self.y_one_hot) return training, testing
Example #14
Source File: recipe_classification.py From Flavor-Network with GNU General Public License v3.0 | 5 votes |
def logistic_test(X,y): X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10) model = LogisticRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) print 'First round:',metrics.accuracy_score(y_test,y_pred) #tune parameter C crange =[0.01,0.1,1,10,100] for num in crange: model = LogisticRegression(C=num) model.fit(X_train, y_train) y_pred = model.predict(X_test) print 'C=', num, ',score=', metrics.accuracy_score(y_test,y_pred)
Example #15
Source File: stack5.py From semeval2017-scienceie with Apache License 2.0 | 5 votes |
def model_withValidation(X_train_total, Y_train_total,X_test=None,Y_test=None,words_test=None,indices2labels=None,hiddenDim=250, filename_x = "none", filename_y = "none"): X_train, X_dev, Y_train, Y_dev = train_test_split(X_train_total, Y_train_total, test_size=0.10, random_state=0) model = Sequential() model.add(Dense(output_dim=hiddenDim, input_dim=X_train.shape[1])) model.add(BatchNormalization()) model.add(Activation("relu")) model.add(Dense(3)) model.add(Activation("softmax")) model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy']) weightsPath = "./tmp/myfooo2%s.dat"%(time.time()) checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1, save_best_only=True) model.fit(X_train, Y_train, verbose=2, nb_epoch=100, batch_size=32, validation_data=(X_dev,Y_dev),callbacks=[checkpointer]) model.load_weights(weightsPath) loss, acc = model.evaluate(X_test,Y_test, batch_size=32) print("loss : %0.5f Accuracy :%0.5f"%(loss,acc)) cf = confusion_matrix(Y_test[:,1],model.predict_classes(X_test)) print(cf) predictions = model.predict_classes(X_test) print("-->",predictions) return model,predictions
Example #16
Source File: stack5.py From semeval2017-scienceie with Apache License 2.0 | 5 votes |
def model_withValidation(X_train_total, Y_train_total,X_test=None,Y_test=None,words_test=None,indices2labels=None,hiddenDim=250, filename_x = "none", filename_y = "none"): X_train, X_dev, Y_train, Y_dev = train_test_split(X_train_total, Y_train_total, test_size=0.10, random_state=0) model = Sequential() model.add(Dense(output_dim=hiddenDim, input_dim=X_train.shape[1])) model.add(BatchNormalization()) model.add(Activation("relu")) # model.add(Dense(output_dim=1000)) # model.add(BatchNormalization()) # model.add(Activation("relu")) # model.add(Dense(output_dim=20)) # model.add(BatchNormalization()) # model.add(Activation("relu")) model.add(Dense(3)) model.add(Activation("softmax")) model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy']) weightsPath = "./tmp/myfooo2%s.dat"%(time.time()) checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1, save_best_only=True) model.fit(X_train, Y_train, verbose=2, nb_epoch=100, batch_size=32, validation_data=(X_dev,Y_dev),callbacks=[checkpointer]) model.load_weights(weightsPath) loss, acc = model.evaluate(X_test,Y_test, batch_size=32) print("loss : %0.5f Accuracy :%0.5f"%(loss,acc)) cf = confusion_matrix(Y_test[:,1],model.predict_classes(X_test)) print(cf) predictions = model.predict_classes(X_test) print("-->",predictions) return model,predictions
Example #17
Source File: logistic_regression_updated.py From DataSciencePython with MIT License | 5 votes |
def cv_loop(X, y, model, N): mean_auc = 0. for i in range(N): X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( X, y, test_size=.20, random_state = i*SEED) model.fit(X_train, y_train) preds = model.predict_proba(X_cv)[:,1] auc = metrics.auc_score(y_cv, preds) print "AUC (fold %d/%d): %f" % (i + 1, N, auc) mean_auc += auc return mean_auc/N
Example #18
Source File: classifier.py From TextDetector with GNU General Public License v3.0 | 5 votes |
def data_load(self, datadir): mytime = timeLog('../timelogs/data_load') mytime.start() print 'Loading data ....', P, L = data_load(datadir) P = numpy.uint8(P) L = numpy.uint8(L) P_train, P_test, L_train, L_test = train_test_split(P, L, train_size = 0.8, test_size = 0.2, random_state = 22) self.feature = P_train self.label = L_train self.feature_test = P_test self.label_test = L_test mytime.end() mytime.final()
Example #19
Source File: classification.py From text-analytics-with-python with Apache License 2.0 | 5 votes |
def prepare_datasets(corpus, labels, test_data_proportion=0.3): train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels, test_size=0.33, random_state=42) return train_X, test_X, train_Y, test_Y
Example #20
Source File: main.py From Python-DevOps with MIT License | 5 votes |
def train_bayes(corpus,tokenizing=True,cleaning=True,normalizing=True,stem=True,vector='tfidf',split=0.2): multinomial,labels,vectorize = None, None, None if vector.lower().find('tfidf') < 0 and vector.lower().find('bow'): raise Exception('Invalid vectorization technique') if isinstance(corpus, str): trainset = sklearn.datasets.load_files(container_path = corpus, encoding = 'UTF-8') trainset.data, trainset.target = separate_dataset(trainset) data, target = trainset.data, trainset.target labels = trainset.target_names if isinstance(corpus, list) or isinstance(corpus, tuple): corpus = np.array(corpus) data, target = corpus[:,0].tolist(),corpus[:,1].tolist() labels = np.unique(target).tolist() target = LabelEncoder().fit_transform(target) c = list(zip(data, target)) random.shuffle(c) data, target = zip(*c) data, target = list(data), list(target) if stem: for i in range(len(data)): data[i] = ' '.join([stemming(k) for k in data[i].split()]) if cleaning: for i in range(len(data)): data[i] = clearstring(data[i],tokenizing) if vector.lower().find('tfidf') >= 0: vectorize = TfidfVectorizer().fit(data) vectors = vectorize.transform(data) else: vectorize = CountVectorizer().fit(data) vectors = vectorize.transform(data) multinomial = MultinomialNB() if split: train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = split) multinomial.partial_fit(train_X, train_Y,classes=np.unique(target)) predicted = multinomial.predict(test_X) print(metrics.classification_report(test_Y, predicted, target_names = labels)) else: multinomial.partial_fit(vectors,target,classes=np.unique(target)) predicted = multinomial.predict(vectors) print(metrics.classification_report(target, predicted, target_names = labels)) return USER_BAYES(multinomial,labels,vectorize)
Example #21
Source File: test_vae.py From smrt with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_autoencoder(): mnist = input_data.read_data_sets("MNIST_data", one_hot=True) all_data = np.asarray(mnist.train.images) seed = 42 X_train, X_test = train_test_split(all_data, train_size=0.7, random_state=seed) # define ae = VariationalAutoEncoder(n_hidden=400, n_latent_factors=20, n_epochs=10, learning_rate=0.01, batch_size=256, display_step=5, activation_function='sigmoid', verbose=2, random_state=seed, layer_type='gaussian') # fit ae.fit(X_train) # show we can get the shape _ = ae.topography_.shape # train error # assert_almost_equal(ae.train_cost_, 0.00380031) # assert transform works todo assert vals ae.transform(X_train) # generate a sample ae.generate() # get the error: # mse = ((X_test - reconstructed) ** 2).sum(axis=1).sum() / X_test.shape[0] # assert_almost_equal(mse, 4.40549573864) # try creating a few synthetic ones using the generate_from_sample method synth = ae.generate_from_sample(X_test[:5]) assert synth.shape[0] == 5
Example #22
Source File: model.py From DeepNews with Apache License 2.0 | 5 votes |
def split_test_train(self, X, y, nb_val_samples=100): """ split X,y data into training and testing """ X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=nb_val_samples, random_state=seed) return (X_train, X_test, Y_train, Y_test)
Example #23
Source File: functions.py From topicModelling with GNU General Public License v3.0 | 5 votes |
def perform_class(X, y, iterations=1): scores = [] for i in range(iterations): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42+iterations) parameters = {'C':[0.01, 0.1, 1, 10, 100]} clf_acc = GridSearchCV(svm.LinearSVC(), parameters, n_jobs=3, cv=3, refit=True, scoring = 'accuracy') clf_acc.fit(X_train, y_train) scores.append([metrics.accuracy_score(y_test, clf_acc.predict(X_test)), metrics.f1_score(y_test, clf_acc.predict(X_test),average='micro')]) acc = np.mean([x[0] for x in scores]), np.std([x[0] for x in scores]) mif = np.mean([x[1] for x in scores]), np.std([x[1] for x in scores]) return acc, mif
Example #24
Source File: functions.py From topicModelling with GNU General Public License v3.0 | 5 votes |
def perform_class(X, y, iterations=1): scores = [] for i in range(iterations): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42+iterations) parameters = {'C':[0.01, 0.1, 1, 10, 100]} clf_acc = GridSearchCV(svm.LinearSVC(), parameters, n_jobs=3, cv=3, refit=True, scoring = 'accuracy') clf_acc.fit(X_train, y_train) scores.append([metrics.accuracy_score(y_test, clf_acc.predict(X_test)), metrics.f1_score(y_test, clf_acc.predict(X_test),average='micro')]) acc = np.mean([x[0] for x in scores]), np.std([x[0] for x in scores]) mif = np.mean([x[1] for x in scores]), np.std([x[1] for x in scores]) return acc, mif
Example #25
Source File: models.py From lentil with Apache License 2.0 | 5 votes |
def fit(self): """ Estimate model parameters that fit the interaction history in self.history """ X = self.feature_matrix_from_interactions(self.history) Y = np.array(self.history['outcome'].apply(lambda x: 1 if x else 0).values) Cs = [0.1, 1., 10.] def val_log_likelihood(C): """ Compute average log-likelihood of IRT model with a specific regularization constant on a validation set :param float C: Coefficient of L2 regularization term :rtype: float :return: Average log-likelihood on validation set """ train_idxes, val_idxes = cross_validation.train_test_split( np.arange(0, len(self.history), 1), train_size=0.7) model = LogisticRegression(penalty='l2', C=C) X_train = self.feature_matrix_from_interactions(self.history.ix[train_idxes]) model.fit(X_train, Y[train_idxes]) X_val = self.feature_matrix_from_interactions(self.history.ix[val_idxes]) log_probas = model.predict_log_proba(X_val) idx_of_zero = 1 if model.classes_[1]==0 else 0 return np.mean(log_probas[np.arange(0, len(val_idxes), 1), idx_of_zero ^ Y[val_idxes]]) self.model = LogisticRegression(penalty='l2', C=( 1. if not self.select_regularization_constant else max(Cs, key=val_log_likelihood))) self.model.fit(X, Y)
Example #26
Source File: solution.py From Kaggle with MIT License | 5 votes |
def optimize_logisticRegression(): train_data = pd.read_csv(r"data/train.csv") print u"数据信息:\n",train_data.info() print u'数据描述:\n',train_data.describe() #display_data(train_data) # 简单显示数据信息 #display_with_process(train_data) # 根据数据的理解,简单处理一下数据显示,验证猜想 process_data = fe_preprocessData(train_data,'process_train_data') # 数据预处理,要训练的数据 train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # 使用正则抽取想要的数据 train_np = train_data.as_matrix() # 转为矩阵 '''训练model''' X = train_np[:,1:] y = train_np[:,0] #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train) model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y) print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) '''测试集上预测''' test_data = pd.read_csv(r"data/test.csv") process_test_data = fe_preprocessData(test_data,'process_test_data') # 预处理数据 test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'optimize_logisticRegression_result/prediction.csv',index=False) #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6) #print cross_validation.cross_val_score(clf, X,y,cv=5) ## 两项映射为多项式
Example #27
Source File: solution.py From Kaggle with MIT License | 5 votes |
def baseline_logisticRegression_crossValidate(): origin_train_data = pd.read_csv(r"data/train.csv") process_data = fe_preprocessData(origin_train_data,'process_train_data') # 数据预处理,要训练的数据 process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2) train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # 使用正则抽取想要的数据 train_np = train_data.as_matrix() # 转为矩阵 '''训练model''' X_train = train_np[:,1:] y_train = train_np[:,0] model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train) print pd.DataFrame({'columns':list(train_data.columns[1:]),'coef_':list(model.coef_.T)}) cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') cv_np = cv_data.as_matrix() X_cv = cv_np[:,1:] y_cv = cv_np[:,0] predictions = model.predict(X_cv) print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0]) '''找到预测错的原始数据,并保存到文件''' error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)] predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId']) predictions_item.columns=['error_PassengerId'] error_result = pd.concat([error_items,predictions_item],axis=1) error_result.to_csv(r'error.csv',index=False) #=print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) #=prediction = model.predict(X_test) #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0]) '''测试集上预测''' '''test_data = pd.read_csv(r"data/test.csv") process_test_data = fe_preprocessData(test_data,'process_test_data',optimize=True) # 预处理数据 test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'logisticRegression_result/prediction.csv',index=False)''' #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6) #print cross_validation.cross_val_score(clf, X,y,cv=5)
Example #28
Source File: solution.py From Kaggle with MIT License | 5 votes |
def baseline_svm_crossValidate(): origin_train_data = pd.read_csv(r"data/train.csv") process_data = pre_processData(origin_train_data,'process_train_data') # 数据预处理,要训练的数据 process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2) train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # 使用正则抽取想要的数据 train_np = train_data.as_matrix() # 转为矩阵 '''训练model''' X_train = train_np[:,1:] y_train = train_np[:,0] model = svm.SVC(kernel='rbf',tol=1e-6).fit(X_train,y_train) #print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') cv_np = cv_data.as_matrix() X_cv = cv_np[:,1:] y_cv = cv_np[:,0] predictions = model.predict(X_cv) print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0]) error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)] predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId']) predictions_item.columns=['error_PassengerId'] # error_items = error_items.reset_index(drop=True) error_result = pd.concat([error_items,predictions_item],axis=1) error_result.to_csv(r'error.csv',index=False) '''测试集上预测''' '''test_data = pd.read_csv(r"data/test.csv") process_test_data = pre_processData(test_data,'process_test_data',optimize=False) # 预处理数据 test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'svm_result/prediction.csv',index=False)''' # baseline crossValidate:逻辑回归模型——进行交叉验证
Example #29
Source File: solution.py From Kaggle with MIT License | 5 votes |
def baseline_randomForest(): train_data = pd.read_csv(r"data/train.csv") print u"数据信息:\n",train_data.info() print u'数据描述:\n',train_data.describe() #display_data(train_data) # 简单显示数据信息 #display_with_process(train_data) # 根据数据的理解,简单处理一下数据显示,验证猜想 process_data = pre_processData(train_data,'process_train_data',optimize=False) # 数据预处理,要训练的数据 train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # 使用正则抽取想要的数据 train_np = train_data.as_matrix() # 转为矩阵 '''训练model''' X = train_np[:,1:] y = train_np[:,0] X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) model = RandomForestClassifier(n_estimators=100).fit(X,y) #predictions = model.predict(X_test) #print np.float32(np.sum(predictions == y_test))/np.float32(predictions.shape[0]) '''预测''' test_data = pd.read_csv(r"data/test.csv") process_test_data = pre_processData(test_data,'process_test_data',optimize=False) # 预处理数据 test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'baseline_randomForest_result/prediction.csv',index=False) # baseline crossValidate:SVM模型———进行交叉验证:
Example #30
Source File: solution.py From Kaggle with MIT License | 5 votes |
def baseline_logisticRegression(): train_data = pd.read_csv(r"data/train.csv") #print u"数据信息:\n",train_data.info() #print u'数据描述:\n',train_data.describe() #display_data(train_data) # 简单显示数据信息 #display_with_process(train_data) # 根据数据的理解,简单处理一下数据显示,验证猜想 process_data = pre_processData(train_data,'process_train_data') # 数据预处理,要训练的数据 train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # 使用正则抽取想要的数据 train_np = train_data.as_matrix() # 转为矩阵 '''训练model''' X = train_np[:,1:] y = train_np[:,0] #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train) model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y) print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) #=prediction = model.predict(X_test) #=cv_error = pd.DataFrame(data=list(X_test[np.where(prediction!=y_test)]),columns=list(train_data.columns)[1:]) #=cv_error.to_csv(r'error.csv',index=True) #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0]) '''测试集上预测''' test_data = pd.read_csv(r"data/test.csv") process_test_data = pre_processData(test_data,'process_test_data') # 预处理数据 test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'baseline_logisticRegression_result/prediction.csv',index=False) #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6) #print cross_validation.cross_val_score(clf, X,y,cv=5) # baseline:SVM模型——0.78947