Python sklearn.cross_validation.cross_val_score() Examples
The following are 30
code examples of sklearn.cross_validation.cross_val_score().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.cross_validation
, or try the search function
.
Example #1
Source File: analysis.py From smallrnaseq with GNU General Public License v3.0 | 7 votes |
def classify(X, y, cl, name=''): """Classification using gene features""" from sklearn.metrics import classification_report, accuracy_score np.random.seed() ind = np.random.permutation(len(X)) from sklearn.cross_validation import train_test_split Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.4) #print X cl.fit(Xtrain, ytrain) ypred = cl.predict(Xtest) print (classification_report(ytest, ypred)) #print accuracy_score(ytest, ypred) from sklearn import cross_validation yl = pd.Categorical(y).labels sc = cross_validation.cross_val_score(cl, X, yl, scoring='roc_auc', cv=5) print("AUC: %0.2f (+/- %0.2f)" % (sc.mean(), sc.std() * 2)) return cl
Example #2
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_cross_val_score_multilabel(): X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1], [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]]) y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]]) clf = KNeighborsClassifier(n_neighbors=1) scoring_micro = make_scorer(precision_score, average='micro') scoring_macro = make_scorer(precision_score, average='macro') scoring_samples = make_scorer(precision_score, average='samples') score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5) score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5) score_samples = cval.cross_val_score(clf, X, y, scoring=scoring_samples, cv=5) assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3]) assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
Example #3
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_cross_val_score_with_score_func_regression(): X, y = make_regression(n_samples=30, n_features=20, n_informative=5, random_state=0) reg = Ridge() # Default score of the Ridge regression estimator scores = cval.cross_val_score(reg, X, y, cv=5) assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # R2 score (aka. determination coefficient) - should be the # same as the default estimator score r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5) assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # Mean squared error; this is a loss function, so "scores" are negative neg_mse_scores = cval.cross_val_score(reg, X, y, cv=5, scoring="neg_mean_squared_error") expected_neg_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99]) assert_array_almost_equal(neg_mse_scores, expected_neg_mse, 2) # Explained variance scoring = make_scorer(explained_variance_score) ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring) assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
Example #4
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_cross_val_score_with_score_func_classification(): iris = load_iris() clf = SVC(kernel='linear') # Default score (should be the accuracy score) scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5) assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2) # Correct classification score (aka. zero / one score) - should be the # same as the default estimator score zo_scores = cval.cross_val_score(clf, iris.data, iris.target, scoring="accuracy", cv=5) assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2) # F1 score (class are balanced so f1_score should be equal to zero/one # score f1_scores = cval.cross_val_score(clf, iris.data, iris.target, scoring="f1_weighted", cv=5) assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
Example #5
Source File: code.py From The_Ultimate_Student_Hunt with MIT License | 6 votes |
def run_model(model,dtrain,predictor_var,target,scoring_method='mean_squared_error'): cv_method = KFold(len(dtrain),5) cv_scores = cross_val_score(model,dtrain[predictor_var],dtrain[target],cv=cv_method,scoring=scoring_method) #print cv_scores, np.mean(cv_scores), np.sqrt((-1)*np.mean(cv_scores)) dtrain_for_val = dtrain[dtrain['Year']<2000] dtest_for_val = dtrain[dtrain['Year']>1999] #cv_method = KFold(len(dtrain_for_val),5) #cv_scores_2 = cross_val_score(model,dtrain_for_val[predictor_var],dtrain_for_val[target],cv=cv_method,scoring=scoring_method) #print cv_scores_2, np.mean(cv_scores_2) dtrain_for_val_ini = dtrain_for_val[predictor_var] dtest_for_val_ini = dtest_for_val[predictor_var] model.fit(dtrain_for_val_ini,dtrain_for_val[target]) pred_for_val = model.predict(dtest_for_val_ini) #print math.sqrt(mean_squared_error(dtest_for_val['Footfall'],pred_for_val))
Example #6
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_cross_val_score_precomputed(): # test for svm with precomputed kernel svm = SVC(kernel="precomputed") iris = load_iris() X, y = iris.data, iris.target linear_kernel = np.dot(X, X.T) score_precomputed = cval.cross_val_score(svm, linear_kernel, y) svm = SVC(kernel="linear") score_linear = cval.cross_val_score(svm, X, y) assert_array_equal(score_precomputed, score_linear) # Error raised for non-square X svm = SVC(kernel="precomputed") assert_raises(ValueError, cval.cross_val_score, svm, X, y) # test error is raised when the precomputed kernel is not array-like # or sparse assert_raises(ValueError, cval.cross_val_score, svm, linear_kernel.tolist(), y)
Example #7
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_cross_val_score_mask(): # test that cross_val_score works with boolean masks svm = SVC(kernel="linear") iris = load_iris() X, y = iris.data, iris.target cv_indices = cval.KFold(len(y), 5) scores_indices = cval.cross_val_score(svm, X, y, cv=cv_indices) cv_indices = cval.KFold(len(y), 5) cv_masks = [] for train, test in cv_indices: mask_train = np.zeros(len(y), dtype=np.bool) mask_test = np.zeros(len(y), dtype=np.bool) mask_train[train] = 1 mask_test[test] = 1 cv_masks.append((train, test)) scores_masks = cval.cross_val_score(svm, X, y, cv=cv_masks) assert_array_equal(scores_indices, scores_masks)
Example #8
Source File: OutPutRes.py From ProFET with GNU General Public License v3.0 | 5 votes |
def CV_multi_stats(X, y, model,n=6) : ''' http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics This version uses multiclass (or multilabel) compatible metrics. May be expanded to use the cross_val_score helper function: http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.cross_val_score.html http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics ''' scores = cross_val_score(estimator=model, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=n, test_size=0.16), n_jobs=-1) #Accuracy scores_f1 = cross_val_score(estimator=model, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=n, test_size=0.16), n_jobs=-1, scoring='f1') print("Model Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2)) print("Model f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) return (scores.mean(), scores.std() ,scores_f1.mean(), scores_f1.std() ) #Removed * 2 from returned STD .. ?
Example #9
Source File: OutPutRes.py From ProFET with GNU General Public License v3.0 | 5 votes |
def PlotPerfPercentFeatures(X,y,est=LinearSVC()): ''' Performance of a classifier (default: SVM-Anova) varying the percentile of features selected (F-test) . http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py ''' transform = SelectPercentile(f_classif) clf = Pipeline([('anova', transform), ('est', est)]) ############################################################################### # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 100) # percentiles = (1,5,10,25,50,75,90) for percentile in percentiles: # print(percentile) clf.set_params(anova__percentile=percentile) this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) print("Outputting Graph:") plt.errorbar(percentiles, score_means, np.array(score_stds)) plt.title( 'Predictor Performance, varying percent of features used') plt.xlabel('Percentile') plt.ylabel('Prediction Performance') plt.axis('tight') plt.show()
Example #10
Source File: VisualizeBestFeatures.py From ProFET with GNU General Public License v3.0 | 5 votes |
def PlotPerfPercentFeatures(X,y,est=LinearSVC()): ''' Display performance of a classifier (default: SVM), varying the percentile of features retained (F-test) . http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py ''' transform = SelectPercentile(f_classif) clf = Pipeline([('anova', transform), ('est', est)]) ############################################################################### # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (1,2,3,5,7,10,15,20,25,33,50,66,75,90, 100) # percentiles = (1,5,10,25,50,75,90) for percentile in percentiles: # print(percentile) clf.set_params(anova__percentile=percentile) this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=5, test_size=0.4), n_jobs=-1) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) print("Outputting Graph:") plt.errorbar(percentiles, score_means, np.array(score_stds)) plt.title( 'Predictor Performance, varying percent of features used') plt.xlabel('Percentile') plt.ylabel('Prediction Performance') plt.axis('tight') plt.show()
Example #11
Source File: Model_Parameters_CV.py From ProFET with GNU General Public License v3.0 | 5 votes |
def plot_BestKFeatures (X_train, y_train): ''' http://nbviewer.ipython.org/github/gmonce/scikit-learn-book/blob/master/Chapter%204%20-%20Advanced%20Features%20-%20Feature%20Engineering%20and%20Selection.ipynb Find the best percentile of features to use, using cross-validation on the training set and get K best feats ''' from sklearn import cross_validation from sklearn import feature_selection from sklearn import tree dt = tree.DecisionTreeClassifier(criterion='entropy') dt = RandomForestClassifier(n_jobs=2, bootstrap=True, n_estimators=250, criterion='gini') dt = dt.fit(X_train, y_train) percentiles = range(1, 95, 5) results = [] for i in range(1, 95, 5): fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i) #Original fs = feature_selection.SelectPercentile(feature_selection.f_classif, percentile=i) # alt X_train_fs = fs.fit_transform(X_train, y_train) scores = cross_validation.cross_val_score(dt, X_train_fs, y_train, cv=4) #print i,scores.mean() results = np.append(results, scores.mean()) optimal_percentil = np.where(results == results.max())[0] print (("Optimal number of features:{0}".format(percentiles[optimal_percentil])), "\n") # Plot number of features VS. cross-validation scores import pylab as pl import matplotlib.pylab as pl pl.figure() pl.xlabel("Number of features selected") pl.ylabel("Cross validation accuracy)") pl.plot(percentiles,results) print ("Mean scores:",results) return
Example #12
Source File: simulation.py From jstsp2015 with MIT License | 5 votes |
def compute_svm_score(K, y, n_folds, scoring='accuracy', random_state=0): cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=random_state) clf = SVC(C=1.0, kernel='precomputed') scores = cross_val_score(clf, K, y, scoring=scoring, cv=cv, n_jobs=1) score = scores.mean() return score
Example #13
Source File: classif_and_ktst.py From jstsp2015 with MIT License | 5 votes |
def compute_svm_cv(K, y, C=100.0, n_folds=5, scoring=balanced_accuracy_scoring): """Compute cross-validated score of SVM with given precomputed kernel. """ cv = StratifiedKFold(y, n_folds=n_folds) clf = SVC(C=C, kernel='precomputed', class_weight='auto') scores = cross_val_score(clf, K, y, scoring=scoring, cv=cv) return scores.mean()
Example #14
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_cross_val_score(): clf = MockClassifier() for a in range(-10, 10): clf.a = a # Smoke test scores = cval.cross_val_score(clf, X, y) assert_array_equal(scores, clf.score(X, y)) # test with multioutput y scores = cval.cross_val_score(clf, X_sparse, X) assert_array_equal(scores, clf.score(X_sparse, X)) scores = cval.cross_val_score(clf, X_sparse, y) assert_array_equal(scores, clf.score(X_sparse, y)) # test with multioutput y scores = cval.cross_val_score(clf, X_sparse, X) assert_array_equal(scores, clf.score(X_sparse, X)) # test with X and y as list list_check = lambda x: isinstance(x, list) clf = CheckingClassifier(check_X=list_check) scores = cval.cross_val_score(clf, X.tolist(), y.tolist()) clf = CheckingClassifier(check_y=list_check) scores = cval.cross_val_score(clf, X, y.tolist()) assert_raises(ValueError, cval.cross_val_score, clf, X, y, scoring="sklearn") # test with 3d X and X_3d = X[:, :, np.newaxis] clf = MockClassifier(allow_nd=True) scores = cval.cross_val_score(clf, X_3d, y) clf = MockClassifier(allow_nd=False) assert_raises(ValueError, cval.cross_val_score, clf, X_3d, y)
Example #15
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_cross_val_score_pandas(): # check cross_val_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] try: from pandas import Series, DataFrame types.append((Series, DataFrame)) except ImportError: pass for TargetType, InputFeatureType in types: # X dataframe, y series X_df, y_ser = InputFeatureType(X), TargetType(y) check_df = lambda x: isinstance(x, InputFeatureType) check_series = lambda x: isinstance(x, TargetType) clf = CheckingClassifier(check_X=check_df, check_y=check_series) cval.cross_val_score(clf, X_df, y_ser)
Example #16
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_cross_val_score_score_func(): clf = MockClassifier() _score_func_args = [] def score_func(y_test, y_predict): _score_func_args.append((y_test, y_predict)) return 1.0 with warnings.catch_warnings(record=True): scoring = make_scorer(score_func) score = cval.cross_val_score(clf, X, y, scoring=scoring) assert_array_equal(score, [1.0, 1.0, 1.0]) assert len(_score_func_args) == 3
Example #17
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_cross_val_score_errors(): class BrokenEstimator: pass assert_raises(TypeError, cval.cross_val_score, BrokenEstimator(), X)
Example #18
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def train_test_split_pandas(): # check cross_val_score doesn't destroy pandas dataframe types = [MockDataFrame] try: from pandas import DataFrame types.append(DataFrame) except ImportError: pass for InputFeatureType in types: # X dataframe X_df = InputFeatureType(X) X_train, X_test = cval.train_test_split(X_df) assert_true(isinstance(X_train, InputFeatureType)) assert_true(isinstance(X_test, InputFeatureType))
Example #19
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_cross_val_predict_pandas(): # check cross_val_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] try: from pandas import Series, DataFrame types.append((Series, DataFrame)) except ImportError: pass for TargetType, InputFeatureType in types: # X dataframe, y series X_df, y_ser = InputFeatureType(X), TargetType(y) check_df = lambda x: isinstance(x, InputFeatureType) check_series = lambda x: isinstance(x, TargetType) clf = CheckingClassifier(check_X=check_df, check_y=check_series) cval.cross_val_predict(clf, X_df, y_ser)
Example #20
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_sparse_fit_params(): iris = load_iris() X, y = iris.data, iris.target clf = MockClassifier() fit_params = {'sparse_sample_weight': coo_matrix(np.eye(X.shape[0]))} a = cval.cross_val_score(clf, X, y, fit_params=fit_params) assert_array_equal(a, np.ones(3))
Example #21
Source File: solution.py From Kaggle with MIT License | 5 votes |
def optimize_logisticRegression(): train_data = pd.read_csv(r"data/train.csv") print u"数据信息:\n",train_data.info() print u'数据描述:\n',train_data.describe() #display_data(train_data) # 简单显示数据信息 #display_with_process(train_data) # 根据数据的理解,简单处理一下数据显示,验证猜想 process_data = fe_preprocessData(train_data,'process_train_data') # 数据预处理,要训练的数据 train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # 使用正则抽取想要的数据 train_np = train_data.as_matrix() # 转为矩阵 '''训练model''' X = train_np[:,1:] y = train_np[:,0] #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train) model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y) print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) '''测试集上预测''' test_data = pd.read_csv(r"data/test.csv") process_test_data = fe_preprocessData(test_data,'process_test_data') # 预处理数据 test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'optimize_logisticRegression_result/prediction.csv',index=False) #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6) #print cross_validation.cross_val_score(clf, X,y,cv=5) ## 两项映射为多项式
Example #22
Source File: ml.py From EDeN with MIT License | 5 votes |
def estimate_model(positive_data_matrix=None, negative_data_matrix=None, target=None, estimator=None, n_jobs=4): """estimate_model.""" X, y = make_data_matrix(positive_data_matrix=positive_data_matrix, negative_data_matrix=negative_data_matrix, target=target) logger.info('Test set') logger.info(describe(X)) logger.info('-' * 80) logger.info('Test Estimate') predictions = estimator.predict(X) margins = estimator.decision_function(X) logger.info(classification_report(y, predictions)) apr = average_precision_score(y, margins) logger.info('APR: %.3f' % apr) roc = roc_auc_score(y, margins) logger.info('ROC: %.3f' % roc) logger.info('Cross-validated estimate') scoring_strings = ['accuracy', 'precision', 'recall', 'f1', 'average_precision', 'roc_auc'] for scoring in scoring_strings: scores = cross_validation.cross_val_score( estimator, X, y, cv=5, scoring=scoring, n_jobs=n_jobs) logger.info('%20s: %.3f +- %.3f' % (scoring, np.mean(scores), np.std(scores))) return roc, apr
Example #23
Source File: Train Classifier and Test Video Feed.py From Emotion-Recognition-Using-SVMs with MIT License | 5 votes |
def evaluate_cross_validation(clf, X, y, K): # create a k-fold cross validation iterator cv = KFold(len(y), K, shuffle=True, random_state=0) # by default the score used is the one returned by score method of the estimator (accuracy) scores = cross_val_score(clf, X, y, cv=cv) print "Scores: ", (scores) print ("Mean score: {0:.3f} (+/-{1:.3f})".format(np.mean(scores), sem(scores))) # Confusion Matrix and Results
Example #24
Source File: titanic.py From MachineLearning with Apache License 2.0 | 5 votes |
def compute_score(clf, X, y,scoring='accuracy'): xval = cross_val_score(clf, X, y, cv = 5,scoring=scoring) return np.mean(xval)
Example #25
Source File: image-classification.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License | 5 votes |
def accuracy(features, labels): from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn import cross_validation # We use logistic regression because it is very fast. # Feel free to experiment with other classifiers clf = Pipeline([('preproc', StandardScaler()), ('classifier', LogisticRegression())]) cv = cross_validation.LeaveOneOut(len(features)) scores = cross_validation.cross_val_score( clf, features, labels, cv=cv) return scores.mean()
Example #26
Source File: rank_tags.py From TGIF-Release with BSD 3-Clause "New" or "Revised" License | 5 votes |
def stump(X, y): score = cross_val_score(LinearSVC(), X, y, cv = 5, n_jobs=5, scoring = 'average_precision') clf = LinearSVC() clf.fit(X, y) coef = clf.coef_[0,0] inter = clf.intercept_[0] return np.mean(score), np.sign(coef), inter / np.abs(coef)
Example #27
Source File: sandpit.py From automl-phase-2 with MIT License | 5 votes |
def _f(x): # iris = load_iris() X, y = X, y = make_hastie_10_2(random_state=0) x = np.ravel(x) f = np.zeros(x.shape) for i in range(f.size): clf = RandomForestClassifier(n_estimators=1, min_samples_leaf=int(np.round(x[i])), random_state=0) # scores = cross_val_score(clf, iris.data, iris.target) scores = cross_val_score(clf, X, y, cv=5) f[i] = -scores.mean() return f.ravel()
Example #28
Source File: scorer.py From scan with GNU Affero General Public License v3.0 | 5 votes |
def train(self): feats = self.get_features() scores = np.array(self.scores) # Compute error metrics for the estimator. self.cv_scores = cross_validation.cross_val_score(self.classifier, feats, scores) self.cv_score = self.cv_scores.mean() self.cv_dev = self.cv_scores.std() self.classifier.fit(feats, scores) self.fit_done = True
Example #29
Source File: sklearnbasemodel.py From Supply-demand-forecasting with MIT License | 5 votes |
def run_croos_validation(self): features,labels,cv = self.getFeaturesLabel() scores = cross_validation.cross_val_score(self.clf, features, labels, cv=cv, scoring=mean_absolute_percentage_error_scoring, n_jobs = -1) print "cross validation scores: means, {}, std, {}, details,{}".format(np.absolute(scores.mean()), scores.std(), np.absolute(scores)) return -np.absolute(scores.mean())
Example #30
Source File: solution.py From Kaggle with MIT License | 5 votes |
def baseline_logisticRegression(): train_data = pd.read_csv(r"data/train.csv") #print u"数据信息:\n",train_data.info() #print u'数据描述:\n',train_data.describe() #display_data(train_data) # 简单显示数据信息 #display_with_process(train_data) # 根据数据的理解,简单处理一下数据显示,验证猜想 process_data = pre_processData(train_data,'process_train_data') # 数据预处理,要训练的数据 train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # 使用正则抽取想要的数据 train_np = train_data.as_matrix() # 转为矩阵 '''训练model''' X = train_np[:,1:] y = train_np[:,0] #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train) model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y) print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) #=prediction = model.predict(X_test) #=cv_error = pd.DataFrame(data=list(X_test[np.where(prediction!=y_test)]),columns=list(train_data.columns)[1:]) #=cv_error.to_csv(r'error.csv',index=True) #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0]) '''测试集上预测''' test_data = pd.read_csv(r"data/test.csv") process_test_data = pre_processData(test_data,'process_test_data') # 预处理数据 test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'baseline_logisticRegression_result/prediction.csv',index=False) #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6) #print cross_validation.cross_val_score(clf, X,y,cv=5) # baseline:SVM模型——0.78947