normalization 在数据跨度不一的情况下对机器学习有很重要的作用.特别是各种数据属性还会互相影响的情况之下. Scikit-learn 中标准化的语句是 preprocessing.scale() . scale 以后, model 就更能从标准化数据中学到东西.
from sklearn.model_selection import train_test_split from sklearn.datasets.samples_generator import make_classification from sklearn.svm import SVC import matplotlib.pyplot as plt X, y = make_classification(n_samples=300, n_features=2 , n_redundant=0, n_informative=2,random_state=22, n_clusters_per_class=1, scale=100) plt.scatter(X[:, 0], X[:, 1], c=y) plt.show() X = preprocessing.scale(X) # normalization step plt.scatter(X[:, 0], X[:, 1], c=y) plt.show() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3) clf = SVC() clf.fit(X_train, y_train) print(clf.score(X_test, y_test)) # 0.944444444444
cross validation 交叉验证1 sklearn 中的 cross validation 交叉验证 对于我们选择正确的 model 和model 的参数是非常有帮助的. 有了他的帮助, 我们能直观的看出不同 model 或者参数对结构准确度的影响.
from sklearn.datasets import load_iris from sklearn.cross_validation import train_test_split,cross_val_score from sklearn.neighbors import KNeighborsClassifier iris = load_iris() X = iris.data y = iris.target # test train split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4) knn = KNeighborsClassifier(n_neighbors=5) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) print(knn.score(X_test, y_test)) # 0.973684210526 # this is cross_val_score knn = KNeighborsClassifier(n_neighbors=5) scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy') print(scores) # [ 0.96666667 1. 0.93333333 0.96666667 1. ] print(scores.mean()) # 0.973333333333
import matplotlib.pyplot as plt k_range = range(1, 31) k_scores = [] for k in k_range:knn = KNeighborsClassifier(n_neighbors=k)# loss = -cross_val_score(knn, X, y, cv=10, scoring='mean_squared_error') # for regressionscores = cross_val_score(knn, X, y, cv=10, scoring='accuracy') # for classificationk_scores.append(scores.mean()) plt.plot(k_range, k_scores) plt.xlabel('Value of K for KNN') plt.ylabel('Cross-Validated Accuracy') plt.show()
cross validation 交叉验证2 sklearn.learning_curve 中的 learning curve 可以很直观的看出我们的 model 学习的进度,对比发现有没有 overfitting 的问题.然后我们可以对我们的 model 进行调整,克服 overfitting 的问题.
from sklearn.learning_curve import learning_curve from sklearn.datasets import load_digits from sklearn.svm import SVC import matplotlib.pyplot as plt import numpy as np digits = load_digits() X = digits.data y = digits.target train_sizes, train_loss, test_loss= learning_curve(SVC(gamma=0.01), X, y, cv=10, scoring='mean_squared_error',train_sizes=[0.1, 0.25, 0.5, 0.75, 1]) train_loss_mean = -np.mean(train_loss, axis=1) test_loss_mean = -np.mean(test_loss, axis=1) plt.plot(train_sizes, train_loss_mean, 'o-', color="r",label="Training") plt.plot(train_sizes, test_loss_mean, 'o-', color="g",label="Cross-validation") plt.xlabel("Training examples") plt.ylabel("Loss") plt.legend(loc="best") plt.show()