3. 模型选择和调优
3.1 交叉验证
- 定义
- 目的为了让模型得精度更加可信
3.2 超参数搜索 Grid Search
- 对K值进行选择。k=[1,2,3,4,5,6]循环遍历搜索。
- API
参数1:传入预估器。
参数2:超参数得取值,字典类型,{‘超参数名称’:[参数列表]}
参数3:cv 几折交叉验证
返回值:可查看最佳参数啥的。
3.3 鸢尾花案例增加K值调优
def KNN_optimal(): x_train, x_test, y_train, y_test = load_data()estimator = KNeighborsClassifier() param_dict = {"n_neighbors": [1, 3, 5, 7, 9, 11]}estimator = GridSearchCV(estimator, param_grid=param_dict, cv=10)estimator.fit(x_train, y_train)y_predict = estimator.predict(x_test)print("预测值为:", y_predict, "\n真实值为:", y_test, "\n比较结果为:", y_test == y_predict)score = estimator.score(x_test, y_test)print("准确率为: ", score)print("最佳参数:\n", estimator.best_params_)print("最佳结果:\n", estimator.best_score_)print("最佳估计器:\n", estimator.best_estimator_)print("交叉验证结果:\n", estimator.cv_results_)return None
3.4 预测 facebook 签到位置
- 数据集介绍
- . 流程分析
1)获取数据
2)数据处理: - 特征值:x
- 目标值:y
- a.缩小范围:2
- b.time -> 年月日时分秒
- c.过滤签到次数少的地点
3)特征工程:特征提取,特征预处理:标准化,特征降维
4)算法训练:KNN算法得预估流程
5)模型评估:模型选择与调优
6)应用
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScalerdef load_data():data = pd.read_csv("../../resources/FBlocation/train.csv")data = data.copy()time_value = pd.to_datetime(data["time"], unit="s") date = pd.DatetimeIndex(time_value) data["day"] = date.daydata["weekday"] = date.weekdaydata["hour"] = date.hourprint("计数count统计\n", data.groupby("place_id").count()) place_count = data.groupby("place_id").count()["row_id"] print("签到place的次数统计\n", place_count)place_count[place_count > 3] print("过滤所有数据,筛选出签到次数大于10的\n", place_count[place_count > 10])data["place_id"].isin(place_count[place_count > 3].index.values) print("布尔值索引\n", data["place_id"].isin(place_count[place_count > 10].index.values))final_data = data[data["place_id"].isin(place_count[place_count > 10].index.values)] print("处理后的data:\n", final_data)return final_datadef implement():used_data_x = load_data()[["x", "y", "accuracy", "day", "weekday", "hour"]]used_data_y = load_data()["place_id"]x_train, x_test, y_train, y_test = \train_test_split(used_data_x, used_data_y)transfer = StandardScaler()x_train = transfer.fit_transform(x_train)x_test = transfer.transform(x_test)estimator = KNeighborsClassifier()param_dict = {"n_neighbors": [5, 10, 15, 20]}estimator = GridSearchCV(estimator, param_grid=param_dict, cv=4)estimator.fit(x_train, y_train) y_predict = estimator.predict(x_test)print("预测值为:", y_predict, "\n真实值为:", y_test, "\n比较结果为:", y_test == y_predict)score = estimator.score(x_test, y_test)print("准确率为: ", score)print("最佳参数:\n", estimator.best_params_)print("最佳结果:\n", estimator.best_score_)print("最佳估计器:\n", estimator.best_estimator_)print("交叉验证结果:\n", estimator.cv_results_)return Noneif __name__ == '__main__':implement()