作者:以下犯上LOVE_845 | 来源:互联网 | 2023-09-24 20:30
首先需要引入需要的类fromsklearn.treeimportDecisionTreeClassifier,export_graphvizfromsklearn.feature_
首先需要引入需要的类
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
然后写入函数tree_titanic()
def tree_titanic():
path="E:\data\\titanic.csv" #注意此处‘\t’会被认为是特殊字符,需要加两个‘\’
titanic=pd.read_csv(path)
print("type(titanic):",type(titanic))
#1.处理特征值和目标值
x=titanic[["pclass","age","sex"]]#注意此处DataFrame选用多个列的时候,要用两个中括号“[]”
print(x)
y=titanic["survived"]
print(y)
# print("x1:",x,type(x))
#2.特征值处理
# (1)缺失值处理
x["age"].fillna(x["age"].mean(), inplace=True)#注意此处我刚开始时犯了一个错误,好长时间才找到,是mean(),而不是mean
# print("x2:", x)
# #(2)转化成字典
x=x.to_dict(orient="records")
print("x3:", x)
# #3.数据集划分
x_train, x_test, y_train, y_test=train_test_split(x, y, random_state=22)
# print(x_train)
# #4.字典特征抽取
transfer=DictVectorizer()
x_train= transfer.fit_transform(x_train)
x_test=transfer.transform(x_test)
#5.决策树预估器
estimator=DecisionTreeClassifier(criterion="entropy")
estimator.fit(x_train,y_train)
#6.模型评估
#(1)方法1:直接比对真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值:\n", y_test == y_predict)
# (2).计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)
#7.可视化决策树
export_graphviz(estimator, out_file="titanic_tree.dot", feature_names=transfer.get_feature_names())
return None
结果为: