决策树分类鸢尾花数据demo

作者：黑色鲜花_866 | 来源：互联网 | 2023-09-17 11:43

code:importnumpyasnpimportpandasaspdimportmatplotlib.pyplotaspltimportmatplotlib

code:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pydotplus

if __name__ == "__main__":
   
	iris_feature_E = "sepal lenght", "sepal width", "petal length", "petal width"
	iris_feature = "the length of sepal", "the width of sepal", "the length of petal", "the width of petal"
	iris_class = "Iris-setosa", "Iris-versicolor", "Iris-virginica"
	
	data = pd.read_csv("iris.data", header=None)
	iris_types = data[4].unique()
	for i, type in enumerate(iris_types):
		data.set_value(data[4] == type, 4, i)
	x, y = np.split(data.values, (4,), axis=1)
	x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=1)
	print(y_test)

	model = DecisionTreeClassifier(criterion='entropy', max_depth=6)
	model = model.fit(x_train, y_train)
	y_test_hat = model.predict(x_test)
	with open('iris.dot', 'w') as f:
		tree.export_graphviz(model, out_file=f)
	dot_data = tree.export_graphviz(model, out_file=None, feature_names=iris_feature_E, class_names=iris_class,
		filled=True, rounded=True, special_characters=True)
	graph = pydotplus.graph_from_dot_data(dot_data)
	graph.write_pdf('iris.pdf')
	f = open('iris.png', 'wb')
	f.write(graph.create_png())
	f.close()

	# 画图
	# 横纵各采样多少个值
	N, M = 50, 50
	# 第0列的范围
	x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
	# 第1列的范围
	x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
	t1 = np.linspace(x1_min, x1_max, N)
	t2 = np.linspace(x2_min, x2_max, M)
	# 生成网格采样点
	x1, x2 = np.meshgrid(t1, t2)
    # # 无意义，只是为了凑另外两个维度
    # # 打开该注释前，确保注释掉x = x[:, :2]
	x3 = np.ones(x1.size) * np.average(x[:, 2])
	x4 = np.ones(x1.size) * np.average(x[:, 3])
	# 测试点
	x_show = np.stack((x1.flat, x2.flat, x3, x4), axis=1)
	print("x_show_shape:\n", x_show.shape)

	cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
	cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
	# 预测值
	y_show_hat = model.predict(x_show)
	print(y_show_hat.shape)
	print(y_show_hat)
	# 使之与输入的形状相同
	y_show_hat = y_show_hat.reshape(x1.shape)
	print(y_show_hat)
	plt.figure(figsize=(15, 15), facecolor='w')
	# 预测值的显示
	plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light)
	print(y_test)
	print(y_test.ravel())
	# 测试数据
	plt.scatter(x_test[:, 0], x_test[:, 1], c=np.squeeze(y_test), edgecolors='k', s=120, cmap=cm_dark, marker='*')
	# 全部数据
	plt.scatter(x[:, 0], x[:, 1], c=np.squeeze(y), edgecolors='k', s=40, cmap=cm_dark)
	plt.xlabel(iris_feature[0], fOntsize=15)
	plt.ylabel(iris_feature[1], fOntsize=15)
	plt.xlim(x1_min, x1_max)
	plt.ylim(x2_min, x2_max)
	plt.grid(True)
	plt.title('yuanwei flowers regressiong with DecisionTree', fOntsize=17)
	plt.show()

	# 训练集上的预测结果
	y_test = y_test.reshape(-1)
	print(y_test_hat)
	print(y_test)
	# True则预测正确，False则预测错误
	result = (y_test_hat == y_test)
	acc = np.mean(result)
	print('accuracy: %.2f%%' % (100 * acc))

    # 过拟合：错误率
	depth = np.arange(1, 15)
	err_list = []
	for d in depth:
		clf = DecisionTreeClassifier(criterion='entropy', max_depth=d)
		clf = clf.fit(x_train, y_train)
		# 测试数据
		y_test_hat = clf.predict(x_test)
		# True则预测正确，False则预测错误
		result = (y_test_hat == y_test)
		err = 1 - np.mean(result)
		err_list.append(err)
		print(d, 'error ratio: %.2f%%' % (100 * err))
	plt.figure(figsize=(15, 15), facecolor='w')
	plt.plot(depth, err_list, 'ro-', lw=2)
	plt.xlabel('DecisionTree Depth', fOntsize=15)
	plt.ylabel('error ratio', fOntsize=15)
	plt.title('DecisionTree Depth and Overfit', fOntsize=17)
	plt.grid(True)
	plt.show()

生成的图文件：

鸢尾花的数据特征一共有四种：花萼长度、花萼宽度，花瓣长度，花瓣宽度。然后再使用决策树两两特征进行分类：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pydotplus

if __name__ == "__main__":
   
	iris_feature_E = "sepal lenght", "sepal width", "petal length", "petal width"
	iris_feature = "the length of sepal", "the width of sepal", "the length of petal", "the width of petal"
	iris_class = "Iris-setosa", "Iris-versicolor", "Iris-virginica"
	
	data = pd.read_csv("iris.data", header=None)
	iris_types = data[4].unique()
	for i, type in enumerate(iris_types):
		data.set_value(data[4] == type, 4, i)
	x_train, y = np.split(data.values, (4,), axis=1)

	feature_pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
	plt.figure(figsize=(15, 15), facecolor='w')
	for i, pair in enumerate(feature_pairs):
		# 准备数据
		x = x_train[:, pair]
		# 决策树进行学习
		clf = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=3)
		dt_clf = clf.fit(x, y)
		# 开始画图
		N, M = 500, 500
		# 第0列的范围
		x1_min, x1_max = x[:, 0].min(), x[:, 0].max()   
    	# 第1列的范围
		x2_min, x2_max = x[:, 1].min(), x[:, 1].max()   
		t1 = np.linspace(x1_min, x1_max, N)
		t2 = np.linspace(x2_min, x2_max, M)
    	# 生成网格采样点
		x1, x2 = np.meshgrid(t1, t2)           
    	# 测试点         
		x_test = np.stack((x1.flat, x2.flat), axis=1)
		# 在训练集上预测结果
		y_hat = dt_clf.predict(x)
		y = y.reshape(-1)
		# 统计预测正确的个数
		c = np.count_nonzero(y_hat == y)
		print("y_hat:\n", y_hat)
		print("y:\n", y)
		'''
		set1 = set(y_hat)
		set2 = set(y)
		print(list(set1 & set2))
		if y_hat.any() != y.any():
			print('predict:%.3f   real:%.3f' %(y_hat.all(), y.all()))
		'''
		# 打印相关信息
		print('features:\t', iris_feature[pair[0]], ' + ', iris_feature[pair[1]])
		print('the number of true prediction:', c)
		print('acc:%.2f%%' %(100 * float(c) / float(len(y))))

		# 画图显示
		cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
		cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
		# 预测值
		y_test_hat = dt_clf.predict(x_test)
		# reshape到和输入的x1相同格式
		y_test_hat = y_test_hat.reshape(x1.shape)
		plt.subplot(2, 3, i+1)
		plt.pcolormesh(x1, x2, y_test_hat, cmap=cm_light)
		plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', cmap=cm_dark)
		plt.xlabel(iris_feature[pair[0]], fOntsize=14)
		plt.ylabel(iris_feature[pair[1]], fOntsize=14)
		plt.xlim(x1_min, x1_max)
		plt.ylim(x2_min, x2_max)
		plt.grid()
	plt.suptitle('the result of yuanwei flowers in each two features with dcisiontree', fOntsize=20)
	plt.tight_layout(2)
	plt.subplots_adjust(top=0.92)
	plt.show()

显然第二种组合效果还可以的。

接着我们使用随机森林算法来分类看看效果：

只需要在上面的代码中修改：

# 决策树进行学习
clf = DecisionTreeRegressor(n_estimators=200, criterion='entropy', max_depth=6)

为：

# 决策树进行学习
clf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=6)

效果：

看得出来随机森林的分类要比决策树好，随机森林因为是根据多个决策树弱分类器联合成一个强分类器，所以其边界出呈现很多的锯齿，分类的准确度也提高很多,150个数据，最后只有一个分错。

推荐阅读

import
利用决策树预测NBA比赛胜负的Python数据挖掘实践

本文通过使用2013-14赛季NBA赛程与结果数据集以及2013年NBA排名数据，结合《Python数据挖掘入门与实践》一书中的方法，展示如何应用决策树算法进行比赛胜负预测。我们将详细讲解数据预处理、特征工程及模型评估等关键步骤。 ... [详细]

蜡笔小新 2024-12-23 09:07:40
main
深入了解 Windows 窗体中的 SplitContainer 控件

SplitContainer 控件是 Windows 窗体中的一种复合控件，由两个可调整大小的面板和一个可移动的拆分条组成。本文将详细介绍其功能、属性以及如何通过编程方式创建复杂的用户界面。 ... [详细]

蜡笔小新 2024-12-25 17:20:08
import
毕业设计：基于机器学习与深度学习的垃圾邮件（短信）分类算法实现

本文详细介绍了如何使用机器学习和深度学习技术对垃圾邮件和短信进行分类。内容涵盖从数据集介绍、预处理、特征提取到模型训练与评估的完整流程，并提供了具体的代码示例和实验结果。 ... [详细]

蜡笔小新 2024-12-25 17:38:50
main
Flutter | Key 的深度解析

在 Flutter 开发过程中，开发者经常会遇到 Widget 构造函数中的可选参数 Key。对于初学者来说，理解 Key 的作用和使用场景可能是一个挑战。本文将详细探讨 Key 的概念及其应用场景，并通过实例帮助你更好地掌握这一重要工具。 ... [详细]

蜡笔小新 2024-12-25 08:05:15
main
采用IKE方式建立IPsec安全隧道

一、【组网和实验环境】按如上的接口ip先作配置，再作ipsec的相关配置，配置文本见文章最后本文实验采用的交换机是H3C模拟器，下载地址如 ... [详细]

蜡笔小新 2024-12-22 20:24:15
main
ImmutableX Poised to Pioneer Web3 Gaming Revolution

ImmutableX is set to spearhead the evolution of Web3 gaming, with its innovative technologies and strategic partnerships driving significant advancements in the industry. ... [详细]

蜡笔小新 2024-12-27 08:55:17
instance
解析Java中Text.splitText()方法及其应用场景

本文详细介绍了Java中org.w3c.dom.Text类的splitText()方法，通过多个代码示例展示了其实际应用。该方法用于将文本节点在指定位置拆分为两个节点，并保持在文档树中。 ... [详细]

蜡笔小新 2024-12-26 18:31:42
数组
Weight the Tree（树形dp）

题目Link题目学习link1题目学习link2题目学习link3%%%受益匪浅！－－－－－&# ... [详细]

蜡笔小新 2024-12-26 15:55:56
main
长春大学软件工程：二叉排序树实验报告

本实验主要探讨了二叉排序树（BST）的基本操作，包括创建、查找和删除节点。通过具体实例和代码实现，详细介绍了如何使用递归和非递归方法进行关键字查找，并展示了删除特定节点后的树结构变化。 ... [详细]

蜡笔小新 2024-12-26 15:32:56
jsp
android知识杂记（三）

andr ... [详细]

蜡笔小新 2024-12-26 13:29:32
main
计算机图形学实训：OpenGL入门与直线光栅化算法

本教程涵盖OpenGL基础操作及直线光栅化技术，包括点的绘制、简单图形绘制、直线绘制以及DDA和中点画线算法。通过逐步实践，帮助读者掌握OpenGL的基本使用方法。 ... [详细]

蜡笔小新 2024-12-26 12:24:25
数组
Codeforces Round #566 (Div. 2) A~F个人题解

Dashboard-CodeforcesRound#566(Div.2)-CodeforcesA.FillingShapes题意：给你一个的表格，你 ... [详细]

蜡笔小新 2024-12-25 18:41:21
import
实体映射最强工具类：MapStruct真香

实体映射最强工具类：MapStruct真香 ... [详细]

蜡笔小新 2024-12-25 16:22:17
main
Python 游戏开发实战：构建游戏主程序模块

在本教程中，我们将深入探讨如何使用 Python 构建游戏的主程序模块。通过逐步实现各个关键组件，最终完成一个功能完善的游戏界面。 ... [详细]

蜡笔小新 2024-12-23 09:29:59
import
TensorFlow 2.0 实战：多层感知机（MLP）网络入门

本教程详细介绍了如何使用 TensorFlow 2.0 构建和训练多层感知机（MLP）网络，涵盖回归和分类任务。通过具体示例和代码实现，帮助初学者快速掌握 TensorFlow 的核心概念和操作。 ... [详细]

蜡笔小新 2024-12-22 19:56:15

黑色鲜花_866

这个家伙很懒，什么也没留下！

Tags | 热门标签

RankList | 热门文章