# 计算信息熵 def cal_information_entropy(data): data_label = data.iloc[:, -1] label_class = data_label.value_counts() Ent = 0 for k in label_class.keys(): p_k = label_class[k] / len(data_label) Ent -= p_k * np.log2(p_k) return Ent
# 计算信息增益 def cal_information_gain(data, a): Ent = cal_information_entropy(data) feature_class = data[a].value_counts() gain = 0 for v in feature_class.keys(): weight = feature_class[v] / data.shape[0] Ent_v = cal_information_entropy(data.loc[data[a] == v]) gain += weight * Ent_v return Ent - gain
# 挑选最优特征 def get_best_feature(data): features = data.columns[:-1] res = {} for a in features: temp = cal_information_gain(data, a) res[a] = temp res = sorted(res.items(), key=lambda x: x[1], reverse=True) return res[0][0]
# 创建决策树 def create_tree(data): data_label = data.iloc[:, -1] if len(data_label.value_counts()) == 1: return data_label.values[0] if all(len(data[i].value_counts()) == 1 for i in data.iloc[:, :-1].columns): return get_most_label(data) best_feature = get_best_feature(data) Tree = {best_feature: {}} exist_vals = pd.unique(data[best_feature]) if len(exist_vals) != len(column_count[best_feature]): no_exist_attr = set(column_count[best_feature]) - set(exist_vals) for no_feat in no_exist_attr: Tree[best_feature][no_feat] = get_most_label(data) for item in drop_exist_feature(data, best_feature): Tree[best_feature][item[0]] = create_tree(item[1]) return Tree