1:高斯模型下的朴素贝叶斯
import numpy as np
import pandas as pdfrom sklearn.datasets import load_iris
from sklearn.model_selection import train_test_splitfrom collections import Counter
import mathdef create_data():iris = load_iris()df = pd.DataFrame(iris.data,columns = iris.feature_names)df['label'] = iris.targetdf.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']data = np.array(df.iloc[:100,:])return data[:,:-1],data[:,-1]X,y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)X_test[0], y_test[0]
class NaiveBayes:def __init__(self):self.model = None
def mean(X):return sum(X) / float(len(X))def stdev(self, X):avg = self.mean(X)return math.sqrt(sum([pow(x-avg, 2) for x in X]) / float(len(X)))def gaussian_probability(self, x, mean, stdev):exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))return (1 / (math.sqrt(2*math.pi) * stdev)) * exponentdef summarize(self, train_data):summaries = [(self.mean(i), self.stdev(i)) for i in zip(*train_data)]return summariesdef fit(self, X, y):labels = list(set(y))data = {label:[] for label in labels}for f, label in zip(X, y):data[label].append(f)self.model = {label: self.summarize(value) for label, value in data.items()}return 'gaussianNB train done!'def calculate_probabilities(self, input_data):probabilities = {}for label, value in self.model.items():probabilities[label] = 1for i in range(len(value)):mean, stdev = value[i]probabilities[label] *= self.gaussian_probability(input_data[i], mean, stdev)return probabilitiesdef predict(self, X_test):label = sorted(self.calculate_probabilities(X_test).items(), key=lambda x: x[-1])[-1][0]return labeldef score(self, X_test, y_test):right = 0for X, y in zip(X_test, y_test):label = self.predict(X)if label == y:right += 1return right / float(len(X_test))
主函数
model = NaiveBayes()
model.fit(X_train, y_train)print(model.predict([4.4, 3.2, 1.3, 0.2]))model.score(X_test, y_test)
2:sklearn贝叶斯实现
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)clf.score(X_test, y_test)
clf.predict([[4.4, 3.2, 1.3, 0.2]])