作者:锋哥很好 | 来源:互联网 | 2023-09-23 00:21
本文《machinelearninginaction》学习笔记数据源码可以在这里获取:https:www.manning.combooksmachine-learni
本文《machine learning in action》学习笔记
数据源码可以在这里获取 :https://www.manning.com/books/machine-learning-in-action
这里Python 3+的code
from numpy import *
import matplotlib.pyplot as plt
import operatordef kNNClassify(inX, dataSet, labels, k):'''put the kNN classification algorithm into action'''dataSetSize = dataSet.shape[0]diffMax = tile(inX,(dataSetSize,1)) - dataSetsqDiffMax = diffMax ** 2sqDistances = sqDiffMax.sum(axis=1)distances = sqDistances**0.5sortedDistIndicies = distances.argsort()classCount= {}for i in range(k):voteIlabel = labels[sortedDistIndicies[i]]classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True)return sortedClassCount[0][0]def file2matrix(filename):"""process the text information"""fr = open(filename)arrayofLines = fr.readlines()fr.close()numberofLines = len(arrayofLines)returnMat = zeros((numberofLines, 3))classLabelVector = []index = 0for line in arrayofLines:line = line.strip()listFromLine = line.split('\t')returnMat[index,:] = listFromLine[0:3]classLabelVector.append(int(listFromLine[-1]))index += 1return returnMat, classLabelVectordef autonorm(dataset):"""归一化"""minvalue = dataset.min(0)maxvalue = dataset.max(0)ranges = maxvalue - minvaluenormdataset = zeros(shape(dataset))m = dataset.shape[0]normdataset = dataset - tile(minvalue, (m,1))normdataset = normdataset / tile(ranges, (m,1))return normdataset, ranges, minvaluedef datingclasstest():horatio = 0.1datingdatamat, datinglabel = file2matrix("datingTestset2.txt")normat, ranges, minvalues = autonorm(datingdatamat)m = normat.shape[0]numtestvec = int(m*horatio)errorcount = 0.0for i in range(numtestvec):classifierresult = kNNClassify(normat[i, :], normat[numtestvec:m], datinglabel[numtestvec:m], 3)print("No.%d test data, the classifier came back with : %d, the real answeris: %d" %(i, classifierresult, datinglabel[i]))if (classifierresult != datinglabel[i]):errorcount += 1.0print ("the total error rate is: %f" % (errorcount/float(numtestvec)))if __name__ == '__main__':datingdatamat, datinglabel = file2matrix('datingTestSet2.txt')normdataset, ranges, minvalue = autonorm(datingdatamat)print(normdataset)print("ranges = ", ranges)print("minvalue = ", minvalue)fig = plt.figure()ax = fig.add_subplot(111)ax.scatter(datingdatamat[:,1], datingdatamat[:,2], 15.0*array(datinglabel), 15.0*array(datinglabel))plt.show()datingclasstest()
读取TXT数据
def file2matrix(filename):"""process the text information"""fr = open(filename)arrayofLines = fr.readlines()numberofLines = len(arrayofLines)returnMat = zeros((numberofLines, 3))classLabelVector = []index = 0for line in arrayofLines:line = line.strip()listFromLine = line.split('\t')returnMat[index:1] = listFromLine[0:3]classLabelVector.append(int(listFromLine[-1]))index += 1return returnMat, classLabelVector
normalization result:
plot:
test result:
结果与书中的结果并不一致,kNN这么简单的算法,其结果应该一致才对。为什么?