#!/usr/bin/env python # encoding: utf-8 ''' Created on Sep 16, 2010 Update on 2017-05-18 @author: Peter Harrington/羊山 《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning ''' from numpy import * # 导入科学计算包numpy和运算符模块operator import operator from os import listdir def createDataSet(): """ 创建数据集和标签 调用方式 import kNN group, labels = kNN.createDataSet() """ group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) labels = ['A', 'A', 'B', 'B'] return group, labels def classify0(inX, dataSet, labels, k): """ inx[1,2,3] DS=[[1,2,3],[1,2,0]] inX: 用于分类的输入向量 dataSet: 输入的训练样本集 labels: 标签向量 k: 选择最近邻居的数目 注意:labels元素数目和dataSet行数相同;程序使用欧式距离公式. 预测数据所在分类可在输入下列命令 kNN.classify0([0,0], group, labels, 3) """ # 1. 距离计算 dataSetSize = dataSet.shape[0] # tile生成和训练样本对应的矩阵,并与训练样本求差 """ tile: 列-3表示复制的行数, 行-1/2表示对inx的重复的次数 In [8]: tile(inx, (3, 1)) Out[8]: array([[1, 2], [1, 2], [1, 2]]) In [9]: tile(inx, (3, 2)) Out[9]: array([[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]]) """ diffMat = tile(inX, (dataSetSize, 1)) - dataSet """ 欧氏距离: 点到点之间的距离 第一行: 同一个点 到 dataSet的第一个点的距离。 第二行: 同一个点 到 dataSet的第二个点的距离。 ... 第N行: 同一个点 到 dataSet的第N个点的距离。 [[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]] (A1-A2)^2+(B1-B2)^2+(c1-c2)^2 """ # 取平方 sqDiffMat = diffMat ** 2 # 将矩阵的每一行相加 sqDistances = sqDiffMat.sum(axis=1) # 开方 distances = sqDistances ** 0.5 # 根据距离排序从小到大的排序,返回对应的索引位置 # print 'distances=', distances sortedDistIndicies = distances.argsort() # print 'distances.argsort()=', sortedDistIndicies # 2. 选择距离最小的k个点 classCount = {} for i in range(k): # 找到该样本的类型 voteIlabel = labels[sortedDistIndicies[i]] # 在字典中将该类型加一 classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 # 3. 排序并返回出现最多的那个类型 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def test1(): """ 第一个例子演示 """ group, labels = createDataSet() print str(group) print str(labels) print classify0([0.1, 0.1], group, labels, 3) # ---------------------------------------------------------------------------------------- def file2matrix(filename): """ 导入训练数据 :param filename: 数据文件路径 :return: 数据矩阵returnMat和对应的类别classLabelVector """ fr = open(filename) numberOfLines = len(fr.readlines()) # get the number of lines in the file # 生成对应的空矩阵 returnMat = zeros((numberOfLines, 3)) # prepare matrix to return classLabelVector = [] # prepare labels return fr = open(filename) index = 0 for line in fr.readlines(): line = line.strip() listFromLine = line.split('\t') # 每列的属性数据 returnMat[index, :] = listFromLine[0:3] # 每列的类别数据 classLabelVector.append(int(listFromLine[-1])) index += 1 # 返回数据矩阵returnMat和对应的类别classLabelVector return returnMat, classLabelVector def autoNorm(dataSet): """ 归一化特征值,消除属性之间量级不同导致的影响 :param dataSet: 数据集 :return: 归一化后的数据集normDataSet,ranges和minVals即最小值与范围,并没有用到 归一化公式: Y = (X-Xmin)/(Xmax-Xmin) """ # 计算每种属性的最大值、最小值、范围 minVals = dataSet.min(0) maxVals = dataSet.max(0) # 极差 ranges = maxVals - minVals normDataSet = zeros(shape(dataSet)) m = dataSet.shape[0] # 生成与最小值之差组成的矩阵 normDataSet = dataSet - tile(minVals, (m, 1)) # 将最小值之差除以范围组成矩阵 normDataSet = normDataSet / tile(ranges, (m, 1)) # element wise divide return normDataSet, ranges, minVals def datingClassTest(): """ 对约会网站的测试方法 :return: 错误数 """ # 设置测试数据的的一个比例(训练数据集比例=1-hoRatio) hoRatio = 0.1 # 测试范围,一部分测试一部分作为样本 # 从文件中加载数据 datingDataMat, datingLabels = file2matrix('input/2.KNN/datingTestSet2.txt') # load data setfrom file # 归一化数据 normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] # 设置测试的样本数量, numTestVecs:m表示训练样本的数量 numTestVecs = int(m * hoRatio) print 'numTestVecs=', numTestVecs errorCount = 0.0 for i in range(numTestVecs): # 对数据测试 classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]) if (classifierResult != datingLabels[i]): errorCount += 1.0 print "the total error rate is: %f" % (errorCount / float(numTestVecs)) print errorCount def img2vector(filename): """ 将图像数据转换为向量 :param filename: 图片文件 :return: 一纬矩阵 """ returnVect = zeros((1, 1024)) fr = open(filename) for i in range(32): lineStr = fr.readline() for j in range(32): returnVect[0, 32 * i + j] = int(lineStr[j]) return returnVect def handwritingClassTest(): # 1. 导入数据 hwLabels = [] trainingFileList = listdir('input/2.KNN/trainingDigits') # load the training set m = len(trainingFileList) trainingMat = zeros((m, 1024)) # hwLabels存储0~9对应的index位置, trainingMat存放的每个位置对应的图片向量 for i in range(m): fileNameStr = trainingFileList[i] fileStr = fileNameStr.split('.')[0] # take off .txt classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) # 将 32*32的矩阵->1*1024的矩阵 trainingMat[i, :] = img2vector('input/2.KNN/trainingDigits/%s' % fileNameStr) # 2. 导入测试数据 testFileList = listdir('input/2.KNN/testDigits') # iterate through the test set errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] # take off .txt classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = img2vector('input/2.KNN/testDigits/%s' % fileNameStr) classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr) if (classifierResult != classNumStr): errorCount += 1.0 print "\nthe total number of errors is: %d" % errorCount print "\nthe total error rate is: %f" % (errorCount / float(mTest)) if __name__ == '__main__': # test1() # datingClassTest() handwritingClassTest()