Files
ailearning/src/python/2.KNN/kNN.py
jiangzhonglian 62e8b3dee4 更新链接
2017-05-18 11:57:30 +08:00

227 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# encoding: utf-8
'''
Created on Sep 16, 2010
Update on 2017-05-18
@author: Peter Harrington/羊山
《机器学习实战》更新地址https://github.com/apachecn/MachineLearning
'''
from numpy import *
# 导入科学计算包numpy和运算符模块operator
import operator
from os import listdir
def createDataSet():
"""
创建数据集和标签
调用方式
import kNN
group, labels = kNN.createDataSet()
"""
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
def classify0(inX, dataSet, labels, k):
"""
inx[1,2,3]
DS=[[1,2,3],[1,2,0]]
inX: 用于分类的输入向量
dataSet: 输入的训练样本集
labels: 标签向量
k: 选择最近邻居的数目
注意labels元素数目和dataSet行数相同程序使用欧式距离公式.
预测数据所在分类可在输入下列命令
kNN.classify0([0,0], group, labels, 3)
"""
# 1. 距离计算
dataSetSize = dataSet.shape[0]
# tile生成和训练样本对应的矩阵并与训练样本求差
"""
tile: 列-3表示复制的行数 行-12表示对inx的重复的次数
In [8]: tile(inx, (3, 1))
Out[8]:
array([[1, 2],
[1, 2],
[1, 2]])
In [9]: tile(inx, (3, 2))
Out[9]:
array([[1, 2, 1, 2],
[1, 2, 1, 2],
[1, 2, 1, 2]])
"""
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
"""
欧氏距离: 点到点之间的距离
第一行: 同一个点 到 dataSet的第一个点的距离。
第二行: 同一个点 到 dataSet的第二个点的距离。
...
第N行 同一个点 到 dataSet的第N个点的距离。
[[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]]
(A1-A2)^2+(B1-B2)^2+(c1-c2)^2
"""
# 取平方
sqDiffMat = diffMat ** 2
# 将矩阵的每一行相加
sqDistances = sqDiffMat.sum(axis=1)
# 开方
distances = sqDistances ** 0.5
# 根据距离排序从小到大的排序,返回对应的索引位置
# print 'distances=', distances
sortedDistIndicies = distances.argsort()
# print 'distances.argsort()=', sortedDistIndicies
# 2. 选择距离最小的k个点
classCount = {}
for i in range(k):
# 找到该样本的类型
voteIlabel = labels[sortedDistIndicies[i]]
# 在字典中将该类型加一
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
# 3. 排序并返回出现最多的那个类型
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def test1():
"""
第一个例子演示
"""
group, labels = createDataSet()
print str(group)
print str(labels)
print classify0([0.1, 0.1], group, labels, 3)
# ----------------------------------------------------------------------------------------
def file2matrix(filename):
"""
导入训练数据
:param filename: 数据文件路径
:return: 数据矩阵returnMat和对应的类别classLabelVector
"""
fr = open(filename)
numberOfLines = len(fr.readlines()) # get the number of lines in the file
# 生成对应的空矩阵
returnMat = zeros((numberOfLines, 3)) # prepare matrix to return
classLabelVector = [] # prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
# 每列的属性数据
returnMat[index, :] = listFromLine[0:3]
# 每列的类别数据
classLabelVector.append(int(listFromLine[-1]))
index += 1
# 返回数据矩阵returnMat和对应的类别classLabelVector
return returnMat, classLabelVector
def autoNorm(dataSet):
"""
归一化特征值,消除属性之间量级不同导致的影响
:param dataSet: 数据集
:return: 归一化后的数据集normDataSet,ranges和minVals即最小值与范围并没有用到
归一化公式:
Y = (X-Xmin)/(Xmax-Xmin)
"""
# 计算每种属性的最大值、最小值、范围
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
# 极差
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
# 生成与最小值之差组成的矩阵
normDataSet = dataSet - tile(minVals, (m, 1))
# 将最小值之差除以范围组成矩阵
normDataSet = normDataSet / tile(ranges, (m, 1)) # element wise divide
return normDataSet, ranges, minVals
def datingClassTest():
"""
对约会网站的测试方法
:return: 错误数
"""
# 设置测试数据的的一个比例(训练数据集比例=1-hoRatio
hoRatio = 0.1 # 测试范围,一部分测试一部分作为样本
# 从文件中加载数据
datingDataMat, datingLabels = file2matrix('input/2.KNN/datingTestSet2.txt') # load data setfrom file
# 归一化数据
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
# 设置测试的样本数量, numTestVecs:m表示训练样本的数量
numTestVecs = int(m * hoRatio)
print 'numTestVecs=', numTestVecs
errorCount = 0.0
for i in range(numTestVecs):
# 对数据测试
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %f" % (errorCount / float(numTestVecs))
print errorCount
def img2vector(filename):
"""
将图像数据转换为向量
:param filename: 图片文件
:return: 一纬矩阵
"""
returnVect = zeros((1, 1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32 * i + j] = int(lineStr[j])
return returnVect
def handwritingClassTest():
# 1. 导入数据
hwLabels = []
trainingFileList = listdir('input/2.KNN/trainingDigits') # load the training set
m = len(trainingFileList)
trainingMat = zeros((m, 1024))
# hwLabels存储09对应的index位置 trainingMat存放的每个位置对应的图片向量
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0] # take off .txt
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
# 将 32*32的矩阵->1*1024的矩阵
trainingMat[i, :] = img2vector('input/2.KNN/trainingDigits/%s' % fileNameStr)
# 2. 导入测试数据
testFileList = listdir('input/2.KNN/testDigits') # iterate through the test set
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0] # take off .txt
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('input/2.KNN/testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
if (classifierResult != classNumStr): errorCount += 1.0
print "\nthe total number of errors is: %d" % errorCount
print "\nthe total error rate is: %f" % (errorCount / float(mTest))
if __name__ == '__main__':
# test1()
# datingClassTest()
handwritingClassTest()