mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-13 23:35:28 +08:00
227 lines
7.5 KiB
Python
227 lines
7.5 KiB
Python
#!/usr/bin/env python
|
||
# encoding: utf-8
|
||
'''
|
||
Created on Sep 16, 2010
|
||
Update on 2017-05-18
|
||
@author: Peter Harrington/羊山
|
||
《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning
|
||
'''
|
||
from numpy import *
|
||
# 导入科学计算包numpy和运算符模块operator
|
||
import operator
|
||
from os import listdir
|
||
|
||
|
||
def createDataSet():
|
||
"""
|
||
创建数据集和标签
|
||
|
||
调用方式
|
||
import kNN
|
||
group, labels = kNN.createDataSet()
|
||
"""
|
||
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
|
||
labels = ['A', 'A', 'B', 'B']
|
||
return group, labels
|
||
|
||
|
||
def classify0(inX, dataSet, labels, k):
|
||
"""
|
||
inx[1,2,3]
|
||
DS=[[1,2,3],[1,2,0]]
|
||
inX: 用于分类的输入向量
|
||
dataSet: 输入的训练样本集
|
||
labels: 标签向量
|
||
k: 选择最近邻居的数目
|
||
注意:labels元素数目和dataSet行数相同;程序使用欧式距离公式.
|
||
|
||
预测数据所在分类可在输入下列命令
|
||
kNN.classify0([0,0], group, labels, 3)
|
||
"""
|
||
# 1. 距离计算
|
||
dataSetSize = dataSet.shape[0]
|
||
# tile生成和训练样本对应的矩阵,并与训练样本求差
|
||
"""
|
||
tile: 列-3表示复制的行数, 行-1/2表示对inx的重复的次数
|
||
|
||
In [8]: tile(inx, (3, 1))
|
||
Out[8]:
|
||
array([[1, 2],
|
||
[1, 2],
|
||
[1, 2]])
|
||
|
||
In [9]: tile(inx, (3, 2))
|
||
Out[9]:
|
||
array([[1, 2, 1, 2],
|
||
[1, 2, 1, 2],
|
||
[1, 2, 1, 2]])
|
||
"""
|
||
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
|
||
"""
|
||
欧氏距离: 点到点之间的距离
|
||
第一行: 同一个点 到 dataSet的第一个点的距离。
|
||
第二行: 同一个点 到 dataSet的第二个点的距离。
|
||
...
|
||
第N行: 同一个点 到 dataSet的第N个点的距离。
|
||
|
||
[[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]]
|
||
(A1-A2)^2+(B1-B2)^2+(c1-c2)^2
|
||
"""
|
||
# 取平方
|
||
sqDiffMat = diffMat ** 2
|
||
# 将矩阵的每一行相加
|
||
sqDistances = sqDiffMat.sum(axis=1)
|
||
# 开方
|
||
distances = sqDistances ** 0.5
|
||
# 根据距离排序从小到大的排序,返回对应的索引位置
|
||
# print 'distances=', distances
|
||
sortedDistIndicies = distances.argsort()
|
||
# print 'distances.argsort()=', sortedDistIndicies
|
||
|
||
# 2. 选择距离最小的k个点
|
||
classCount = {}
|
||
for i in range(k):
|
||
# 找到该样本的类型
|
||
voteIlabel = labels[sortedDistIndicies[i]]
|
||
# 在字典中将该类型加一
|
||
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
|
||
# 3. 排序并返回出现最多的那个类型
|
||
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
|
||
return sortedClassCount[0][0]
|
||
|
||
|
||
def test1():
|
||
"""
|
||
第一个例子演示
|
||
"""
|
||
group, labels = createDataSet()
|
||
print str(group)
|
||
print str(labels)
|
||
print classify0([0.1, 0.1], group, labels, 3)
|
||
|
||
|
||
# ----------------------------------------------------------------------------------------
|
||
def file2matrix(filename):
|
||
"""
|
||
导入训练数据
|
||
:param filename: 数据文件路径
|
||
:return: 数据矩阵returnMat和对应的类别classLabelVector
|
||
"""
|
||
fr = open(filename)
|
||
numberOfLines = len(fr.readlines()) # get the number of lines in the file
|
||
# 生成对应的空矩阵
|
||
returnMat = zeros((numberOfLines, 3)) # prepare matrix to return
|
||
classLabelVector = [] # prepare labels return
|
||
fr = open(filename)
|
||
index = 0
|
||
for line in fr.readlines():
|
||
line = line.strip()
|
||
listFromLine = line.split('\t')
|
||
# 每列的属性数据
|
||
returnMat[index, :] = listFromLine[0:3]
|
||
# 每列的类别数据
|
||
classLabelVector.append(int(listFromLine[-1]))
|
||
index += 1
|
||
# 返回数据矩阵returnMat和对应的类别classLabelVector
|
||
return returnMat, classLabelVector
|
||
|
||
|
||
def autoNorm(dataSet):
|
||
"""
|
||
归一化特征值,消除属性之间量级不同导致的影响
|
||
:param dataSet: 数据集
|
||
:return: 归一化后的数据集normDataSet,ranges和minVals即最小值与范围,并没有用到
|
||
|
||
归一化公式:
|
||
Y = (X-Xmin)/(Xmax-Xmin)
|
||
"""
|
||
# 计算每种属性的最大值、最小值、范围
|
||
minVals = dataSet.min(0)
|
||
maxVals = dataSet.max(0)
|
||
# 极差
|
||
ranges = maxVals - minVals
|
||
normDataSet = zeros(shape(dataSet))
|
||
m = dataSet.shape[0]
|
||
# 生成与最小值之差组成的矩阵
|
||
normDataSet = dataSet - tile(minVals, (m, 1))
|
||
# 将最小值之差除以范围组成矩阵
|
||
normDataSet = normDataSet / tile(ranges, (m, 1)) # element wise divide
|
||
return normDataSet, ranges, minVals
|
||
|
||
|
||
def datingClassTest():
|
||
"""
|
||
对约会网站的测试方法
|
||
:return: 错误数
|
||
"""
|
||
# 设置测试数据的的一个比例(训练数据集比例=1-hoRatio)
|
||
hoRatio = 0.1 # 测试范围,一部分测试一部分作为样本
|
||
# 从文件中加载数据
|
||
datingDataMat, datingLabels = file2matrix('input/2.KNN/datingTestSet2.txt') # load data setfrom file
|
||
# 归一化数据
|
||
normMat, ranges, minVals = autoNorm(datingDataMat)
|
||
m = normMat.shape[0]
|
||
# 设置测试的样本数量, numTestVecs:m表示训练样本的数量
|
||
numTestVecs = int(m * hoRatio)
|
||
print 'numTestVecs=', numTestVecs
|
||
errorCount = 0.0
|
||
for i in range(numTestVecs):
|
||
# 对数据测试
|
||
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
|
||
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
|
||
if (classifierResult != datingLabels[i]): errorCount += 1.0
|
||
print "the total error rate is: %f" % (errorCount / float(numTestVecs))
|
||
print errorCount
|
||
|
||
|
||
def img2vector(filename):
|
||
"""
|
||
将图像数据转换为向量
|
||
:param filename: 图片文件
|
||
:return: 一纬矩阵
|
||
"""
|
||
returnVect = zeros((1, 1024))
|
||
fr = open(filename)
|
||
for i in range(32):
|
||
lineStr = fr.readline()
|
||
for j in range(32):
|
||
returnVect[0, 32 * i + j] = int(lineStr[j])
|
||
return returnVect
|
||
|
||
|
||
def handwritingClassTest():
|
||
# 1. 导入数据
|
||
hwLabels = []
|
||
trainingFileList = listdir('input/2.KNN/trainingDigits') # load the training set
|
||
m = len(trainingFileList)
|
||
trainingMat = zeros((m, 1024))
|
||
# hwLabels存储0~9对应的index位置, trainingMat存放的每个位置对应的图片向量
|
||
for i in range(m):
|
||
fileNameStr = trainingFileList[i]
|
||
fileStr = fileNameStr.split('.')[0] # take off .txt
|
||
classNumStr = int(fileStr.split('_')[0])
|
||
hwLabels.append(classNumStr)
|
||
# 将 32*32的矩阵->1*1024的矩阵
|
||
trainingMat[i, :] = img2vector('input/2.KNN/trainingDigits/%s' % fileNameStr)
|
||
|
||
# 2. 导入测试数据
|
||
testFileList = listdir('input/2.KNN/testDigits') # iterate through the test set
|
||
errorCount = 0.0
|
||
mTest = len(testFileList)
|
||
for i in range(mTest):
|
||
fileNameStr = testFileList[i]
|
||
fileStr = fileNameStr.split('.')[0] # take off .txt
|
||
classNumStr = int(fileStr.split('_')[0])
|
||
vectorUnderTest = img2vector('input/2.KNN/testDigits/%s' % fileNameStr)
|
||
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
|
||
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
|
||
if (classifierResult != classNumStr): errorCount += 1.0
|
||
print "\nthe total number of errors is: %d" % errorCount
|
||
print "\nthe total error rate is: %f" % (errorCount / float(mTest))
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# test1()
|
||
# datingClassTest()
|
||
handwritingClassTest()
|