mirror of
https://github.com/apachecn/ailearning.git
synced 2026-05-08 23:12:06 +08:00
80
input/10.KMeans/testSet.xtx
Normal file
80
input/10.KMeans/testSet.xtx
Normal file
@@ -0,0 +1,80 @@
|
||||
1.658985 4.285136
|
||||
-3.453687 3.424321
|
||||
4.838138 -1.151539
|
||||
-5.379713 -3.362104
|
||||
0.972564 2.924086
|
||||
-3.567919 1.531611
|
||||
0.450614 -3.302219
|
||||
-3.487105 -1.724432
|
||||
2.668759 1.594842
|
||||
-3.156485 3.191137
|
||||
3.165506 -3.999838
|
||||
-2.786837 -3.099354
|
||||
4.208187 2.984927
|
||||
-2.123337 2.943366
|
||||
0.704199 -0.479481
|
||||
-0.392370 -3.963704
|
||||
2.831667 1.574018
|
||||
-0.790153 3.343144
|
||||
2.943496 -3.357075
|
||||
-3.195883 -2.283926
|
||||
2.336445 2.875106
|
||||
-1.786345 2.554248
|
||||
2.190101 -1.906020
|
||||
-3.403367 -2.778288
|
||||
1.778124 3.880832
|
||||
-1.688346 2.230267
|
||||
2.592976 -2.054368
|
||||
-4.007257 -3.207066
|
||||
2.257734 3.387564
|
||||
-2.679011 0.785119
|
||||
0.939512 -4.023563
|
||||
-3.674424 -2.261084
|
||||
2.046259 2.735279
|
||||
-3.189470 1.780269
|
||||
4.372646 -0.822248
|
||||
-2.579316 -3.497576
|
||||
1.889034 5.190400
|
||||
-0.798747 2.185588
|
||||
2.836520 -2.658556
|
||||
-3.837877 -3.253815
|
||||
2.096701 3.886007
|
||||
-2.709034 2.923887
|
||||
3.367037 -3.184789
|
||||
-2.121479 -4.232586
|
||||
2.329546 3.179764
|
||||
-3.284816 3.273099
|
||||
3.091414 -3.815232
|
||||
-3.762093 -2.432191
|
||||
3.542056 2.778832
|
||||
-1.736822 4.241041
|
||||
2.127073 -2.983680
|
||||
-4.323818 -3.938116
|
||||
3.792121 5.135768
|
||||
-4.786473 3.358547
|
||||
2.624081 -3.260715
|
||||
-4.009299 -2.978115
|
||||
2.493525 1.963710
|
||||
-2.513661 2.642162
|
||||
1.864375 -3.176309
|
||||
-3.171184 -3.572452
|
||||
2.894220 2.489128
|
||||
-2.562539 2.884438
|
||||
3.491078 -3.947487
|
||||
-2.565729 -2.012114
|
||||
3.332948 3.983102
|
||||
-1.616805 3.573188
|
||||
2.280615 -2.559444
|
||||
-2.651229 -3.103198
|
||||
2.321395 3.154987
|
||||
-1.685703 2.939697
|
||||
3.031012 -3.620252
|
||||
-4.599622 -2.185829
|
||||
4.196223 1.126677
|
||||
-2.133863 3.093686
|
||||
4.668892 -2.562705
|
||||
-2.793241 -2.149706
|
||||
2.884105 3.043438
|
||||
-2.967647 2.848696
|
||||
4.479332 -1.764772
|
||||
-4.905566 -2.911070
|
||||
@@ -4,12 +4,12 @@
|
||||
from numpy import *
|
||||
|
||||
# 从文本中构建矩阵,加载文本文件,然后处理
|
||||
def loadDataSet(fileName): # 通用函数,用来解析以 tab 键分隔的 floats(浮点数)
|
||||
dataMat = [] # assume last column is target value
|
||||
def loadDataSet(fileName): # 通用函数,用来解析以 tab 键分隔的 floats(浮点数)
|
||||
dataMat = [] # 假设最后一列是目标变量
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
curLine = line.strip().split('\t')
|
||||
fltLine = map(float,curLine) # 映射所有的元素为 float(浮点数)类型
|
||||
fltLine = map(float,curLine) # 映射所有的元素为 float(浮点数)类型
|
||||
dataMat.append(fltLine)
|
||||
return dataMat
|
||||
|
||||
@@ -19,32 +19,35 @@ def distEclud(vecA, vecB):
|
||||
|
||||
# 为给定数据集构建一个包含 k 个随机质心的集合。随机质心必须要在整个数据集的边界之内,这可以通过找到数据集每一维的最小和最大值来完成。然后生成 0~1.0 之间的随机数并通过取值范围和最小值,以便确保随机点在数据的边界之内。
|
||||
def randCent(dataSet, k):
|
||||
n = shape(dataSet)[1] # 列数
|
||||
centroids = mat(zeros((k,n))) # 创建质心矩阵
|
||||
for j in range(n): # 穿件随机簇质心,并且在每一维的边界内
|
||||
minJ = min(dataSet[:,j])
|
||||
rangeJ = float(max(dataSet[:,j]) - minJ)
|
||||
centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1)) # 随机生成
|
||||
n = shape(dataSet)[1] # 列的数俩
|
||||
centroids = mat(zeros((k,n))) # 创建k个质心矩阵
|
||||
for j in range(n): # 创建随机簇质心,并且在每一维的边界内
|
||||
minJ = min(dataSet[:,j]) # 最小值
|
||||
rangeJ = float(max(dataSet[:,j]) - minJ) # 范围 = 最大值 - 最小值
|
||||
centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1)) # 随机生成
|
||||
return centroids
|
||||
|
||||
# k-means 聚类算法
|
||||
# 该算法会创建k个质心,然后将每个点分配到最近的质心,再重新计算质心。
|
||||
# 这个过程重复数次,知道数据点的簇分配结果不再改变位置。
|
||||
# 运行结果(多次运行结果可能会不一样,可以试试,原因为随机质心的影响,但总的结果是对的, 因为数据足够相似,也可能会陷入局部最小值)
|
||||
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
|
||||
m = shape(dataSet)[0]
|
||||
clusterAssment = mat(zeros((m,2))) # 创建矩阵来分配数据点到质心中
|
||||
centroids = createCent(dataSet, k)
|
||||
m = shape(dataSet)[0] # 行数
|
||||
clusterAssment = mat(zeros((m,2))) # 创建一个与 dataSet 行数一样,但是有两列的矩阵,用来保存簇分配结果。
|
||||
centroids = createCent(dataSet, k) # 创建质心,随机k个质心
|
||||
clusterChanged = True
|
||||
while clusterChanged:
|
||||
clusterChanged = False
|
||||
for i in range(m): # 循环每一个数据点并分配到最近的质心中去
|
||||
for i in range(m): # 循环每一个数据点并分配到最近的质心中去
|
||||
minDist = inf; minIndex = -1
|
||||
for j in range(k):
|
||||
distJI = distMeas(centroids[j,:],dataSet[i,:])
|
||||
if distJI < minDist:
|
||||
distJI = distMeas(centroids[j,:],dataSet[i,:]) # 计算距离
|
||||
if distJI < minDist: # 如果距离比 minDist(最小距离)还小,更新 minDist(最小距离)和最小质心的 index(索引)
|
||||
minDist = distJI; minIndex = j
|
||||
if clusterAssment[i,0] != minIndex: clusterChanged = True
|
||||
clusterAssment[i,:] = minIndex,minDist**2
|
||||
clusterAssment[i,:] = minIndex,minDist**2 # 更新簇分配结果为最小质心的 index(索引),minDist(最小距离)的平方
|
||||
print centroids
|
||||
for cent in range(k): # 重新计算质心
|
||||
for cent in range(k): # 更新质心
|
||||
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]] # 获取该簇中的所有点
|
||||
centroids[cent,:] = mean(ptsInClust, axis=0) # 分配质心
|
||||
centroids[cent,:] = mean(ptsInClust, axis=0) # 将质心修改为簇中所有点的平均值,mean 就是求平均值的
|
||||
return centroids, clusterAssment
|
||||
|
||||
43
src/python/10.kmeans/test.xtx
Normal file
43
src/python/10.kmeans/test.xtx
Normal file
@@ -0,0 +1,43 @@
|
||||
# import
|
||||
>>> import kMeans
|
||||
>>> from numpy import *
|
||||
|
||||
# 从文本中构建矩阵,加载测试数据集
|
||||
>>> datMat=mat(kMeans.loadDataSet('testSet.txt'))
|
||||
|
||||
# 测试 randCent() 函数是否正常运行。
|
||||
# 首先,先看一下矩阵中的最大值与最小值
|
||||
>>> min(datMat[:,0])
|
||||
matrix([[-5.379713]])
|
||||
>>> min(datMat[:,1])
|
||||
matrix([[-4.232586]])
|
||||
>>> max(datMat[:,1])
|
||||
matrix([[ 5.1904]])
|
||||
>>> max(datMat[:,0])
|
||||
matrix([[ 4.838138]])
|
||||
|
||||
# 然后看看 randCent() 函数能否生成 min 到 max 之间的值
|
||||
>>> kMeans.randCent(datMat, 2)
|
||||
matrix([[-3.59997714, -1.43558065],
|
||||
[-3.03744979, 4.35541488]])
|
||||
|
||||
# 最后测试一下距离计算方法
|
||||
>>> kMeans.distEclud(datMat[0], datMat[1])
|
||||
5.184632816681332
|
||||
|
||||
# 该算法会创建k个质心,然后将每个点分配到最近的质心,再重新计算质心。
|
||||
# 这个过程重复数次,知道数据点的簇分配结果不再改变位置。
|
||||
# 运行结果(多次运行结果可能会不一样,可以试试,原因为随机质心的影响,但总的结果是对的, 因为数据足够相似)
|
||||
>>> myCentroids, clustAssing = kMeans.kMeans(datMat, 4)
|
||||
[[ 0.15357605 -0.94962877]
|
||||
[ 3.3593825 1.05965957]
|
||||
[-2.41900657 3.30513371]
|
||||
[-2.80505526 -3.73280289]]
|
||||
[[ 2.35622556 -3.02056425]
|
||||
[ 2.95373358 2.32801413]
|
||||
[-2.46154315 2.78737555]
|
||||
[-3.38237045 -2.9473363 ]]
|
||||
[[ 2.65077367 -2.79019029]
|
||||
[ 2.6265299 3.10868015]
|
||||
[-2.46154315 2.78737555]
|
||||
[-3.53973889 -2.89384326]]
|
||||
Reference in New Issue
Block a user