更新完PCA的代码和注释

This commit is contained in:
jiangzhonglian
2017-04-07 23:21:11 +08:00
parent 19dbf5bc35
commit 022e598e9a
3 changed files with 1703 additions and 20 deletions

1567
input/13.PCA/secom.data Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -20,15 +20,26 @@ randArray = random.rand(4, 4)
# 转化关系, 数组转化为矩阵
randMat = mat(randArray)
# .I表示对矩阵求逆(可以利用矩阵的初等变换
# # 意义逆矩阵是一个判断相似性的工具。逆矩阵A与列向量p相乘后将得到列向量qq的第i个分量表示p与A的第i个列向量的相似度。
# # 参考案例链接:
# # https://www.zhihu.com/question/33258489
# # http://blog.csdn.net/vernice/article/details/48506027
# .T表示对矩阵转置(行列颠倒)
'''
.I 表示对矩阵求逆(可以利用矩阵的初等变换)
意义逆矩阵是一个判断相似性的工具。逆矩阵A与列向量p相乘后将得到列向量qq的第i个分量表示p与A的第i个列向量的相似度。
参考案例链接:
https://www.zhihu.com/question/33258489
http://blog.csdn.net/vernice/article/details/48506027
.T 表示对矩阵转置(行列颠倒)
.A 返回矩阵基于的数组
参考案例链接:
http://blog.csdn.net/qq403977698/article/details/47254539
'''
invRandMat = randMat.I
TraRandMat = randMat.T
ArrRandMat = randMat.A
# 输出结果
print(randArray, '\n---\n', randMat, '\n+++\n', invRandMat)
print 'randArray=', randArray
print 'randMat=', randMat
print 'invRandMat=', invRandMat
print 'TraRandMat=', TraRandMat
print 'ArrRandMat=', ArrRandMat
# 矩阵和逆矩阵 进行求积 (单位矩阵对角线都为1嘛理论上4*4的矩阵其他的都为0)
myEye = randMat*invRandMat
# 误差

View File

@@ -8,38 +8,143 @@ Update on 2017-04-06
'''
print(__doc__)
from numpy import *
import matplotlib
import matplotlib.pyplot as plt
def loadDataSet(fileName, delim='\t'):
fr = open(fileName)
stringArr = [line.strip().split(delim) for line in fr.readlines()]
datArr = [map(float,line) for line in stringArr]
datArr = [map(float, line) for line in stringArr]
return mat(datArr)
def pca(dataMat, topNfeat=9999999):
"""pca
Args:
dataMat 原数据集矩阵
topNfeat 应用的N个特征
Returns:
lowDDataMat
reconMat
"""
# 计算每一列的均值
meanVals = mean(dataMat, axis=0)
meanRemoved = dataMat - meanVals #remove mean
# print 'meanVals', meanVals
# 每个向量同时都减去 均值
meanRemoved = dataMat - meanVals
# print 'meanRemoved=', meanRemoved
# cov协方差=[(x1-x均值)*(y1-y均值)+(x2-x均值)*(y2-y均值)+...+(xn-x均值)*(yn-y均值)+]/(n-1)
'''
方差:(一维)度量两个随机变量关系的统计量
协方差: (二维)度量各个维度偏离其均值的程度
协方差矩阵:(多维)度量各个维度偏离其均值的程度
 cov(X, Y)>0时表明X与Y正相关(X越大Y也越大X越小Y也越小。这种情况我们称为“正相关”。)
 cov(X, Y)<0时表明X与Y负相关
 cov(X, Y)=0时表明X与Y不相关。
'''
covMat = cov(meanRemoved, rowvar=0)
eigVals,eigVects = linalg.eig(mat(covMat))
eigValInd = argsort(eigVals) #sort, sort goes smallest to largest
eigValInd = eigValInd[:-(topNfeat+1):-1] #cut off unwanted dimensions
redEigVects = eigVects[:,eigValInd] #reorganize eig vects largest to smallest
lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions
# eigVals为特征值 eigVects为特征向量
eigVals, eigVects = linalg.eig(mat(covMat))
# print 'eigVals=', eigVals
# print 'eigVects=', eigVects
# 对特征值进行从小到大的排序返回从小到大的index序号
# 特征值的逆序就可以得到topNfeat个最大的特征向量
'''
>>> x = np.array([3, 1, 2])
>>> np.argsort(x)
array([1, 2, 0]) # index,1 = 1; index,2 = 2; index,0 = 3
>>> y = np.argsort(x)
>>> y[::-1]
array([0, 2, 1])
>>> y[:-3:-1]
array([0, 2]) # 取出 -1, -2
>>> y[:-6:-1]
array([0, 2, 1])
'''
eigValInd = argsort(eigVals)
# print 'eigValInd1=', eigValInd
# -1表示倒序返回topN的特征值[-1 到 -(topNfeat+1) 但是不包括-(topNfeat+1)本身的倒叙]
eigValInd = eigValInd[:-(topNfeat+1):-1]
# print 'eigValInd2=', eigValInd
# 重组eig vects 最大到最小
redEigVects = eigVects[:, eigValInd]
# print 'redEigVects=', redEigVects.T
# 将数据转换到新空间
lowDDataMat = meanRemoved * redEigVects
reconMat = (lowDDataMat * redEigVects.T) + meanVals
# print 'lowDDataMat=', lowDDataMat
# print 'reconMat=', reconMat
return lowDDataMat, reconMat
def replaceNanWithMean():
datMat = loadDataSet('secom.data', ' ')
datMat = loadDataSet('input/13.PCA/secom.data', ' ')
numFeat = shape(datMat)[1]
for i in range(numFeat):
meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number)
datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal #set NaN values to mean
# 对value不为NaN的求均值
# .A 返回矩阵基于的数组
meanVal = mean(datMat[nonzero(~isnan(datMat[:, i].A))[0], i])
# 将value为NaN的值赋值为均值
datMat[nonzero(isnan(datMat[:, i].A))[0],i] = meanVal
return datMat
def show_picture(dataMat, reconMat):
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(dataMat[:, 0].flatten().A[0], dataMat[:, 1].flatten().A[0], marker='^', s=90)
ax.scatter(reconMat[:, 0].flatten().A[0], reconMat[:, 1].flatten().A[0], marker='o', s=50, c='red')
plt.show()
# def pca_semiconductor():
if __name__ == "__main__":
dataMat = loadDataSet('input/13.PCA/testSet.txt')
lowDmat, reconMat = pca(dataMat, 1)
print shape(lowDmat)
# # 加载数据并转化数据类型为float
# dataMat = loadDataSet('input/13.PCA/testSet.txt')
# # 只需要1个特征向量
# lowDmat, reconMat = pca(dataMat, 1)
# # 只需要2个特征向量和原始数据一致没任何变化
# # lowDmat, reconMat = pca(dataMat, 2)
# # print shape(lowDmat)
# show_picture(dataMat, reconMat)
# 利用PCA对半导体制造数据降维
dataMat = replaceNanWithMean()
# print shape(dataMat)
# lowDmat, reconMat = pca(dataMat, 40)
# print shape(lowDmat)
# show_picture(dataMat, reconMat)
meanVals = mean(dataMat, axis=0)
meanRemoved = dataMat-meanVals
covMat = cov(meanRemoved, rowvar=0)
eigvals, eigVects = linalg.eig(mat(covMat))
eigValInd = argsort(eigvals)
topNfeat = 20
eigValInd = eigValInd[:-(topNfeat+1):-1]
cov_all_score = sum(eigvals)
sum_cov_score = 0
for i in range(0, len(eigValInd)):
line_cov_score = eigvals[eigValInd[i]]
sum_cov_score += line_cov_score
'''
我们发现其中有超过20%的特征值都是0。
这就意味着这些特征都是其他特征的副本,也就是说,它们可以通过其他特征来表示,而本身并没有提供额外的信息。
最前面15个值的数量级大于10^5实际上那以后的值都变得非常小。
这就相当于告诉我们只有部分重要特征,重要特征的数目也很快就会下降。
最后我们可能会注意到有一些小的负值他们主要源自数值误差应该四舍五入成0.
'''
print '主成分:%s, 方差占比:%s%%, 累积方差占比:%s%%' % (format(i+1, '2.0f'), format(line_cov_score/cov_all_score*100, '4.1f'), format(sum_cov_score/cov_all_score*100, '4.1f'))