This commit is contained in:
hello19883
2017-04-06 20:05:34 +08:00
17 changed files with 1121 additions and 12 deletions

45
src/python/13.PCA/pca.py Normal file
View File

@@ -0,0 +1,45 @@
#!/usr/bin/python
# coding:utf8
'''
Created on Jun 1, 2011
Update on 2017-04-06
@author: Peter Harrington/片刻
'''
print(__doc__)
from numpy import *
def loadDataSet(fileName, delim='\t'):
fr = open(fileName)
stringArr = [line.strip().split(delim) for line in fr.readlines()]
datArr = [map(float,line) for line in stringArr]
return mat(datArr)
def pca(dataMat, topNfeat=9999999):
meanVals = mean(dataMat, axis=0)
meanRemoved = dataMat - meanVals #remove mean
covMat = cov(meanRemoved, rowvar=0)
eigVals,eigVects = linalg.eig(mat(covMat))
eigValInd = argsort(eigVals) #sort, sort goes smallest to largest
eigValInd = eigValInd[:-(topNfeat+1):-1] #cut off unwanted dimensions
redEigVects = eigVects[:,eigValInd] #reorganize eig vects largest to smallest
lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions
reconMat = (lowDDataMat * redEigVects.T) + meanVals
return lowDDataMat, reconMat
def replaceNanWithMean():
datMat = loadDataSet('secom.data', ' ')
numFeat = shape(datMat)[1]
for i in range(numFeat):
meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number)
datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal #set NaN values to mean
return datMat
if __name__ == "__main__":
dataMat = loadDataSet('data/13.PCA/testSet.txt')
lowDmat, reconMat = pca(dataMat, 1)
print shape(lowDmat)