更新readme

This commit is contained in:
jiangzhonglian
2017-02-25 19:19:30 +08:00
parent d9d21f5258
commit f69881fcc3
8 changed files with 734 additions and 0 deletions

21
src/python/01.NumPy.py Normal file
View File

@@ -0,0 +1,21 @@
#!/usr/bin/python
# coding:utf8
from numpy import random
'''
# NumPy 矩阵和数字的区别
NumPy存在2中不同的数据类型:
1. 矩阵 matrix
2. 数组 array
相似点:
都可以处理行列表示的数字元素
不同点:
1. 2个数据类型上执行相同的数据运算可能得到不同的结果。
2. NumPy函数库中的 matrix 与 MATLAB中 matrices 等价。
'''
# 生成一个 4*4 的随机数组
print random.rand(4, 4)

136
src/python/Logistic.py Normal file
View File

@@ -0,0 +1,136 @@
#!/usr/bin/python
# coding: utf8
'''
Created on Oct 27, 2010
Logistic Regression Working Module
@author: Peter
'''
import os
from numpy import *
import matplotlib.pyplot as plt
# 解析数据
def loadDataSet(file_name):
# dataMat为原始数据 labelMat为原始数据的标签
dataMat = []; labelMat = []
fr = open(file_name)
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return dataMat,labelMat
# sigmoid跳跃函数
def sigmoid(inX):
return 1.0/(1+exp(-inX))
# 正常的处理方案
def gradAscent(dataMatIn, classLabels):
dataMatrix = mat(dataMatIn) #convert to NumPy matrix
# transpose() 行列转制函数
# 将行矩阵转化为列矩阵 => 矩阵的转置
labelMat = mat(classLabels).transpose() #convert to NumPy matrix
m,n = shape(dataMatrix)
# print m, n, '__'*10, shape(dataMatrix.transpose()), '__'*100
alpha = 0.001
maxCycles = 500
# 权重
weights = ones((n,1))
for k in range(maxCycles): #heavy on matrix operations
# m*3的矩阵 * 3*1的单位矩阵 m*1的矩阵
# 那么乘上单位矩阵的意义,就代表:通过公式得到的理论值
# 参考地址: 矩阵乘法的本质是什么? https://www.zhihu.com/question/21351965/answer/31050145
# n*3 * 3*1 = n*1
h = sigmoid(dataMatrix*weights) #matrix mult
# labelMat是实际值
error = (labelMat - h) #vector subtraction
# 0.001* (3*m)*(m*1) 表示在每一个列上的一个误差情况,最后得出 x1,x2,xn的系数的偏移量
weights = weights + alpha * dataMatrix.transpose() * error #matrix mult
return array(weights)
# 梯度上升算法
def stocGradAscent0(dataMatrix, classLabels):
m,n = shape(dataMatrix)
alpha = 0.01
# n*1的矩阵
# 函数ones创建一个全1的数组
weights = ones(n) #initialize to all ones
for i in range(m):
# sum(dataMatrix[i]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn
h = sigmoid(sum(dataMatrix[i]*weights))
error = classLabels[i] - h
# 0.01*(1*1)*(1*n)
print weights, "*"*10 , dataMatrix[i], "*"*10 , error
weights = weights + alpha * error * dataMatrix[i]
return weights
# 随机梯度上升算法(随机化)
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
m,n = shape(dataMatrix)
weights = ones(n) #initialize to all ones
# 随机剃度, 循环150,观察是否收敛
for j in range(numIter):
# [0, 1, 2 .. m-1]
dataIndex = range(m)
for i in range(m):
# i和j的不断增大导致alpha的值不断减少但是不为0
alpha = 4/(1.0+j+i)+0.0001 #apha decreases with iteration, does not
# 随机产生一个 0len()之间的一个值
randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant
# sum(dataMatrix[i]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn
h = sigmoid(sum(dataMatrix[randIndex]*weights))
error = classLabels[randIndex] - h
# print weights, '__h=%s' % h, '__'*20, alpha, '__'*20, error, '__'*20, dataMatrix[randIndex]
weights = weights + alpha * error * dataMatrix[randIndex]
del(dataIndex[randIndex])
return weights
# 可视化展示
def plotBestFit(dataArr, labelMat, weights):
n = shape(dataArr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i])== 1:
xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = arange(-3.0, 3.0, 0.1)
"""
y的由来卧槽是不是没看懂
首先理论上是这个样子的。
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
w0*x0+w1*x1+w2*x2=f(x)
x0最开始就设置为1叻 x2就是我们画图的y值而f(x)被我们磨合误差给算到w0,w1,w2身上去了
所以: w0+w1*x+w2*y=0 => y = (-w0-w1*x)/w2
"""
y = (-weights[0]-weights[1]*x)/weights[2]
ax.plot(x, y)
plt.xlabel('X'); plt.ylabel('Y')
plt.show()
def main():
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# 1.收集并准备数据
dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
# print dataMat, '---\n', labelMat
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
# 因为数组没有是复制n份 array的乘法就是乘法
dataArr = array(dataMat)
# print dataArr
# weights = gradAscent(dataArr, labelMat)
# weights = stocGradAscent0(dataArr, labelMat)
weights = stocGradAscent1(dataArr, labelMat)
# print '*'*30, weights
# 数据可视化
plotBestFit(dataArr, labelMat, weights)
if __name__=="__main__":
main()

206
src/python/apriori.py Normal file
View File

@@ -0,0 +1,206 @@
#!/usr/bin/python
# coding: utf8
'''
Created on Mar 24, 2011
Ch 11 code
@author: Peter
'''
from numpy import *
def loadDataSet():
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
def createC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return map(frozenset, C1) # use frozen set so we
# can use it as a key in a dict
def scanD(D, Ck, minSupport):
ssCnt = {}
for tid in D:
for can in Ck:
# s.issubset(t) 测试是否 s 中的每一个元素都在 t 中
if can.issubset(tid):
if not ssCnt.has_key(can): ssCnt[can]=1
else: ssCnt[can] += 1
numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt:
support = ssCnt[key]/numItems
if support >= minSupport:
retList.insert(0, key)
supportData[key] = support
return retList, supportData
def aprioriGen(Lk, k): #creates Ck
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i+1, lenLk):
L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
L1.sort(); L2.sort()
if L1==L2: #if first k-2 elements are equal
retList.append(Lk[i] | Lk[j]) #set union
return retList
def apriori(dataSet, minSupport = 0.5):
# 冻结每一行数据
C1 = createC1(dataSet)
D = map(set, dataSet)
# 计算支持support
L1, supportData = scanD(D, C1, minSupport)
print("outcome: ", supportData)
L = [L1]
k = 2
while (len(L[k-2]) > 0):
Ck = aprioriGen(L[k-2], k)
Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
supportData.update(supK)
L.append(Lk)
k += 1
return L, supportData
def main():
# project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# 1.收集并准备数据
# dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
# 1. 加载数据
dataSet = loadDataSet()
print(dataSet)
# 调用 apriori 做购物篮分析
apriori(dataSet, minSupport = 0.7)
if __name__=="__main__":
main()
def generateRules(L, supportData, minConf=0.7): #supportData is a dict coming from scanD
bigRuleList = []
for i in range(1, len(L)):#only get the sets with two or more items
for freqSet in L[i]:
H1 = [frozenset([item]) for item in freqSet]
if (i > 1):
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
prunedH = [] #create new list to return
for conseq in H:
conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
if conf >= minConf:
print freqSet-conseq,'-->',conseq,'conf:',conf
brl.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
m = len(H[0])
if (len(freqSet) > (m + 1)): #try further merging
Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
if (len(Hmp1) > 1): #need at least two sets to merge
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
def pntRules(ruleList, itemMeaning):
for ruleTup in ruleList:
for item in ruleTup[0]:
print itemMeaning[item]
print " -------->"
for item in ruleTup[1]:
print itemMeaning[item]
print "confidence: %f" % ruleTup[2]
print #print a blank line
# from time import sleep
# from votesmart import votesmart
# votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
# #votesmart.apikey = 'get your api key first'
# def getActionIds():
# actionIdList = []; billTitleList = []
# fr = open('recent20bills.txt')
# for line in fr.readlines():
# billNum = int(line.split('\t')[0])
# try:
# billDetail = votesmart.votes.getBill(billNum) #api call
# for action in billDetail.actions:
# if action.level == 'House' and \
# (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
# actionId = int(action.actionId)
# print 'bill: %d has actionId: %d' % (billNum, actionId)
# actionIdList.append(actionId)
# billTitleList.append(line.strip().split('\t')[1])
# except:
# print "problem getting bill %d" % billNum
# sleep(1) #delay to be polite
# return actionIdList, billTitleList
#
# def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
# itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
# for billTitle in billTitleList:#fill up itemMeaning list
# itemMeaning.append('%s -- Nay' % billTitle)
# itemMeaning.append('%s -- Yea' % billTitle)
# transDict = {}#list of items in each transaction (politician)
# voteCount = 2
# for actionId in actionIdList:
# sleep(3)
# print 'getting votes for actionId: %d' % actionId
# try:
# voteList = votesmart.votes.getBillActionVotes(actionId)
# for vote in voteList:
# if not transDict.has_key(vote.candidateName):
# transDict[vote.candidateName] = []
# if vote.officeParties == 'Democratic':
# transDict[vote.candidateName].append(1)
# elif vote.officeParties == 'Republican':
# transDict[vote.candidateName].append(0)
# if vote.action == 'Nay':
# transDict[vote.candidateName].append(voteCount)
# elif vote.action == 'Yea':
# transDict[vote.candidateName].append(voteCount + 1)
# except:
# print "problem getting actionId: %d" % actionId
# voteCount += 2
# return transDict, itemMeaning

298
src/python/regression.py Normal file
View File

@@ -0,0 +1,298 @@
#!/usr/bin/python
# coding: utf8
'''
Created on Jan 8, 2011
@author: Peter
'''
import os
from numpy import *
import matplotlib.pylab as plt
def loadDataSet(fileName): #general function to parse tab -delimited floats
numFeat = len(open(fileName).readline().split('\t')) - 1 #get number of fields
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr =[]
curLine = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
def standRegres(xArr,yArr):
# >>> A.T # transpose, 转置
xMat = mat(xArr); yMat = mat(yArr).T
# 转置矩阵*矩阵
xTx = xMat.T*xMat
if linalg.det(xTx) == 0.0:
print "This matrix is singular, cannot do inverse"
return
# >>> print A.I # inverse, 逆矩阵
# print xTx.I, "*"*10, xMat.T, "*"*10, yMat
ws = xTx.I * (xMat.T*yMat) # 最小二乘法求最优解
return ws
def plotBestFit(xArr, yArr, ws):
xMat = mat(xArr)
yMat = mat(yArr)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xMat[:,1].flatten().A[0], yMat.T[:,0].flatten().A[0])
yHat = xMat*ws
# 再计算相关系数
print "相关系数\n", corrcoef(yHat.T, yMat)
xMat.sort(0)
yHat = xMat*ws
n = shape(xMat)[0]
xcord = []; ycord = []
for i in range(n):
xcord.append(xMat[i, 1]); ycord.append(yHat[i, 0])
ax.plot(xcord, ycord, c='red')
plt.xlabel('X'); plt.ylabel('Y')
plt.show()
def main1():
# w0*x0+w1*x1+w2*x2=f(x)
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# 1.收集并准备数据
xArr, yArr = loadDataSet("%s/resources/ex0.txt" % project_dir)
# print xArr, '---\n', yArr
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
ws = standRegres(xArr, yArr)
print '*'*30, '---\n', ws
# 数据可视化
plotBestFit(xArr, yArr, ws)
def lwlr(testPoint, xArr, yArr,k=1.0):
xMat = mat(xArr); yMat = mat(yArr).T
m = shape(xMat)[0]
weights = mat(eye((m)))
for j in range(m): #next 2 lines create weights matrix
diffMat = testPoint - xMat[j,:]
# 高斯核对应的加权
weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))
xTx = xMat.T * (weights * xMat)
if linalg.det(xTx) == 0.0:
print "This matrix is singular, cannot do inverse"
return
# 加权的回归系数求解
ws = xTx.I * (xMat.T * (weights * yMat))
return testPoint * ws
def lwlrTest(testArr,xArr,yArr,k=1.0): #loops over all the data points and applies lwlr to each one
m = shape(testArr)[0]
# m*1的矩阵
# 函数 zeros 创建一个全0的数组
yHat = zeros(m)
print "shape(yHat)", shape(yHat)
for i in range(m):
yHat[i] = lwlr(testArr[i],xArr,yArr,k)
return yHat
def lwlrTestPlot(xArr, yArr, yHat):
xMat = mat(xArr)
yMat = mat(yArr)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xMat[:,1].flatten().A[0], yMat.T[:,0].flatten().A[0])
# 再计算相关系数
print "相关系数\n", corrcoef(yHat.T, yMat)
n = shape(xMat)[0]
xcord = []; ycord = []
for i in range(n):
xcord.append(xMat[i, 1]), ycord.append(yHat[i])
xcord.sort(), ycord.sort()
# print xcord, "------\n", ycord
ax.plot(xcord, ycord, c='red')
plt.xlabel('X'); plt.ylabel('Y')
plt.show()
def main2():
# w0*x0+w1*x1+w2*x2=f(x)
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# 1.收集并准备数据
xArr, yArr = loadDataSet("%s/resources/ex0.txt" % project_dir)
# print xArr, '---\n', yArr
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
yHat = lwlrTest(xArr, xArr, yArr, 0.003)
print xArr, '---\n', yHat[1]
# 数据可视化
lwlrTestPlot(xArr, yArr, yHat)
if __name__=="__main__":
# 线性回归
# main1()
# 局部加权线性回归
main2()
def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
return ((yArr-yHatArr)**2).sum()
def ridgeRegres(xMat,yMat,lam=0.2):
xTx = xMat.T*xMat
denom = xTx + eye(shape(xMat)[1])*lam
if linalg.det(denom) == 0.0:
print "This matrix is singular, cannot do inverse"
return
ws = denom.I * (xMat.T*yMat)
return ws
def ridgeTest(xArr,yArr):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #to eliminate X0 take mean off of Y
#regularize X's
xMeans = mean(xMat,0) #calc mean then subtract it off
xVar = var(xMat,0) #calc variance of Xi then divide by it
xMat = (xMat - xMeans)/xVar
numTestPts = 30
wMat = zeros((numTestPts,shape(xMat)[1]))
for i in range(numTestPts):
ws = ridgeRegres(xMat,yMat,exp(i-10))
wMat[i,:]=ws.T
return wMat
def regularize(xMat):#regularize by columns
inMat = xMat.copy()
inMeans = mean(inMat,0) #calc mean then subtract it off
inVar = var(inMat,0) #calc variance of Xi then divide by it
inMat = (inMat - inMeans)/inVar
return inMat
def stageWise(xArr,yArr,eps=0.01,numIt=100):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #can also regularize ys but will get smaller coef
xMat = regularize(xMat)
m,n=shape(xMat)
#returnMat = zeros((numIt,n)) #testing code remove
ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
for i in range(numIt):
print ws.T
lowestError = inf;
for j in range(n):
for sign in [-1,1]:
wsTest = ws.copy()
wsTest[j] += eps*sign
yTest = xMat*wsTest
rssE = rssError(yMat.A,yTest.A)
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
#returnMat[i,:]=ws.T
#return returnMat
def scrapePage(inFile,outFile,yr,numPce,origPrc):
from BeautifulSoup import BeautifulSoup
fr = open(inFile); fw=open(outFile,'a') #a is append mode writing
soup = BeautifulSoup(fr.read())
i=1
currentRow = soup.findAll('table', r="%d" % i)
while(len(currentRow)!=0):
title = currentRow[0].findAll('a')[1].text
lwrTitle = title.lower()
if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
newFlag = 1.0
else:
newFlag = 0.0
soldUnicde = currentRow[0].findAll('td')[3].findAll('span')
if len(soldUnicde)==0:
print "item #%d did not sell" % i
else:
soldPrice = currentRow[0].findAll('td')[4]
priceStr = soldPrice.text
priceStr = priceStr.replace('$','') #strips out $
priceStr = priceStr.replace(',','') #strips out ,
if len(soldPrice)>1:
priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping
print "%s\t%d\t%s" % (priceStr,newFlag,title)
fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr))
i += 1
currentRow = soup.findAll('table', r="%d" % i)
fw.close()
from time import sleep
import json
import urllib2
def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
sleep(10)
myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum)
pg = urllib2.urlopen(searchURL)
retDict = json.loads(pg.read())
for i in range(len(retDict['items'])):
try:
currItem = retDict['items'][i]
if currItem['product']['condition'] == 'new':
newFlag = 1
else: newFlag = 0
listOfInv = currItem['product']['inventories']
for item in listOfInv:
sellingPrice = item['price']
if sellingPrice > origPrc * 0.5:
print "%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice)
retX.append([yr, numPce, newFlag, origPrc])
retY.append(sellingPrice)
except: print 'problem with item %d' % i
def setDataCollect(retX, retY):
searchForSet(retX, retY, 8288, 2006, 800, 49.99)
searchForSet(retX, retY, 10030, 2002, 3096, 269.99)
searchForSet(retX, retY, 10179, 2007, 5195, 499.99)
searchForSet(retX, retY, 10181, 2007, 3428, 199.99)
searchForSet(retX, retY, 10189, 2008, 5922, 299.99)
searchForSet(retX, retY, 10196, 2009, 3263, 249.99)
def crossValidation(xArr,yArr,numVal=10):
m = len(yArr)
indexList = range(m)
errorMat = zeros((numVal,30))#create error mat 30columns numVal rows
for i in range(numVal):
trainX=[]; trainY=[]
testX = []; testY = []
random.shuffle(indexList)
for j in range(m):#create training set based on first 90% of values in indexList
if j < m*0.9:
trainX.append(xArr[indexList[j]])
trainY.append(yArr[indexList[j]])
else:
testX.append(xArr[indexList[j]])
testY.append(yArr[indexList[j]])
wMat = ridgeTest(trainX,trainY) #get 30 weight vectors from ridge
for k in range(30):#loop over all of the ridge estimates
matTestX = mat(testX); matTrainX=mat(trainX)
meanTrain = mean(matTrainX,0)
varTrain = var(matTrainX,0)
matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params
yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store
errorMat[i,k]=rssError(yEst.T.A,array(testY))
#print errorMat[i,k]
meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors
minMean = float(min(meanErrors))
bestWeights = wMat[nonzero(meanErrors==minMean)]
#can unregularize to get model
#when we regularized we wrote Xreg = (x-meanX)/var(x)
#we can now write in terms of x not Xreg: x*w/var(x) - meanX/var(x) +meanY
xMat = mat(xArr); yMat=mat(yArr).T
meanX = mean(xMat,0); varX = var(xMat,0)
unReg = bestWeights/varX
print "the best model from Ridge Regression is:\n",unReg
print "with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat)