mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-09 13:25:39 +08:00
更新readme
This commit is contained in:
21
src/python/01.NumPy.py
Normal file
21
src/python/01.NumPy.py
Normal file
@@ -0,0 +1,21 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
from numpy import random
|
||||
|
||||
'''
|
||||
# NumPy 矩阵和数字的区别
|
||||
NumPy存在2中不同的数据类型:
|
||||
1. 矩阵 matrix
|
||||
2. 数组 array
|
||||
相似点:
|
||||
都可以处理行列表示的数字元素
|
||||
不同点:
|
||||
1. 2个数据类型上执行相同的数据运算可能得到不同的结果。
|
||||
2. NumPy函数库中的 matrix 与 MATLAB中 matrices 等价。
|
||||
'''
|
||||
|
||||
# 生成一个 4*4 的随机数组
|
||||
print random.rand(4, 4)
|
||||
|
||||
|
||||
136
src/python/Logistic.py
Normal file
136
src/python/Logistic.py
Normal file
@@ -0,0 +1,136 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf8
|
||||
|
||||
'''
|
||||
Created on Oct 27, 2010
|
||||
Logistic Regression Working Module
|
||||
@author: Peter
|
||||
'''
|
||||
|
||||
import os
|
||||
from numpy import *
|
||||
import matplotlib.pyplot as plt
|
||||
# 解析数据
|
||||
def loadDataSet(file_name):
|
||||
# dataMat为原始数据, labelMat为原始数据的标签
|
||||
dataMat = []; labelMat = []
|
||||
fr = open(file_name)
|
||||
for line in fr.readlines():
|
||||
lineArr = line.strip().split()
|
||||
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||||
labelMat.append(int(lineArr[2]))
|
||||
return dataMat,labelMat
|
||||
|
||||
# sigmoid跳跃函数
|
||||
def sigmoid(inX):
|
||||
return 1.0/(1+exp(-inX))
|
||||
|
||||
# 正常的处理方案
|
||||
def gradAscent(dataMatIn, classLabels):
|
||||
dataMatrix = mat(dataMatIn) #convert to NumPy matrix
|
||||
# transpose() 行列转制函数
|
||||
# 将行矩阵转化为列矩阵 => 矩阵的转置
|
||||
labelMat = mat(classLabels).transpose() #convert to NumPy matrix
|
||||
m,n = shape(dataMatrix)
|
||||
# print m, n, '__'*10, shape(dataMatrix.transpose()), '__'*100
|
||||
alpha = 0.001
|
||||
maxCycles = 500
|
||||
# 权重
|
||||
weights = ones((n,1))
|
||||
for k in range(maxCycles): #heavy on matrix operations
|
||||
# m*3的矩阵 * 3*1的单位矩阵 = m*1的矩阵
|
||||
# 那么乘上单位矩阵的意义,就代表:通过公式得到的理论值
|
||||
# 参考地址: 矩阵乘法的本质是什么? https://www.zhihu.com/question/21351965/answer/31050145
|
||||
# n*3 * 3*1 = n*1
|
||||
h = sigmoid(dataMatrix*weights) #matrix mult
|
||||
# labelMat是实际值
|
||||
error = (labelMat - h) #vector subtraction
|
||||
# 0.001* (3*m)*(m*1) 表示在每一个列上的一个误差情况,最后得出 x1,x2,xn的系数的偏移量
|
||||
weights = weights + alpha * dataMatrix.transpose() * error #matrix mult
|
||||
return array(weights)
|
||||
|
||||
# 梯度上升算法
|
||||
def stocGradAscent0(dataMatrix, classLabels):
|
||||
m,n = shape(dataMatrix)
|
||||
alpha = 0.01
|
||||
# n*1的矩阵
|
||||
# 函数ones创建一个全1的数组
|
||||
weights = ones(n) #initialize to all ones
|
||||
for i in range(m):
|
||||
# sum(dataMatrix[i]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn
|
||||
h = sigmoid(sum(dataMatrix[i]*weights))
|
||||
error = classLabels[i] - h
|
||||
# 0.01*(1*1)*(1*n)
|
||||
print weights, "*"*10 , dataMatrix[i], "*"*10 , error
|
||||
weights = weights + alpha * error * dataMatrix[i]
|
||||
return weights
|
||||
|
||||
# 随机梯度上升算法(随机化)
|
||||
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
|
||||
m,n = shape(dataMatrix)
|
||||
weights = ones(n) #initialize to all ones
|
||||
# 随机剃度, 循环150,观察是否收敛
|
||||
for j in range(numIter):
|
||||
# [0, 1, 2 .. m-1]
|
||||
dataIndex = range(m)
|
||||
for i in range(m):
|
||||
# i和j的不断增大,导致alpha的值不断减少,但是不为0
|
||||
alpha = 4/(1.0+j+i)+0.0001 #apha decreases with iteration, does not
|
||||
# 随机产生一个 0~len()之间的一个值
|
||||
randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant
|
||||
# sum(dataMatrix[i]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn
|
||||
h = sigmoid(sum(dataMatrix[randIndex]*weights))
|
||||
error = classLabels[randIndex] - h
|
||||
# print weights, '__h=%s' % h, '__'*20, alpha, '__'*20, error, '__'*20, dataMatrix[randIndex]
|
||||
weights = weights + alpha * error * dataMatrix[randIndex]
|
||||
del(dataIndex[randIndex])
|
||||
return weights
|
||||
|
||||
# 可视化展示
|
||||
def plotBestFit(dataArr, labelMat, weights):
|
||||
n = shape(dataArr)[0]
|
||||
xcord1 = []; ycord1 = []
|
||||
xcord2 = []; ycord2 = []
|
||||
for i in range(n):
|
||||
if int(labelMat[i])== 1:
|
||||
xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
|
||||
else:
|
||||
xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
|
||||
ax.scatter(xcord2, ycord2, s=30, c='green')
|
||||
x = arange(-3.0, 3.0, 0.1)
|
||||
"""
|
||||
y的由来,卧槽,是不是没看懂?
|
||||
首先理论上是这个样子的。
|
||||
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||||
w0*x0+w1*x1+w2*x2=f(x)
|
||||
x0最开始就设置为1叻, x2就是我们画图的y值,而f(x)被我们磨合误差给算到w0,w1,w2身上去了
|
||||
所以: w0+w1*x+w2*y=0 => y = (-w0-w1*x)/w2
|
||||
"""
|
||||
y = (-weights[0]-weights[1]*x)/weights[2]
|
||||
ax.plot(x, y)
|
||||
plt.xlabel('X'); plt.ylabel('Y')
|
||||
plt.show()
|
||||
|
||||
def main():
|
||||
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
|
||||
# 1.收集并准备数据
|
||||
dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
|
||||
|
||||
# print dataMat, '---\n', labelMat
|
||||
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
|
||||
# 因为数组没有是复制n份, array的乘法就是乘法
|
||||
dataArr = array(dataMat)
|
||||
# print dataArr
|
||||
# weights = gradAscent(dataArr, labelMat)
|
||||
# weights = stocGradAscent0(dataArr, labelMat)
|
||||
weights = stocGradAscent1(dataArr, labelMat)
|
||||
# print '*'*30, weights
|
||||
|
||||
# 数据可视化
|
||||
plotBestFit(dataArr, labelMat, weights)
|
||||
|
||||
if __name__=="__main__":
|
||||
main()
|
||||
206
src/python/apriori.py
Normal file
206
src/python/apriori.py
Normal file
@@ -0,0 +1,206 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf8
|
||||
|
||||
'''
|
||||
Created on Mar 24, 2011
|
||||
Ch 11 code
|
||||
@author: Peter
|
||||
'''
|
||||
from numpy import *
|
||||
|
||||
def loadDataSet():
|
||||
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
|
||||
|
||||
def createC1(dataSet):
|
||||
C1 = []
|
||||
for transaction in dataSet:
|
||||
for item in transaction:
|
||||
if not [item] in C1:
|
||||
C1.append([item])
|
||||
|
||||
C1.sort()
|
||||
return map(frozenset, C1) # use frozen set so we
|
||||
# can use it as a key in a dict
|
||||
|
||||
def scanD(D, Ck, minSupport):
|
||||
ssCnt = {}
|
||||
for tid in D:
|
||||
for can in Ck:
|
||||
# s.issubset(t) 测试是否 s 中的每一个元素都在 t 中
|
||||
if can.issubset(tid):
|
||||
if not ssCnt.has_key(can): ssCnt[can]=1
|
||||
else: ssCnt[can] += 1
|
||||
numItems = float(len(D))
|
||||
retList = []
|
||||
supportData = {}
|
||||
for key in ssCnt:
|
||||
support = ssCnt[key]/numItems
|
||||
if support >= minSupport:
|
||||
retList.insert(0, key)
|
||||
supportData[key] = support
|
||||
return retList, supportData
|
||||
|
||||
def aprioriGen(Lk, k): #creates Ck
|
||||
retList = []
|
||||
lenLk = len(Lk)
|
||||
for i in range(lenLk):
|
||||
for j in range(i+1, lenLk):
|
||||
L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
|
||||
L1.sort(); L2.sort()
|
||||
if L1==L2: #if first k-2 elements are equal
|
||||
retList.append(Lk[i] | Lk[j]) #set union
|
||||
return retList
|
||||
|
||||
def apriori(dataSet, minSupport = 0.5):
|
||||
# 冻结每一行数据
|
||||
C1 = createC1(dataSet)
|
||||
D = map(set, dataSet)
|
||||
|
||||
# 计算支持support
|
||||
L1, supportData = scanD(D, C1, minSupport)
|
||||
print("outcome: ", supportData)
|
||||
|
||||
L = [L1]
|
||||
k = 2
|
||||
while (len(L[k-2]) > 0):
|
||||
Ck = aprioriGen(L[k-2], k)
|
||||
Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
|
||||
supportData.update(supK)
|
||||
L.append(Lk)
|
||||
k += 1
|
||||
return L, supportData
|
||||
|
||||
def main():
|
||||
# project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
|
||||
# 1.收集并准备数据
|
||||
# dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
|
||||
|
||||
|
||||
# 1. 加载数据
|
||||
dataSet = loadDataSet()
|
||||
print(dataSet)
|
||||
# 调用 apriori 做购物篮分析
|
||||
apriori(dataSet, minSupport = 0.7)
|
||||
|
||||
if __name__=="__main__":
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def generateRules(L, supportData, minConf=0.7): #supportData is a dict coming from scanD
|
||||
bigRuleList = []
|
||||
for i in range(1, len(L)):#only get the sets with two or more items
|
||||
for freqSet in L[i]:
|
||||
H1 = [frozenset([item]) for item in freqSet]
|
||||
if (i > 1):
|
||||
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
|
||||
else:
|
||||
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
|
||||
return bigRuleList
|
||||
|
||||
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
|
||||
prunedH = [] #create new list to return
|
||||
for conseq in H:
|
||||
conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
|
||||
if conf >= minConf:
|
||||
print freqSet-conseq,'-->',conseq,'conf:',conf
|
||||
brl.append((freqSet-conseq, conseq, conf))
|
||||
prunedH.append(conseq)
|
||||
return prunedH
|
||||
|
||||
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
|
||||
m = len(H[0])
|
||||
if (len(freqSet) > (m + 1)): #try further merging
|
||||
Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
|
||||
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
|
||||
if (len(Hmp1) > 1): #need at least two sets to merge
|
||||
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
|
||||
|
||||
def pntRules(ruleList, itemMeaning):
|
||||
for ruleTup in ruleList:
|
||||
for item in ruleTup[0]:
|
||||
print itemMeaning[item]
|
||||
print " -------->"
|
||||
for item in ruleTup[1]:
|
||||
print itemMeaning[item]
|
||||
print "confidence: %f" % ruleTup[2]
|
||||
print #print a blank line
|
||||
|
||||
|
||||
# from time import sleep
|
||||
# from votesmart import votesmart
|
||||
# votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
|
||||
# #votesmart.apikey = 'get your api key first'
|
||||
# def getActionIds():
|
||||
# actionIdList = []; billTitleList = []
|
||||
# fr = open('recent20bills.txt')
|
||||
# for line in fr.readlines():
|
||||
# billNum = int(line.split('\t')[0])
|
||||
# try:
|
||||
# billDetail = votesmart.votes.getBill(billNum) #api call
|
||||
# for action in billDetail.actions:
|
||||
# if action.level == 'House' and \
|
||||
# (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
|
||||
# actionId = int(action.actionId)
|
||||
# print 'bill: %d has actionId: %d' % (billNum, actionId)
|
||||
# actionIdList.append(actionId)
|
||||
# billTitleList.append(line.strip().split('\t')[1])
|
||||
# except:
|
||||
# print "problem getting bill %d" % billNum
|
||||
# sleep(1) #delay to be polite
|
||||
# return actionIdList, billTitleList
|
||||
#
|
||||
# def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
|
||||
# itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
|
||||
# for billTitle in billTitleList:#fill up itemMeaning list
|
||||
# itemMeaning.append('%s -- Nay' % billTitle)
|
||||
# itemMeaning.append('%s -- Yea' % billTitle)
|
||||
# transDict = {}#list of items in each transaction (politician)
|
||||
# voteCount = 2
|
||||
# for actionId in actionIdList:
|
||||
# sleep(3)
|
||||
# print 'getting votes for actionId: %d' % actionId
|
||||
# try:
|
||||
# voteList = votesmart.votes.getBillActionVotes(actionId)
|
||||
# for vote in voteList:
|
||||
# if not transDict.has_key(vote.candidateName):
|
||||
# transDict[vote.candidateName] = []
|
||||
# if vote.officeParties == 'Democratic':
|
||||
# transDict[vote.candidateName].append(1)
|
||||
# elif vote.officeParties == 'Republican':
|
||||
# transDict[vote.candidateName].append(0)
|
||||
# if vote.action == 'Nay':
|
||||
# transDict[vote.candidateName].append(voteCount)
|
||||
# elif vote.action == 'Yea':
|
||||
# transDict[vote.candidateName].append(voteCount + 1)
|
||||
# except:
|
||||
# print "problem getting actionId: %d" % actionId
|
||||
# voteCount += 2
|
||||
# return transDict, itemMeaning
|
||||
298
src/python/regression.py
Normal file
298
src/python/regression.py
Normal file
@@ -0,0 +1,298 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf8
|
||||
|
||||
'''
|
||||
Created on Jan 8, 2011
|
||||
|
||||
@author: Peter
|
||||
'''
|
||||
|
||||
import os
|
||||
from numpy import *
|
||||
import matplotlib.pylab as plt
|
||||
|
||||
def loadDataSet(fileName): #general function to parse tab -delimited floats
|
||||
numFeat = len(open(fileName).readline().split('\t')) - 1 #get number of fields
|
||||
dataMat = []; labelMat = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
lineArr =[]
|
||||
curLine = line.strip().split('\t')
|
||||
for i in range(numFeat):
|
||||
lineArr.append(float(curLine[i]))
|
||||
dataMat.append(lineArr)
|
||||
labelMat.append(float(curLine[-1]))
|
||||
return dataMat,labelMat
|
||||
|
||||
def standRegres(xArr,yArr):
|
||||
# >>> A.T # transpose, 转置
|
||||
xMat = mat(xArr); yMat = mat(yArr).T
|
||||
# 转置矩阵*矩阵
|
||||
xTx = xMat.T*xMat
|
||||
if linalg.det(xTx) == 0.0:
|
||||
print "This matrix is singular, cannot do inverse"
|
||||
return
|
||||
# >>> print A.I # inverse, 逆矩阵
|
||||
# print xTx.I, "*"*10, xMat.T, "*"*10, yMat
|
||||
ws = xTx.I * (xMat.T*yMat) # 最小二乘法求最优解
|
||||
return ws
|
||||
|
||||
def plotBestFit(xArr, yArr, ws):
|
||||
|
||||
xMat = mat(xArr)
|
||||
yMat = mat(yArr)
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
ax.scatter(xMat[:,1].flatten().A[0], yMat.T[:,0].flatten().A[0])
|
||||
|
||||
yHat = xMat*ws
|
||||
# 再计算相关系数
|
||||
print "相关系数\n", corrcoef(yHat.T, yMat)
|
||||
|
||||
xMat.sort(0)
|
||||
yHat = xMat*ws
|
||||
n = shape(xMat)[0]
|
||||
xcord = []; ycord = []
|
||||
for i in range(n):
|
||||
xcord.append(xMat[i, 1]); ycord.append(yHat[i, 0])
|
||||
|
||||
ax.plot(xcord, ycord, c='red')
|
||||
plt.xlabel('X'); plt.ylabel('Y')
|
||||
plt.show()
|
||||
|
||||
def main1():
|
||||
# w0*x0+w1*x1+w2*x2=f(x)
|
||||
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
|
||||
# 1.收集并准备数据
|
||||
xArr, yArr = loadDataSet("%s/resources/ex0.txt" % project_dir)
|
||||
# print xArr, '---\n', yArr
|
||||
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
|
||||
ws = standRegres(xArr, yArr)
|
||||
print '*'*30, '---\n', ws
|
||||
|
||||
# 数据可视化
|
||||
plotBestFit(xArr, yArr, ws)
|
||||
|
||||
|
||||
def lwlr(testPoint, xArr, yArr,k=1.0):
|
||||
xMat = mat(xArr); yMat = mat(yArr).T
|
||||
m = shape(xMat)[0]
|
||||
weights = mat(eye((m)))
|
||||
for j in range(m): #next 2 lines create weights matrix
|
||||
diffMat = testPoint - xMat[j,:]
|
||||
# 高斯核对应的加权
|
||||
weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))
|
||||
xTx = xMat.T * (weights * xMat)
|
||||
if linalg.det(xTx) == 0.0:
|
||||
print "This matrix is singular, cannot do inverse"
|
||||
return
|
||||
|
||||
# 加权的回归系数求解
|
||||
ws = xTx.I * (xMat.T * (weights * yMat))
|
||||
return testPoint * ws
|
||||
|
||||
def lwlrTest(testArr,xArr,yArr,k=1.0): #loops over all the data points and applies lwlr to each one
|
||||
m = shape(testArr)[0]
|
||||
# m*1的矩阵
|
||||
# 函数 zeros 创建一个全0的数组
|
||||
yHat = zeros(m)
|
||||
print "shape(yHat)", shape(yHat)
|
||||
for i in range(m):
|
||||
yHat[i] = lwlr(testArr[i],xArr,yArr,k)
|
||||
return yHat
|
||||
|
||||
def lwlrTestPlot(xArr, yArr, yHat):
|
||||
|
||||
xMat = mat(xArr)
|
||||
yMat = mat(yArr)
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
ax.scatter(xMat[:,1].flatten().A[0], yMat.T[:,0].flatten().A[0])
|
||||
|
||||
# 再计算相关系数
|
||||
print "相关系数\n", corrcoef(yHat.T, yMat)
|
||||
|
||||
n = shape(xMat)[0]
|
||||
xcord = []; ycord = []
|
||||
for i in range(n):
|
||||
xcord.append(xMat[i, 1]), ycord.append(yHat[i])
|
||||
|
||||
xcord.sort(), ycord.sort()
|
||||
# print xcord, "------\n", ycord
|
||||
ax.plot(xcord, ycord, c='red')
|
||||
plt.xlabel('X'); plt.ylabel('Y')
|
||||
plt.show()
|
||||
|
||||
def main2():
|
||||
# w0*x0+w1*x1+w2*x2=f(x)
|
||||
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
|
||||
# 1.收集并准备数据
|
||||
xArr, yArr = loadDataSet("%s/resources/ex0.txt" % project_dir)
|
||||
# print xArr, '---\n', yArr
|
||||
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
|
||||
yHat = lwlrTest(xArr, xArr, yArr, 0.003)
|
||||
print xArr, '---\n', yHat[1]
|
||||
|
||||
# 数据可视化
|
||||
lwlrTestPlot(xArr, yArr, yHat)
|
||||
|
||||
if __name__=="__main__":
|
||||
# 线性回归
|
||||
# main1()
|
||||
# 局部加权线性回归
|
||||
main2()
|
||||
|
||||
def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
|
||||
return ((yArr-yHatArr)**2).sum()
|
||||
|
||||
def ridgeRegres(xMat,yMat,lam=0.2):
|
||||
xTx = xMat.T*xMat
|
||||
denom = xTx + eye(shape(xMat)[1])*lam
|
||||
if linalg.det(denom) == 0.0:
|
||||
print "This matrix is singular, cannot do inverse"
|
||||
return
|
||||
ws = denom.I * (xMat.T*yMat)
|
||||
return ws
|
||||
|
||||
def ridgeTest(xArr,yArr):
|
||||
xMat = mat(xArr); yMat=mat(yArr).T
|
||||
yMean = mean(yMat,0)
|
||||
yMat = yMat - yMean #to eliminate X0 take mean off of Y
|
||||
#regularize X's
|
||||
xMeans = mean(xMat,0) #calc mean then subtract it off
|
||||
xVar = var(xMat,0) #calc variance of Xi then divide by it
|
||||
xMat = (xMat - xMeans)/xVar
|
||||
numTestPts = 30
|
||||
wMat = zeros((numTestPts,shape(xMat)[1]))
|
||||
for i in range(numTestPts):
|
||||
ws = ridgeRegres(xMat,yMat,exp(i-10))
|
||||
wMat[i,:]=ws.T
|
||||
return wMat
|
||||
|
||||
def regularize(xMat):#regularize by columns
|
||||
inMat = xMat.copy()
|
||||
inMeans = mean(inMat,0) #calc mean then subtract it off
|
||||
inVar = var(inMat,0) #calc variance of Xi then divide by it
|
||||
inMat = (inMat - inMeans)/inVar
|
||||
return inMat
|
||||
|
||||
def stageWise(xArr,yArr,eps=0.01,numIt=100):
|
||||
xMat = mat(xArr); yMat=mat(yArr).T
|
||||
yMean = mean(yMat,0)
|
||||
yMat = yMat - yMean #can also regularize ys but will get smaller coef
|
||||
xMat = regularize(xMat)
|
||||
m,n=shape(xMat)
|
||||
#returnMat = zeros((numIt,n)) #testing code remove
|
||||
ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
|
||||
for i in range(numIt):
|
||||
print ws.T
|
||||
lowestError = inf;
|
||||
for j in range(n):
|
||||
for sign in [-1,1]:
|
||||
wsTest = ws.copy()
|
||||
wsTest[j] += eps*sign
|
||||
yTest = xMat*wsTest
|
||||
rssE = rssError(yMat.A,yTest.A)
|
||||
if rssE < lowestError:
|
||||
lowestError = rssE
|
||||
wsMax = wsTest
|
||||
ws = wsMax.copy()
|
||||
#returnMat[i,:]=ws.T
|
||||
#return returnMat
|
||||
|
||||
def scrapePage(inFile,outFile,yr,numPce,origPrc):
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
fr = open(inFile); fw=open(outFile,'a') #a is append mode writing
|
||||
soup = BeautifulSoup(fr.read())
|
||||
i=1
|
||||
currentRow = soup.findAll('table', r="%d" % i)
|
||||
while(len(currentRow)!=0):
|
||||
title = currentRow[0].findAll('a')[1].text
|
||||
lwrTitle = title.lower()
|
||||
if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
|
||||
newFlag = 1.0
|
||||
else:
|
||||
newFlag = 0.0
|
||||
soldUnicde = currentRow[0].findAll('td')[3].findAll('span')
|
||||
if len(soldUnicde)==0:
|
||||
print "item #%d did not sell" % i
|
||||
else:
|
||||
soldPrice = currentRow[0].findAll('td')[4]
|
||||
priceStr = soldPrice.text
|
||||
priceStr = priceStr.replace('$','') #strips out $
|
||||
priceStr = priceStr.replace(',','') #strips out ,
|
||||
if len(soldPrice)>1:
|
||||
priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping
|
||||
print "%s\t%d\t%s" % (priceStr,newFlag,title)
|
||||
fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr))
|
||||
i += 1
|
||||
currentRow = soup.findAll('table', r="%d" % i)
|
||||
fw.close()
|
||||
|
||||
from time import sleep
|
||||
import json
|
||||
import urllib2
|
||||
def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
|
||||
sleep(10)
|
||||
myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
|
||||
searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum)
|
||||
pg = urllib2.urlopen(searchURL)
|
||||
retDict = json.loads(pg.read())
|
||||
for i in range(len(retDict['items'])):
|
||||
try:
|
||||
currItem = retDict['items'][i]
|
||||
if currItem['product']['condition'] == 'new':
|
||||
newFlag = 1
|
||||
else: newFlag = 0
|
||||
listOfInv = currItem['product']['inventories']
|
||||
for item in listOfInv:
|
||||
sellingPrice = item['price']
|
||||
if sellingPrice > origPrc * 0.5:
|
||||
print "%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice)
|
||||
retX.append([yr, numPce, newFlag, origPrc])
|
||||
retY.append(sellingPrice)
|
||||
except: print 'problem with item %d' % i
|
||||
|
||||
def setDataCollect(retX, retY):
|
||||
searchForSet(retX, retY, 8288, 2006, 800, 49.99)
|
||||
searchForSet(retX, retY, 10030, 2002, 3096, 269.99)
|
||||
searchForSet(retX, retY, 10179, 2007, 5195, 499.99)
|
||||
searchForSet(retX, retY, 10181, 2007, 3428, 199.99)
|
||||
searchForSet(retX, retY, 10189, 2008, 5922, 299.99)
|
||||
searchForSet(retX, retY, 10196, 2009, 3263, 249.99)
|
||||
|
||||
def crossValidation(xArr,yArr,numVal=10):
|
||||
m = len(yArr)
|
||||
indexList = range(m)
|
||||
errorMat = zeros((numVal,30))#create error mat 30columns numVal rows
|
||||
for i in range(numVal):
|
||||
trainX=[]; trainY=[]
|
||||
testX = []; testY = []
|
||||
random.shuffle(indexList)
|
||||
for j in range(m):#create training set based on first 90% of values in indexList
|
||||
if j < m*0.9:
|
||||
trainX.append(xArr[indexList[j]])
|
||||
trainY.append(yArr[indexList[j]])
|
||||
else:
|
||||
testX.append(xArr[indexList[j]])
|
||||
testY.append(yArr[indexList[j]])
|
||||
wMat = ridgeTest(trainX,trainY) #get 30 weight vectors from ridge
|
||||
for k in range(30):#loop over all of the ridge estimates
|
||||
matTestX = mat(testX); matTrainX=mat(trainX)
|
||||
meanTrain = mean(matTrainX,0)
|
||||
varTrain = var(matTrainX,0)
|
||||
matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params
|
||||
yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store
|
||||
errorMat[i,k]=rssError(yEst.T.A,array(testY))
|
||||
#print errorMat[i,k]
|
||||
meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors
|
||||
minMean = float(min(meanErrors))
|
||||
bestWeights = wMat[nonzero(meanErrors==minMean)]
|
||||
#can unregularize to get model
|
||||
#when we regularized we wrote Xreg = (x-meanX)/var(x)
|
||||
#we can now write in terms of x not Xreg: x*w/var(x) - meanX/var(x) +meanY
|
||||
xMat = mat(xArr); yMat=mat(yArr).T
|
||||
meanX = mean(xMat,0); varX = var(xMat,0)
|
||||
unReg = bestWeights/varX
|
||||
print "the best model from Ridge Regression is:\n",unReg
|
||||
print "with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat)
|
||||
Reference in New Issue
Block a user