diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..fe715984 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.linting.pylintEnabled": false +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 00000000..66d0b4ec --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,9 @@ +{ + // See https://go.microsoft.com/fwlink/?LinkId=733558 + // for the documentation about the tasks.json format + "version": "0.1.0", + "command": "/usr/bin/python", + "isShellCommand": true, + "args": ["${file}"], + "showOutput": "always" +} diff --git a/README.md b/README.md index 711e5ced..4de2019f 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,28 @@ ## 第一部分 分类 * 1) 机器学习基础 + * 机器学习是什么 + * 把无序的数据转换成有用的信息。 + * 机器学习的意义 + * 我们利用计算机来彰显数据背后的真实含义。 + * 监督学习 + * 样本集:训练数据 + 测试数据 + * 特征(feature-是否有缺失情况) + 目标变量(分类-离散值/回归-连续值<0~100、 -999~999>) + * `知识表示`:机器已经学会如何识别鸟类的过程 + * 1.可以采用规则集的形式 + * 2.可以采用概率分布的形式 + * 3.可以使训练样本集中的一个实例 + * 非监督学习 + * 开发的步骤 + * 1.收集数据 + * 2.准备输入数据 + * 3.分析输入数据 + * 4.训练算法 + * 5.测试算法 + * 6.使用算法 + * Python相关的库 + * 科学函数库:SciPy、`NumPy`(底层语言:C和Fortran) + * 绘图工具库:`Matplotlib` * 2) k-紧邻算法 * 3) 决策树 * 4) 基于概率论的分类方法:朴素贝叶斯 diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..711e5ced --- /dev/null +++ b/docs/README.md @@ -0,0 +1,39 @@ +# MachineLearning + +**Mahchine Leaning in Action (python)** + +## 第一部分 分类 + +* 1) 机器学习基础 +* 2) k-紧邻算法 +* 3) 决策树 +* 4) 基于概率论的分类方法:朴素贝叶斯 +* 5) Logistic回归 +* 6) 支持向量机 +* 7) 利用AdaBoost元算法提高分类 + +## 第二部分 利用回归预测数值型数据 + +* 8) 预测数值型数据:回归 +* 9) 数回归 + +### 第三部分 无监督学习 + +* 10) 使用K-均值聚类算法对未标注数据分组 +* 11) 使用Apriori算法进行关联分析 +* 12) 使用FP-growth算法来高效发现频繁项集 + +### 第四部分 其他工具 + +* 13) 使用PCA来简化数据 +* 14) 使用SVD简化数据 +* 15) 大数据与MapReduce + +* * * + +* 附录A Python入门 +* 附录B 线性代数 +* 附录C 概率论复习 +* 附录D 资源 +* 索引 +* 版权声明 \ No newline at end of file diff --git a/src/python/01.NumPy.py b/src/python/01.NumPy.py new file mode 100644 index 00000000..8425a6ce --- /dev/null +++ b/src/python/01.NumPy.py @@ -0,0 +1,21 @@ +#!/usr/bin/python +# coding:utf8 + +from numpy import random + +''' +# NumPy 矩阵和数字的区别 +NumPy存在2中不同的数据类型: + 1. 矩阵 matrix + 2. 数组 array +相似点: + 都可以处理行列表示的数字元素 +不同点: + 1. 2个数据类型上执行相同的数据运算可能得到不同的结果。 + 2. NumPy函数库中的 matrix 与 MATLAB中 matrices 等价。 +''' + +# 生成一个 4*4 的随机数组 +print random.rand(4, 4) + + diff --git a/src/python/Logistic.py b/src/python/Logistic.py new file mode 100644 index 00000000..82ca465c --- /dev/null +++ b/src/python/Logistic.py @@ -0,0 +1,136 @@ +#!/usr/bin/python +# coding: utf8 + +''' +Created on Oct 27, 2010 +Logistic Regression Working Module +@author: Peter +''' + +import os +from numpy import * +import matplotlib.pyplot as plt +# 解析数据 +def loadDataSet(file_name): + # dataMat为原始数据, labelMat为原始数据的标签 + dataMat = []; labelMat = [] + fr = open(file_name) + for line in fr.readlines(): + lineArr = line.strip().split() + dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) + labelMat.append(int(lineArr[2])) + return dataMat,labelMat + +# sigmoid跳跃函数 +def sigmoid(inX): + return 1.0/(1+exp(-inX)) + +# 正常的处理方案 +def gradAscent(dataMatIn, classLabels): + dataMatrix = mat(dataMatIn) #convert to NumPy matrix + # transpose() 行列转制函数 + # 将行矩阵转化为列矩阵 => 矩阵的转置 + labelMat = mat(classLabels).transpose() #convert to NumPy matrix + m,n = shape(dataMatrix) + # print m, n, '__'*10, shape(dataMatrix.transpose()), '__'*100 + alpha = 0.001 + maxCycles = 500 + # 权重 + weights = ones((n,1)) + for k in range(maxCycles): #heavy on matrix operations + # m*3的矩阵 * 3*1的单位矩阵 = m*1的矩阵 + # 那么乘上单位矩阵的意义,就代表:通过公式得到的理论值 + # 参考地址: 矩阵乘法的本质是什么? https://www.zhihu.com/question/21351965/answer/31050145 + # n*3 * 3*1 = n*1 + h = sigmoid(dataMatrix*weights) #matrix mult + # labelMat是实际值 + error = (labelMat - h) #vector subtraction + # 0.001* (3*m)*(m*1) 表示在每一个列上的一个误差情况,最后得出 x1,x2,xn的系数的偏移量 + weights = weights + alpha * dataMatrix.transpose() * error #matrix mult + return array(weights) + +# 梯度上升算法 +def stocGradAscent0(dataMatrix, classLabels): + m,n = shape(dataMatrix) + alpha = 0.01 + # n*1的矩阵 + # 函数ones创建一个全1的数组 + weights = ones(n) #initialize to all ones + for i in range(m): + # sum(dataMatrix[i]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn + h = sigmoid(sum(dataMatrix[i]*weights)) + error = classLabels[i] - h + # 0.01*(1*1)*(1*n) + print weights, "*"*10 , dataMatrix[i], "*"*10 , error + weights = weights + alpha * error * dataMatrix[i] + return weights + +# 随机梯度上升算法(随机化) +def stocGradAscent1(dataMatrix, classLabels, numIter=150): + m,n = shape(dataMatrix) + weights = ones(n) #initialize to all ones + # 随机剃度, 循环150,观察是否收敛 + for j in range(numIter): + # [0, 1, 2 .. m-1] + dataIndex = range(m) + for i in range(m): + # i和j的不断增大,导致alpha的值不断减少,但是不为0 + alpha = 4/(1.0+j+i)+0.0001 #apha decreases with iteration, does not + # 随机产生一个 0~len()之间的一个值 + randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant + # sum(dataMatrix[i]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn + h = sigmoid(sum(dataMatrix[randIndex]*weights)) + error = classLabels[randIndex] - h + # print weights, '__h=%s' % h, '__'*20, alpha, '__'*20, error, '__'*20, dataMatrix[randIndex] + weights = weights + alpha * error * dataMatrix[randIndex] + del(dataIndex[randIndex]) + return weights + +# 可视化展示 +def plotBestFit(dataArr, labelMat, weights): + n = shape(dataArr)[0] + xcord1 = []; ycord1 = [] + xcord2 = []; ycord2 = [] + for i in range(n): + if int(labelMat[i])== 1: + xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) + else: + xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) + fig = plt.figure() + ax = fig.add_subplot(111) + ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') + ax.scatter(xcord2, ycord2, s=30, c='green') + x = arange(-3.0, 3.0, 0.1) + """ + y的由来,卧槽,是不是没看懂? + 首先理论上是这个样子的。 + dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) + w0*x0+w1*x1+w2*x2=f(x) + x0最开始就设置为1叻, x2就是我们画图的y值,而f(x)被我们磨合误差给算到w0,w1,w2身上去了 + 所以: w0+w1*x+w2*y=0 => y = (-w0-w1*x)/w2 + """ + y = (-weights[0]-weights[1]*x)/weights[2] + ax.plot(x, y) + plt.xlabel('X'); plt.ylabel('Y') + plt.show() + +def main(): + project_dir = os.path.dirname(os.path.dirname(os.getcwd())) + # 1.收集并准备数据 + dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir) + + # print dataMat, '---\n', labelMat + # 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值 + # 因为数组没有是复制n份, array的乘法就是乘法 + dataArr = array(dataMat) + # print dataArr + # weights = gradAscent(dataArr, labelMat) + # weights = stocGradAscent0(dataArr, labelMat) + weights = stocGradAscent1(dataArr, labelMat) + # print '*'*30, weights + + # 数据可视化 + plotBestFit(dataArr, labelMat, weights) + +if __name__=="__main__": + main() \ No newline at end of file diff --git a/src/python/apriori.py b/src/python/apriori.py new file mode 100644 index 00000000..ee6af908 --- /dev/null +++ b/src/python/apriori.py @@ -0,0 +1,206 @@ +#!/usr/bin/python +# coding: utf8 + +''' +Created on Mar 24, 2011 +Ch 11 code +@author: Peter +''' +from numpy import * + +def loadDataSet(): + return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] + +def createC1(dataSet): + C1 = [] + for transaction in dataSet: + for item in transaction: + if not [item] in C1: + C1.append([item]) + + C1.sort() + return map(frozenset, C1) # use frozen set so we + # can use it as a key in a dict + +def scanD(D, Ck, minSupport): + ssCnt = {} + for tid in D: + for can in Ck: + # s.issubset(t) 测试是否 s 中的每一个元素都在 t 中 + if can.issubset(tid): + if not ssCnt.has_key(can): ssCnt[can]=1 + else: ssCnt[can] += 1 + numItems = float(len(D)) + retList = [] + supportData = {} + for key in ssCnt: + support = ssCnt[key]/numItems + if support >= minSupport: + retList.insert(0, key) + supportData[key] = support + return retList, supportData + +def aprioriGen(Lk, k): #creates Ck + retList = [] + lenLk = len(Lk) + for i in range(lenLk): + for j in range(i+1, lenLk): + L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2] + L1.sort(); L2.sort() + if L1==L2: #if first k-2 elements are equal + retList.append(Lk[i] | Lk[j]) #set union + return retList + +def apriori(dataSet, minSupport = 0.5): + # 冻结每一行数据 + C1 = createC1(dataSet) + D = map(set, dataSet) + + # 计算支持support + L1, supportData = scanD(D, C1, minSupport) + print("outcome: ", supportData) + + L = [L1] + k = 2 + while (len(L[k-2]) > 0): + Ck = aprioriGen(L[k-2], k) + Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk + supportData.update(supK) + L.append(Lk) + k += 1 + return L, supportData + +def main(): + # project_dir = os.path.dirname(os.path.dirname(os.getcwd())) + # 1.收集并准备数据 + # dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir) + + + # 1. 加载数据 + dataSet = loadDataSet() + print(dataSet) + # 调用 apriori 做购物篮分析 + apriori(dataSet, minSupport = 0.7) + +if __name__=="__main__": + main() + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +def generateRules(L, supportData, minConf=0.7): #supportData is a dict coming from scanD + bigRuleList = [] + for i in range(1, len(L)):#only get the sets with two or more items + for freqSet in L[i]: + H1 = [frozenset([item]) for item in freqSet] + if (i > 1): + rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) + else: + calcConf(freqSet, H1, supportData, bigRuleList, minConf) + return bigRuleList + +def calcConf(freqSet, H, supportData, brl, minConf=0.7): + prunedH = [] #create new list to return + for conseq in H: + conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence + if conf >= minConf: + print freqSet-conseq,'-->',conseq,'conf:',conf + brl.append((freqSet-conseq, conseq, conf)) + prunedH.append(conseq) + return prunedH + +def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7): + m = len(H[0]) + if (len(freqSet) > (m + 1)): #try further merging + Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates + Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf) + if (len(Hmp1) > 1): #need at least two sets to merge + rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf) + +def pntRules(ruleList, itemMeaning): + for ruleTup in ruleList: + for item in ruleTup[0]: + print itemMeaning[item] + print " -------->" + for item in ruleTup[1]: + print itemMeaning[item] + print "confidence: %f" % ruleTup[2] + print #print a blank line + + +# from time import sleep +# from votesmart import votesmart +# votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030' +# #votesmart.apikey = 'get your api key first' +# def getActionIds(): +# actionIdList = []; billTitleList = [] +# fr = open('recent20bills.txt') +# for line in fr.readlines(): +# billNum = int(line.split('\t')[0]) +# try: +# billDetail = votesmart.votes.getBill(billNum) #api call +# for action in billDetail.actions: +# if action.level == 'House' and \ +# (action.stage == 'Passage' or action.stage == 'Amendment Vote'): +# actionId = int(action.actionId) +# print 'bill: %d has actionId: %d' % (billNum, actionId) +# actionIdList.append(actionId) +# billTitleList.append(line.strip().split('\t')[1]) +# except: +# print "problem getting bill %d" % billNum +# sleep(1) #delay to be polite +# return actionIdList, billTitleList +# +# def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints +# itemMeaning = ['Republican', 'Democratic']#list of what each item stands for +# for billTitle in billTitleList:#fill up itemMeaning list +# itemMeaning.append('%s -- Nay' % billTitle) +# itemMeaning.append('%s -- Yea' % billTitle) +# transDict = {}#list of items in each transaction (politician) +# voteCount = 2 +# for actionId in actionIdList: +# sleep(3) +# print 'getting votes for actionId: %d' % actionId +# try: +# voteList = votesmart.votes.getBillActionVotes(actionId) +# for vote in voteList: +# if not transDict.has_key(vote.candidateName): +# transDict[vote.candidateName] = [] +# if vote.officeParties == 'Democratic': +# transDict[vote.candidateName].append(1) +# elif vote.officeParties == 'Republican': +# transDict[vote.candidateName].append(0) +# if vote.action == 'Nay': +# transDict[vote.candidateName].append(voteCount) +# elif vote.action == 'Yea': +# transDict[vote.candidateName].append(voteCount + 1) +# except: +# print "problem getting actionId: %d" % actionId +# voteCount += 2 +# return transDict, itemMeaning diff --git a/src/python/regression.py b/src/python/regression.py new file mode 100644 index 00000000..de328368 --- /dev/null +++ b/src/python/regression.py @@ -0,0 +1,298 @@ +#!/usr/bin/python +# coding: utf8 + +''' +Created on Jan 8, 2011 + +@author: Peter +''' + +import os +from numpy import * +import matplotlib.pylab as plt + +def loadDataSet(fileName): #general function to parse tab -delimited floats + numFeat = len(open(fileName).readline().split('\t')) - 1 #get number of fields + dataMat = []; labelMat = [] + fr = open(fileName) + for line in fr.readlines(): + lineArr =[] + curLine = line.strip().split('\t') + for i in range(numFeat): + lineArr.append(float(curLine[i])) + dataMat.append(lineArr) + labelMat.append(float(curLine[-1])) + return dataMat,labelMat + +def standRegres(xArr,yArr): + # >>> A.T # transpose, 转置 + xMat = mat(xArr); yMat = mat(yArr).T + # 转置矩阵*矩阵 + xTx = xMat.T*xMat + if linalg.det(xTx) == 0.0: + print "This matrix is singular, cannot do inverse" + return + # >>> print A.I # inverse, 逆矩阵 + # print xTx.I, "*"*10, xMat.T, "*"*10, yMat + ws = xTx.I * (xMat.T*yMat) # 最小二乘法求最优解 + return ws + +def plotBestFit(xArr, yArr, ws): + + xMat = mat(xArr) + yMat = mat(yArr) + fig = plt.figure() + ax = fig.add_subplot(111) + ax.scatter(xMat[:,1].flatten().A[0], yMat.T[:,0].flatten().A[0]) + + yHat = xMat*ws + # 再计算相关系数 + print "相关系数\n", corrcoef(yHat.T, yMat) + + xMat.sort(0) + yHat = xMat*ws + n = shape(xMat)[0] + xcord = []; ycord = [] + for i in range(n): + xcord.append(xMat[i, 1]); ycord.append(yHat[i, 0]) + + ax.plot(xcord, ycord, c='red') + plt.xlabel('X'); plt.ylabel('Y') + plt.show() + +def main1(): + # w0*x0+w1*x1+w2*x2=f(x) + project_dir = os.path.dirname(os.path.dirname(os.getcwd())) + # 1.收集并准备数据 + xArr, yArr = loadDataSet("%s/resources/ex0.txt" % project_dir) + # print xArr, '---\n', yArr + # 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值 + ws = standRegres(xArr, yArr) + print '*'*30, '---\n', ws + + # 数据可视化 + plotBestFit(xArr, yArr, ws) + + +def lwlr(testPoint, xArr, yArr,k=1.0): + xMat = mat(xArr); yMat = mat(yArr).T + m = shape(xMat)[0] + weights = mat(eye((m))) + for j in range(m): #next 2 lines create weights matrix + diffMat = testPoint - xMat[j,:] + # 高斯核对应的加权 + weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2)) + xTx = xMat.T * (weights * xMat) + if linalg.det(xTx) == 0.0: + print "This matrix is singular, cannot do inverse" + return + + # 加权的回归系数求解 + ws = xTx.I * (xMat.T * (weights * yMat)) + return testPoint * ws + +def lwlrTest(testArr,xArr,yArr,k=1.0): #loops over all the data points and applies lwlr to each one + m = shape(testArr)[0] + # m*1的矩阵 + # 函数 zeros 创建一个全0的数组 + yHat = zeros(m) + print "shape(yHat)", shape(yHat) + for i in range(m): + yHat[i] = lwlr(testArr[i],xArr,yArr,k) + return yHat + +def lwlrTestPlot(xArr, yArr, yHat): + + xMat = mat(xArr) + yMat = mat(yArr) + fig = plt.figure() + ax = fig.add_subplot(111) + ax.scatter(xMat[:,1].flatten().A[0], yMat.T[:,0].flatten().A[0]) + + # 再计算相关系数 + print "相关系数\n", corrcoef(yHat.T, yMat) + + n = shape(xMat)[0] + xcord = []; ycord = [] + for i in range(n): + xcord.append(xMat[i, 1]), ycord.append(yHat[i]) + + xcord.sort(), ycord.sort() + # print xcord, "------\n", ycord + ax.plot(xcord, ycord, c='red') + plt.xlabel('X'); plt.ylabel('Y') + plt.show() + +def main2(): + # w0*x0+w1*x1+w2*x2=f(x) + project_dir = os.path.dirname(os.path.dirname(os.getcwd())) + # 1.收集并准备数据 + xArr, yArr = loadDataSet("%s/resources/ex0.txt" % project_dir) + # print xArr, '---\n', yArr + # 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值 + yHat = lwlrTest(xArr, xArr, yArr, 0.003) + print xArr, '---\n', yHat[1] + + # 数据可视化 + lwlrTestPlot(xArr, yArr, yHat) + +if __name__=="__main__": + # 线性回归 + # main1() + # 局部加权线性回归 + main2() + +def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays + return ((yArr-yHatArr)**2).sum() + +def ridgeRegres(xMat,yMat,lam=0.2): + xTx = xMat.T*xMat + denom = xTx + eye(shape(xMat)[1])*lam + if linalg.det(denom) == 0.0: + print "This matrix is singular, cannot do inverse" + return + ws = denom.I * (xMat.T*yMat) + return ws + +def ridgeTest(xArr,yArr): + xMat = mat(xArr); yMat=mat(yArr).T + yMean = mean(yMat,0) + yMat = yMat - yMean #to eliminate X0 take mean off of Y + #regularize X's + xMeans = mean(xMat,0) #calc mean then subtract it off + xVar = var(xMat,0) #calc variance of Xi then divide by it + xMat = (xMat - xMeans)/xVar + numTestPts = 30 + wMat = zeros((numTestPts,shape(xMat)[1])) + for i in range(numTestPts): + ws = ridgeRegres(xMat,yMat,exp(i-10)) + wMat[i,:]=ws.T + return wMat + +def regularize(xMat):#regularize by columns + inMat = xMat.copy() + inMeans = mean(inMat,0) #calc mean then subtract it off + inVar = var(inMat,0) #calc variance of Xi then divide by it + inMat = (inMat - inMeans)/inVar + return inMat + +def stageWise(xArr,yArr,eps=0.01,numIt=100): + xMat = mat(xArr); yMat=mat(yArr).T + yMean = mean(yMat,0) + yMat = yMat - yMean #can also regularize ys but will get smaller coef + xMat = regularize(xMat) + m,n=shape(xMat) + #returnMat = zeros((numIt,n)) #testing code remove + ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy() + for i in range(numIt): + print ws.T + lowestError = inf; + for j in range(n): + for sign in [-1,1]: + wsTest = ws.copy() + wsTest[j] += eps*sign + yTest = xMat*wsTest + rssE = rssError(yMat.A,yTest.A) + if rssE < lowestError: + lowestError = rssE + wsMax = wsTest + ws = wsMax.copy() + #returnMat[i,:]=ws.T + #return returnMat + +def scrapePage(inFile,outFile,yr,numPce,origPrc): + from BeautifulSoup import BeautifulSoup + fr = open(inFile); fw=open(outFile,'a') #a is append mode writing + soup = BeautifulSoup(fr.read()) + i=1 + currentRow = soup.findAll('table', r="%d" % i) + while(len(currentRow)!=0): + title = currentRow[0].findAll('a')[1].text + lwrTitle = title.lower() + if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1): + newFlag = 1.0 + else: + newFlag = 0.0 + soldUnicde = currentRow[0].findAll('td')[3].findAll('span') + if len(soldUnicde)==0: + print "item #%d did not sell" % i + else: + soldPrice = currentRow[0].findAll('td')[4] + priceStr = soldPrice.text + priceStr = priceStr.replace('$','') #strips out $ + priceStr = priceStr.replace(',','') #strips out , + if len(soldPrice)>1: + priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping + print "%s\t%d\t%s" % (priceStr,newFlag,title) + fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr)) + i += 1 + currentRow = soup.findAll('table', r="%d" % i) + fw.close() + +from time import sleep +import json +import urllib2 +def searchForSet(retX, retY, setNum, yr, numPce, origPrc): + sleep(10) + myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY' + searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum) + pg = urllib2.urlopen(searchURL) + retDict = json.loads(pg.read()) + for i in range(len(retDict['items'])): + try: + currItem = retDict['items'][i] + if currItem['product']['condition'] == 'new': + newFlag = 1 + else: newFlag = 0 + listOfInv = currItem['product']['inventories'] + for item in listOfInv: + sellingPrice = item['price'] + if sellingPrice > origPrc * 0.5: + print "%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice) + retX.append([yr, numPce, newFlag, origPrc]) + retY.append(sellingPrice) + except: print 'problem with item %d' % i + +def setDataCollect(retX, retY): + searchForSet(retX, retY, 8288, 2006, 800, 49.99) + searchForSet(retX, retY, 10030, 2002, 3096, 269.99) + searchForSet(retX, retY, 10179, 2007, 5195, 499.99) + searchForSet(retX, retY, 10181, 2007, 3428, 199.99) + searchForSet(retX, retY, 10189, 2008, 5922, 299.99) + searchForSet(retX, retY, 10196, 2009, 3263, 249.99) + +def crossValidation(xArr,yArr,numVal=10): + m = len(yArr) + indexList = range(m) + errorMat = zeros((numVal,30))#create error mat 30columns numVal rows + for i in range(numVal): + trainX=[]; trainY=[] + testX = []; testY = [] + random.shuffle(indexList) + for j in range(m):#create training set based on first 90% of values in indexList + if j < m*0.9: + trainX.append(xArr[indexList[j]]) + trainY.append(yArr[indexList[j]]) + else: + testX.append(xArr[indexList[j]]) + testY.append(yArr[indexList[j]]) + wMat = ridgeTest(trainX,trainY) #get 30 weight vectors from ridge + for k in range(30):#loop over all of the ridge estimates + matTestX = mat(testX); matTrainX=mat(trainX) + meanTrain = mean(matTrainX,0) + varTrain = var(matTrainX,0) + matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params + yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store + errorMat[i,k]=rssError(yEst.T.A,array(testY)) + #print errorMat[i,k] + meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors + minMean = float(min(meanErrors)) + bestWeights = wMat[nonzero(meanErrors==minMean)] + #can unregularize to get model + #when we regularized we wrote Xreg = (x-meanX)/var(x) + #we can now write in terms of x not Xreg: x*w/var(x) - meanX/var(x) +meanY + xMat = mat(xArr); yMat=mat(yArr).T + meanX = mean(xMat,0); varX = var(xMat,0) + unReg = bestWeights/varX + print "the best model from Ridge Regression is:\n",unReg + print "with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat) \ No newline at end of file