更新readme

2026-02-09 13:25:39 +08:00 · 2017-02-25 19:19:30 +08:00
parent d9d21f5258
commit f69881fcc3
8 changed files with 734 additions and 0 deletions
--- a/src/python/01.NumPy.py
+++ b/src/python/01.NumPy.py
@@ -0,0 +1,21 @@
+#!/usr/bin/python
+# coding:utf8
+
+from numpy import random
+
+'''
+# NumPy 矩阵和数字的区别
+NumPy存在2中不同的数据类型:
+    1. 矩阵 matrix
+    2. 数组 array
+相似点：
+    都可以处理行列表示的数字元素
+不同点：
+    1. 2个数据类型上执行相同的数据运算可能得到不同的结果。
+    2. NumPy函数库中的 matrix 与 MATLAB中 matrices 等价。
+'''
+
+# 生成一个 4*4 的随机数组
+print random.rand(4, 4)
+
+
--- a/src/python/Logistic.py
+++ b/src/python/Logistic.py
@@ -0,0 +1,136 @@
+#!/usr/bin/python
+# coding: utf8
+
+'''
+Created on Oct 27, 2010
+Logistic Regression Working Module
+@author: Peter
+'''
+
+import os
+from numpy import *
+import matplotlib.pyplot as plt
+# 解析数据
+def loadDataSet(file_name):
+    # dataMat为原始数据， labelMat为原始数据的标签
+    dataMat = []; labelMat = []
+    fr = open(file_name)
+    for line in fr.readlines():
+        lineArr = line.strip().split()
+        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
+        labelMat.append(int(lineArr[2]))
+    return dataMat,labelMat
+
+# sigmoid跳跃函数
+def sigmoid(inX):
+    return 1.0/(1+exp(-inX))
+
+# 正常的处理方案
+def gradAscent(dataMatIn, classLabels):
+    dataMatrix = mat(dataMatIn)             #convert to NumPy matrix
+    # transpose() 行列转制函数
+    # 将行矩阵转化为列矩阵    =>  矩阵的转置
+    labelMat = mat(classLabels).transpose() #convert to NumPy matrix
+    m,n = shape(dataMatrix)
+    # print m, n, '__'*10, shape(dataMatrix.transpose()), '__'*100
+    alpha = 0.001
+    maxCycles = 500
+    # 权重
+    weights = ones((n,1))
+    for k in range(maxCycles):              #heavy on matrix operations
+        # m*3的矩阵 * 3*1的单位矩阵 ＝ m*1的矩阵
+        # 那么乘上单位矩阵的意义，就代表：通过公式得到的理论值
+        # 参考地址： 矩阵乘法的本质是什么？ https://www.zhihu.com/question/21351965/answer/31050145
+        # n*3   *  3*1  = n*1
+        h = sigmoid(dataMatrix*weights)     #matrix mult
+        # labelMat是实际值
+        error = (labelMat - h)              #vector subtraction
+        # 0.001* (3*m)*(m*1) 表示在每一个列上的一个误差情况，最后得出 x1,x2,xn的系数的偏移量
+        weights = weights + alpha * dataMatrix.transpose() * error #matrix mult
+    return array(weights)
+
+# 梯度上升算法
+def stocGradAscent0(dataMatrix, classLabels):
+    m,n = shape(dataMatrix)
+    alpha = 0.01
+    # n*1的矩阵
+    # 函数ones创建一个全1的数组
+    weights = ones(n)   #initialize to all ones
+    for i in range(m):
+        # sum(dataMatrix[i]*weights)为了求 f(x)的值， f(x)=a1*x1+b2*x2+..+nn*xn
+        h = sigmoid(sum(dataMatrix[i]*weights))
+        error = classLabels[i] - h
+        # 0.01*(1*1)*(1*n)
+        print weights, "*"*10 , dataMatrix[i], "*"*10 , error
+        weights = weights + alpha * error * dataMatrix[i]
+    return weights
+
+# 随机梯度上升算法（随机化）
+def stocGradAscent1(dataMatrix, classLabels, numIter=150):
+    m,n = shape(dataMatrix)
+    weights = ones(n)   #initialize to all ones
+    # 随机剃度, 循环150,观察是否收敛
+    for j in range(numIter):
+        # [0, 1, 2 .. m-1]
+        dataIndex = range(m)
+        for i in range(m):
+            # i和j的不断增大，导致alpha的值不断减少，但是不为0
+            alpha = 4/(1.0+j+i)+0.0001    #apha decreases with iteration, does not
+            # 随机产生一个 0～len()之间的一个值
+            randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant
+            # sum(dataMatrix[i]*weights)为了求 f(x)的值， f(x)=a1*x1+b2*x2+..+nn*xn
+            h = sigmoid(sum(dataMatrix[randIndex]*weights))
+            error = classLabels[randIndex] - h
+            # print weights, '__h=%s' % h, '__'*20, alpha, '__'*20, error, '__'*20, dataMatrix[randIndex]
+            weights = weights + alpha * error * dataMatrix[randIndex]
+            del(dataIndex[randIndex])
+    return weights
+
+# 可视化展示
+def plotBestFit(dataArr, labelMat, weights):
+    n = shape(dataArr)[0]
+    xcord1 = []; ycord1 = []
+    xcord2 = []; ycord2 = []
+    for i in range(n):
+        if int(labelMat[i])== 1:
+            xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
+        else:
+            xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
+    ax.scatter(xcord2, ycord2, s=30, c='green')
+    x = arange(-3.0, 3.0, 0.1)
+    """
+    y的由来，卧槽，是不是没看懂？
+    首先理论上是这个样子的。
+    dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
+    w0*x0+w1*x1+w2*x2=f(x)
+    x0最开始就设置为1叻， x2就是我们画图的y值，而f(x)被我们磨合误差给算到w0,w1,w2身上去了
+    所以： w0+w1*x+w2*y=0 => y = (-w0-w1*x)/w2   
+    """
+    y = (-weights[0]-weights[1]*x)/weights[2]
+    ax.plot(x, y)
+    plt.xlabel('X'); plt.ylabel('Y')
+    plt.show()
+
+def main():
+    project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
+    # 1.收集并准备数据
+    dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
+
+    # print dataMat, '---\n', labelMat
+    # 2.训练模型，  f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
+    # 因为数组没有是复制n份， array的乘法就是乘法
+    dataArr = array(dataMat)
+    # print dataArr
+    # weights = gradAscent(dataArr, labelMat)
+    # weights = stocGradAscent0(dataArr, labelMat)
+    weights = stocGradAscent1(dataArr, labelMat)
+    # print '*'*30, weights
+
+    # 数据可视化
+    plotBestFit(dataArr, labelMat, weights)
+
+if __name__=="__main__":
+    main()
--- a/src/python/apriori.py
+++ b/src/python/apriori.py
@@ -0,0 +1,206 @@
+#!/usr/bin/python
+# coding: utf8
+
+'''
+Created on Mar 24, 2011
+Ch 11 code
+@author: Peter
+'''
+from numpy import *
+
+def loadDataSet():
+    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
+
+def createC1(dataSet):
+    C1 = []
+    for transaction in dataSet:
+        for item in transaction:
+            if not [item] in C1:
+                C1.append([item])
+                
+    C1.sort()
+    return map(frozenset, C1) # use frozen set so we
+                              # can use it as a key in a dict
+
+def scanD(D, Ck, minSupport):
+    ssCnt = {}
+    for tid in D:
+        for can in Ck:
+            # s.issubset(t)  测试是否 s 中的每一个元素都在 t 中
+            if can.issubset(tid):
+                if not ssCnt.has_key(can): ssCnt[can]=1
+                else: ssCnt[can] += 1
+    numItems = float(len(D))
+    retList = []
+    supportData = {}
+    for key in ssCnt:
+        support = ssCnt[key]/numItems
+        if support >= minSupport:
+            retList.insert(0, key)
+        supportData[key] = support
+    return retList, supportData
+
+def aprioriGen(Lk, k): #creates Ck
+    retList = []
+    lenLk = len(Lk)
+    for i in range(lenLk):
+        for j in range(i+1, lenLk):
+            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
+            L1.sort(); L2.sort()
+            if L1==L2: #if first k-2 elements are equal
+                retList.append(Lk[i] | Lk[j]) #set union
+    return retList
+
+def apriori(dataSet, minSupport = 0.5):
+    # 冻结每一行数据
+    C1 = createC1(dataSet)
+    D = map(set, dataSet)
+
+    # 计算支持support
+    L1, supportData = scanD(D, C1, minSupport)
+    print("outcome: ", supportData)
+
+    L = [L1]
+    k = 2
+    while (len(L[k-2]) > 0):
+        Ck = aprioriGen(L[k-2], k)
+        Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
+        supportData.update(supK)
+        L.append(Lk)
+        k += 1
+    return L, supportData
+
+def main():
+    # project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
+    # 1.收集并准备数据
+    # dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
+
+
+    # 1. 加载数据
+    dataSet = loadDataSet()
+    print(dataSet)
+    # 调用 apriori 做购物篮分析
+    apriori(dataSet, minSupport = 0.7)
+
+if __name__=="__main__":
+    main()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def generateRules(L, supportData, minConf=0.7):  #supportData is a dict coming from scanD
+    bigRuleList = []
+    for i in range(1, len(L)):#only get the sets with two or more items
+        for freqSet in L[i]:
+            H1 = [frozenset([item]) for item in freqSet]
+            if (i > 1):
+                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
+            else:
+                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
+    return bigRuleList         
+
+def calcConf(freqSet, H, supportData, brl, minConf=0.7):
+    prunedH = [] #create new list to return
+    for conseq in H:
+        conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
+        if conf >= minConf: 
+            print freqSet-conseq,'-->',conseq,'conf:',conf
+            brl.append((freqSet-conseq, conseq, conf))
+            prunedH.append(conseq)
+    return prunedH
+
+def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
+    m = len(H[0])
+    if (len(freqSet) > (m + 1)): #try further merging
+        Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
+        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
+        if (len(Hmp1) > 1):    #need at least two sets to merge
+            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
+            
+def pntRules(ruleList, itemMeaning):
+    for ruleTup in ruleList:
+        for item in ruleTup[0]:
+            print itemMeaning[item]
+        print "           -------->"
+        for item in ruleTup[1]:
+            print itemMeaning[item]
+        print "confidence: %f" % ruleTup[2]
+        print       #print a blank line
+        
+            
+# from time import sleep
+# from votesmart import votesmart
+# votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
+# #votesmart.apikey = 'get your api key first'
+# def getActionIds():
+#     actionIdList = []; billTitleList = []
+#     fr = open('recent20bills.txt')
+#     for line in fr.readlines():
+#         billNum = int(line.split('\t')[0])
+#         try:
+#             billDetail = votesmart.votes.getBill(billNum) #api call
+#             for action in billDetail.actions:
+#                 if action.level == 'House' and \
+#                 (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
+#                     actionId = int(action.actionId)
+#                     print 'bill: %d has actionId: %d' % (billNum, actionId)
+#                     actionIdList.append(actionId)
+#                     billTitleList.append(line.strip().split('\t')[1])
+#         except:
+#             print "problem getting bill %d" % billNum
+#         sleep(1)                                      #delay to be polite
+#     return actionIdList, billTitleList
+#
+# def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
+#     itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
+#     for billTitle in billTitleList:#fill up itemMeaning list
+#         itemMeaning.append('%s -- Nay' % billTitle)
+#         itemMeaning.append('%s -- Yea' % billTitle)
+#     transDict = {}#list of items in each transaction (politician)
+#     voteCount = 2
+#     for actionId in actionIdList:
+#         sleep(3)
+#         print 'getting votes for actionId: %d' % actionId
+#         try:
+#             voteList = votesmart.votes.getBillActionVotes(actionId)
+#             for vote in voteList:
+#                 if not transDict.has_key(vote.candidateName):
+#                     transDict[vote.candidateName] = []
+#                     if vote.officeParties == 'Democratic':
+#                         transDict[vote.candidateName].append(1)
+#                     elif vote.officeParties == 'Republican':
+#                         transDict[vote.candidateName].append(0)
+#                 if vote.action == 'Nay':
+#                     transDict[vote.candidateName].append(voteCount)
+#                 elif vote.action == 'Yea':
+#                     transDict[vote.candidateName].append(voteCount + 1)
+#         except:
+#             print "problem getting actionId: %d" % actionId
+#         voteCount += 2
+#     return transDict, itemMeaning
--- a/src/python/regression.py
+++ b/src/python/regression.py
@@ -0,0 +1,298 @@
+#!/usr/bin/python
+# coding: utf8
+
+'''
+Created on Jan 8, 2011
+
+@author: Peter
+'''
+
+import os
+from numpy import *
+import matplotlib.pylab as plt
+
+def loadDataSet(fileName):      #general function to parse tab -delimited floats
+    numFeat = len(open(fileName).readline().split('\t')) - 1 #get number of fields
+    dataMat = []; labelMat = []
+    fr = open(fileName)
+    for line in fr.readlines():
+        lineArr =[]
+        curLine = line.strip().split('\t')
+        for i in range(numFeat):
+            lineArr.append(float(curLine[i]))
+        dataMat.append(lineArr)
+        labelMat.append(float(curLine[-1]))
+    return dataMat,labelMat
+
+def standRegres(xArr,yArr):
+    # >>> A.T  # transpose, 转置
+    xMat = mat(xArr); yMat = mat(yArr).T
+    # 转置矩阵*矩阵
+    xTx = xMat.T*xMat
+    if linalg.det(xTx) == 0.0:
+        print "This matrix is singular, cannot do inverse"
+        return
+    # >>> print A.I  # inverse, 逆矩阵
+    # print xTx.I, "*"*10, xMat.T, "*"*10, yMat
+    ws = xTx.I * (xMat.T*yMat)  # 最小二乘法求最优解
+    return ws
+
+def plotBestFit(xArr, yArr, ws):
+
+    xMat = mat(xArr)
+    yMat = mat(yArr)
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    ax.scatter(xMat[:,1].flatten().A[0], yMat.T[:,0].flatten().A[0])
+
+    yHat = xMat*ws
+    # 再计算相关系数
+    print "相关系数\n", corrcoef(yHat.T, yMat)
+
+    xMat.sort(0)
+    yHat = xMat*ws
+    n = shape(xMat)[0]
+    xcord = []; ycord = []
+    for i in range(n):
+        xcord.append(xMat[i, 1]); ycord.append(yHat[i, 0])
+
+    ax.plot(xcord, ycord, c='red')
+    plt.xlabel('X'); plt.ylabel('Y')
+    plt.show()
+
+def main1():
+    # w0*x0+w1*x1+w2*x2=f(x)
+    project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
+    # 1.收集并准备数据
+    xArr, yArr = loadDataSet("%s/resources/ex0.txt" % project_dir)
+    # print xArr, '---\n', yArr
+    # 2.训练模型，  f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
+    ws = standRegres(xArr, yArr)
+    print '*'*30, '---\n', ws
+
+    # 数据可视化
+    plotBestFit(xArr, yArr, ws)
+
+
+def lwlr(testPoint, xArr, yArr,k=1.0):
+    xMat = mat(xArr); yMat = mat(yArr).T
+    m = shape(xMat)[0]
+    weights = mat(eye((m)))
+    for j in range(m):                      #next 2 lines create weights matrix
+        diffMat = testPoint - xMat[j,:]
+        # 高斯核对应的加权
+        weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))
+    xTx = xMat.T * (weights * xMat)
+    if linalg.det(xTx) == 0.0:
+        print "This matrix is singular, cannot do inverse"
+        return
+
+    # 加权的回归系数求解
+    ws = xTx.I * (xMat.T * (weights * yMat))
+    return testPoint * ws
+
+def lwlrTest(testArr,xArr,yArr,k=1.0):  #loops over all the data points and applies lwlr to each one
+    m = shape(testArr)[0]
+    # m*1的矩阵
+    # 函数 zeros 创建一个全0的数组
+    yHat = zeros(m)
+    print "shape(yHat)", shape(yHat)
+    for i in range(m):
+        yHat[i] = lwlr(testArr[i],xArr,yArr,k)
+    return yHat
+
+def lwlrTestPlot(xArr, yArr, yHat):
+
+    xMat = mat(xArr)
+    yMat = mat(yArr)
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    ax.scatter(xMat[:,1].flatten().A[0], yMat.T[:,0].flatten().A[0])
+
+    # 再计算相关系数
+    print "相关系数\n", corrcoef(yHat.T, yMat)
+
+    n = shape(xMat)[0]
+    xcord = []; ycord = []
+    for i in range(n):
+        xcord.append(xMat[i, 1]), ycord.append(yHat[i])
+
+    xcord.sort(), ycord.sort()
+    # print xcord, "------\n", ycord
+    ax.plot(xcord, ycord, c='red')
+    plt.xlabel('X'); plt.ylabel('Y')
+    plt.show()
+
+def main2():
+    # w0*x0+w1*x1+w2*x2=f(x)
+    project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
+    # 1.收集并准备数据
+    xArr, yArr = loadDataSet("%s/resources/ex0.txt" % project_dir)
+    # print xArr, '---\n', yArr
+    # 2.训练模型，  f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
+    yHat = lwlrTest(xArr, xArr, yArr, 0.003)
+    print xArr, '---\n', yHat[1]
+
+    # 数据可视化
+    lwlrTestPlot(xArr, yArr, yHat)
+
+if __name__=="__main__":
+    # 线性回归
+    # main1()
+    # 局部加权线性回归
+    main2()
+
+def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
+    return ((yArr-yHatArr)**2).sum()
+
+def ridgeRegres(xMat,yMat,lam=0.2):
+    xTx = xMat.T*xMat
+    denom = xTx + eye(shape(xMat)[1])*lam
+    if linalg.det(denom) == 0.0:
+        print "This matrix is singular, cannot do inverse"
+        return
+    ws = denom.I * (xMat.T*yMat)
+    return ws
+
+def ridgeTest(xArr,yArr):
+    xMat = mat(xArr); yMat=mat(yArr).T
+    yMean = mean(yMat,0)
+    yMat = yMat - yMean     #to eliminate X0 take mean off of Y
+    #regularize X's
+    xMeans = mean(xMat,0)   #calc mean then subtract it off
+    xVar = var(xMat,0)      #calc variance of Xi then divide by it
+    xMat = (xMat - xMeans)/xVar
+    numTestPts = 30
+    wMat = zeros((numTestPts,shape(xMat)[1]))
+    for i in range(numTestPts):
+        ws = ridgeRegres(xMat,yMat,exp(i-10))
+        wMat[i,:]=ws.T
+    return wMat
+
+def regularize(xMat):#regularize by columns
+    inMat = xMat.copy()
+    inMeans = mean(inMat,0)   #calc mean then subtract it off
+    inVar = var(inMat,0)      #calc variance of Xi then divide by it
+    inMat = (inMat - inMeans)/inVar
+    return inMat
+
+def stageWise(xArr,yArr,eps=0.01,numIt=100):
+    xMat = mat(xArr); yMat=mat(yArr).T
+    yMean = mean(yMat,0)
+    yMat = yMat - yMean     #can also regularize ys but will get smaller coef
+    xMat = regularize(xMat)
+    m,n=shape(xMat)
+    #returnMat = zeros((numIt,n)) #testing code remove
+    ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
+    for i in range(numIt):
+        print ws.T
+        lowestError = inf;
+        for j in range(n):
+            for sign in [-1,1]:
+                wsTest = ws.copy()
+                wsTest[j] += eps*sign
+                yTest = xMat*wsTest
+                rssE = rssError(yMat.A,yTest.A)
+                if rssE < lowestError:
+                    lowestError = rssE
+                    wsMax = wsTest
+        ws = wsMax.copy()
+        #returnMat[i,:]=ws.T
+        #return returnMat
+
+def scrapePage(inFile,outFile,yr,numPce,origPrc):
+   from BeautifulSoup import BeautifulSoup
+   fr = open(inFile); fw=open(outFile,'a') #a is append mode writing
+   soup = BeautifulSoup(fr.read())
+   i=1
+   currentRow = soup.findAll('table', r="%d" % i)
+   while(len(currentRow)!=0):
+       title = currentRow[0].findAll('a')[1].text
+       lwrTitle = title.lower()
+       if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
+           newFlag = 1.0
+       else:
+           newFlag = 0.0
+       soldUnicde = currentRow[0].findAll('td')[3].findAll('span')
+       if len(soldUnicde)==0:
+           print "item #%d did not sell" % i
+       else:
+           soldPrice = currentRow[0].findAll('td')[4]
+           priceStr = soldPrice.text
+           priceStr = priceStr.replace('$','') #strips out $
+           priceStr = priceStr.replace(',','') #strips out ,
+           if len(soldPrice)>1:
+               priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping
+           print "%s\t%d\t%s" % (priceStr,newFlag,title)
+           fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr))
+       i += 1
+       currentRow = soup.findAll('table', r="%d" % i)
+   fw.close()
+
+from time import sleep
+import json
+import urllib2
+def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
+    sleep(10)
+    myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
+    searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum)
+    pg = urllib2.urlopen(searchURL)
+    retDict = json.loads(pg.read())
+    for i in range(len(retDict['items'])):
+        try:
+            currItem = retDict['items'][i]
+            if currItem['product']['condition'] == 'new':
+                newFlag = 1
+            else: newFlag = 0
+            listOfInv = currItem['product']['inventories']
+            for item in listOfInv:
+                sellingPrice = item['price']
+                if  sellingPrice > origPrc * 0.5:
+                    print "%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice)
+                    retX.append([yr, numPce, newFlag, origPrc])
+                    retY.append(sellingPrice)
+        except: print 'problem with item %d' % i
+
+def setDataCollect(retX, retY):
+    searchForSet(retX, retY, 8288, 2006, 800, 49.99)
+    searchForSet(retX, retY, 10030, 2002, 3096, 269.99)
+    searchForSet(retX, retY, 10179, 2007, 5195, 499.99)
+    searchForSet(retX, retY, 10181, 2007, 3428, 199.99)
+    searchForSet(retX, retY, 10189, 2008, 5922, 299.99)
+    searchForSet(retX, retY, 10196, 2009, 3263, 249.99)
+
+def crossValidation(xArr,yArr,numVal=10):
+    m = len(yArr)
+    indexList = range(m)
+    errorMat = zeros((numVal,30))#create error mat 30columns numVal rows
+    for i in range(numVal):
+        trainX=[]; trainY=[]
+        testX = []; testY = []
+        random.shuffle(indexList)
+        for j in range(m):#create training set based on first 90% of values in indexList
+            if j < m*0.9:
+                trainX.append(xArr[indexList[j]])
+                trainY.append(yArr[indexList[j]])
+            else:
+                testX.append(xArr[indexList[j]])
+                testY.append(yArr[indexList[j]])
+        wMat = ridgeTest(trainX,trainY)    #get 30 weight vectors from ridge
+        for k in range(30):#loop over all of the ridge estimates
+            matTestX = mat(testX); matTrainX=mat(trainX)
+            meanTrain = mean(matTrainX,0)
+            varTrain = var(matTrainX,0)
+            matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params
+            yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store
+            errorMat[i,k]=rssError(yEst.T.A,array(testY))
+            #print errorMat[i,k]
+    meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors
+    minMean = float(min(meanErrors))
+    bestWeights = wMat[nonzero(meanErrors==minMean)]
+    #can unregularize to get model
+    #when we regularized we wrote Xreg = (x-meanX)/var(x)
+    #we can now write in terms of x not Xreg:  x*w/var(x) - meanX/var(x) +meanY
+    xMat = mat(xArr); yMat=mat(yArr).T
+    meanX = mean(xMat,0); varX = var(xMat,0)
+    unReg = bestWeights/varX
+    print "the best model from Ridge Regression is:\n",unReg
+    print "with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat)