diff --git a/docs/8.预测数值型数据:回归.md b/docs/8.预测数值型数据:回归.md new file mode 100644 index 00000000..90b975ad --- /dev/null +++ b/docs/8.预测数值型数据:回归.md @@ -0,0 +1,18 @@ + +# 8)Ԥֵݣع + +* Իعص + * ŵ㣺⣬ϲӡ + * ȱ㣺ԷԵϲá + * ݷΧֵͺͱݡ +* ԭ + * عĿԤֵ͵Ŀֱֵӵİ취ԼдһĿֵļ㹫ʽ +* عһ + * ռݣκη + * ׼ݣعҪֵݣݽתɶֵݡ + * ݣݵĿӶάͼڶͷڲ»عϵ֮󣬿Խ߻ͼΪԱȡ + * ѵ㷨ҵعϵ + * ݣʹRƽԤֵݵ϶ȣģ͵Ч + * ʹ㷨ʹûع飬ڸʱԤһֵǶԷ෽ΪԤݶɢǩ +* ܽ + * عϵĹ̾ǻع顣 diff --git a/src/python/08.Predictive numerical data regression/regression.py b/src/python/08.Predictive numerical data regression/regression.py new file mode 100644 index 00000000..be857aa7 --- /dev/null +++ b/src/python/08.Predictive numerical data regression/regression.py @@ -0,0 +1,258 @@ +''' +Create by ApacheCN-xy +Date from 2017-02-27 +''' + + +from numpy import * + +def loadDataSet(fileName): #解析以tab键分隔的文件中的浮点数 + numFeat = len(open(fileName).readline().split('\t')) - 1 #获得一行有几个字段 + dataMat = []; labelMat = [] + fr = open(fileName) + for line in fr.readlines(): #读取每一行 + lineArr =[] + curLine = line.strip().split('\t') #删除每一行的开头和结尾的tab + for i in range(numFeat):#从0到3,不包含3 + lineArr.append(float(curLine[i]))#将数据添加到lineArr List中 + dataMat.append(lineArr) + labelMat.append(float(curLine[-1])) + return dataMat,labelMat + +def standRegres(xArr,yArr): #线性回归 + xMat = mat(xArr); yMat = mat(yArr).T + xTx = xMat.T*xMat + if linalg.det(xTx) == 0.0: + print ("This matrix is singular, cannot do inverse") + return + ws = xTx.I * (xMat.T*yMat) + return ws + +def lwlr(testPoint,xArr,yArr,k=1.0): + xMat = mat(xArr); yMat = mat(yArr).T + m = shape(xMat)[0] + weights = mat(eye((m))) + for j in range(m): #下面两行创建权重矩阵 + diffMat = testPoint - xMat[j,:] # + weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2)) + xTx = xMat.T * (weights * xMat) + if linalg.det(xTx) == 0.0: + print ("This matrix is singular, cannot do inverse") + return + ws = xTx.I * (xMat.T * (weights * yMat)) + return testPoint * ws + +def lwlrTest(testArr,xArr,yArr,k=1.0): #循环所有的数据点,并将lwlr运用于所有的数据点 + m = shape(testArr)[0] + yHat = zeros(m) + for i in range(m): + yHat[i] = lwlr(testArr[i],xArr,yArr,k) + return yHat + +def lwlrTestPlot(xArr,yArr,k=1.0): #首先将 X 排序,其余的都与lwlrTest相同,这样更容易绘图 + yHat = zeros(shape(yArr)) + xCopy = mat(xArr) + xCopy.sort(0) + for i in range(shape(xArr)[0]): + yHat[i] = lwlr(xCopy[i],xArr,yArr,k) + return yHat,xCopy + +def rssError(yArr,yHatArr): #yArr 和 yHatArr 两者都需要是数组 + return ((yArr-yHatArr)**2).sum() + +def ridgeRegres(xMat,yMat,lam=0.2): #岭回归 + xTx = xMat.T*xMat + denom = xTx + eye(shape(xMat)[1])*lam + if linalg.det(denom) == 0.0: + print "This matrix is singular, cannot do inverse" + return + ws = denom.I * (xMat.T*yMat) + return ws + +def ridgeTest(xArr,yArr): + xMat = mat(xArr); yMat=mat(yArr).T + yMean = mean(yMat,0) + yMat = yMat - yMean #Y取平均值以消除X0 + #regularize X's(正则化 X的)??? + xMeans = mean(xMat,0) #计算平均值然后减去它 + xVar = var(xMat,0) #然后计算除以 Xi的方差 + xMat = (xMat - xMeans)/xVar + numTestPts = 30 + wMat = zeros((numTestPts,shape(xMat)[1])) + for i in range(numTestPts): + ws = ridgeRegres(xMat,yMat,exp(i-10)) + wMat[i,:]=ws.T + return wMat + +def regularize(xMat):#按列进行规范化 + inMat = xMat.copy() + inMeans = mean(inMat,0) #计算平均值然后减去它 + inVar = var(inMat,0) #计算除以Xi的方差 + inMat = (inMat - inMeans)/inVar + return inMat + +def stageWise(xArr,yArr,eps=0.01,numIt=100): + xMat = mat(xArr); yMat=mat(yArr).T + yMean = mean(yMat,0) + yMat = yMat - yMean #也可以规则化ys但会得到更小的coef + xMat = regularize(xMat) + m,n=shape(xMat) + #returnMat = zeros((numIt,n)) #测试代码删除 + ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy() + for i in range(numIt): + print ws.T + lowestError = inf; + for j in range(n): + for sign in [-1,1]: + wsTest = ws.copy() + wsTest[j] += eps*sign + yTest = xMat*wsTest + rssE = rssError(yMat.A,yTest.A) + if rssE < lowestError: + lowestError = rssE + wsMax = wsTest + ws = wsMax.copy() + #returnMat[i,:]=ws.T + #return returnMat + +#def scrapePage(inFile,outFile,yr,numPce,origPrc): +# from BeautifulSoup import BeautifulSoup +# fr = open(inFile); fw=open(outFile,'a') #a is append mode writing +# soup = BeautifulSoup(fr.read()) +# i=1 +# currentRow = soup.findAll('table', r="%d" % i) +# while(len(currentRow)!=0): +# title = currentRow[0].findAll('a')[1].text +# lwrTitle = title.lower() +# if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1): +# newFlag = 1.0 +# else: +# newFlag = 0.0 +# soldUnicde = currentRow[0].findAll('td')[3].findAll('span') +# if len(soldUnicde)==0: +# print "item #%d did not sell" % i +# else: +# soldPrice = currentRow[0].findAll('td')[4] +# priceStr = soldPrice.text +# priceStr = priceStr.replace('$','') #strips out $ +# priceStr = priceStr.replace(',','') #strips out , +# if len(soldPrice)>1: +# priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping +# print "%s\t%d\t%s" % (priceStr,newFlag,title) +# fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr)) +# i += 1 +# currentRow = soup.findAll('table', r="%d" % i) +# fw.close() + +from time import sleep +import json +import urllib2 +def searchForSet(retX, retY, setNum, yr, numPce, origPrc): + sleep(10) + myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY' + searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum) + pg = urllib2.urlopen(searchURL) + retDict = json.loads(pg.read()) + for i in range(len(retDict['items'])): + try: + currItem = retDict['items'][i] + if currItem['product']['condition'] == 'new': + newFlag = 1 + else: newFlag = 0 + listOfInv = currItem['product']['inventories'] + for item in listOfInv: + sellingPrice = item['price'] + if sellingPrice > origPrc * 0.5: + print "%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice) + retX.append([yr, numPce, newFlag, origPrc]) + retY.append(sellingPrice) + except: print 'problem with item %d' % i + +def setDataCollect(retX, retY): + searchForSet(retX, retY, 8288, 2006, 800, 49.99) + searchForSet(retX, retY, 10030, 2002, 3096, 269.99) + searchForSet(retX, retY, 10179, 2007, 5195, 499.99) + searchForSet(retX, retY, 10181, 2007, 3428, 199.99) + searchForSet(retX, retY, 10189, 2008, 5922, 299.99) + searchForSet(retX, retY, 10196, 2009, 3263, 249.99) + +def crossValidation(xArr,yArr,numVal=10): + m = len(yArr) + indexList = range(m) + errorMat = zeros((numVal,30))#create error mat 30columns numVal rows创建error mat 30columns numVal 行 + for i in range(numVal): + trainX=[]; trainY=[] + testX = []; testY = [] + random.shuffle(indexList) + for j in range(m):#create training set based on first 90% of values in indexList + #基于indexList中的前90%的值创建训练集 + if j < m*0.9: + trainX.append(xArr[indexList[j]]) + trainY.append(yArr[indexList[j]]) + else: + testX.append(xArr[indexList[j]]) + testY.append(yArr[indexList[j]]) + wMat = ridgeTest(trainX,trainY) #get 30 weight vectors from ridge + for k in range(30):#loop over all of the ridge estimates + matTestX = mat(testX); matTrainX=mat(trainX) + meanTrain = mean(matTrainX,0) + varTrain = var(matTrainX,0) + matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params + yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store + errorMat[i,k]=rssError(yEst.T.A,array(testY)) + #print errorMat[i,k] + meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors + minMean = float(min(meanErrors)) + bestWeights = wMat[nonzero(meanErrors==minMean)] + #can unregularize to get model + #when we regularized we wrote Xreg = (x-meanX)/var(x) + #we can now write in terms of x not Xreg: x*w/var(x) - meanX/var(x) +meanY + xMat = mat(xArr); yMat=mat(yArr).T + meanX = mean(xMat,0); varX = var(xMat,0) + unReg = bestWeights/varX + print "the best model from Ridge Regression is:\n",unReg + print "with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat) + + + + + + + + + + + #test for xianxinghuigui + def regression1(): + xArr, yArr = loadDataSet("ex0.txt") + xMat = mat(xArr) + yMat = mat(yArr) + ws = standRegres(xArr, yArr) + fig = plt.figure() + ax = fig.add_subplot(111) + ax.scatter(xMat[:, 1].flatten(), yMat.T[:, 0].flatten().A[0]) + xCopy = xMat.copy() + xCopy.sort(0) + yHat = xCopy * ws + ax.plot(xCopy[:, 1], yHat) + plt.show() + +if __name__ == "__main__": + regression1() + + + #test for jiaquanhuigui + def regression1(): + xArr, yArr = loadDataSet("ex0.txt") + yHat = lwlrTest(xArr, xArr, yArr, 0.003) + xMat = mat(xArr) + srtInd = xMat[:,1].argsort(0) + xSort=xMat[srtInd][:,0,:] + fig = plt.figure() + ax = fig.add_subplot(111) + ax.plot(xSort[:,1], yHat[srtInd]) + ax.scatter(xMat[:,1].flatten().A[0], mat(yArr).T.flatten().A[0] , s=2, c='red') + plt.show() + +if __name__ == "__main__": + regression1() \ No newline at end of file