From f41ba460006d1ee491c598656a8adffb39984492 Mon Sep 17 00:00:00 2001 From: jiangzhonglian Date: Fri, 3 Mar 2017 15:17:56 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E6=A0=91=E5=9B=9E=E5=BD=92?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 4 +- docs/9.树回归.md | 13 +++ src/python/09.RegTrees/regTrees.py | 140 +++++++++++++++++++++++++++++ 3 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 docs/9.树回归.md create mode 100644 src/python/09.RegTrees/regTrees.py diff --git a/README.md b/README.md index fc123341..0d56cfa5 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # MachineLearning -**Mahchine Leaning in Action (python)** +**Mahchine Leaning in Action (python) | ApacheCN(apache中文网)** ## 第一部分 分类 @@ -19,6 +19,7 @@ * 8) 预测数值型数据:回归 * 9) 数回归 + * [树回归](./docs/9.树回归.md) ## 第三部分 无监督学习 @@ -41,3 +42,4 @@ * 附录D 资源 * 索引 * 版权声明 +* [ApacheCN(apache中文网) 更新](wwww.apache.wiki) \ No newline at end of file diff --git a/docs/9.树回归.md b/docs/9.树回归.md new file mode 100644 index 00000000..bd68557c --- /dev/null +++ b/docs/9.树回归.md @@ -0,0 +1,13 @@ + +# 9) 树回归 + +* 树回归是什么? + * 分类回归树(Classification and Regression Tree,CART)是一种典型的决策树算法,CART算法不仅可以应用于分类问题,而且可以用于回归问题。 + * CART算法构建的回归树并介绍其中的树剪枝技术(该技术主要的目的是防止数的过拟合) +* 树回归的构建 + * 优点:可以对复杂和非线性的数据建模。 + * 缺点:结果不易理解。 + * 适用数据类型:数值型和标称型数据。 + + + \ No newline at end of file diff --git a/src/python/09.RegTrees/regTrees.py b/src/python/09.RegTrees/regTrees.py new file mode 100644 index 00000000..c1436e0e --- /dev/null +++ b/src/python/09.RegTrees/regTrees.py @@ -0,0 +1,140 @@ +#!/usr/bin/python +# coding:utf8 + +''' +Created on Feb 4, 2011 +Update on 2017-03-02 +Tree-Based Regression Methods Source Code for Machine Learning in Action Ch. 9 +@author: Peter Harrington/jiangzhonglian +''' +from numpy import * + +def loadDataSet(fileName): #general function to parse tab -delimited floats + dataMat = [] #assume last column is target value + fr = open(fileName) + for line in fr.readlines(): + curLine = line.strip().split('\t') + fltLine = map(float,curLine) #map all elements to float() + dataMat.append(fltLine) + return dataMat + +def binSplitDataSet(dataSet, feature, value): + mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:][0] + mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:][0] + return mat0,mat1 + +def regLeaf(dataSet):#returns the value used for each leaf + return mean(dataSet[:,-1]) + +def regErr(dataSet): + return var(dataSet[:,-1]) * shape(dataSet)[0] + +def linearSolve(dataSet): #helper function used in two places + m,n = shape(dataSet) + X = mat(ones((m,n))); Y = mat(ones((m,1)))#create a copy of data with 1 in 0th postion + X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y + xTx = X.T*X + if linalg.det(xTx) == 0.0: + raise NameError('This matrix is singular, cannot do inverse,\n\ + try increasing the second value of ops') + ws = xTx.I * (X.T * Y) + return ws,X,Y + +def modelLeaf(dataSet):#create linear model and return coeficients + ws,X,Y = linearSolve(dataSet) + return ws + +def modelErr(dataSet): + ws,X,Y = linearSolve(dataSet) + yHat = X * ws + return sum(power(Y - yHat,2)) + +def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)): + tolS = ops[0]; tolN = ops[1] + #if all the target variables are the same value: quit and return value + if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1 + return None, leafType(dataSet) + m,n = shape(dataSet) + #the choice of the best feature is driven by Reduction in RSS error from mean + S = errType(dataSet) + bestS = inf; bestIndex = 0; bestValue = 0 + for featIndex in range(n-1): + for splitVal in set(dataSet[:,featIndex]): + mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) + if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue + newS = errType(mat0) + errType(mat1) + if newS < bestS: + bestIndex = featIndex + bestValue = splitVal + bestS = newS + #if the decrease (S-bestS) is less than a threshold don't do the split + if (S - bestS) < tolS: + return None, leafType(dataSet) #exit cond 2 + mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) + if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): #exit cond 3 + return None, leafType(dataSet) + return bestIndex,bestValue#returns the best feature to split on + #and the value used for that split + +def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering + feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split + if feat == None: return val #if the splitting hit a stop condition return val + retTree = {} + retTree['spInd'] = feat + retTree['spVal'] = val + lSet, rSet = binSplitDataSet(dataSet, feat, val) + retTree['left'] = createTree(lSet, leafType, errType, ops) + retTree['right'] = createTree(rSet, leafType, errType, ops) + return retTree + +def isTree(obj): + return (type(obj).__name__=='dict') + +def getMean(tree): + if isTree(tree['right']): tree['right'] = getMean(tree['right']) + if isTree(tree['left']): tree['left'] = getMean(tree['left']) + return (tree['left']+tree['right'])/2.0 + +def prune(tree, testData): + if shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree + if (isTree(tree['right']) or isTree(tree['left'])):#if the branches are not trees try to prune them + lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) + if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet) + if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet) + #if they are now both leafs, see if we can merge them + if not isTree(tree['left']) and not isTree(tree['right']): + lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) + errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) +\ + sum(power(rSet[:,-1] - tree['right'],2)) + treeMean = (tree['left']+tree['right'])/2.0 + errorMerge = sum(power(testData[:,-1] - treeMean,2)) + if errorMerge < errorNoMerge: + print "merging" + return treeMean + else: return tree + else: return tree + +def regTreeEval(model, inDat): + return float(model) + +def modelTreeEval(model, inDat): + n = shape(inDat)[1] + X = mat(ones((1,n+1))) + X[:,1:n+1]=inDat + return float(X*model) + +def treeForeCast(tree, inData, modelEval=regTreeEval): + if not isTree(tree): return modelEval(tree, inData) + if inData[tree['spInd']] > tree['spVal']: + if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval) + else: return modelEval(tree['left'], inData) + else: + if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval) + else: return modelEval(tree['right'], inData) + +def createForeCast(tree, testData, modelEval=regTreeEval): + m=len(testData) + yHat = mat(zeros((m,1))) + for i in range(m): + yHat[i,0] = treeForeCast(tree, mat(testData[i]), modelEval) + return yHat \ No newline at end of file