mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-08 21:04:33 +08:00
更新树回归
This commit is contained in:
140
src/python/09.RegTrees/regTrees.py
Normal file
140
src/python/09.RegTrees/regTrees.py
Normal file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on Feb 4, 2011
|
||||
Update on 2017-03-02
|
||||
Tree-Based Regression Methods Source Code for Machine Learning in Action Ch. 9
|
||||
@author: Peter Harrington/jiangzhonglian
|
||||
'''
|
||||
from numpy import *
|
||||
|
||||
def loadDataSet(fileName): #general function to parse tab -delimited floats
|
||||
dataMat = [] #assume last column is target value
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
curLine = line.strip().split('\t')
|
||||
fltLine = map(float,curLine) #map all elements to float()
|
||||
dataMat.append(fltLine)
|
||||
return dataMat
|
||||
|
||||
def binSplitDataSet(dataSet, feature, value):
|
||||
mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:][0]
|
||||
mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:][0]
|
||||
return mat0,mat1
|
||||
|
||||
def regLeaf(dataSet):#returns the value used for each leaf
|
||||
return mean(dataSet[:,-1])
|
||||
|
||||
def regErr(dataSet):
|
||||
return var(dataSet[:,-1]) * shape(dataSet)[0]
|
||||
|
||||
def linearSolve(dataSet): #helper function used in two places
|
||||
m,n = shape(dataSet)
|
||||
X = mat(ones((m,n))); Y = mat(ones((m,1)))#create a copy of data with 1 in 0th postion
|
||||
X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y
|
||||
xTx = X.T*X
|
||||
if linalg.det(xTx) == 0.0:
|
||||
raise NameError('This matrix is singular, cannot do inverse,\n\
|
||||
try increasing the second value of ops')
|
||||
ws = xTx.I * (X.T * Y)
|
||||
return ws,X,Y
|
||||
|
||||
def modelLeaf(dataSet):#create linear model and return coeficients
|
||||
ws,X,Y = linearSolve(dataSet)
|
||||
return ws
|
||||
|
||||
def modelErr(dataSet):
|
||||
ws,X,Y = linearSolve(dataSet)
|
||||
yHat = X * ws
|
||||
return sum(power(Y - yHat,2))
|
||||
|
||||
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
|
||||
tolS = ops[0]; tolN = ops[1]
|
||||
#if all the target variables are the same value: quit and return value
|
||||
if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1
|
||||
return None, leafType(dataSet)
|
||||
m,n = shape(dataSet)
|
||||
#the choice of the best feature is driven by Reduction in RSS error from mean
|
||||
S = errType(dataSet)
|
||||
bestS = inf; bestIndex = 0; bestValue = 0
|
||||
for featIndex in range(n-1):
|
||||
for splitVal in set(dataSet[:,featIndex]):
|
||||
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
|
||||
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
|
||||
newS = errType(mat0) + errType(mat1)
|
||||
if newS < bestS:
|
||||
bestIndex = featIndex
|
||||
bestValue = splitVal
|
||||
bestS = newS
|
||||
#if the decrease (S-bestS) is less than a threshold don't do the split
|
||||
if (S - bestS) < tolS:
|
||||
return None, leafType(dataSet) #exit cond 2
|
||||
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
|
||||
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): #exit cond 3
|
||||
return None, leafType(dataSet)
|
||||
return bestIndex,bestValue#returns the best feature to split on
|
||||
#and the value used for that split
|
||||
|
||||
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
|
||||
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
|
||||
if feat == None: return val #if the splitting hit a stop condition return val
|
||||
retTree = {}
|
||||
retTree['spInd'] = feat
|
||||
retTree['spVal'] = val
|
||||
lSet, rSet = binSplitDataSet(dataSet, feat, val)
|
||||
retTree['left'] = createTree(lSet, leafType, errType, ops)
|
||||
retTree['right'] = createTree(rSet, leafType, errType, ops)
|
||||
return retTree
|
||||
|
||||
def isTree(obj):
|
||||
return (type(obj).__name__=='dict')
|
||||
|
||||
def getMean(tree):
|
||||
if isTree(tree['right']): tree['right'] = getMean(tree['right'])
|
||||
if isTree(tree['left']): tree['left'] = getMean(tree['left'])
|
||||
return (tree['left']+tree['right'])/2.0
|
||||
|
||||
def prune(tree, testData):
|
||||
if shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree
|
||||
if (isTree(tree['right']) or isTree(tree['left'])):#if the branches are not trees try to prune them
|
||||
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
|
||||
if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
|
||||
if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet)
|
||||
#if they are now both leafs, see if we can merge them
|
||||
if not isTree(tree['left']) and not isTree(tree['right']):
|
||||
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
|
||||
errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) +\
|
||||
sum(power(rSet[:,-1] - tree['right'],2))
|
||||
treeMean = (tree['left']+tree['right'])/2.0
|
||||
errorMerge = sum(power(testData[:,-1] - treeMean,2))
|
||||
if errorMerge < errorNoMerge:
|
||||
print "merging"
|
||||
return treeMean
|
||||
else: return tree
|
||||
else: return tree
|
||||
|
||||
def regTreeEval(model, inDat):
|
||||
return float(model)
|
||||
|
||||
def modelTreeEval(model, inDat):
|
||||
n = shape(inDat)[1]
|
||||
X = mat(ones((1,n+1)))
|
||||
X[:,1:n+1]=inDat
|
||||
return float(X*model)
|
||||
|
||||
def treeForeCast(tree, inData, modelEval=regTreeEval):
|
||||
if not isTree(tree): return modelEval(tree, inData)
|
||||
if inData[tree['spInd']] > tree['spVal']:
|
||||
if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval)
|
||||
else: return modelEval(tree['left'], inData)
|
||||
else:
|
||||
if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval)
|
||||
else: return modelEval(tree['right'], inData)
|
||||
|
||||
def createForeCast(tree, testData, modelEval=regTreeEval):
|
||||
m=len(testData)
|
||||
yHat = mat(zeros((m,1)))
|
||||
for i in range(m):
|
||||
yHat[i,0] = treeForeCast(tree, mat(testData[i]), modelEval)
|
||||
return yHat
|
||||
Reference in New Issue
Block a user