更新构建树的Coding

This commit is contained in:
jiangzhonglian
2017-03-06 21:13:32 +08:00
parent a778798dd8
commit 10e1c2b577
9 changed files with 569 additions and 55 deletions

View File

@@ -31,7 +31,7 @@
## 第四部分 其他工具
* 13) 使用PCA来简化数据
*[利用PCA来简化数据](./docs/13.利用PCA来简化数据.md)
* [利用PCA来简化数据](./docs/13.利用PCA来简化数据.md)
* 14) 使用SVD简化数据
* 15) 大数据与MapReduce

View File

@@ -8,6 +8,6 @@
* 优点:可以对复杂和非线性的数据建模。
* 缺点:结果不易理解。
* 适用数据类型:数值型和标称型数据。
* 那么问题来了,如何计算连续型数值的混乱度呢?
* `误差`:也就是计算平均差的总值(总方差=方差*样本数)
* 二元切分方式

View File

@@ -20,10 +20,15 @@ randArray = random.rand(4, 4)
# 转化关系, 数组转化为矩阵
randMat = mat(randArray)
# .I表示对矩阵求逆
# .I表示对矩阵求逆(可以利用矩阵的初等变换
# # 意义逆矩阵是一个判断相似性的工具。逆矩阵A与列向量p相乘后将得到列向量qq的第i个分量表示p与A的第i个列向量的相似度。
# # 参考案例链接:
# # https://www.zhihu.com/question/33258489
# # http://blog.csdn.net/vernice/article/details/48506027
# .T表示对矩阵转置(行列颠倒)
invRandMat = randMat.I
# 输出结果
print randArray, '\n', randMat, '\n', invRandMat
print randArray, '\n---\n', randMat, '\n+++\n', invRandMat
# 矩阵和逆矩阵 进行求积 (单位矩阵对角线都为1嘛理论上4*4的矩阵其他的都为0)
myEye = randMat*invRandMat
# 误差

View File

@@ -98,6 +98,7 @@ def show_pdf(clf):
# from IPython.display import Image
# Image(graph.create_png())
if __name__ == '__main__':
x, y = createDataSet()

View File

@@ -77,9 +77,9 @@ def plotTree(myTree, parentPt, nodeTxt):
plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
for key in secondDict.keys():
# 判断该节点是否是Node节点
if type(secondDict[key]).__name__=='dict':
if type(secondDict[key]).__name__ == 'dict':
# 如果是就递归调用[recursion]
plotTree(secondDict[key],cntrPt,str(key))
plotTree(secondDict[key], cntrPt, str(key))
else:
# 如果不是,就在原来节点一半的地方找到节点的坐标
plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
@@ -121,7 +121,7 @@ def createPlot(inTree):
# 测试数据集
def retrieveTree(i):
listOfTrees =[
listOfTrees = [
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
{'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
]

View File

@@ -0,0 +1,16 @@
#!/usr/bin/python
# coding:utf8
'''
Created on 2017-03-06
Update on 2017-03-06
@author: jiangzhonglian
'''
class treeNode():
def __init__(self, feat, val, right, left):
self.featureToSplitOn = feat
self.valueOfSplit = val
self.rightBranch = right
self.leftBranch = left

View File

@@ -9,25 +9,136 @@ Tree-Based Regression Methods Source Code for Machine Learning in Action Ch. 9
'''
from numpy import *
def loadDataSet(fileName): #general function to parse tab -delimited floats
dataMat = [] #assume last column is target value
# 默认解析的数据是用tab分隔并且是数值类型
# general function to parse tab -delimited floats
def loadDataSet(fileName):
"""loadDataSet(解析每一行并转化为float类型)
Args:
fileName 文件名
Returns:
dataMat 每一行的数据集array类型
Raises:
"""
# 假定最后一列是结果值
# assume last column is target value
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = map(float,curLine) #map all elements to float()
# 将所有的元素转化为float类型
# map all elements to float()
fltLine = map(float, curLine)
dataMat.append(fltLine)
return dataMat
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:][0]
mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:][0]
return mat0,mat1
"""binSplitDataSet(将数据集按照feature列的value进行 二元切分)
def regLeaf(dataSet):#returns the value used for each leaf
return mean(dataSet[:,-1])
Args:
fileName 文件名
Returns:
dataMat 每一行的数据集array类型
Raises:
"""
# # 测试案例
# print 'dataSet[:, feature]=', dataSet[:, feature]
# print 'nonzero(dataSet[:, feature] > value)[0]=', nonzero(dataSet[:, feature] > value)[0]
# print 'nonzero(dataSet[:, feature] <= value)[0]=', nonzero(dataSet[:, feature] <= value)[0]
# dataSet[:, feature] 取去每一行中第1列的值(从0开始算)
# nonzero(dataSet[:, feature] > value) 返回结果为true行的index下标
mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :]
mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :]
return mat0, mat1
# 返回每一个叶子结点的均值
# returns the value used for each leaf
def regLeaf(dataSet):
return mean(dataSet[:, -1])
# 计算总方差=方差*样本数
def regErr(dataSet):
return var(dataSet[:,-1]) * shape(dataSet)[0]
# shape(dataSet)[0] 表示行数
return var(dataSet[:, -1]) * shape(dataSet)[0]
# 1.用最佳方式切分数据集
# 2.生成相应的叶节点
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
"""chooseBestSplit(用最佳方式切分数据集 和 生成相应的叶节点)
Args:
dataSet 数据集
leafType 计算叶子节点的函数
errType 求总方差
ops [容许误差下降值,切分的最少样本数]
Returns:
bestIndex feature的index坐标
bestValue 切分的最优值
Raises:
"""
tolS = ops[0]
tolN = ops[1]
# 如果结果集(最后一列为1个变量),就返回推出
# .T 对数据集进行转置
# .tolist()[0] 转化为数组并取第0列
if len(set(dataSet[:, -1].T.tolist()[0])) == 1:
# exit cond 1
return None, leafType(dataSet)
# 计算行列值
m, n = shape(dataSet)
print m, n
# 无分类误差的总方差和
# the choice of the best feature is driven by Reduction in RSS error from mean
S = errType(dataSet)
# inf 正无穷大
bestS, bestIndex, bestValue = inf, 0, 0
# 循环处理每一列对应的feature值
for featIndex in range(n-1):
for splitVal in set(dataSet[:, featIndex].T.tolist()[0]):
# 对该列进行分组然后组内的成员的val值进行 二元切分
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
# 判断二元切分的方式的元素数量是否符合预期
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
continue
newS = errType(mat0) + errType(mat1)
# 如果二元切分,算出来的误差在可接受范围内,那么就记录切分点,并记录最小误差
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
# 判断二元切分的方式的元素误差是否符合预期
# if the decrease (S-bestS) is less than a threshold don't do the split
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
# 对整体的成员进行判断,是否符合预期
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
return None, leafType(dataSet)
return bestIndex, bestValue
# assume dataSet is NumPy Mat so we can array filtering
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
# 选择最好的切分方式: feature索引值最优切分值
# choose the best split
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
# if the splitting hit a stop condition return val
if feat is None:
return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['right'] = createTree(lSet, leafType, errType, ops)
retTree['left'] = createTree(rSet, leafType, errType, ops)
return retTree
def linearSolve(dataSet): #helper function used in two places
m,n = shape(dataSet)
@@ -49,43 +160,7 @@ def modelErr(dataSet):
yHat = X * ws
return sum(power(Y - yHat,2))
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
tolS = ops[0]; tolN = ops[1]
#if all the target variables are the same value: quit and return value
if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1
return None, leafType(dataSet)
m,n = shape(dataSet)
#the choice of the best feature is driven by Reduction in RSS error from mean
S = errType(dataSet)
bestS = inf; bestIndex = 0; bestValue = 0
for featIndex in range(n-1):
for splitVal in set(dataSet[:,featIndex]):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
#if the decrease (S-bestS) is less than a threshold don't do the split
if (S - bestS) < tolS:
return None, leafType(dataSet) #exit cond 2
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): #exit cond 3
return None, leafType(dataSet)
return bestIndex,bestValue#returns the best feature to split on
#and the value used for that split
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
if feat == None: return val #if the splitting hit a stop condition return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
def isTree(obj):
return (type(obj).__name__=='dict')
@@ -137,4 +212,21 @@ def createForeCast(tree, testData, modelEval=regTreeEval):
yHat = mat(zeros((m,1)))
for i in range(m):
yHat[i,0] = treeForeCast(tree, mat(testData[i]), modelEval)
return yHat
return yHat
if __name__ == "__main__":
# # 测试数据集
# testMat = mat(eye(4))
# print testMat
# print type(testMat)
# mat0, mat1 = binSplitDataSet(testMat, 1, 0.5)
# print mat0, '\n-----------\n', mat1
# 获取数据集
# myDat = loadDataSet('testData/RT_data1.txt')
myDat = loadDataSet('testData/RT_data2.txt')
myMat = mat(myDat)
myTree = createTree(myMat)
print myTree

200
testData/RT_data1.txt Executable file
View File

@@ -0,0 +1,200 @@
0.036098 0.155096
0.993349 1.077553
0.530897 0.893462
0.712386 0.564858
0.343554 -0.371700
0.098016 -0.332760
0.691115 0.834391
0.091358 0.099935
0.727098 1.000567
0.951949 0.945255
0.768596 0.760219
0.541314 0.893748
0.146366 0.034283
0.673195 0.915077
0.183510 0.184843
0.339563 0.206783
0.517921 1.493586
0.703755 1.101678
0.008307 0.069976
0.243909 -0.029467
0.306964 -0.177321
0.036492 0.408155
0.295511 0.002882
0.837522 1.229373
0.202054 -0.087744
0.919384 1.029889
0.377201 -0.243550
0.814825 1.095206
0.611270 0.982036
0.072243 -0.420983
0.410230 0.331722
0.869077 1.114825
0.620599 1.334421
0.101149 0.068834
0.820802 1.325907
0.520044 0.961983
0.488130 -0.097791
0.819823 0.835264
0.975022 0.673579
0.953112 1.064690
0.475976 -0.163707
0.273147 -0.455219
0.804586 0.924033
0.074795 -0.349692
0.625336 0.623696
0.656218 0.958506
0.834078 1.010580
0.781930 1.074488
0.009849 0.056594
0.302217 -0.148650
0.678287 0.907727
0.180506 0.103676
0.193641 -0.327589
0.343479 0.175264
0.145809 0.136979
0.996757 1.035533
0.590210 1.336661
0.238070 -0.358459
0.561362 1.070529
0.377597 0.088505
0.099142 0.025280
0.539558 1.053846
0.790240 0.533214
0.242204 0.209359
0.152324 0.132858
0.252649 -0.055613
0.895930 1.077275
0.133300 -0.223143
0.559763 1.253151
0.643665 1.024241
0.877241 0.797005
0.613765 1.621091
0.645762 1.026886
0.651376 1.315384
0.697718 1.212434
0.742527 1.087056
0.901056 1.055900
0.362314 -0.556464
0.948268 0.631862
0.000234 0.060903
0.750078 0.906291
0.325412 -0.219245
0.726828 1.017112
0.348013 0.048939
0.458121 -0.061456
0.280738 -0.228880
0.567704 0.969058
0.750918 0.748104
0.575805 0.899090
0.507940 1.107265
0.071769 -0.110946
0.553520 1.391273
0.401152 -0.121640
0.406649 -0.366317
0.652121 1.004346
0.347837 -0.153405
0.081931 -0.269756
0.821648 1.280895
0.048014 0.064496
0.130962 0.184241
0.773422 1.125943
0.789625 0.552614
0.096994 0.227167
0.625791 1.244731
0.589575 1.185812
0.323181 0.180811
0.822443 1.086648
0.360323 -0.204830
0.950153 1.022906
0.527505 0.879560
0.860049 0.717490
0.007044 0.094150
0.438367 0.034014
0.574573 1.066130
0.536689 0.867284
0.782167 0.886049
0.989888 0.744207
0.761474 1.058262
0.985425 1.227946
0.132543 -0.329372
0.346986 -0.150389
0.768784 0.899705
0.848921 1.170959
0.449280 0.069098
0.066172 0.052439
0.813719 0.706601
0.661923 0.767040
0.529491 1.022206
0.846455 0.720030
0.448656 0.026974
0.795072 0.965721
0.118156 -0.077409
0.084248 -0.019547
0.845815 0.952617
0.576946 1.234129
0.772083 1.299018
0.696648 0.845423
0.595012 1.213435
0.648675 1.287407
0.897094 1.240209
0.552990 1.036158
0.332982 0.210084
0.065615 -0.306970
0.278661 0.253628
0.773168 1.140917
0.203693 -0.064036
0.355688 -0.119399
0.988852 1.069062
0.518735 1.037179
0.514563 1.156648
0.976414 0.862911
0.919074 1.123413
0.697777 0.827805
0.928097 0.883225
0.900272 0.996871
0.344102 -0.061539
0.148049 0.204298
0.130052 -0.026167
0.302001 0.317135
0.337100 0.026332
0.314924 -0.001952
0.269681 -0.165971
0.196005 -0.048847
0.129061 0.305107
0.936783 1.026258
0.305540 -0.115991
0.683921 1.414382
0.622398 0.766330
0.902532 0.861601
0.712503 0.933490
0.590062 0.705531
0.723120 1.307248
0.188218 0.113685
0.643601 0.782552
0.520207 1.209557
0.233115 -0.348147
0.465625 -0.152940
0.884512 1.117833
0.663200 0.701634
0.268857 0.073447
0.729234 0.931956
0.429664 -0.188659
0.737189 1.200781
0.378595 -0.296094
0.930173 1.035645
0.774301 0.836763
0.273940 -0.085713
0.824442 1.082153
0.626011 0.840544
0.679390 1.307217
0.578252 0.921885
0.785541 1.165296
0.597409 0.974770
0.014083 -0.132525
0.663870 1.187129
0.552381 1.369630
0.683886 0.999985
0.210334 -0.006899
0.604529 1.212685
0.250744 0.046297

200
testData/RT_data2.txt Executable file
View File

@@ -0,0 +1,200 @@
1.000000 0.409175 1.883180
1.000000 0.182603 0.063908
1.000000 0.663687 3.042257
1.000000 0.517395 2.305004
1.000000 0.013643 -0.067698
1.000000 0.469643 1.662809
1.000000 0.725426 3.275749
1.000000 0.394350 1.118077
1.000000 0.507760 2.095059
1.000000 0.237395 1.181912
1.000000 0.057534 0.221663
1.000000 0.369820 0.938453
1.000000 0.976819 4.149409
1.000000 0.616051 3.105444
1.000000 0.413700 1.896278
1.000000 0.105279 -0.121345
1.000000 0.670273 3.161652
1.000000 0.952758 4.135358
1.000000 0.272316 0.859063
1.000000 0.303697 1.170272
1.000000 0.486698 1.687960
1.000000 0.511810 1.979745
1.000000 0.195865 0.068690
1.000000 0.986769 4.052137
1.000000 0.785623 3.156316
1.000000 0.797583 2.950630
1.000000 0.081306 0.068935
1.000000 0.659753 2.854020
1.000000 0.375270 0.999743
1.000000 0.819136 4.048082
1.000000 0.142432 0.230923
1.000000 0.215112 0.816693
1.000000 0.041270 0.130713
1.000000 0.044136 -0.537706
1.000000 0.131337 -0.339109
1.000000 0.463444 2.124538
1.000000 0.671905 2.708292
1.000000 0.946559 4.017390
1.000000 0.904176 4.004021
1.000000 0.306674 1.022555
1.000000 0.819006 3.657442
1.000000 0.845472 4.073619
1.000000 0.156258 0.011994
1.000000 0.857185 3.640429
1.000000 0.400158 1.808497
1.000000 0.375395 1.431404
1.000000 0.885807 3.935544
1.000000 0.239960 1.162152
1.000000 0.148640 -0.227330
1.000000 0.143143 -0.068728
1.000000 0.321582 0.825051
1.000000 0.509393 2.008645
1.000000 0.355891 0.664566
1.000000 0.938633 4.180202
1.000000 0.348057 0.864845
1.000000 0.438898 1.851174
1.000000 0.781419 2.761993
1.000000 0.911333 4.075914
1.000000 0.032469 0.110229
1.000000 0.499985 2.181987
1.000000 0.771663 3.152528
1.000000 0.670361 3.046564
1.000000 0.176202 0.128954
1.000000 0.392170 1.062726
1.000000 0.911188 3.651742
1.000000 0.872288 4.401950
1.000000 0.733107 3.022888
1.000000 0.610239 2.874917
1.000000 0.732739 2.946801
1.000000 0.714825 2.893644
1.000000 0.076386 0.072131
1.000000 0.559009 1.748275
1.000000 0.427258 1.912047
1.000000 0.841875 3.710686
1.000000 0.558918 1.719148
1.000000 0.533241 2.174090
1.000000 0.956665 3.656357
1.000000 0.620393 3.522504
1.000000 0.566120 2.234126
1.000000 0.523258 1.859772
1.000000 0.476884 2.097017
1.000000 0.176408 0.001794
1.000000 0.303094 1.231928
1.000000 0.609731 2.953862
1.000000 0.017774 -0.116803
1.000000 0.622616 2.638864
1.000000 0.886539 3.943428
1.000000 0.148654 -0.328513
1.000000 0.104350 -0.099866
1.000000 0.116868 -0.030836
1.000000 0.516514 2.359786
1.000000 0.664896 3.212581
1.000000 0.004327 0.188975
1.000000 0.425559 1.904109
1.000000 0.743671 3.007114
1.000000 0.935185 3.845834
1.000000 0.697300 3.079411
1.000000 0.444551 1.939739
1.000000 0.683753 2.880078
1.000000 0.755993 3.063577
1.000000 0.902690 4.116296
1.000000 0.094491 -0.240963
1.000000 0.873831 4.066299
1.000000 0.991810 4.011834
1.000000 0.185611 0.077710
1.000000 0.694551 3.103069
1.000000 0.657275 2.811897
1.000000 0.118746 -0.104630
1.000000 0.084302 0.025216
1.000000 0.945341 4.330063
1.000000 0.785827 3.087091
1.000000 0.530933 2.269988
1.000000 0.879594 4.010701
1.000000 0.652770 3.119542
1.000000 0.879338 3.723411
1.000000 0.764739 2.792078
1.000000 0.504884 2.192787
1.000000 0.554203 2.081305
1.000000 0.493209 1.714463
1.000000 0.363783 0.885854
1.000000 0.316465 1.028187
1.000000 0.580283 1.951497
1.000000 0.542898 1.709427
1.000000 0.112661 0.144068
1.000000 0.816742 3.880240
1.000000 0.234175 0.921876
1.000000 0.402804 1.979316
1.000000 0.709423 3.085768
1.000000 0.867298 3.476122
1.000000 0.993392 3.993679
1.000000 0.711580 3.077880
1.000000 0.133643 -0.105365
1.000000 0.052031 -0.164703
1.000000 0.366806 1.096814
1.000000 0.697521 3.092879
1.000000 0.787262 2.987926
1.000000 0.476710 2.061264
1.000000 0.721417 2.746854
1.000000 0.230376 0.716710
1.000000 0.104397 0.103831
1.000000 0.197834 0.023776
1.000000 0.129291 -0.033299
1.000000 0.528528 1.942286
1.000000 0.009493 -0.006338
1.000000 0.998533 3.808753
1.000000 0.363522 0.652799
1.000000 0.901386 4.053747
1.000000 0.832693 4.569290
1.000000 0.119002 -0.032773
1.000000 0.487638 2.066236
1.000000 0.153667 0.222785
1.000000 0.238619 1.089268
1.000000 0.208197 1.487788
1.000000 0.750921 2.852033
1.000000 0.183403 0.024486
1.000000 0.995608 3.737750
1.000000 0.151311 0.045017
1.000000 0.126804 0.001238
1.000000 0.983153 3.892763
1.000000 0.772495 2.819376
1.000000 0.784133 2.830665
1.000000 0.056934 0.234633
1.000000 0.425584 1.810782
1.000000 0.998709 4.237235
1.000000 0.707815 3.034768
1.000000 0.413816 1.742106
1.000000 0.217152 1.169250
1.000000 0.360503 0.831165
1.000000 0.977989 3.729376
1.000000 0.507953 1.823205
1.000000 0.920771 4.021970
1.000000 0.210542 1.262939
1.000000 0.928611 4.159518
1.000000 0.580373 2.039114
1.000000 0.841390 4.101837
1.000000 0.681530 2.778672
1.000000 0.292795 1.228284
1.000000 0.456918 1.736620
1.000000 0.134128 -0.195046
1.000000 0.016241 -0.063215
1.000000 0.691214 3.305268
1.000000 0.582002 2.063627
1.000000 0.303102 0.898840
1.000000 0.622598 2.701692
1.000000 0.525024 1.992909
1.000000 0.996775 3.811393
1.000000 0.881025 4.353857
1.000000 0.723457 2.635641
1.000000 0.676346 2.856311
1.000000 0.254625 1.352682
1.000000 0.488632 2.336459
1.000000 0.519875 2.111651
1.000000 0.160176 0.121726
1.000000 0.609483 3.264605
1.000000 0.531881 2.103446
1.000000 0.321632 0.896855
1.000000 0.845148 4.220850
1.000000 0.012003 -0.217283
1.000000 0.018883 -0.300577
1.000000 0.071476 0.006014