修改树回归的原理和后剪枝的描述

2026-02-08 12:55:26 +08:00 · 2017-09-21 12:49:03 +08:00
parent 627ec87a1b
commit 73fa4dbb92
2 changed files with 31 additions and 33 deletions
--- a/docs/9.树回归.md
+++ b/docs/9.树回归.md
@@ -56,7 +56,7 @@ CART 和 C4.5 之间主要差异在于分类结果上，CART 可以回归分析
 ```
 对每个特征:
    对每个特征值: 
-        将数据集切分成两份
+        将数据集切分成两份（小于该特征值的数据样本放在左子树，否则放在右子树）
        计算切分的误差
        如果当前误差小于当前最小误差，那么将当前切分设定为最佳切分并更新最小误差
 返回最佳切分的特征和阈值
@@ -272,8 +272,6 @@ def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):

 决策树构造完成后进行剪枝。剪枝的过程是对拥有同样父节点的一组节点进行检查，判断如果将其合并，熵的增加量是否小于某一阈值。如果确实小，则这一组节点可以合并一个节点，其中包含了所有可能的结果。合并也被称作 `塌陷处理` ，在回归树中一般采用取需要合并的所有子树的平均值。后剪枝是目前最普遍的做法。

-后剪枝的剪枝过程是删除一些子树，然后用其叶子节点代替，这个叶子节点所标识的类别通过大多数原则(majority class criterion)确定。所谓大多数原则，是指剪枝过程中, 将一些子树删除而用叶节点代替,这个叶节点所标识的类别用这棵子树中大多数训练样本所属的类别来标识,所标识的类 称为majority class ，（majority class 在很多英文文献中也多次出现）。
-
 后剪枝 prune() 的伪代码如下:

 ```
--- a/src/python/9.RegTrees/regTrees.py
+++ b/src/python/9.RegTrees/regTrees.py
@@ -395,12 +395,12 @@ def createForeCast(tree, testData, modelEval=regTreeEval):


 if __name__ == "__main__":
-    # 测试数据集
-    testMat = mat(eye(4))
-    print testMat
-    print type(testMat)
-    mat0, mat1 = binSplitDataSet(testMat, 1, 0.5)
-    print mat0, '\n-----------\n', mat1
+    # # 测试数据集
+    # testMat = mat(eye(4))
+    # print testMat
+    # print type(testMat)
+    # mat0, mat1 = binSplitDataSet(testMat, 1, 0.5)
+    # print mat0, '\n-----------\n', mat1

    # # 回归树
    # myDat = loadDataSet('input/9.RegTrees/data1.txt')
@@ -431,29 +431,29 @@ if __name__ == "__main__":
    # myTree = createTree(myMat, modelLeaf, modelErr)
    # print myTree

-    # # # 回归树 VS 模型树 VS 线性回归
-    # trainMat = mat(loadDataSet('input/9.RegTrees/bikeSpeedVsIq_train.txt'))
-    # testMat = mat(loadDataSet('input/9.RegTrees/bikeSpeedVsIq_test.txt'))
-    # # # 回归树
-    # myTree1 = createTree(trainMat, ops=(1, 20))
-    # print myTree1
-    # yHat1 = createForeCast(myTree1, testMat[:, 0])
-    # print "--------------\n"
-    # # print yHat1
-    # # print "ssss==>", testMat[:, 1]
-    # print "回归树:", corrcoef(yHat1, testMat[:, 1],rowvar=0)[0, 1]
+    # # 回归树 VS 模型树 VS 线性回归
+    trainMat = mat(loadDataSet('input/9.RegTrees/bikeSpeedVsIq_train.txt'))
+    testMat = mat(loadDataSet('input/9.RegTrees/bikeSpeedVsIq_test.txt'))
+    # # 回归树
+    myTree1 = createTree(trainMat, ops=(1, 20))
+    print myTree1
+    yHat1 = createForeCast(myTree1, testMat[:, 0])
+    print "--------------\n"
+    # print yHat1
+    # print "ssss==>", testMat[:, 1]
+    print "回归树:", corrcoef(yHat1, testMat[:, 1],rowvar=0)[0, 1]

-    # # 模型树
-    # myTree2 = createTree(trainMat, modelLeaf, modelErr, ops=(1, 20))
-    # yHat2 = createForeCast(myTree2, testMat[:, 0], modelTreeEval)
-    # print myTree2
-    # print "模型树:", corrcoef(yHat2, testMat[:, 1],rowvar=0)[0, 1]
+    # 模型树
+    myTree2 = createTree(trainMat, modelLeaf, modelErr, ops=(1, 20))
+    yHat2 = createForeCast(myTree2, testMat[:, 0], modelTreeEval)
+    print myTree2
+    print "模型树:", corrcoef(yHat2, testMat[:, 1],rowvar=0)[0, 1]

-    # # 线性回归
-    # ws, X, Y = linearSolve(trainMat)
-    # print ws
-    # m = len(testMat[:, 0])
-    # yHat3 = mat(zeros((m, 1)))
-    # for i in range(shape(testMat)[0]):
-    #     yHat3[i] = testMat[i, 0]*ws[1, 0] + ws[0, 0]
-    # print "线性回归:", corrcoef(yHat3, testMat[:, 1],rowvar=0)[0, 1]
+    # 线性回归
+    ws, X, Y = linearSolve(trainMat)
+    print ws
+    m = len(testMat[:, 0])
+    yHat3 = mat(zeros((m, 1)))
+    for i in range(shape(testMat)[0]):
+        yHat3[i] = testMat[i, 0]*ws[1, 0] + ws[0, 0]
+    print "线性回归:", corrcoef(yHat3, testMat[:, 1],rowvar=0)[0, 1]