Merge pull request #1 from apachecn/master

11,12 关联关系
2026-05-08 14:52:28 +08:00 · 2017-03-03 15:31:50 +08:00
parent 0948986d68 b6d455cb23
commit ea5741637f
10 changed files with 669 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -9,8 +9,10 @@
 * 2) k-近邻算法
    * [k-近邻算法](./docs/2.k-近邻算法.md)
 * 3) 决策树
+    * [决策树](./docs/3.决策树.md)
 * 4) 基于概率论的分类方法：朴素贝叶斯
 * 5) Logistic回归
+    * [Logistic回归](./docs/5.Logistic回归.md)
 * 6) 支持向量机
 * 7) 利用AdaBoost元算法提高分类

@@ -28,6 +30,7 @@
 ## 第四部分  其他工具

 * 13) 使用PCA来简化数据
+     *[利用PCA来简化数据](./docs/13.利用PCA来简化数据.md)
 * 14) 使用SVD简化数据
 * 15) 大数据与MapReduce

--- a/docs/13.利用PCA来简化数据.md
+++ b/docs/13.利用PCA来简化数据.md
@@ -0,0 +1,6 @@
+
+# 3) 利用PCA来简化数据
+<script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=default"></script>
+
+* 未完待续
+    
--- a/docs/3.决策树.md
+++ b/docs/3.决策树.md
@@ -0,0 +1,25 @@
+
+# 3) 决策树
+<script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=default"></script>
+
+* 决策树是什么？
+    * 顾名思义，是一种树，一种依托于策略抉择而建立起来的树。
+    * 从数据产生决策树的机器学习技术叫做决策树学习, 通俗点说就是决策树。
+* 决策数目前的情况：
+    * 1.最经常使用的数据挖掘算法。(流行的原因：不需要了解机器学习的知识，就能搞明白决策树是如何工作的)
+    * 2.数据形式[决策过程只有：是／否]和数据内在含义非常容易理解。
+    * 3.决策树给出的结果往往可以匹敌在当前领域具有几十年工作经验的人类专家。
+* 决策树的构造：
+    * 优点：计算复杂度不高，输出结果易于理解，对中间值的缺失不敏感，可以处理不相关特征数据。
+    * 缺点：可能会产生过度匹配问题。
+    * 适用数据类型：数值型和标称型[标称型:其实就是离散型数据，变量的结果只在`有限`目标集中取值(例如：分类特征 A/B/C类其中一种)]。
+* 如何找出第一个分支点呢？
+    * 信息增益： 
+        * 划分数据集的最大原则是：将无序的数据变得更加有序。
+        * 集合信息的度量称为`香农熵`或者简称`熵`(名字来源于信息论之父`克劳德·香农`)
+        * 公式： 
+            * \\(p(x_i)\\) 表示该label分类的概率
+            * \\(l(x_i) = - \log_2p(x_i)\\) 表示符号\\(x_i\\)的信息定义
+            * \\(H = -\sum_{i=0}^np(x_i)\log_2p(x_i)\\) 表示香农熵，用于计算信息熵
+    * 基尼不纯度(Gini impurity)  [本书不做过多的介绍]
+        * 简单来说：就是从一个数据集中随机选取子项，度量其被错误分类到其他分组里的概率。
--- a/docs/5.Logistic回归.md
+++ b/docs/5.Logistic回归.md
@@ -0,0 +1,11 @@
+
+# 1) 逻辑回归基础
+
+  * 逻辑回归(Logistic Regression)
+    * 1.1 分类问题
+    * 1.2 假说表示
+    * 1.3 判定边界
+    * 1.4 代价函数
+    * 1.5 简化的成本函数和梯度下降 
+    * 1.6 高级优化
+    * 1.7 多类分类：一个对所有
--- a/src/python/03.DecisionTree/DTSklearn.py
+++ b/src/python/03.DecisionTree/DTSklearn.py
@@ -0,0 +1,115 @@
+#!/usr/bin/python
+# coding: utf8
+# 原始链接： http://blog.csdn.net/lsldd/article/details/41223147
+import numpy as np
+from sklearn import tree
+from sklearn.metrics import precision_recall_curve
+from sklearn.metrics import classification_report
+from sklearn.cross_validation import train_test_split
+
+
+def createDataSet():
+    ''' 数据读入 '''
+    data = []
+    labels = []
+    with open("testData/DT_data.txt") as ifile:
+        for line in ifile:
+            # 特征： 身高 体重   label： 胖瘦
+            tokens = line.strip().split(' ')
+            data.append([float(tk) for tk in tokens[:-1]])
+            labels.append(tokens[-1])
+    # 特征数据
+    x = np.array(data)
+    # label分类的标签数据
+    labels = np.array(labels)
+    # 预估结果的标签数据
+    y = np.zeros(labels.shape)
+
+    ''' 标签转换为0/1 '''
+    y[labels == 'fat'] = 1
+    print data, '-------', x, '-------', labels, '-------', y
+    return x, y
+
+
+def predict_train(x_train, y_train):
+    '''
+    使用信息熵作为划分标准，对决策树进行训练
+    参考链接： http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
+    '''
+    clf = tree.DecisionTreeClassifier(criterion='entropy')
+    # print(clf)
+    clf.fit(x_train, y_train)
+    ''' 系数反映每个特征的影响力。越大表示该特征在分类中起到的作用越大 '''
+    print 'feature_importances_: %s' % clf.feature_importances_
+
+    '''测试结果的打印'''
+    y_pre = clf.predict(x_train)
+    # print(x_train)
+    print(y_pre)
+    print(y_train)
+    print(np.mean(y_pre == y_train))
+    return y_pre, clf
+
+
+def show_precision_recall(x, clf,  y_train, y_pre):
+    '''
+    准确率与召回率
+    参考链接： http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html#sklearn.metrics.precision_recall_curve
+    '''
+    precision, recall, thresholds = precision_recall_curve(y_train, y_pre)
+    # 计算全量的预估结果
+    answer = clf.predict_proba(x)[:, 1]
+
+    '''
+    展现 准确率与召回率
+        precision 准确率
+        recall 召回率
+        f1-score  准确率和召回率的一个综合得分
+        support 参与比较的数量
+    参考链接：http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report
+    '''
+    # target_names 以 y的label分类为准
+    target_names = ['thin', 'fat']
+    print(classification_report(y, answer, target_names=target_names))
+    print(answer)
+    print(y)
+
+
+def show_pdf(clf):
+    '''
+    可视化输出
+    把决策树结构写入文件: http://sklearn.lzjqsdd.com/modules/tree.html
+
+    Mac报错：pydotplus.graphviz.InvocationException: GraphViz's executables not found
+    解决方案：sudo brew install graphviz
+    参考写入： http://www.jianshu.com/p/59b510bafb4d
+    '''
+    # with open("testResult/tree.dot", 'w') as f:
+    #     from sklearn.externals.six import StringIO
+    #     tree.export_graphviz(clf, out_file=f)
+
+    import pydotplus
+    from sklearn.externals.six import StringIO
+    dot_data = StringIO()
+    tree.export_graphviz(clf, out_file=dot_data)
+    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
+    graph.write_pdf("testResult/tree.pdf")
+
+    # from IPython.display import Image
+    # Image(graph.create_png())
+
+if __name__ == '__main__':
+    x, y = createDataSet()
+
+    ''' 拆分训练数据与测试数据， 80%做训练 20%做测试 '''
+    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
+    print '拆分数据：', x_train, x_test, y_train, y_test
+
+    # 得到训练的预测结果集
+    y_pre, clf = predict_train(x_train, y_train)
+
+    # 展现 准确率与召回率
+    show_precision_recall(x, clf, y_train, y_pre)
+
+    # 可视化输出
+    show_pdf(clf)
--- a/src/python/03.DecisionTree/DecisionTree.py
+++ b/src/python/03.DecisionTree/DecisionTree.py
@@ -0,0 +1,243 @@
+#!/usr/bin/python
+# coding:utf8
+
+'''
+Created on Oct 12, 2010
+Update on 2017-02-27
+Decision Tree Source Code for Machine Learning in Action Ch. 3
+@author: Peter Harrington/jiangzhonglian
+'''
+from math import log
+import operator
+import DecisionTreePlot as dtPlot
+
+
+def createDataSet():
+    """DateSet 基础数据集
+
+    Args:
+        无需传入参数
+    Returns:
+        返回数据集和对应的label标签
+    Raises:
+
+    """
+    dataSet = [[1, 1, 'yes'],
+               [1, 1, 'yes'],
+               [1, 0, 'no'],
+               [0, 1, 'no'],
+               [0, 1, 'no']]
+    # dataSet = [['yes'],
+    #         ['yes'],
+    #         ['no'],
+    #         ['no'],
+    #         ['no']]
+    labels = ['no surfacing', 'flippers']
+    # change to discrete values
+    return dataSet, labels
+
+
+def calcShannonEnt(dataSet):
+    """calcShannonEnt(calculate Shannon entropy 计算label分类标签的香农熵)
+
+    Args:
+        dataSet 数据集
+    Returns:
+        返回香农熵的计算值
+    Raises:
+
+    """
+    # 求list的长度，表示计算参与训练的数据量
+    numEntries = len(dataSet)
+    # print type(dataSet), 'numEntries: ', numEntries
+
+    # 计算分类标签label出现的次数
+    labelCounts = {}
+    # the the number of unique elements and their occurance
+    for featVec in dataSet:
+        currentLabel = featVec[-1]
+        if currentLabel not in labelCounts.keys():
+            labelCounts[currentLabel] = 0
+        labelCounts[currentLabel] += 1
+        # print '-----', featVec, labelCounts
+
+    # 对于label标签的占比，求出label标签的香农熵
+    shannonEnt = 0.0
+    for key in labelCounts:
+        prob = float(labelCounts[key])/numEntries
+        # log base 2
+        shannonEnt -= prob * log(prob, 2)
+        # print '---', prob, prob * log(prob, 2), shannonEnt
+    return shannonEnt
+
+
+def splitDataSet(dataSet, axis, value):
+    """splitDataSet(通过遍历dataSet数据集，求出axis对应的colnum列的值为value的行)
+
+    Args:
+        dataSet 数据集
+        axis 表示每一行的axis列
+        value 表示axis列对应的value值
+    Returns:
+        axis列为value的数据集【该数据集需要排除axis列】
+    Raises:
+
+    """
+    retDataSet = []
+    for featVec in dataSet:
+        # axis列为value的数据集【该数据集需要排除axis列】
+        if featVec[axis] == value:
+            # chop out axis used for splitting
+            reducedFeatVec = featVec[:axis]
+            '''
+            请百度查询一下： extend和append的区别
+            '''
+            reducedFeatVec.extend(featVec[axis+1:])
+            # 收集结果值 axis列为value的行【该行需要排除axis列】
+            retDataSet.append(reducedFeatVec)
+    return retDataSet
+
+
+def chooseBestFeatureToSplit(dataSet):
+    """chooseBestFeatureToSplit(选择最好的特征)
+
+    Args:
+        dataSet 数据集
+    Returns:
+        bestFeature 最优的特征列
+    Raises:
+
+    """
+    # 求第一行有多少列的 Feature
+    numFeatures = len(dataSet[0]) - 1
+    # label的信息熵
+    baseEntropy = calcShannonEnt(dataSet)
+    # 最优的信息增益值, 和最优的Featurn编号
+    bestInfoGain, bestFeature = 0.0, -1
+    # iterate over all the features
+    for i in range(numFeatures):
+        # create a list of all the examples of this feature
+        # 获取每一个feature的list集合
+        featList = [example[i] for example in dataSet]
+        # get a set of unique values
+        # 获取剔重后的集合
+        uniqueVals = set(featList)
+        # 创建一个临时的信息熵
+        newEntropy = 0.0
+        # 遍历某一列的value集合，计算该列的信息熵
+        for value in uniqueVals:
+            subDataSet = splitDataSet(dataSet, i, value)
+            prob = len(subDataSet)/float(len(dataSet))
+            newEntropy += prob * calcShannonEnt(subDataSet)
+        # gain[信息增益] 值越大，意味着该分类提供的信息量越大，该特征对分类的不确定程度越小
+        infoGain = baseEntropy - newEntropy
+        if (infoGain > bestInfoGain):
+            bestInfoGain = infoGain
+            bestFeature = i
+    return bestFeature
+
+
+def majorityCnt(classList):
+    """majorityCnt(选择出线次数最多的一个结果)
+
+    Args:
+        classList label列的集合
+    Returns:
+        bestFeature 最优的特征列
+    Raises:
+
+    """
+    classCount = {}
+    for vote in classList:
+        if vote not in classCount.keys():
+            classCount[vote] = 0
+        classCount[vote] += 1
+    # 倒叙排列classCount得到一个字典集合，然后取出第一个就是结果（yes/no）
+    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
+    # print 'sortedClassCount:', sortedClassCount
+    return sortedClassCount[0][0]
+
+
+def createTree(dataSet, labels):
+    classList = [example[-1] for example in dataSet]
+    # 如果数据集的最后一列的第一个值出现的次数=整个集合的数量，也就说只有一个类别，就只直接返回结果就行
+    if classList.count(classList[0]) == len(classList):
+        return classList[0]
+    # 如果数据集只有1列，那么最初出现label次数最多的一类，作为结果
+    if len(dataSet[0]) == 1:
+        return majorityCnt(classList)
+
+    # 选择最优的列，得到最有列对应的label含义
+    bestFeat = chooseBestFeatureToSplit(dataSet)
+    bestFeatLabel = labels[bestFeat]
+    # 初始化myTree
+    myTree = {bestFeatLabel: {}}
+    # 注：labels列表是可变对象，在PYTHON函数中作为参数时传址引用，能够被全局修改
+    # 所以这行代码导致函数外的同名变量被删除了元素，造成例句无法执行，提示'no surfacing' is not in list
+    del(labels[bestFeat])
+    # 取出最优列，然后它的branch做分类
+    featValues = [example[bestFeat] for example in dataSet]
+    uniqueVals = set(featValues)
+    for value in uniqueVals:
+        # 求出剩余的标签label
+        subLabels = labels[:]
+        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
+        # print 'myTree', value, myTree
+    return myTree
+
+
+def classify(inputTree, featLabels, testVec):
+    # 获取tree的第一个节点对应的key值
+    firstStr = inputTree.keys()[0]
+    # 获取第一个节点对应的value值
+    secondDict = inputTree[firstStr]
+    # 判断根节点的索引值，然后根据testVec来获取对应的树分枝位置
+    featIndex = featLabels.index(firstStr)
+    key = testVec[featIndex]
+    valueOfFeat = secondDict[key]
+    print '+++', firstStr, 'xxx', secondDict, '---', key, '>>>', valueOfFeat
+    # 判断分枝是否结束
+    if isinstance(valueOfFeat, dict):
+        classLabel = classify(valueOfFeat, featLabels, testVec)
+    else:
+        classLabel = valueOfFeat
+    return classLabel
+
+
+def storeTree(inputTree, filename):
+    import pickle
+    fw = open(filename, 'w')
+    pickle.dump(inputTree, fw)
+    fw.close()
+
+
+def grabTree(filename):
+    import pickle
+    fr = open(filename)
+    return pickle.load(fr)
+
+
+if __name__ == "__main__":
+
+    # 1.创建数据和结果标签
+    myDat, labels = createDataSet()
+    # print myDat, labels
+
+    # # 计算label分类标签的香农熵
+    # calcShannonEnt(myDat)
+
+    # # 求第0列 为 1/0的列的数据集【排除第0列】
+    # print '1---', splitDataSet(myDat, 0, 1)
+    # print '0---', splitDataSet(myDat, 0, 0)
+
+    # # 计算最好的信息增益的列
+    # print chooseBestFeatureToSplit(myDat)
+
+    import copy
+    myTree = createTree(myDat, copy.deepcopy(labels))
+    print myTree
+    # [1, 1]表示要取的分支上的节点位置，对应的结果值
+    # print classify(myTree, labels, [1, 1])
+
+    # 画图可视化展现
+    dtPlot.createPlot(myTree)
--- a/src/python/03.DecisionTree/DecisionTreePlot.py
+++ b/src/python/03.DecisionTree/DecisionTreePlot.py
@@ -0,0 +1,132 @@
+#!/usr/bin/python
+# coding:utf8
+
+'''
+Created on Oct 14, 2010
+Update on 2017-02-27
+Decision Tree Source Code for Machine Learning in Action Ch. 3
+@author: Peter Harrington/jiangzhonglian
+'''
+import matplotlib.pyplot as plt
+
+# 定义文本框 和 箭头格式 【 sawtooth 波浪方框, round4 矩形方框 , fc表示字体颜色的深浅 0.1~0.9 依次变浅，没错是变浅】
+decisionNode = dict(boxstyle="sawtooth", fc="0.8")
+leafNode = dict(boxstyle="round4", fc="0.8")
+arrow_args = dict(arrowstyle="<-")
+
+
+def getNumLeafs(myTree):
+    numLeafs = 0
+    firstStr = myTree.keys()[0]
+    secondDict = myTree[firstStr]
+    # 根节点开始遍历
+    for key in secondDict.keys():
+        # 判断子节点是否为dict, 不是+1
+        if type(secondDict[key]).__name__ == 'dict':
+            numLeafs += getNumLeafs(secondDict[key])
+        else:
+            numLeafs += 1
+    return numLeafs
+
+
+def getTreeDepth(myTree):
+    maxDepth = 0
+    firstStr = myTree.keys()[0]
+    secondDict = myTree[firstStr]
+    # 根节点开始遍历
+    for key in secondDict.keys():
+        # 判断子节点是不是dict, 求分枝的深度
+        if type(secondDict[key]).__name__ == 'dict':
+            thisDepth = 1 + getTreeDepth(secondDict[key])
+        else:
+            thisDepth = 1
+        # 记录最大的分支深度
+        if thisDepth > maxDepth:
+            maxDepth = thisDepth
+    return maxDepth
+
+
+def plotNode(nodeTxt, centerPt, parentPt, nodeType):
+    createPlot.ax1.annotate(nodeTxt, xy=parentPt,  xycoords='axes fraction', xytext=centerPt, textcoords='axes fraction', va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)
+
+
+def plotMidText(cntrPt, parentPt, txtString):
+    xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
+    yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
+    createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)
+
+
+def plotTree(myTree, parentPt, nodeTxt):
+    # 获取叶子节点的数量
+    numLeafs = getNumLeafs(myTree)
+    # 获取树的深度
+    # depth = getTreeDepth(myTree)
+
+    # 找出第1个中心点的位置，然后与 parentPt定点进行划线
+    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)
+    # print cntrPt
+    # 并打印输入对应的文字
+    plotMidText(cntrPt, parentPt, nodeTxt)
+
+    firstStr = myTree.keys()[0]
+    # 可视化Node分支点
+    plotNode(firstStr, cntrPt, parentPt, decisionNode)
+    # 根节点的值
+    secondDict = myTree[firstStr]
+    # y值 = 最高点-层数的高度[第二个节点位置]
+    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
+    for key in secondDict.keys():
+        # 判断该节点是否是Node节点
+        if type(secondDict[key]).__name__=='dict':
+            # 如果是就递归调用[recursion]
+            plotTree(secondDict[key],cntrPt,str(key))
+        else:
+            # 如果不是，就在原来节点一半的地方找到节点的坐标
+            plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
+            # 可视化该节点位置
+            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
+            # 并打印输入对应的文字
+            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
+    # plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
+
+
+def createPlot(inTree):
+    # 创建一个figure的模版
+    fig = plt.figure(1, facecolor='green')
+    fig.clf()
+
+    axprops = dict(xticks=[], yticks=[])
+    # 表示创建一个1行，1列的图，createPlot.ax1 为第 1 个子图，
+    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
+
+    plotTree.totalW = float(getNumLeafs(inTree))
+    plotTree.totalD = float(getTreeDepth(inTree))
+    # 半个节点的长度
+    plotTree.xOff = -0.5/plotTree.totalW
+    plotTree.yOff = 1.0
+    plotTree(inTree, (0.5, 1.0), '')
+    plt.show()
+
+
+# # 测试画图
+# def createPlot():
+#     fig = plt.figure(1, facecolor='white')
+#     fig.clf()
+#     # ticks for demo puropses
+#     createPlot.ax1 = plt.subplot(111, frameon=False)
+#     plotNode('a decision node', (0.5, 0.1), (0.1, 0.5), decisionNode)
+#     plotNode('a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode)
+#     plt.show()
+
+
+# 测试数据集
+def retrieveTree(i):
+    listOfTrees =[
+        {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
+        {'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
+    ]
+    return listOfTrees[i]
+
+
+myTree = retrieveTree(0)
+createPlot(myTree)
--- a/src/python/tools/DecisionTree_getInfoGain.py
+++ b/src/python/tools/DecisionTree_getInfoGain.py
@@ -0,0 +1,124 @@
+#!/usr/bin/python
+# coding: utf8
+
+from math import log
+
+
+def calcShannonEnt(dataSet):
+    """calcShannonEnt(calculate Shannon entropy 计算label分类标签的香农熵)
+
+    Args:
+        dataSet 数据集
+    Returns:
+        返回香农熵的计算值
+    Raises:
+
+    """
+    # 求list的长度，表示计算参与训练的数据量
+    numEntries = len(dataSet)
+    # print type(dataSet), 'numEntries: ', numEntries
+
+    # 计算分类标签label出现的次数
+    labelCounts = {}
+    # the the number of unique elements and their occurance
+    for featVec in dataSet:
+        currentLabel = featVec[-1]
+        if currentLabel not in labelCounts.keys():
+            labelCounts[currentLabel] = 0
+        labelCounts[currentLabel] += 1
+        # print '-----', featVec, labelCounts
+
+    # 对于label标签的占比，求出label标签的香农熵
+    shannonEnt = 0.0
+    for key in labelCounts:
+        prob = float(labelCounts[key])/numEntries
+        # log base 2
+        shannonEnt -= prob * log(prob, 2)
+        # print '---', prob, prob * log(prob, 2), shannonEnt
+    return shannonEnt
+
+
+def splitDataSet(dataSet, axis, value):
+    """splitDataSet(通过遍历dataSet数据集，求出axis对应的colnum列的值为value的行)
+
+    Args:
+        dataSet 数据集
+        axis 表示每一行的axis列
+        value 表示axis列对应的value值
+    Returns:
+        axis列为value的数据集【该数据集需要排除axis列】
+    Raises:
+
+    """
+    retDataSet = []
+    for featVec in dataSet:
+        # axis列为value的数据集【该数据集需要排除axis列】
+        if featVec[axis] == value:
+            # chop out axis used for splitting
+            reducedFeatVec = featVec[:axis]
+            '''
+            请百度查询一下： extend和append的区别
+            '''
+            reducedFeatVec.extend(featVec[axis+1:])
+            # 收集结果值 axis列为value的行【该行需要排除axis列】
+            retDataSet.append(reducedFeatVec)
+    return retDataSet
+
+
+def getFeatureShannonEnt(dataSet, labels):
+    """chooseBestFeatureToSplit(选择最好的特征)
+
+    Args:
+        dataSet 数据集
+    Returns:
+        bestFeature 最优的特征列
+    Raises:
+
+    """
+    # 求第一行有多少列的 Feature
+    numFeatures = len(dataSet[0]) - 1
+    # label的信息熵
+    baseEntropy = calcShannonEnt(dataSet)
+    # 最优的信息增益值, 和最优的Featurn编号
+    bestInfoGain, bestFeature, endEntropy = 0.0, -1, 0.0
+    # iterate over all the features
+    for i in range(numFeatures):
+        # create a list of all the examples of this feature
+        # 获取每一个feature的list集合
+        featList = [example[i] for example in dataSet]
+        # get a set of unique values
+        # 获取剔重后的集合
+        uniqueVals = set(featList)
+        # 创建一个临时的信息熵
+        newEntropy = 0.0
+        # 遍历某一列的value集合，计算该列的信息熵
+        for value in uniqueVals:
+            subDataSet = splitDataSet(dataSet, i, value)
+            prob = len(subDataSet)/float(len(dataSet))
+            newEntropy += prob * calcShannonEnt(subDataSet)
+        # gain[信息增益] 值越大，意味着该分类提供的信息量越大，该特征对分类的不确定程度越小
+        # gain[信息增益]=0, 表示与类别相同，无需其他的分类
+        # gain[信息增益]=baseEntropy, 表示分类和没分类没有区别
+        infoGain = baseEntropy - newEntropy
+        # print infoGain
+        if (infoGain > bestInfoGain):
+            endEntropy = newEntropy
+            bestInfoGain = infoGain
+            bestFeature = i
+    else:
+        if numFeatures < 0:
+            labels[bestFeature] = 'null'
+
+    return labels[bestFeature], baseEntropy, endEntropy, bestInfoGain
+
+
+if __name__ == '__main__':
+    labels = ['no surfacing', 'flippers']
+    dataSet1 = [['yes'], ['yes'], ['no'], ['no'], ['no']]
+    dataSet2 = [['a', 1, 'yes'], ['a', 2, 'yes'], ['b', 3, 'no'], ['c', 4, 'no'], ['c', 5, 'no']]
+    dataSet3 = [[1, 'yes'], [1, 'yes'], [1, 'no'], [3, 'no'], [3, 'no']]
+    infoGain1 = getFeatureShannonEnt(dataSet1, labels)
+    infoGain2 = getFeatureShannonEnt(dataSet2, labels)
+    infoGain3 = getFeatureShannonEnt(dataSet3, labels)
+    print '信息增益: \n\t%s, \n\t%s, \n\t%s' % (infoGain1, infoGain2, infoGain3)
+
--- a/testData/DT_data.txt
+++ b/testData/DT_data.txt
@@ -0,0 +1,10 @@
+1.5 50 thin  
+1.5 60 fat  
+1.6 40 thin  
+1.6 60 fat  
+1.7 60 thin  
+1.7 80 fat  
+1.8 60 thin  
+1.8 90 fat  
+1.9 70 thin  
+1.9 80 thin 
--- a/testResult/tree.pdf
+++ b/testResult/tree.pdf