格式数学公式

2026-02-09 05:15:28 +08:00 · 2017-02-27 16:33:37 +08:00
parent 186f4dbaba
commit 7da5afed05
3 changed files with 196 additions and 0 deletions
--- a/src/python/03.DecisionTree.py
+++ b/src/python/03.DecisionTree.py
@@ -0,0 +1,172 @@
+#!/usr/bin/python
+# coding:utf8
+
+'''
+Created on Oct 12, 2010
+Update on 2017-02-27
+Decision Tree Source Code for Machine Learning in Action Ch. 3
+@author: Peter Harrington/jiangzhonglian
+'''
+from math import log
+import operator
+
+
+def createDataSet():
+    """DateSet 基础数据集
+
+    Args:
+        无需传入参数
+    Returns:
+        返回数据集和对应的label标签
+    Raises:
+
+    """
+    dataSet = [[1, 1, 'yes'],
+               [1, 1, 'yes'],
+               [1, 0, 'no'],
+               [0, 1, 'no'],
+               [0, 1, 'no']]
+    labels = ['no surfacing', 'flippers']
+    # change to discrete values
+    return dataSet, labels
+
+
+def calcShannonEnt(dataSet):
+    """calcShannonEnt(calculate Shannon entropy 计算香农熵)
+
+    Args:
+        dataSet 数据集
+    Returns:
+        返回香农熵的计算值
+    Raises:
+
+    """
+    # 求list的长度，表示计算参与训练的数据量
+    numEntries = len(dataSet)
+    # print type(dataSet), 'numEntries: ', numEntries
+
+    # 计算分类标签label出现的次数
+    labelCounts = {}
+    # the the number of unique elements and their occurance
+    for featVec in dataSet:
+        currentLabel = featVec[-1]
+        if currentLabel not in labelCounts.keys():
+            labelCounts[currentLabel] = 0
+        labelCounts[currentLabel] += 1
+        # print '-----', featVec, labelCounts
+
+    # 对于label标签的占比，求出label标签的香农熵
+    shannonEnt = 0.0
+    for key in labelCounts:
+        prob = float(labelCounts[key])/numEntries
+        # log base 2
+        shannonEnt -= prob * log(prob, 2)
+        print '---', prob, prob * log(prob, 2), shannonEnt
+    return shannonEnt
+
+
+def splitDataSet(dataSet, axis, value):
+    retDataSet = []
+    for featVec in dataSet:
+        if featVec[axis] == value:
+            # chop out axis used for splitting
+            reducedFeatVec = featVec[:axis]
+            reducedFeatVec.extend(featVec[axis+1:])
+            retDataSet.append(reducedFeatVec)
+    return retDataSet
+
+
+def chooseBestFeatureToSplit(dataSet):
+    # the last column is used for the labels
+    numFeatures = len(dataSet[0]) - 1
+    baseEntropy = calcShannonEnt(dataSet)
+    bestInfoGain = 0.0
+    bestFeature = -1
+    # iterate over all the features
+    for i in range(numFeatures):
+        # create a list of all the examples of this feature
+        featList = [example[i] for example in dataSet]
+        # get a set of unique values
+        uniqueVals = set(featList)      
+        newEntropy = 0.0
+        for value in uniqueVals:
+            subDataSet = splitDataSet(dataSet, i, value)
+            prob = len(subDataSet)/float(len(dataSet))
+            newEntropy += prob * calcShannonEnt(subDataSet)     
+        infoGain = baseEntropy - newEntropy     #calculate the info gain; ie reduction in entropy
+        if (infoGain > bestInfoGain):       #compare this to the best gain so far
+            bestInfoGain = infoGain         #if better than current best, set to best
+            bestFeature = i
+    return bestFeature                      #returns an integer
+
+
+def majorityCnt(classList):
+    classCount = {}
+    for vote in classList:
+        if vote not in classCount.keys():
+            classCount[vote] = 0
+        classCount[vote] += 1
+    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
+    return sortedClassCount[0][0]
+
+
+def createTree(dataSet, labels):
+    classList = [example[-1] for example in dataSet]
+    if classList.count(classList[0]) == len(classList):
+        return classList[0]#stop splitting when all of the classes are equal
+    if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet
+        return majorityCnt(classList)
+    bestFeat = chooseBestFeatureToSplit(dataSet)
+    bestFeatLabel = labels[bestFeat]
+    myTree = {bestFeatLabel:{}}
+    # 注：labels列表是可变对象，在PYTHON函数中作为参数时传址引用，能够被全局修改
+    # 所以这行代码导致函数外的同名变量被删除了元素，造成例句无法执行，提示'no surfacing' is not in list
+    del(labels[bestFeat])
+    featValues = [example[bestFeat] for example in dataSet]
+    uniqueVals = set(featValues)
+    for value in uniqueVals:
+        subLabels = labels[:]       #copy all of labels, so trees don't mess up existing labels
+        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
+    return myTree
+
+
+def classify(inputTree, featLabels, testVec):
+    # 获取tree的第一个节点值
+    print '1111', inputTree.keys()
+    firstStr = inputTree.keys()[0]
+    secondDict = inputTree[firstStr]
+    featIndex = featLabels.index(firstStr)
+    key = testVec[featIndex]
+    valueOfFeat = secondDict[key]
+    if isinstance(valueOfFeat, dict):
+        classLabel = classify(valueOfFeat, featLabels, testVec)
+    else:
+        classLabel = valueOfFeat
+    return classLabel
+
+
+def storeTree(inputTree,filename):
+    import pickle
+    fw = open(filename, 'w')
+    pickle.dump(inputTree, fw)
+    fw.close()
+
+
+def grabTree(filename):
+    import pickle
+    fr = open(filename)
+    return pickle.load(fr)
+
+
+if __name__ == "__main__":
+
+    # 1.创建数据和结果标签
+    myDat, labels = createDataSet()
+    print myDat, labels
+
+    calcShannonEnt(myDat)
+
+    # import copy
+    # myTree = createTree(myDat, copy.deepcopy(labels))
+    # print myTree
+    # print classify(myTree, labels, [1, 1])