diff --git a/README.md b/README.md index c7544159..b87dcabc 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ * 2) k-近邻算法 * [k-近邻算法](./docs/2.k-近邻算法.md) * 3) 决策树 + * [决策树](./docs/3.决策树.md) * 4) 基于概率论的分类方法:朴素贝叶斯 * 5) Logistic回归 * 6) 支持向量机 diff --git a/docs/3.决策树.md b/docs/3.决策树.md new file mode 100644 index 00000000..f91b40bf --- /dev/null +++ b/docs/3.决策树.md @@ -0,0 +1,23 @@ + +# 3) 决策树 + +* 决策树是什么? + * 顾名思义,是一种树,一种依托于策略抉择而建立起来的树。 + * 从数据产生决策树的机器学习技术叫做决策树学习, 通俗点说就是决策树。 +* 决策数目前的情况: + * 1.最经常使用的数据挖掘算法。(流行的原因:不需要了解机器学习的知识,就能搞明白决策树是如何工作的) + * 2.数据形式[决策过程只有:是/否]和数据内在含义非常容易理解。 + * 3.决策树给出的结果往往可以匹敌在当前领域具有几十年工作经验的人类专家。 +* 决策树的构造: + * 优点:计算复杂度不高,输出结果易于理解,对中间值的缺失不敏感,可以处理不相关特征数据。 + * 缺点:可能会产生过度匹配问题。 + * 适用数据类型:数值型和标称型[标称型:其实就是离散型数据,变量的结果只在`有限`目标集中取值(例如:分类特征 A/B/C类其中一种)]。 +* 如何找出第一个分支点呢? + * 信息增益: + * 划分数据集的最大原则是:将无序的数据变得更加有序。 + * 集合信息的度量称为`香农熵`或者简称`熵`(名字来源于信息论之父`克劳德·香农`) + * 公式: + * l(x_i) = -log_2 P(x_i) + * + * 基尼不纯度(Gini impurity) [本书不做过多的介绍] + * 简单来说:就是从一个数据集中随机选取子项,度量其被错误分类到其他分组里的概率。 diff --git a/src/python/03.DecisionTree.py b/src/python/03.DecisionTree.py new file mode 100644 index 00000000..e958eef8 --- /dev/null +++ b/src/python/03.DecisionTree.py @@ -0,0 +1,172 @@ +#!/usr/bin/python +# coding:utf8 + +''' +Created on Oct 12, 2010 +Update on 2017-02-27 +Decision Tree Source Code for Machine Learning in Action Ch. 3 +@author: Peter Harrington/jiangzhonglian +''' +from math import log +import operator + + +def createDataSet(): + """DateSet 基础数据集 + + Args: + 无需传入参数 + Returns: + 返回数据集和对应的label标签 + Raises: + + """ + dataSet = [[1, 1, 'yes'], + [1, 1, 'yes'], + [1, 0, 'no'], + [0, 1, 'no'], + [0, 1, 'no']] + labels = ['no surfacing', 'flippers'] + # change to discrete values + return dataSet, labels + + +def calcShannonEnt(dataSet): + """calcShannonEnt(calculate Shannon entropy 计算香农熵) + + Args: + dataSet 数据集 + Returns: + 返回香农熵的计算值 + Raises: + + """ + # 求list的长度,表示计算参与训练的数据量 + numEntries = len(dataSet) + # print type(dataSet), 'numEntries: ', numEntries + + # 计算分类标签label出现的次数 + labelCounts = {} + # the the number of unique elements and their occurance + for featVec in dataSet: + currentLabel = featVec[-1] + if currentLabel not in labelCounts.keys(): + labelCounts[currentLabel] = 0 + labelCounts[currentLabel] += 1 + # print '-----', featVec, labelCounts + + # 对于label标签的占比,求出label标签的香农熵 + shannonEnt = 0.0 + for key in labelCounts: + prob = float(labelCounts[key])/numEntries + # log base 2 + shannonEnt -= prob * log(prob, 2) + print '---', prob, prob * log(prob, 2), shannonEnt + return shannonEnt + + +def splitDataSet(dataSet, axis, value): + retDataSet = [] + for featVec in dataSet: + if featVec[axis] == value: + # chop out axis used for splitting + reducedFeatVec = featVec[:axis] + reducedFeatVec.extend(featVec[axis+1:]) + retDataSet.append(reducedFeatVec) + return retDataSet + + +def chooseBestFeatureToSplit(dataSet): + # the last column is used for the labels + numFeatures = len(dataSet[0]) - 1 + baseEntropy = calcShannonEnt(dataSet) + bestInfoGain = 0.0 + bestFeature = -1 + # iterate over all the features + for i in range(numFeatures): + # create a list of all the examples of this feature + featList = [example[i] for example in dataSet] + # get a set of unique values + uniqueVals = set(featList) + newEntropy = 0.0 + for value in uniqueVals: + subDataSet = splitDataSet(dataSet, i, value) + prob = len(subDataSet)/float(len(dataSet)) + newEntropy += prob * calcShannonEnt(subDataSet) + infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropy + if (infoGain > bestInfoGain): #compare this to the best gain so far + bestInfoGain = infoGain #if better than current best, set to best + bestFeature = i + return bestFeature #returns an integer + + +def majorityCnt(classList): + classCount = {} + for vote in classList: + if vote not in classCount.keys(): + classCount[vote] = 0 + classCount[vote] += 1 + sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) + return sortedClassCount[0][0] + + +def createTree(dataSet, labels): + classList = [example[-1] for example in dataSet] + if classList.count(classList[0]) == len(classList): + return classList[0]#stop splitting when all of the classes are equal + if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet + return majorityCnt(classList) + bestFeat = chooseBestFeatureToSplit(dataSet) + bestFeatLabel = labels[bestFeat] + myTree = {bestFeatLabel:{}} + # 注:labels列表是可变对象,在PYTHON函数中作为参数时传址引用,能够被全局修改 + # 所以这行代码导致函数外的同名变量被删除了元素,造成例句无法执行,提示'no surfacing' is not in list + del(labels[bestFeat]) + featValues = [example[bestFeat] for example in dataSet] + uniqueVals = set(featValues) + for value in uniqueVals: + subLabels = labels[:] #copy all of labels, so trees don't mess up existing labels + myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) + return myTree + + +def classify(inputTree, featLabels, testVec): + # 获取tree的第一个节点值 + print '1111', inputTree.keys() + firstStr = inputTree.keys()[0] + secondDict = inputTree[firstStr] + featIndex = featLabels.index(firstStr) + key = testVec[featIndex] + valueOfFeat = secondDict[key] + if isinstance(valueOfFeat, dict): + classLabel = classify(valueOfFeat, featLabels, testVec) + else: + classLabel = valueOfFeat + return classLabel + + +def storeTree(inputTree,filename): + import pickle + fw = open(filename, 'w') + pickle.dump(inputTree, fw) + fw.close() + + +def grabTree(filename): + import pickle + fr = open(filename) + return pickle.load(fr) + + +if __name__ == "__main__": + + # 1.创建数据和结果标签 + myDat, labels = createDataSet() + print myDat, labels + + calcShannonEnt(myDat) + + # import copy + # myTree = createTree(myDat, copy.deepcopy(labels)) + # print myTree + # print classify(myTree, labels, [1, 1])