diff --git a/src/python/03.DecisionTree/DTSklearn.py b/src/python/03.DecisionTree/DTSklearn.py new file mode 100644 index 00000000..5155b214 --- /dev/null +++ b/src/python/03.DecisionTree/DTSklearn.py @@ -0,0 +1,115 @@ +#!/usr/bin/python +# coding: utf8 +# 原始链接: http://blog.csdn.net/lsldd/article/details/41223147 +import numpy as np +from sklearn import tree +from sklearn.metrics import precision_recall_curve +from sklearn.metrics import classification_report +from sklearn.cross_validation import train_test_split + + +def createDataSet(): + ''' 数据读入 ''' + data = [] + labels = [] + with open("testData/DT_data.txt") as ifile: + for line in ifile: + # 特征: 身高 体重 label: 胖瘦 + tokens = line.strip().split(' ') + data.append([float(tk) for tk in tokens[:-1]]) + labels.append(tokens[-1]) + # 特征数据 + x = np.array(data) + # label分类的标签数据 + labels = np.array(labels) + # 预估结果的标签数据 + y = np.zeros(labels.shape) + + ''' 标签转换为0/1 ''' + y[labels == 'fat'] = 1 + print data, '-------', x, '-------', labels, '-------', y + return x, y + + +def predict_train(x_train, y_train): + ''' + 使用信息熵作为划分标准,对决策树进行训练 + 参考链接: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier + ''' + clf = tree.DecisionTreeClassifier(criterion='entropy') + # print(clf) + clf.fit(x_train, y_train) + ''' 系数反映每个特征的影响力。越大表示该特征在分类中起到的作用越大 ''' + print 'feature_importances_: %s' % clf.feature_importances_ + + '''测试结果的打印''' + y_pre = clf.predict(x_train) + # print(x_train) + print(y_pre) + print(y_train) + print(np.mean(y_pre == y_train)) + return y_pre, clf + + +def show_precision_recall(x, clf, y_train, y_pre): + ''' + 准确率与召回率 + 参考链接: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html#sklearn.metrics.precision_recall_curve + ''' + precision, recall, thresholds = precision_recall_curve(y_train, y_pre) + # 计算全量的预估结果 + answer = clf.predict_proba(x)[:, 1] + + ''' + 展现 准确率与召回率 + precision 准确率 + recall 召回率 + f1-score 准确率和召回率的一个综合得分 + support 参与比较的数量 + 参考链接:http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report + ''' + # target_names 以 y的label分类为准 + target_names = ['thin', 'fat'] + print(classification_report(y, answer, target_names=target_names)) + print(answer) + print(y) + + +def show_pdf(clf): + ''' + 可视化输出 + 把决策树结构写入文件: http://sklearn.lzjqsdd.com/modules/tree.html + + Mac报错:pydotplus.graphviz.InvocationException: GraphViz's executables not found + 解决方案:sudo brew install graphviz + 参考写入: http://www.jianshu.com/p/59b510bafb4d + ''' + # with open("testResult/tree.dot", 'w') as f: + # from sklearn.externals.six import StringIO + # tree.export_graphviz(clf, out_file=f) + + import pydotplus + from sklearn.externals.six import StringIO + dot_data = StringIO() + tree.export_graphviz(clf, out_file=dot_data) + graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) + graph.write_pdf("testResult/tree.pdf") + + # from IPython.display import Image + # Image(graph.create_png()) + +if __name__ == '__main__': + x, y = createDataSet() + + ''' 拆分训练数据与测试数据, 80%做训练 20%做测试 ''' + x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) + print '拆分数据:', x_train, x_test, y_train, y_test + + # 得到训练的预测结果集 + y_pre, clf = predict_train(x_train, y_train) + + # 展现 准确率与召回率 + show_precision_recall(x, clf, y_train, y_pre) + + # 可视化输出 + show_pdf(clf) diff --git a/src/python/03.DecisionTree/DecisionTree.py b/src/python/03.DecisionTree/DecisionTree.py index c7920bd1..03a4d25f 100644 --- a/src/python/03.DecisionTree/DecisionTree.py +++ b/src/python/03.DecisionTree/DecisionTree.py @@ -129,7 +129,7 @@ def chooseBestFeatureToSplit(dataSet): subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) - # 计算label的信息熵和每个特征的信息熵 的增益值,如果增益值大于最大值,那么效果越好 + # gain[信息增益] 值越大,意味着该分类提供的信息量越大,该特征对分类的不确定程度越小 infoGain = baseEntropy - newEntropy if (infoGain > bestInfoGain): bestInfoGain = infoGain diff --git a/src/python/tools/DecisionTree.py b/src/python/tools/DecisionTree.py new file mode 100644 index 00000000..a2974d91 --- /dev/null +++ b/src/python/tools/DecisionTree.py @@ -0,0 +1,124 @@ +#!/usr/bin/python +# coding: utf8 + +from math import log + + +def calcShannonEnt(dataSet): + """calcShannonEnt(calculate Shannon entropy 计算label分类标签的香农熵) + + Args: + dataSet 数据集 + Returns: + 返回香农熵的计算值 + Raises: + + """ + # 求list的长度,表示计算参与训练的数据量 + numEntries = len(dataSet) + # print type(dataSet), 'numEntries: ', numEntries + + # 计算分类标签label出现的次数 + labelCounts = {} + # the the number of unique elements and their occurance + for featVec in dataSet: + currentLabel = featVec[-1] + if currentLabel not in labelCounts.keys(): + labelCounts[currentLabel] = 0 + labelCounts[currentLabel] += 1 + # print '-----', featVec, labelCounts + + # 对于label标签的占比,求出label标签的香农熵 + shannonEnt = 0.0 + for key in labelCounts: + prob = float(labelCounts[key])/numEntries + # log base 2 + shannonEnt -= prob * log(prob, 2) + # print '---', prob, prob * log(prob, 2), shannonEnt + return shannonEnt + + +def splitDataSet(dataSet, axis, value): + """splitDataSet(通过遍历dataSet数据集,求出axis对应的colnum列的值为value的行) + + Args: + dataSet 数据集 + axis 表示每一行的axis列 + value 表示axis列对应的value值 + Returns: + axis列为value的数据集【该数据集需要排除axis列】 + Raises: + + """ + retDataSet = [] + for featVec in dataSet: + # axis列为value的数据集【该数据集需要排除axis列】 + if featVec[axis] == value: + # chop out axis used for splitting + reducedFeatVec = featVec[:axis] + ''' + 请百度查询一下: extend和append的区别 + ''' + reducedFeatVec.extend(featVec[axis+1:]) + # 收集结果值 axis列为value的行【该行需要排除axis列】 + retDataSet.append(reducedFeatVec) + return retDataSet + + +def getFeatureShannonEnt(dataSet, labels): + """chooseBestFeatureToSplit(选择最好的特征) + + Args: + dataSet 数据集 + Returns: + bestFeature 最优的特征列 + Raises: + + """ + # 求第一行有多少列的 Feature + numFeatures = len(dataSet[0]) - 1 + # label的信息熵 + baseEntropy = calcShannonEnt(dataSet) + # 最优的信息增益值, 和最优的Featurn编号 + bestInfoGain, bestFeature, endEntropy = 0.0, -1, 0.0 + # iterate over all the features + for i in range(numFeatures): + # create a list of all the examples of this feature + # 获取每一个feature的list集合 + featList = [example[i] for example in dataSet] + # get a set of unique values + # 获取剔重后的集合 + uniqueVals = set(featList) + # 创建一个临时的信息熵 + newEntropy = 0.0 + # 遍历某一列的value集合,计算该列的信息熵 + for value in uniqueVals: + subDataSet = splitDataSet(dataSet, i, value) + prob = len(subDataSet)/float(len(dataSet)) + newEntropy += prob * calcShannonEnt(subDataSet) + # gain[信息增益] 值越大,意味着该分类提供的信息量越大,该特征对分类的不确定程度越小 + # gain[信息增益]=0, 表示与类别相同,无需其他的分类 + # gain[信息增益]=baseEntropy, 表示分类和没分类没有区别 + infoGain = baseEntropy - newEntropy + # print infoGain + if (infoGain > bestInfoGain): + endEntropy = newEntropy + bestInfoGain = infoGain + bestFeature = i + else: + if numFeatures < 0: + labels[bestFeature] = 'null' + + return labels[bestFeature], baseEntropy, endEntropy, bestInfoGain + + +if __name__ == '__main__': + labels = ['no surfacing', 'flippers'] + dataSet1 = [['yes'], ['yes'], ['no'], ['no'], ['no']] + dataSet2 = [['a', 1, 'yes'], ['a', 2, 'yes'], ['b', 3, 'no'], ['c', 4, 'no'], ['c', 5, 'no']] + dataSet3 = [[1, 'yes'], [1, 'yes'], [1, 'no'], [3, 'no'], [3, 'no']] + infoGain1 = getFeatureShannonEnt(dataSet1, labels) + infoGain2 = getFeatureShannonEnt(dataSet2, labels) + infoGain3 = getFeatureShannonEnt(dataSet3, labels) + print '香农熵: \n\t%s, \n\t%s, \n\t%s' % (infoGain1, infoGain2, infoGain3) + diff --git a/testData/DT_data.txt b/testData/DT_data.txt new file mode 100644 index 00000000..3bf55b88 --- /dev/null +++ b/testData/DT_data.txt @@ -0,0 +1,10 @@ +1.5 50 thin +1.5 60 fat +1.6 40 thin +1.6 60 fat +1.7 60 thin +1.7 80 fat +1.8 60 thin +1.8 90 fat +1.9 70 thin +1.9 80 thin diff --git a/testResult/tree.pdf b/testResult/tree.pdf new file mode 100644 index 00000000..f2b72829 Binary files /dev/null and b/testResult/tree.pdf differ