2017-03-18_添加交流的课程注释

This commit is contained in:
jiangzhonglian
2017-03-18 20:55:22 +08:00
parent 096dd4c516
commit fc481871c4
4 changed files with 74 additions and 35 deletions

View File

@@ -31,6 +31,7 @@ def createDataSet():
# ['no'],
# ['no'],
# ['no']]
# labels 露出水面 脚蹼
labels = ['no surfacing', 'flippers']
# change to discrete values
return dataSet, labels
@@ -122,10 +123,9 @@ def chooseBestFeatureToSplit(dataSet):
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
# gain[信息增益] 值越大,意味着该分类提供的信息量越大,该特征对分类的不确定程度越小
# 也就说: 列进行group分组后对应的类别越多信息量越大那么香农熵越小那么信息增益就越大所以gain越大
# gain[信息增益]: 划分数据集前后的信息变化, 获取信息熵最大的值
infoGain = baseEntropy - newEntropy
# print 'infoGain=', infoGain, 'bestFeature=', i
print 'infoGain=', infoGain, 'bestFeature=', i, baseEntropy, newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
@@ -133,7 +133,7 @@ def chooseBestFeatureToSplit(dataSet):
def majorityCnt(classList):
"""majorityCnt(选择出线次数最多的一个结果)
"""majorityCnt(选择出次数最多的一个结果)
Args:
classList label列的集合