From adf961c5db7764c54f60a6fcfdc3315c8c6cacf7 Mon Sep 17 00:00:00 2001 From: jiangzhonglian Date: Sun, 2 Apr 2017 16:03:48 +0800 Subject: [PATCH 1/4] =?UTF-8?q?=E6=9B=B4=E6=96=B0=209.=E6=A0=91=E5=9B=9E?= =?UTF-8?q?=E5=BD=92=E7=9A=84=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/python/09.RegTrees/regTrees.py | 23 +++++++++++++---------- src/python/09.RegTrees/treeExplore.py | 1 - src/python/11.Apriori/apriori.py | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/python/09.RegTrees/regTrees.py b/src/python/09.RegTrees/regTrees.py index c6077e0f..b621c1cb 100644 --- a/src/python/09.RegTrees/regTrees.py +++ b/src/python/09.RegTrees/regTrees.py @@ -103,6 +103,7 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): bestS, bestIndex, bestValue = inf, 0, 0 # 循环处理每一列对应的feature值 for featIndex in range(n-1): + # [0]表示这一列的[所有行],不要[0]就是一个array[[所有行]] for splitVal in set(dataSet[:, featIndex].T.tolist()[0]): # 对该列进行分组,然后组内的成员的val值进行 二元切分 mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) @@ -236,7 +237,7 @@ def linearSolve(dataSet): # 如果矩阵的逆不存在,会造成程序异常 if linalg.det(xTx) == 0.0: raise NameError('This matrix is singular, cannot do inverse,\ntry increasing the second value of ops') - # 最小二乘法求最优解 + # 最小二乘法求最优解: w0*1+w1*x1=y ws = xTx.I * (X.T * Y) return ws, X, Y @@ -291,7 +292,9 @@ if __name__ == "__main__": # # 回归树 # myDat = loadDataSet('testData/RT_data1.txt') # # myDat = loadDataSet('testData/RT_data2.txt') + # # print 'myDat=', myDat # myMat = mat(myDat) + # # print 'myMat=', myMat # myTree = createTree(myMat) # print myTree @@ -301,7 +304,7 @@ if __name__ == "__main__": # myTree = createTree(myMat, ops=(0, 1)) # print myTree - # # 2.后剪枝就是:通过测试数据,对预测模型进行合并判断 + # # 2. 后剪枝就是:通过测试数据,对预测模型进行合并判断 # myDatTest = loadDataSet('testData/RT_data3test.txt') # myMat2Test = mat(myDatTest) # myFinalTree = prune(myTree, myMat2Test) @@ -330,11 +333,11 @@ if __name__ == "__main__": print myTree2 print "模型树:", corrcoef(yHat2, testMat[:, 1],rowvar=0)[0, 1] - # # 线性回归 - # ws, X, Y = linearSolve(trainMat) - # print ws - # m = len(testMat[:, 0]) - # yHat3 = mat(zeros((m, 1))) - # for i in range(shape(testMat)[0]): - # yHat3[i] = testMat[i, 0]*ws[1, 0] + ws[0, 0] - # print "线性回归:", corrcoef(yHat3, testMat[:, 1],rowvar=0)[0, 1] + # 线性回归 + ws, X, Y = linearSolve(trainMat) + print ws + m = len(testMat[:, 0]) + yHat3 = mat(zeros((m, 1))) + for i in range(shape(testMat)[0]): + yHat3[i] = testMat[i, 0]*ws[1, 0] + ws[0, 0] + print "线性回归:", corrcoef(yHat3, testMat[:, 1],rowvar=0)[0, 1] diff --git a/src/python/09.RegTrees/treeExplore.py b/src/python/09.RegTrees/treeExplore.py index a426a342..33665a6a 100644 --- a/src/python/09.RegTrees/treeExplore.py +++ b/src/python/09.RegTrees/treeExplore.py @@ -100,7 +100,6 @@ def main(root): # 退出按钮 Button(root, text="退出", fg="black", command=quit).grid(row=1, column=2) - # 创建一个画板 canvas reDraw.f = Figure(figsize=(5, 4), dpi=100) reDraw.canvas = FigureCanvasTkAgg(reDraw.f, master=root) diff --git a/src/python/11.Apriori/apriori.py b/src/python/11.Apriori/apriori.py index 3ba163d0..0babab1f 100644 --- a/src/python/11.Apriori/apriori.py +++ b/src/python/11.Apriori/apriori.py @@ -188,7 +188,7 @@ def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7): """ # H[0]是freqSet的元素组合的第一个元素 m = len(H[0]) - # 判断,freqSet的长度是否>组合的长度+1, 避免过度匹配 例如:计算过一边{1,2,3} 和 {1, 2} {1, 3},就没必要再计算了 {1,2,3}和{1,2,3}的组合关系 + # 判断,freqSet的长度是否>组合的长度+1, 避免过度匹配 例如:计算过一边{1,2,3} 和 {1, 2} {1, 3},就没必要再计算了进一步合并来计算 {1,2,3}和{1,2,3}的组合关系 if (len(freqSet) > (m + 1)): print 'freqSet******************', len(freqSet), m + 1, freqSet, H, H[0] # 合并数据集集合,组合为2/3/..n的集合 From 1c766d615f2f9d26925fcf325e9a47725403450d Mon Sep 17 00:00:00 2001 From: jiangzhonglian Date: Sun, 2 Apr 2017 19:35:00 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E6=9B=B4=E6=96=B0=2011.apriori=E7=AE=97?= =?UTF-8?q?=E6=B3=95=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/python/11.Apriori/apriori.py | 58 +++++++++++++++++--------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/src/python/11.Apriori/apriori.py b/src/python/11.Apriori/apriori.py index 0babab1f..ac13a171 100644 --- a/src/python/11.Apriori/apriori.py +++ b/src/python/11.Apriori/apriori.py @@ -43,7 +43,7 @@ def scanD(D, Ck, minSupport): Args: D 原始数据集, D用来判断,CK中的元素,是否存在于原数据D中 - Ck 合并后的数据集 + Ck 所有key的元素集合 Returns: retList 支持度大于阈值的集合 supportData 全量key的字典集合 @@ -141,6 +141,8 @@ def apriori(dataSet, minSupport=0.5): if len(Lk) == 0: break # Lk表示满足频繁子项的集合,L元素在增加 + # l=[[set(1), set(2), set(3)]] + # l=[[set(1), set(2), set(3)] [set(1, 2), set(2, 3)]] L.append(Lk) k += 1 # print 'k=', k, len(L[k-2]) @@ -157,7 +159,7 @@ def calcConf(freqSet, H, supportData, brl, minConf=0.7): brl bigRuleList的空数组 minConf 置信度的阈值 Returns: - prunedH 记录 可信度大于阈值的集合 + prunedH 记录 置信度大于阈值的集合 """ # 记录 可信度大于阈值的集合 prunedH = [] @@ -209,7 +211,7 @@ def generateRules(L, supportData, minConf=0.7): Args: L 频繁项集的全集 supportData 所有元素和支持度的全集 - minConf 可信度的阈值 + minConf 置信度的阈值 Returns: bigRuleList 关于 (A->B+置信度) 3个字段的组合 """ @@ -217,7 +219,9 @@ def generateRules(L, supportData, minConf=0.7): # 循环L频繁项集,所有的统一大小组合(2/../n个的组合,从第2组开始) for i in range(1, len(L)): # 获取频繁项集中每个组合的所有元素 + # [[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])], [frozenset([2, 3, 5])]] for freqSet in L[i]: + # 假设:freqSet=frozenset([1, 3]) H1=[1, 3] # 组合总的元素并遍历子元素,并转化为冻结的set集合,再存放到list列表中 H1 = [frozenset([item]) for item in freqSet] # 2个的组合,走else, 2个以上的组合,走if @@ -299,17 +303,17 @@ def main(): # # 收集并准备数据 # dataMat, labelMat = loadDataSet("%s/resources/Apriori_testdata.txt" % project_dir) - # 现在的的测试 - # 1. 加载数据 - dataSet = loadDataSet() - print(dataSet) - # 调用 apriori 做购物篮分析 - # 支持度满足阈值的key集合L,和所有元素和支持度的全集suppoerData - L, supportData = apriori(dataSet, minSupport=0.5) - print L, '\n', supportData - print '\ngenerateRules\n' - rules = generateRules(L, supportData, minConf=0.25) - print rules + # # 现在的的测试 + # # 1. 加载数据 + # dataSet = loadDataSet() + # print(dataSet) + # # 调用 apriori 做购物篮分析 + # # 支持度满足阈值的key集合L,和所有元素和支持度的全集suppoerData + # L, supportData = apriori(dataSet, minSupport=0.5) + # print L, '\n', supportData + # print '\ngenerateRules\n' + # rules = generateRules(L, supportData, minConf=0.25) + # print rules # # 项目实战 # # 构建美国国会投票记录的事务数据集 @@ -324,20 +328,20 @@ def main(): # rules = generateRules(L, supportData, minConf=0.95) # print rules - # # 项目实战 - # # 发现毒蘑菇的相似特性 - # # 得到全集的数据 - # dataSet = [line.split() for line in open("testData/Apriori_mushroom.dat").readlines()] - # L, supportData = apriori(dataSet, minSupport=0.3) - # # 2表示毒蘑菇,1表示可食用的蘑菇 - # # 找出关于2的频繁子项出来,就知道如果是毒蘑菇,那么出现频繁的也可能是毒蘑菇 - # for item in L[1]: - # if item.intersection('2'): - # print item + # 项目实战 + # 发现毒蘑菇的相似特性 + # 得到全集的数据 + dataSet = [line.split() for line in open("testData/Apriori_mushroom.dat").readlines()] + L, supportData = apriori(dataSet, minSupport=0.3) + # 2表示毒蘑菇,1表示可食用的蘑菇 + # 找出关于2的频繁子项出来,就知道如果是毒蘑菇,那么出现频繁的也可能是毒蘑菇 + for item in L[1]: + if item.intersection('2'): + print item - # for item in L[2]: - # if item.intersection('2'): - # print item + for item in L[2]: + if item.intersection('2'): + print item if __name__ == "__main__": From 57af8aca11fdd946ce911374a45b4bd3febe14df Mon Sep 17 00:00:00 2001 From: jiangzhonglian Date: Sun, 2 Apr 2017 19:35:23 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E6=9B=B4=E6=96=B0=2011.apriori=E7=AE=97?= =?UTF-8?q?=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/python/11.Apriori/apriori.py | 48 ++++++++++++++++---------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/python/11.Apriori/apriori.py b/src/python/11.Apriori/apriori.py index ac13a171..b4daef2b 100644 --- a/src/python/11.Apriori/apriori.py +++ b/src/python/11.Apriori/apriori.py @@ -303,17 +303,17 @@ def main(): # # 收集并准备数据 # dataMat, labelMat = loadDataSet("%s/resources/Apriori_testdata.txt" % project_dir) - # # 现在的的测试 - # # 1. 加载数据 - # dataSet = loadDataSet() - # print(dataSet) - # # 调用 apriori 做购物篮分析 - # # 支持度满足阈值的key集合L,和所有元素和支持度的全集suppoerData - # L, supportData = apriori(dataSet, minSupport=0.5) - # print L, '\n', supportData - # print '\ngenerateRules\n' - # rules = generateRules(L, supportData, minConf=0.25) - # print rules + # 现在的的测试 + # 1. 加载数据 + dataSet = loadDataSet() + print(dataSet) + # 调用 apriori 做购物篮分析 + # 支持度满足阈值的key集合L,和所有元素和支持度的全集suppoerData + L, supportData = apriori(dataSet, minSupport=0.5) + print L, '\n', supportData + print '\ngenerateRules\n' + rules = generateRules(L, supportData, minConf=0.25) + print rules # # 项目实战 # # 构建美国国会投票记录的事务数据集 @@ -328,20 +328,20 @@ def main(): # rules = generateRules(L, supportData, minConf=0.95) # print rules - # 项目实战 - # 发现毒蘑菇的相似特性 - # 得到全集的数据 - dataSet = [line.split() for line in open("testData/Apriori_mushroom.dat").readlines()] - L, supportData = apriori(dataSet, minSupport=0.3) - # 2表示毒蘑菇,1表示可食用的蘑菇 - # 找出关于2的频繁子项出来,就知道如果是毒蘑菇,那么出现频繁的也可能是毒蘑菇 - for item in L[1]: - if item.intersection('2'): - print item + # # 项目实战 + # # 发现毒蘑菇的相似特性 + # # 得到全集的数据 + # dataSet = [line.split() for line in open("testData/Apriori_mushroom.dat").readlines()] + # L, supportData = apriori(dataSet, minSupport=0.3) + # # 2表示毒蘑菇,1表示可食用的蘑菇 + # # 找出关于2的频繁子项出来,就知道如果是毒蘑菇,那么出现频繁的也可能是毒蘑菇 + # for item in L[1]: + # if item.intersection('2'): + # print item - for item in L[2]: - if item.intersection('2'): - print item + # for item in L[2]: + # if item.intersection('2'): + # print item if __name__ == "__main__": From 19b8689d2e3621e9f3b638d755edbf0aedf2469e Mon Sep 17 00:00:00 2001 From: jiangzhonglian Date: Sun, 2 Apr 2017 23:12:36 +0800 Subject: [PATCH 4/4] =?UTF-8?q?=E6=9B=B4=E6=96=B0=2012.fpGrowth=E6=B3=A8?= =?UTF-8?q?=E9=87=8A=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/python/12.FrequentPattemTree/fpGrowth.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/python/12.FrequentPattemTree/fpGrowth.py b/src/python/12.FrequentPattemTree/fpGrowth.py index 46639ca0..42a9f3f7 100644 --- a/src/python/12.FrequentPattemTree/fpGrowth.py +++ b/src/python/12.FrequentPattemTree/fpGrowth.py @@ -41,6 +41,7 @@ def loadSimpDat(): ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'], ['z'], ['r', 'x', 'n', 'o', 's'], + # ['r', 'x', 'n', 'o', 's'], ['y', 'r', 'x', 'z', 'q', 't', 'p'], ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']] return simpDat @@ -49,7 +50,10 @@ def loadSimpDat(): def createInitSet(dataSet): retDict = {} for trans in dataSet: - retDict[frozenset(trans)] = 1 + if not retDict.has_key(frozenset(trans)): + retDict[frozenset(trans)] = 1 + else: + retDict[frozenset(trans)] += 1 return retDict @@ -193,7 +197,7 @@ def findPrefixPath(basePat, treeNode): # prefixPath[1:] 变frozenset后,字母就变无序了 # condPats[frozenset(prefixPath)] = treeNode.count condPats[frozenset(prefixPath[1:])] = treeNode.count - # 递归,寻找改节点的上一个 相同值的链接节点 + # 递归,寻找改节点的下一个 相同值的链接节点 treeNode = treeNode.nodeLink # print treeNode return condPats