From adf961c5db7764c54f60a6fcfdc3315c8c6cacf7 Mon Sep 17 00:00:00 2001
From: jiangzhonglian <jiang-s@163.com>
Date: Sun, 2 Apr 2017 16:03:48 +0800
Subject: [PATCH 1/4] =?UTF-8?q?=E6=9B=B4=E6=96=B0=209.=E6=A0=91=E5=9B=9E?=
 =?UTF-8?q?=E5=BD=92=E7=9A=84=E6=B3=A8=E9=87=8A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/python/09.RegTrees/regTrees.py    | 23 +++++++++++++----------
 src/python/09.RegTrees/treeExplore.py |  1 -
 src/python/11.Apriori/apriori.py      |  2 +-
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/python/09.RegTrees/regTrees.py b/src/python/09.RegTrees/regTrees.py
index c6077e0f..b621c1cb 100644
--- a/src/python/09.RegTrees/regTrees.py
+++ b/src/python/09.RegTrees/regTrees.py
@@ -103,6 +103,7 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
     bestS, bestIndex, bestValue = inf, 0, 0
     # 循环处理每一列对应的feature值
     for featIndex in range(n-1):
+        # [0]表示这一列的[所有行]，不要[0]就是一个array[[所有行]]
         for splitVal in set(dataSet[:, featIndex].T.tolist()[0]):
             # 对该列进行分组，然后组内的成员的val值进行 二元切分
             mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
@@ -236,7 +237,7 @@ def linearSolve(dataSet):
     # 如果矩阵的逆不存在，会造成程序异常
     if linalg.det(xTx) == 0.0:
         raise NameError('This matrix is singular, cannot do inverse,\ntry increasing the second value of ops')
-    # 最小二乘法求最优解
+    # 最小二乘法求最优解:  w0*1+w1*x1=y
     ws = xTx.I * (X.T * Y)
     return ws, X, Y
 
@@ -291,7 +292,9 @@ if __name__ == "__main__":
     # # 回归树
     # myDat = loadDataSet('testData/RT_data1.txt')
     # # myDat = loadDataSet('testData/RT_data2.txt')
+    # # print 'myDat=', myDat
     # myMat = mat(myDat)
+    # # print 'myMat=',  myMat
     # myTree = createTree(myMat)
     # print myTree
 
@@ -301,7 +304,7 @@ if __name__ == "__main__":
     # myTree = createTree(myMat, ops=(0, 1))
     # print myTree
 
-    # # 2.后剪枝就是：通过测试数据，对预测模型进行合并判断
+    # # 2. 后剪枝就是：通过测试数据，对预测模型进行合并判断
     # myDatTest = loadDataSet('testData/RT_data3test.txt')
     # myMat2Test = mat(myDatTest)
     # myFinalTree = prune(myTree, myMat2Test)
@@ -330,11 +333,11 @@ if __name__ == "__main__":
     print myTree2
     print "模型树:", corrcoef(yHat2, testMat[:, 1],rowvar=0)[0, 1]
 
-    # # 线性回归
-    # ws, X, Y = linearSolve(trainMat)
-    # print ws
-    # m = len(testMat[:, 0])
-    # yHat3 = mat(zeros((m, 1)))
-    # for i in range(shape(testMat)[0]):
-    #     yHat3[i] = testMat[i, 0]*ws[1, 0] + ws[0, 0]
-    # print "线性回归:", corrcoef(yHat3, testMat[:, 1],rowvar=0)[0, 1]
+    # 线性回归
+    ws, X, Y = linearSolve(trainMat)
+    print ws
+    m = len(testMat[:, 0])
+    yHat3 = mat(zeros((m, 1)))
+    for i in range(shape(testMat)[0]):
+        yHat3[i] = testMat[i, 0]*ws[1, 0] + ws[0, 0]
+    print "线性回归:", corrcoef(yHat3, testMat[:, 1],rowvar=0)[0, 1]
diff --git a/src/python/09.RegTrees/treeExplore.py b/src/python/09.RegTrees/treeExplore.py
index a426a342..33665a6a 100644
--- a/src/python/09.RegTrees/treeExplore.py
+++ b/src/python/09.RegTrees/treeExplore.py
@@ -100,7 +100,6 @@ def main(root):
     # 退出按钮
     Button(root, text="退出", fg="black", command=quit).grid(row=1, column=2)
 
-
     # 创建一个画板 canvas
     reDraw.f = Figure(figsize=(5, 4), dpi=100)
     reDraw.canvas = FigureCanvasTkAgg(reDraw.f, master=root)
diff --git a/src/python/11.Apriori/apriori.py b/src/python/11.Apriori/apriori.py
index 3ba163d0..0babab1f 100644
--- a/src/python/11.Apriori/apriori.py
+++ b/src/python/11.Apriori/apriori.py
@@ -188,7 +188,7 @@ def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
     """
     # H[0]是freqSet的元素组合的第一个元素
     m = len(H[0])
-    # 判断，freqSet的长度是否>组合的长度+1, 避免过度匹配 例如：计算过一边{1,2,3} 和 {1, 2} {1, 3}，就没必要再计算了 {1,2,3}和{1,2,3}的组合关系
+    # 判断，freqSet的长度是否>组合的长度+1, 避免过度匹配 例如：计算过一边{1,2,3} 和 {1, 2} {1, 3}，就没必要再计算了进一步合并来计算 {1,2,3}和{1,2,3}的组合关系
     if (len(freqSet) > (m + 1)):
         print 'freqSet******************', len(freqSet), m + 1, freqSet, H, H[0]
         # 合并数据集集合，组合为2/3/..n的集合

From 1c766d615f2f9d26925fcf325e9a47725403450d Mon Sep 17 00:00:00 2001
From: jiangzhonglian <jiang-s@163.com>
Date: Sun, 2 Apr 2017 19:35:00 +0800
Subject: [PATCH 2/4] =?UTF-8?q?=E6=9B=B4=E6=96=B0=2011.apriori=E7=AE=97?=
 =?UTF-8?q?=E6=B3=95=E6=B3=A8=E9=87=8A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/python/11.Apriori/apriori.py | 58 +++++++++++++++++---------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/src/python/11.Apriori/apriori.py b/src/python/11.Apriori/apriori.py
index 0babab1f..ac13a171 100644
--- a/src/python/11.Apriori/apriori.py
+++ b/src/python/11.Apriori/apriori.py
@@ -43,7 +43,7 @@ def scanD(D, Ck, minSupport):
 
     Args:
         D    原始数据集, D用来判断，CK中的元素，是否存在于原数据D中
-        Ck   合并后的数据集
+        Ck   所有key的元素集合
     Returns:
         retList      支持度大于阈值的集合
         supportData  全量key的字典集合
@@ -141,6 +141,8 @@ def apriori(dataSet, minSupport=0.5):
         if len(Lk) == 0:
             break
         # Lk表示满足频繁子项的集合，L元素在增加
+        # l=[[set(1), set(2), set(3)]]
+        # l=[[set(1), set(2), set(3)]  [set(1, 2), set(2, 3)]]
         L.append(Lk)
         k += 1
         # print 'k=', k, len(L[k-2])
@@ -157,7 +159,7 @@ def calcConf(freqSet, H, supportData, brl, minConf=0.7):
         brl bigRuleList的空数组
         minConf 置信度的阈值
     Returns:
-        prunedH 记录 可信度大于阈值的集合
+        prunedH 记录 置信度大于阈值的集合
     """
     # 记录 可信度大于阈值的集合
     prunedH = []
@@ -209,7 +211,7 @@ def generateRules(L, supportData, minConf=0.7):
     Args:
         L 频繁项集的全集
         supportData 所有元素和支持度的全集
-        minConf 可信度的阈值
+        minConf 置信度的阈值
     Returns:
         bigRuleList 关于 (A->B+置信度) 3个字段的组合
     """
@@ -217,7 +219,9 @@ def generateRules(L, supportData, minConf=0.7):
     # 循环L频繁项集，所有的统一大小组合（2/../n个的组合，从第2组开始）
     for i in range(1, len(L)):
         # 获取频繁项集中每个组合的所有元素
+        # [[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])], [frozenset([2, 3, 5])]]
         for freqSet in L[i]:
+            # 假设：freqSet=frozenset([1, 3])  H1=[1, 3]
             # 组合总的元素并遍历子元素，并转化为冻结的set集合，再存放到list列表中
             H1 = [frozenset([item]) for item in freqSet]
             # 2个的组合，走else, 2个以上的组合，走if
@@ -299,17 +303,17 @@ def main():
     # # 收集并准备数据
     # dataMat, labelMat = loadDataSet("%s/resources/Apriori_testdata.txt" % project_dir)
 
-    # 现在的的测试
-    # 1. 加载数据
-    dataSet = loadDataSet()
-    print(dataSet)
-    # 调用 apriori 做购物篮分析
-    # 支持度满足阈值的key集合L，和所有元素和支持度的全集suppoerData
-    L, supportData = apriori(dataSet, minSupport=0.5)
-    print L, '\n', supportData
-    print '\ngenerateRules\n'
-    rules = generateRules(L, supportData, minConf=0.25)
-    print rules
+    # # 现在的的测试
+    # # 1. 加载数据
+    # dataSet = loadDataSet()
+    # print(dataSet)
+    # # 调用 apriori 做购物篮分析
+    # # 支持度满足阈值的key集合L，和所有元素和支持度的全集suppoerData
+    # L, supportData = apriori(dataSet, minSupport=0.5)
+    # print L, '\n', supportData
+    # print '\ngenerateRules\n'
+    # rules = generateRules(L, supportData, minConf=0.25)
+    # print rules
 
     # # 项目实战
     # # 构建美国国会投票记录的事务数据集
@@ -324,20 +328,20 @@ def main():
     # rules = generateRules(L, supportData, minConf=0.95)
     # print rules
 
-    # # 项目实战
-    # # 发现毒蘑菇的相似特性
-    # # 得到全集的数据
-    # dataSet = [line.split() for line in open("testData/Apriori_mushroom.dat").readlines()]
-    # L, supportData = apriori(dataSet, minSupport=0.3)
-    # # 2表示毒蘑菇，1表示可食用的蘑菇
-    # # 找出关于2的频繁子项出来，就知道如果是毒蘑菇，那么出现频繁的也可能是毒蘑菇
-    # for item in L[1]:
-    #     if item.intersection('2'):
-    #         print item
+    # 项目实战
+    # 发现毒蘑菇的相似特性
+    # 得到全集的数据
+    dataSet = [line.split() for line in open("testData/Apriori_mushroom.dat").readlines()]
+    L, supportData = apriori(dataSet, minSupport=0.3)
+    # 2表示毒蘑菇，1表示可食用的蘑菇
+    # 找出关于2的频繁子项出来，就知道如果是毒蘑菇，那么出现频繁的也可能是毒蘑菇
+    for item in L[1]:
+        if item.intersection('2'):
+            print item
 
-    # for item in L[2]:
-    #     if item.intersection('2'):
-    #         print item
+    for item in L[2]:
+        if item.intersection('2'):
+            print item
 
 
 if __name__ == "__main__":

From 57af8aca11fdd946ce911374a45b4bd3febe14df Mon Sep 17 00:00:00 2001
From: jiangzhonglian <jiang-s@163.com>
Date: Sun, 2 Apr 2017 19:35:23 +0800
Subject: [PATCH 3/4] =?UTF-8?q?=E6=9B=B4=E6=96=B0=2011.apriori=E7=AE=97?=
 =?UTF-8?q?=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/python/11.Apriori/apriori.py | 48 ++++++++++++++++----------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/python/11.Apriori/apriori.py b/src/python/11.Apriori/apriori.py
index ac13a171..b4daef2b 100644
--- a/src/python/11.Apriori/apriori.py
+++ b/src/python/11.Apriori/apriori.py
@@ -303,17 +303,17 @@ def main():
     # # 收集并准备数据
     # dataMat, labelMat = loadDataSet("%s/resources/Apriori_testdata.txt" % project_dir)
 
-    # # 现在的的测试
-    # # 1. 加载数据
-    # dataSet = loadDataSet()
-    # print(dataSet)
-    # # 调用 apriori 做购物篮分析
-    # # 支持度满足阈值的key集合L，和所有元素和支持度的全集suppoerData
-    # L, supportData = apriori(dataSet, minSupport=0.5)
-    # print L, '\n', supportData
-    # print '\ngenerateRules\n'
-    # rules = generateRules(L, supportData, minConf=0.25)
-    # print rules
+    # 现在的的测试
+    # 1. 加载数据
+    dataSet = loadDataSet()
+    print(dataSet)
+    # 调用 apriori 做购物篮分析
+    # 支持度满足阈值的key集合L，和所有元素和支持度的全集suppoerData
+    L, supportData = apriori(dataSet, minSupport=0.5)
+    print L, '\n', supportData
+    print '\ngenerateRules\n'
+    rules = generateRules(L, supportData, minConf=0.25)
+    print rules
 
     # # 项目实战
     # # 构建美国国会投票记录的事务数据集
@@ -328,20 +328,20 @@ def main():
     # rules = generateRules(L, supportData, minConf=0.95)
     # print rules
 
-    # 项目实战
-    # 发现毒蘑菇的相似特性
-    # 得到全集的数据
-    dataSet = [line.split() for line in open("testData/Apriori_mushroom.dat").readlines()]
-    L, supportData = apriori(dataSet, minSupport=0.3)
-    # 2表示毒蘑菇，1表示可食用的蘑菇
-    # 找出关于2的频繁子项出来，就知道如果是毒蘑菇，那么出现频繁的也可能是毒蘑菇
-    for item in L[1]:
-        if item.intersection('2'):
-            print item
+    # # 项目实战
+    # # 发现毒蘑菇的相似特性
+    # # 得到全集的数据
+    # dataSet = [line.split() for line in open("testData/Apriori_mushroom.dat").readlines()]
+    # L, supportData = apriori(dataSet, minSupport=0.3)
+    # # 2表示毒蘑菇，1表示可食用的蘑菇
+    # # 找出关于2的频繁子项出来，就知道如果是毒蘑菇，那么出现频繁的也可能是毒蘑菇
+    # for item in L[1]:
+    #     if item.intersection('2'):
+    #         print item
 
-    for item in L[2]:
-        if item.intersection('2'):
-            print item
+    # for item in L[2]:
+    #     if item.intersection('2'):
+    #         print item
 
 
 if __name__ == "__main__":

From 19b8689d2e3621e9f3b638d755edbf0aedf2469e Mon Sep 17 00:00:00 2001
From: jiangzhonglian <jiang-s@163.com>
Date: Sun, 2 Apr 2017 23:12:36 +0800
Subject: [PATCH 4/4] =?UTF-8?q?=E6=9B=B4=E6=96=B0=2012.fpGrowth=E6=B3=A8?=
 =?UTF-8?q?=E9=87=8A=E8=AF=B4=E6=98=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/python/12.FrequentPattemTree/fpGrowth.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/python/12.FrequentPattemTree/fpGrowth.py b/src/python/12.FrequentPattemTree/fpGrowth.py
index 46639ca0..42a9f3f7 100644
--- a/src/python/12.FrequentPattemTree/fpGrowth.py
+++ b/src/python/12.FrequentPattemTree/fpGrowth.py
@@ -41,6 +41,7 @@ def loadSimpDat():
                ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
                ['z'],
                ['r', 'x', 'n', 'o', 's'],
+            #    ['r', 'x', 'n', 'o', 's'],
                ['y', 'r', 'x', 'z', 'q', 't', 'p'],
                ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
     return simpDat
@@ -49,7 +50,10 @@ def loadSimpDat():
 def createInitSet(dataSet):
     retDict = {}
     for trans in dataSet:
-        retDict[frozenset(trans)] = 1
+        if not retDict.has_key(frozenset(trans)):
+            retDict[frozenset(trans)] = 1
+        else:
+            retDict[frozenset(trans)] += 1
     return retDict
 
 
@@ -193,7 +197,7 @@ def findPrefixPath(basePat, treeNode):
             # prefixPath[1:] 变frozenset后，字母就变无序了
             # condPats[frozenset(prefixPath)] = treeNode.count
             condPats[frozenset(prefixPath[1:])] = treeNode.count
-        # 递归，寻找改节点的上一个 相同值的链接节点
+        # 递归，寻找改节点的下一个 相同值的链接节点
         treeNode = treeNode.nodeLink
         # print treeNode
     return condPats