更新完2个关于Apriori所有的项目案例

2026-05-07 22:24:18 +08:00 · 2017-03-17 12:53:24 +08:00
parent cade394e0e
commit 3f92837e56
4 changed files with 8254 additions and 102 deletions
--- a/docs/11.使用Apriori算法进行关联分析.md
+++ b/docs/11.使用Apriori算法进行关联分析.md
@@ -33,5 +33,4 @@
    * 2.接下来合并所有剩余规则来创建一个新的规则列表，其中规则右部包含两个元素。
    * 如下图：
    * ![所有可能的项集组合](./11.所有可能的项集组合.png)
-
-
+* 最后： 每次增加频繁项集的大小，Apriori算法都会重新扫描整个数据集，是否有优化空间呢？ 下一章：FP-growth算法等着你的到来
--- a/src/python/11.Apriori/apriori.py
+++ b/src/python/11.Apriori/apriori.py
@@ -87,7 +87,7 @@ def aprioriGen(Lk, k):
            # if first k-2 elements are equal
            if L1 == L2:
                # set union
-                print 'union=', Lk[i] | Lk[j], Lk[i], Lk[j]
+                # print 'union=', Lk[i] | Lk[j], Lk[i], Lk[j]
                retList.append(Lk[i] | Lk[j])
    return retList

@@ -108,7 +108,7 @@ def apriori(dataSet, minSupport=0.5):

    # 计算支持support， L1表示满足support的key, supportData表示全集的集合
    L1, supportData = scanD(D, C1, minSupport)
-    print "L1=", L1, "\n", "outcome: ", supportData
+    # print "L1=", L1, "\n", "outcome: ", supportData

    L = [L1]
    k = 2
@@ -176,7 +176,7 @@ def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
        Hmp1 = aprioriGen(H, m+1)
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        # 如果有2个结果都可以，直接返回结果就行，下面这个判断是多余，我个人觉得
-        print 'Hmp1=', Hmp1
+        # print 'Hmp1=', Hmp1
        if (len(Hmp1) > 1):
            # print '-------'
            # print len(freqSet),  len(Hmp1[0]) + 1
@@ -208,108 +208,117 @@ def generateRules(L, supportData, minConf=0.7):
    return bigRuleList


+def getActionIds():
+    from time import sleep
+    from votesmart import votesmart
+    # votesmart.apikey = 'get your api key first'
+    votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
+    actionIdList = []
+    billTitleList = []
+    fr = open('testData/Apriori_recent20bills.txt')
+    for line in fr.readlines():
+        billNum = int(line.split('\t')[0])
+        try:
+            billDetail = votesmart.votes.getBill(billNum) # api call
+            for action in billDetail.actions:
+                if action.level == 'House' and (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
+                    actionId = int(action.actionId)
+                    print 'bill: %d has actionId: %d' % (billNum, actionId)
+                    actionIdList.append(actionId)
+                    billTitleList.append(line.strip().split('\t')[1])
+        except:
+            print "problem getting bill %d" % billNum
+        sleep(1)                                      # delay to be polite
+    return actionIdList, billTitleList
+
+
+def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
+    itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
+    for billTitle in billTitleList:#fill up itemMeaning list
+        itemMeaning.append('%s -- Nay' % billTitle)
+        itemMeaning.append('%s -- Yea' % billTitle)
+    transDict = {}#list of items in each transaction (politician)
+    voteCount = 2
+    for actionId in actionIdList:
+        sleep(3)
+        print 'getting votes for actionId: %d' % actionId
+        try:
+            voteList = votesmart.votes.getBillActionVotes(actionId)
+            for vote in voteList:
+                if not transDict.has_key(vote.candidateName):
+                    transDict[vote.candidateName] = []
+                    if vote.officeParties == 'Democratic':
+                        transDict[vote.candidateName].append(1)
+                    elif vote.officeParties == 'Republican':
+                        transDict[vote.candidateName].append(0)
+                if vote.action == 'Nay':
+                    transDict[vote.candidateName].append(voteCount)
+                elif vote.action == 'Yea':
+                    transDict[vote.candidateName].append(voteCount + 1)
+        except:
+            print "problem getting actionId: %d" % actionId
+        voteCount += 2
+    return transDict, itemMeaning
+
+
+# 暂时没用上
+# def pntRules(ruleList, itemMeaning):
+#     for ruleTup in ruleList:
+#         for item in ruleTup[0]:
+#             print itemMeaning[item]
+#         print "           -------->"
+#         for item in ruleTup[1]:
+#             print itemMeaning[item]
+#         print "confidence: %f" % ruleTup[2]
+#         print       #print a blank line
+
+
 def main():
+    # 以前的测试
    # project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
-    # 1.收集并准备数据
+    # 收集并准备数据
    # dataMat, labelMat = loadDataSet("%s/resources/Apriori_testdata.txt" % project_dir)

-    # 1. 加载数据
-    dataSet = loadDataSet()
-    print(dataSet)
-    # 调用 apriori 做购物篮分析
-    # 支持度满足阈值的key集合L，和所有key的全集suppoerData
-    L, supportData = apriori(dataSet, minSupport=0.5)
-    # print L, supportData
-    print '\ngenerateRules\n'
-    generateRules(L, supportData, minConf=0.05)
+    # 现在的的测试
+    # # 1. 加载数据
+    # dataSet = loadDataSet()
+    # print(dataSet)
+    # # 调用 apriori 做购物篮分析
+    # # 支持度满足阈值的key集合L，和所有key的全集suppoerData
+    # L, supportData = apriori(dataSet, minSupport=0.5)
+    # # print L, supportData
+    # print '\ngenerateRules\n'
+    # rules = generateRules(L, supportData, minConf=0.05)
+    # print rules
+
+    # 项目实战
+    # 构建美国国会投票记录的事务数据集
+    # actionIdList, billTitleList = getActionIds()
+    # # 测试前2个
+    # # transDict, itemMeaning = getTransList(actionIdList[: 2], billTitleList[: 2])
+    # # transDict 表示 action_id的集合，transDict[key]这个就是action_id对应的选项，例如 [1, 2, 3]
+    # transDict, itemMeaning = getTransList(actionIdList, billTitleList)
+    # # 得到全集的数据
+    # dataSet = [transDict[key] for key in transDict.keys()]
+    # L, supportData = apriori(dataSet, minSupport=0.3)
+    # rules = generateRules(L, supportData, minConf=0.95)
+    # print rules
+
+    # 项目实战
+    # 发现毒蘑菇的相似特性
+    # 得到全集的数据
+    dataSet = [line.split() for line in open("testData/Apriori_mushroom.dat").readlines()]
+    L, supportData = apriori(dataSet, minSupport=0.3)
+    # 2表示毒蘑菇，1表示可食用的蘑菇
+    # 找出关于2的频繁子项出来，就知道如果是毒蘑菇，那么出现频繁的也可能是毒蘑菇
+    for item in L[1]:
+        if item.intersection('2'):
+            print item
+
+    for item in L[2]:
+        if item.intersection('2'):
+            print item


 if __name__ == "__main__":
    main()
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-def pntRules(ruleList, itemMeaning):
-    for ruleTup in ruleList:
-        for item in ruleTup[0]:
-            print itemMeaning[item]
-        print "           -------->"
-        for item in ruleTup[1]:
-            print itemMeaning[item]
-        print "confidence: %f" % ruleTup[2]
-        print       #print a blank line
-        
-            
-# from time import sleep
-# from votesmart import votesmart
-# votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
-# #votesmart.apikey = 'get your api key first'
-# def getActionIds():
-#     actionIdList = []; billTitleList = []
-#     fr = open('recent20bills.txt')
-#     for line in fr.readlines():
-#         billNum = int(line.split('\t')[0])
-#         try:
-#             billDetail = votesmart.votes.getBill(billNum) #api call
-#             for action in billDetail.actions:
-#                 if action.level == 'House' and \
-#                 (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
-#                     actionId = int(action.actionId)
-#                     print 'bill: %d has actionId: %d' % (billNum, actionId)
-#                     actionIdList.append(actionId)
-#                     billTitleList.append(line.strip().split('\t')[1])
-#         except:
-#             print "problem getting bill %d" % billNum
-#         sleep(1)                                      #delay to be polite
-#     return actionIdList, billTitleList
-#
-# def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
-#     itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
-#     for billTitle in billTitleList:#fill up itemMeaning list
-#         itemMeaning.append('%s -- Nay' % billTitle)
-#         itemMeaning.append('%s -- Yea' % billTitle)
-#     transDict = {}#list of items in each transaction (politician)
-#     voteCount = 2
-#     for actionId in actionIdList:
-#         sleep(3)
-#         print 'getting votes for actionId: %d' % actionId
-#         try:
-#             voteList = votesmart.votes.getBillActionVotes(actionId)
-#             for vote in voteList:
-#                 if not transDict.has_key(vote.candidateName):
-#                     transDict[vote.candidateName] = []
-#                     if vote.officeParties == 'Democratic':
-#                         transDict[vote.candidateName].append(1)
-#                     elif vote.officeParties == 'Republican':
-#                         transDict[vote.candidateName].append(0)
-#                 if vote.action == 'Nay':
-#                     transDict[vote.candidateName].append(voteCount)
-#                 elif vote.action == 'Yea':
-#                     transDict[vote.candidateName].append(voteCount + 1)
-#         except:
-#             print "problem getting actionId: %d" % actionId
-#         voteCount += 2
-#     return transDict, itemMeaning
--- a/testData/Apriori_mushroom.dat
+++ b/testData/Apriori_mushroom.dat
--- a/testData/Apriori_recent20bills.txt
+++ b/testData/Apriori_recent20bills.txt
@@ -0,0 +1,20 @@
+12939	Prohibiting Federal Funding of National Public Radio
+12940	Removing Troops from Afghanistan
+12830	Prioritizing Payment of Public Debt
+12857	Calling for a Balanced Budget Constitutional Amendment
+12988	Terminating the Home Affordable Modification Program
+12040	Repealing Business Transaction Reporting Requirements
+12465	Repealing the Health Care Bill
+11451	Science and Technology Funding
+11364	Credit Default Swap Regulations
+11820	"Whistleblower Protection" for Offshore Oil Workers
+12452	Treaty with Russia to Reduce and Limit Offensive Arms
+11318	Derivatives Regulation Modifications
+11414	Repealing "Don't Ask, Don't Tell" After Military Review and Certification
+11719	Unemployment Benefits Extension
+11205	Prohibiting 2010- 2011 Congressional Cost-of-Living Pay Increase
+12747	Prohibiting Use of Federal Funds For Planned Parenthood
+12792	Reducing Federal Funding of the US Institute of Peace
+12827	Prohibiting the Use of Federal Funds for NASCAR Sponsorships
+12445	Mine Safety Act
+12049	2010-2011 Defense Authorizations