更新完2个关于Apriori所有的项目案例

This commit is contained in:
jiangzhonglian
2017-03-17 12:53:24 +08:00
parent cade394e0e
commit 3f92837e56
4 changed files with 8254 additions and 102 deletions

View File

@@ -33,5 +33,4 @@
* 2.接下来合并所有剩余规则来创建一个新的规则列表,其中规则右部包含两个元素。
* 如下图:
* ![所有可能的项集组合](./11.所有可能的项集组合.png)
* 最后: 每次增加频繁项集的大小Apriori算法都会重新扫描整个数据集是否有优化空间呢 下一章FP-growth算法等着你的到来

View File

@@ -87,7 +87,7 @@ def aprioriGen(Lk, k):
# if first k-2 elements are equal
if L1 == L2:
# set union
print 'union=', Lk[i] | Lk[j], Lk[i], Lk[j]
# print 'union=', Lk[i] | Lk[j], Lk[i], Lk[j]
retList.append(Lk[i] | Lk[j])
return retList
@@ -108,7 +108,7 @@ def apriori(dataSet, minSupport=0.5):
# 计算支持support L1表示满足support的key, supportData表示全集的集合
L1, supportData = scanD(D, C1, minSupport)
print "L1=", L1, "\n", "outcome: ", supportData
# print "L1=", L1, "\n", "outcome: ", supportData
L = [L1]
k = 2
@@ -176,7 +176,7 @@ def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
Hmp1 = aprioriGen(H, m+1)
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
# 如果有2个结果都可以直接返回结果就行下面这个判断是多余我个人觉得
print 'Hmp1=', Hmp1
# print 'Hmp1=', Hmp1
if (len(Hmp1) > 1):
# print '-------'
# print len(freqSet), len(Hmp1[0]) + 1
@@ -208,108 +208,117 @@ def generateRules(L, supportData, minConf=0.7):
return bigRuleList
def getActionIds():
from time import sleep
from votesmart import votesmart
# votesmart.apikey = 'get your api key first'
votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
actionIdList = []
billTitleList = []
fr = open('testData/Apriori_recent20bills.txt')
for line in fr.readlines():
billNum = int(line.split('\t')[0])
try:
billDetail = votesmart.votes.getBill(billNum) # api call
for action in billDetail.actions:
if action.level == 'House' and (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
actionId = int(action.actionId)
print 'bill: %d has actionId: %d' % (billNum, actionId)
actionIdList.append(actionId)
billTitleList.append(line.strip().split('\t')[1])
except:
print "problem getting bill %d" % billNum
sleep(1) # delay to be polite
return actionIdList, billTitleList
def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
for billTitle in billTitleList:#fill up itemMeaning list
itemMeaning.append('%s -- Nay' % billTitle)
itemMeaning.append('%s -- Yea' % billTitle)
transDict = {}#list of items in each transaction (politician)
voteCount = 2
for actionId in actionIdList:
sleep(3)
print 'getting votes for actionId: %d' % actionId
try:
voteList = votesmart.votes.getBillActionVotes(actionId)
for vote in voteList:
if not transDict.has_key(vote.candidateName):
transDict[vote.candidateName] = []
if vote.officeParties == 'Democratic':
transDict[vote.candidateName].append(1)
elif vote.officeParties == 'Republican':
transDict[vote.candidateName].append(0)
if vote.action == 'Nay':
transDict[vote.candidateName].append(voteCount)
elif vote.action == 'Yea':
transDict[vote.candidateName].append(voteCount + 1)
except:
print "problem getting actionId: %d" % actionId
voteCount += 2
return transDict, itemMeaning
# 暂时没用上
# def pntRules(ruleList, itemMeaning):
# for ruleTup in ruleList:
# for item in ruleTup[0]:
# print itemMeaning[item]
# print " -------->"
# for item in ruleTup[1]:
# print itemMeaning[item]
# print "confidence: %f" % ruleTup[2]
# print #print a blank line
def main():
# 以前的测试
# project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# 1.收集并准备数据
# 收集并准备数据
# dataMat, labelMat = loadDataSet("%s/resources/Apriori_testdata.txt" % project_dir)
# 1. 加载数据
dataSet = loadDataSet()
print(dataSet)
# 调用 apriori 做购物篮分析
# 支持度满足阈值的key集合L和所有key的全集suppoerData
L, supportData = apriori(dataSet, minSupport=0.5)
# print L, supportData
print '\ngenerateRules\n'
generateRules(L, supportData, minConf=0.05)
# 现在的的测试
# # 1. 加载数据
# dataSet = loadDataSet()
# print(dataSet)
# # 调用 apriori 做购物篮分析
# # 支持度满足阈值的key集合L和所有key的全集suppoerData
# L, supportData = apriori(dataSet, minSupport=0.5)
# # print L, supportData
# print '\ngenerateRules\n'
# rules = generateRules(L, supportData, minConf=0.05)
# print rules
# 项目实战
# 构建美国国会投票记录的事务数据集
# actionIdList, billTitleList = getActionIds()
# # 测试前2个
# # transDict, itemMeaning = getTransList(actionIdList[: 2], billTitleList[: 2])
# # transDict 表示 action_id的集合transDict[key]这个就是action_id对应的选项例如 [1, 2, 3]
# transDict, itemMeaning = getTransList(actionIdList, billTitleList)
# # 得到全集的数据
# dataSet = [transDict[key] for key in transDict.keys()]
# L, supportData = apriori(dataSet, minSupport=0.3)
# rules = generateRules(L, supportData, minConf=0.95)
# print rules
# 项目实战
# 发现毒蘑菇的相似特性
# 得到全集的数据
dataSet = [line.split() for line in open("testData/Apriori_mushroom.dat").readlines()]
L, supportData = apriori(dataSet, minSupport=0.3)
# 2表示毒蘑菇1表示可食用的蘑菇
# 找出关于2的频繁子项出来就知道如果是毒蘑菇那么出现频繁的也可能是毒蘑菇
for item in L[1]:
if item.intersection('2'):
print item
for item in L[2]:
if item.intersection('2'):
print item
if __name__ == "__main__":
main()
def pntRules(ruleList, itemMeaning):
for ruleTup in ruleList:
for item in ruleTup[0]:
print itemMeaning[item]
print " -------->"
for item in ruleTup[1]:
print itemMeaning[item]
print "confidence: %f" % ruleTup[2]
print #print a blank line
# from time import sleep
# from votesmart import votesmart
# votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
# #votesmart.apikey = 'get your api key first'
# def getActionIds():
# actionIdList = []; billTitleList = []
# fr = open('recent20bills.txt')
# for line in fr.readlines():
# billNum = int(line.split('\t')[0])
# try:
# billDetail = votesmart.votes.getBill(billNum) #api call
# for action in billDetail.actions:
# if action.level == 'House' and \
# (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
# actionId = int(action.actionId)
# print 'bill: %d has actionId: %d' % (billNum, actionId)
# actionIdList.append(actionId)
# billTitleList.append(line.strip().split('\t')[1])
# except:
# print "problem getting bill %d" % billNum
# sleep(1) #delay to be polite
# return actionIdList, billTitleList
#
# def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
# itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
# for billTitle in billTitleList:#fill up itemMeaning list
# itemMeaning.append('%s -- Nay' % billTitle)
# itemMeaning.append('%s -- Yea' % billTitle)
# transDict = {}#list of items in each transaction (politician)
# voteCount = 2
# for actionId in actionIdList:
# sleep(3)
# print 'getting votes for actionId: %d' % actionId
# try:
# voteList = votesmart.votes.getBillActionVotes(actionId)
# for vote in voteList:
# if not transDict.has_key(vote.candidateName):
# transDict[vote.candidateName] = []
# if vote.officeParties == 'Democratic':
# transDict[vote.candidateName].append(1)
# elif vote.officeParties == 'Republican':
# transDict[vote.candidateName].append(0)
# if vote.action == 'Nay':
# transDict[vote.candidateName].append(voteCount)
# elif vote.action == 'Yea':
# transDict[vote.candidateName].append(voteCount + 1)
# except:
# print "problem getting actionId: %d" % actionId
# voteCount += 2
# return transDict, itemMeaning

8124
testData/Apriori_mushroom.dat Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,20 @@
12939 Prohibiting Federal Funding of National Public Radio
12940 Removing Troops from Afghanistan
12830 Prioritizing Payment of Public Debt
12857 Calling for a Balanced Budget Constitutional Amendment
12988 Terminating the Home Affordable Modification Program
12040 Repealing Business Transaction Reporting Requirements
12465 Repealing the Health Care Bill
11451 Science and Technology Funding
11364 Credit Default Swap Regulations
11820 "Whistleblower Protection" for Offshore Oil Workers
12452 Treaty with Russia to Reduce and Limit Offensive Arms
11318 Derivatives Regulation Modifications
11414 Repealing "Don't Ask, Don't Tell" After Military Review and Certification
11719 Unemployment Benefits Extension
11205 Prohibiting 2010- 2011 Congressional Cost-of-Living Pay Increase
12747 Prohibiting Use of Federal Funds For Planned Parenthood
12792 Reducing Federal Funding of the US Institute of Peace
12827 Prohibiting the Use of Federal Funds for NASCAR Sponsorships
12445 Mine Safety Act
12049 2010-2011 Defense Authorizations