频繁项集->关联规则的代码测试完毕

This commit is contained in:
jiangzhonglian
2017-03-17 00:08:16 +08:00
parent 8f25688d54
commit cade394e0e
3 changed files with 92 additions and 37 deletions

View File

@@ -87,12 +87,13 @@ def aprioriGen(Lk, k):
# if first k-2 elements are equal
if L1 == L2:
# set union
print 'union=', Lk[i] | Lk[j], Lk[i], Lk[j]
retList.append(Lk[i] | Lk[j])
return retList
def apriori(dataSet, minSupport=0.5):
"""aprioriGen(循环数据集,然后进行两两合并)
"""apriori
Args:
dataSet 原始数据集
@@ -107,7 +108,7 @@ def apriori(dataSet, minSupport=0.5):
# 计算支持support L1表示满足support的key, supportData表示全集的集合
L1, supportData = scanD(D, C1, minSupport)
# print "L1=", L1, "\n", "outcome: ", supportData
print "L1=", L1, "\n", "outcome: ", supportData
L = [L1]
k = 2
@@ -117,6 +118,7 @@ def apriori(dataSet, minSupport=0.5):
# print '-----------', D, Ck
# 计算合并后的数据集的支持度
# Lk满足支持度的key的list supK表示key全集
# print 'Ck', Ck
Lk, supK = scanD(D, Ck, minSupport)
# 如果字典没有,就追加元素,如果有,就更新元素
supportData.update(supK)
@@ -129,6 +131,83 @@ def apriori(dataSet, minSupport=0.5):
return L, supportData
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
"""calcConf
Args:
freqSet 每一组的各个元素
H 将元素变成set集合
supportData 所有元素的支持度全集
brl bigRuleList的空数组
minConf 置信度的阈值
Returns:
prunedH 记录 可信度大于阈值的集合
"""
# 记录 可信度大于阈值的集合
prunedH = []
for conseq in H:
# 计算自信度的值,例如元素 H=set(1, 2) 分别求supportData[1] 和 supportData[2]
# print 'confidence=', freqSet, conseq, freqSet-conseq
conf = supportData[freqSet]/supportData[freqSet-conseq]
if conf >= minConf:
print freqSet-conseq, '-->', conseq, 'conf:', conf
brl.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
"""rulesFromConseq
Args:
freqSet 每一组的各个元素
H 将元素变成set集合
supportData 所有元素的支持度全集
brl bigRuleList的空数组
minConf 置信度的阈值
Returns:
prunedH 记录 可信度大于阈值的集合
"""
# 去除list列表中第一个出现的冻结的set集合
m = len(H[0])
# 判断freqSet的长度是否>组合的长度+1
if (len(freqSet) > (m + 1)):
# 合并相邻的集合组合为2/3/..n的集合
Hmp1 = aprioriGen(H, m+1)
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
# 如果有2个结果都可以直接返回结果就行下面这个判断是多余我个人觉得
print 'Hmp1=', Hmp1
if (len(Hmp1) > 1):
# print '-------'
# print len(freqSet), len(Hmp1[0]) + 1
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
def generateRules(L, supportData, minConf=0.7):
"""generateRules
Args:
L 频繁项集的全集
supportData 所有元素的支持度全集
minConf 可信度的阈值
Returns:
bigRuleList 关于 (A->B+置信度) 3个字段的组合
"""
bigRuleList = []
# 循环L频繁项集所有的统一大小组合2/../n个的组合从第2组开始
for i in range(1, len(L)):
# 获取频繁项集中每个组合的所有元素
for freqSet in L[i]:
# 组合总的元素并遍历子元素并转化为冻结的set集合再存放到list列表中
H1 = [frozenset([item]) for item in freqSet]
# 2个的组合走else, 2个以上的组合走if
if (i > 1):
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList
def main():
# project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# 1.收集并准备数据
@@ -139,8 +218,10 @@ def main():
print(dataSet)
# 调用 apriori 做购物篮分析
# 支持度满足阈值的key集合L和所有key的全集suppoerData
L, supportData = apriori(dataSet, minSupport=0.7)
print L, supportData
L, supportData = apriori(dataSet, minSupport=0.5)
# print L, supportData
print '\ngenerateRules\n'
generateRules(L, supportData, minConf=0.05)
if __name__ == "__main__":
@@ -171,39 +252,6 @@ if __name__ == "__main__":
def generateRules(L, supportData, minConf=0.7): #supportData is a dict coming from scanD
bigRuleList = []
for i in range(1, len(L)):#only get the sets with two or more items
for freqSet in L[i]:
H1 = [frozenset([item]) for item in freqSet]
if (i > 1):
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
prunedH = [] #create new list to return
for conseq in H:
conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
if conf >= minConf:
print freqSet-conseq,'-->',conseq,'conf:',conf
brl.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
m = len(H[0])
if (len(freqSet) > (m + 1)): #try further merging
Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
if (len(Hmp1) > 1): #need at least two sets to merge
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
def pntRules(ruleList, itemMeaning):
for ruleTup in ruleList:
for item in ruleTup[0]: