diff --git a/docs/12.使用FP-growth算法来高效发现频繁项集.md b/docs/12.使用FP-growth算法来高效发现频繁项集.md index adcb7cb3..437fc99a 100644 --- a/docs/12.使用FP-growth算法来高效发现频繁项集.md +++ b/docs/12.使用FP-growth算法来高效发现频繁项集.md @@ -1,7 +1,7 @@ -# 12.使用FP-growth算法来高效发现频繁项集 # +# 12) 使用FP-growth算法来高效发现频繁项集 -**- 基本过程** +## 基本过程 - 构建FP树 * 对原始数据集扫描两遍 @@ -9,13 +9,21 @@ * 第二遍只扫描频繁元素。 - 从FP树种挖掘频繁项集 -**FP树介绍** - - 是一种紧凑的数据结构,FP代表频繁模式(Frequent Pattem)每个项集以路径的方式存储在树中。 - 包含:项集【集合中的单个元素+出现次数+父节点】 +## FP树介绍 +* FP-growth算法是将数据存储在一种称为FP树的紧凑的数据结构中,FP代表频繁模式(Frequent Pattem)每个项集以路径的方式存储在树中。 +* 包含:项集【集合中的单个元素+出现次数+父节点】 * 与其他树结构相比 * 它通过链接(link)来连接相似元素,被连起来的元素项可以看成一个链表。 * 一个元素项可以出现多次 + * 相似项之间的链接即`节点链接`(node link), 用于快速发现相似项的位置。 +## FP-growth算法 特点 +* 优点: 一般要快于Apriori。(通常性能要好两个数量级以上) +* 缺点: 实现比较困难,在某些数据集上性能会下降。 +* 适用数据类型:标称型数据(离散型数据)。 + +## 项目实战 +* 1.从Twitter文本流中挖掘常用词。 +* 2.从网民页面浏览行为中挖掘常见模式。 diff --git a/src/python/03.DecisionTree/DecisionTree.py b/src/python/03.DecisionTree/DecisionTree.py index 2425803d..dce6eb25 100644 --- a/src/python/03.DecisionTree/DecisionTree.py +++ b/src/python/03.DecisionTree/DecisionTree.py @@ -20,8 +20,6 @@ def createDataSet(): 无需传入参数 Returns: 返回数据集和对应的label标签 - Raises: - """ dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], @@ -44,9 +42,7 @@ def calcShannonEnt(dataSet): Args: dataSet 数据集 Returns: - 返回香农熵的计算值 - Raises: - + 返回 每一组feature下的某个分类下,香农熵的信息期望 """ # 求list的长度,表示计算参与训练的数据量 numEntries = len(dataSet) @@ -81,8 +77,6 @@ def splitDataSet(dataSet, axis, value): value 表示axis列对应的value值 Returns: axis列为value的数据集【该数据集需要排除axis列】 - Raises: - """ retDataSet = [] for featVec in dataSet: @@ -106,10 +100,8 @@ def chooseBestFeatureToSplit(dataSet): dataSet 数据集 Returns: bestFeature 最优的特征列 - Raises: - """ - # 求第一行有多少列的 Feature + # 求第一行有多少列的 Feature, 最后一列是label列嘛 numFeatures = len(dataSet[0]) - 1 # label的信息熵 baseEntropy = calcShannonEnt(dataSet) @@ -147,8 +139,6 @@ def majorityCnt(classList): classList label列的集合 Returns: bestFeature 最优的特征列 - Raises: - """ classCount = {} for vote in classList: @@ -172,6 +162,7 @@ def createTree(dataSet, labels): # 选择最优的列,得到最有列对应的label含义 bestFeat = chooseBestFeatureToSplit(dataSet) + # 获取label的名称 bestFeatLabel = labels[bestFeat] # 初始化myTree myTree = {bestFeatLabel: {}} @@ -190,16 +181,26 @@ def createTree(dataSet, labels): def classify(inputTree, featLabels, testVec): - # 获取tree的第一个节点对应的key值 + """classify(给输入的节点,进行分类) + + Args: + inputTree 决策树模型 + featLabels label标签对应的名称 + testVec 测试输入的数据 + Returns: + classLabel 分类的结果值,需要映射label才能知道名称 + """ + # 获取tree的根节点对于的key值 firstStr = inputTree.keys()[0] - # 获取第一个节点对应的value值 + # 通过key得到根节点对应的value secondDict = inputTree[firstStr] - # 判断根节点的索引值,然后根据testVec来获取对应的树分枝位置 + # 判断根节点名称获取根节点在label中的先后顺序,这样就知道输入的testVec怎么开始对照树来做分类 featIndex = featLabels.index(firstStr) + # 测试数据,找到根节点对应的label位置,也就知道从输入的数据的第几位来开始分类 key = testVec[featIndex] valueOfFeat = secondDict[key] print '+++', firstStr, 'xxx', secondDict, '---', key, '>>>', valueOfFeat - # 判断分枝是否结束 + # 判断分枝是否结束: 判断valueOfFeat是否是dict类型 if isinstance(valueOfFeat, dict): classLabel = classify(valueOfFeat, featLabels, testVec) else: @@ -240,7 +241,7 @@ if __name__ == "__main__": myTree = createTree(myDat, copy.deepcopy(labels)) print myTree # [1, 1]表示要取的分支上的节点位置,对应的结果值 - # print classify(myTree, labels, [1, 1]) + print classify(myTree, labels, [1, 1]) # 画图可视化展现 dtPlot.createPlot(myTree) diff --git a/src/python/03.DecisionTree/DecisionTreePlot.py b/src/python/03.DecisionTree/DecisionTreePlot.py index 3c5c4d31..59b97751 100644 --- a/src/python/03.DecisionTree/DecisionTreePlot.py +++ b/src/python/03.DecisionTree/DecisionTreePlot.py @@ -128,5 +128,5 @@ def retrieveTree(i): return listOfTrees[i] -myTree = retrieveTree(1) -createPlot(myTree) +# myTree = retrieveTree(1) +# createPlot(myTree) diff --git a/src/python/07.AdaBoost/adaboost.py b/src/python/07.AdaBoost/adaboost.py index bf144378..b256f07a 100644 --- a/src/python/07.AdaBoost/adaboost.py +++ b/src/python/07.AdaBoost/adaboost.py @@ -201,7 +201,7 @@ def plotROC(predStrengths, classLabels): ySum += cur[1] # draw line from cur to (cur[0]-delX, cur[1]-delY) # 画点连线 (x1, x2, y1, y2) - print cur[0], cur[0]-delX, cur[1], cur[1]-delY + # print cur[0], cur[0]-delX, cur[1], cur[1]-delY ax.plot([cur[0], cur[0]-delX], [cur[1], cur[1]-delY], c='b') cur = (cur[0]-delX, cur[1]-delY) # 画对角的虚线线 diff --git a/src/python/12.FrequentPattemTree/apriori.py b/src/python/12.FrequentPattemTree/apriori.py deleted file mode 100644 index e172b575..00000000 --- a/src/python/12.FrequentPattemTree/apriori.py +++ /dev/null @@ -1,12 +0,0 @@ -def loadDataSet(): - return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]] -def createC1(dataSet): - c1=[] - for transaction in dataSet: - for item in transaction: - if not [item] in c1: - c1.append([item]) - c1.sort() - return map(frozenset,c1) -def scanD(D,ck,minSupport): - ssCnt = {} \ No newline at end of file diff --git a/src/python/12.FrequentPattemTree/fpGrowth.py b/src/python/12.FrequentPattemTree/fpGrowth.py index 5e3898cc..8f992385 100644 --- a/src/python/12.FrequentPattemTree/fpGrowth.py +++ b/src/python/12.FrequentPattemTree/fpGrowth.py @@ -1,19 +1,178 @@ +''' +Created on Jun 14, 2011 +FP-Growth FP means frequent pattern +the FP-Growth algorithm needs: +1. FP-tree (class treeNode) +2. header table (use dict) + +This finds frequent itemsets similar to apriori but does not find association rules. + +@author: Peter/片刻 +''' +print(__doc__) + + class treeNode: - def __init__(self,nameValue,numOccur,parentNode): + def __init__(self, nameValue, numOccur, parentNode): self.name = nameValue self.count = numOccur self.nodeLink = None + # needs to be updated self.parent = parentNode self.children = {} - def inc(self,numOccur): + + def inc(self, numOccur): self.count += numOccur - def disp(self,ind=1): - print(' '*ind,self.name,' ',self.count) + + def disp(self, ind=1): + print ' '*ind, self.name, ' ', self.count for child in self.children.values(): child.disp(ind+1) - if __name__ == "__main__": - import fpGrowth - rootNode = fpGrowth.treeNode('pyramid',9,None) - rootNode.children['eye']=fpGrowth.treeNode('eye',13,None) - rootNode.disp() \ No newline at end of file + +def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine + headerTable = {} + #go over dataSet twice + for trans in dataSet:#first pass counts frequency of occurance + for item in trans: + headerTable[item] = headerTable.get(item, 0) + dataSet[trans] + for k in headerTable.keys(): #remove items not meeting minSup + if headerTable[k] < minSup: + del(headerTable[k]) + freqItemSet = set(headerTable.keys()) + #print 'freqItemSet: ',freqItemSet + if len(freqItemSet) == 0: return None, None #if no items meet min support -->get out + for k in headerTable: + headerTable[k] = [headerTable[k], None] #reformat headerTable to use Node link + #print 'headerTable: ',headerTable + retTree = treeNode('Null Set', 1, None) #create tree + for tranSet, count in dataSet.items(): #go through dataset 2nd time + localD = {} + for item in tranSet: #put transaction items in order + if item in freqItemSet: + localD[item] = headerTable[item][0] + if len(localD) > 0: + orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)] + updateTree(orderedItems, retTree, headerTable, count)#populate tree with ordered freq itemset + return retTree, headerTable #return tree and header table + + +def updateTree(items, inTree, headerTable, count): + if items[0] in inTree.children:#check if orderedItems[0] in retTree.children + inTree.children[items[0]].inc(count) #incrament count + else: #add items[0] to inTree.children + inTree.children[items[0]] = treeNode(items[0], count, inTree) + if headerTable[items[0]][1] == None: #update header table + headerTable[items[0]][1] = inTree.children[items[0]] + else: + updateHeader(headerTable[items[0]][1], inTree.children[items[0]]) + if len(items) > 1:#call updateTree() with remaining ordered items + updateTree(items[1::], inTree.children[items[0]], headerTable, count) + + +def updateHeader(nodeToTest, targetNode): #this version does not use recursion + while (nodeToTest.nodeLink != None): #Do not use recursion to traverse a linked list! + nodeToTest = nodeToTest.nodeLink + nodeToTest.nodeLink = targetNode + + +def ascendTree(leafNode, prefixPath): #ascends from leaf node to root + if leafNode.parent != None: + prefixPath.append(leafNode.name) + ascendTree(leafNode.parent, prefixPath) + + +def findPrefixPath(basePat, treeNode): #treeNode comes from header table + condPats = {} + while treeNode != None: + prefixPath = [] + ascendTree(treeNode, prefixPath) + if len(prefixPath) > 1: + condPats[frozenset(prefixPath[1:])] = treeNode.count + treeNode = treeNode.nodeLink + return condPats + + +def mineTree(inTree, headerTable, minSup, preFix, freqItemList): + bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]#(sort header table) + for basePat in bigL: #start from bottom of header table + newFreqSet = preFix.copy() + newFreqSet.add(basePat) + #print 'finalFrequent Item: ',newFreqSet #append to set + freqItemList.append(newFreqSet) + condPattBases = findPrefixPath(basePat, headerTable[basePat][1]) + #print 'condPattBases :',basePat, condPattBases + #2. construct cond FP-tree from cond. pattern base + myCondTree, myHead = createTree(condPattBases, minSup) + #print 'head from conditional tree: ', myHead + if myHead != None: #3. mine cond. FP-tree + #print 'conditional tree for: ',newFreqSet + #myCondTree.disp(1) + mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList) + + +def loadSimpDat(): + simpDat = [['r', 'z', 'h', 'j', 'p'], + ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'], + ['z'], + ['r', 'x', 'n', 'o', 's'], + ['y', 'r', 'x', 'z', 'q', 't', 'p'], + ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']] + return simpDat + + +def createInitSet(dataSet): + retDict = {} + for trans in dataSet: + retDict[frozenset(trans)] = 1 + return retDict + + +import twitter +from time import sleep +import re + + +def textParse(bigString): + urlsRemoved = re.sub('(http:[/][/]|www.)([a-z]|[A-Z]|[0-9]|[/.]|[~])*', '', bigString) + listOfTokens = re.split(r'\W*', urlsRemoved) + return [tok.lower() for tok in listOfTokens if len(tok) > 2] + + +def getLotsOfTweets(searchStr): + CONSUMER_KEY = '' + CONSUMER_SECRET = '' + ACCESS_TOKEN_KEY = '' + ACCESS_TOKEN_SECRET = '' + api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET, + access_token_key=ACCESS_TOKEN_KEY, + access_token_secret=ACCESS_TOKEN_SECRET) + #you can get 1500 results 15 pages * 100 per page + resultsPages = [] + for i in range(1,15): + print "fetching page %d" % i + searchResults = api.GetSearch(searchStr, per_page=100, page=i) + resultsPages.append(searchResults) + sleep(6) + return resultsPages + + +def mineTweets(tweetArr, minSup=5): + parsedList = [] + for i in range(14): + for j in range(100): + parsedList.append(textParse(tweetArr[i][j].text)) + initSet = createInitSet(parsedList) + myFPtree, myHeaderTab = createTree(initSet, minSup) + myFreqList = [] + mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) + return myFreqList + + +#minSup = 3 +#simpDat = loadSimpDat() +#initSet = createInitSet(simpDat) +#myFPtree, myHeaderTab = createTree(initSet, minSup) +#myFPtree.disp() +#myFreqList = [] +#mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) diff --git a/testResult/tree.pdf b/testResult/tree.pdf index 6cb99e20..2a7e0a8d 100644 Binary files a/testResult/tree.pdf and b/testResult/tree.pdf differ