From fc481871c4571a2e5142e203487bbc44d91daf80 Mon Sep 17 00:00:00 2001 From: jiangzhonglian Date: Sat, 18 Mar 2017 20:55:22 +0800 Subject: [PATCH] =?UTF-8?q?2017-03-18=5F=E6=B7=BB=E5=8A=A0=E4=BA=A4?= =?UTF-8?q?=E6=B5=81=E7=9A=84=E8=AF=BE=E7=A8=8B=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/python/02.kNN/kNN.py | 18 ++++-- src/python/03.DecisionTree/DecisionTree.py | 8 +-- src/python/04.NaiveBayes/bayes.py | 24 ++++---- src/python/12.FrequentPattemTree/fpGrowth.py | 59 +++++++++++++++----- 4 files changed, 74 insertions(+), 35 deletions(-) diff --git a/src/python/02.kNN/kNN.py b/src/python/02.kNN/kNN.py index 6cdd9695..271aa621 100644 --- a/src/python/02.kNN/kNN.py +++ b/src/python/02.kNN/kNN.py @@ -23,6 +23,8 @@ def createDataSet(): def classify0(inX, dataSet, labels, k): """ + inx[1,2,3] + DS=[[1,2,3],[1,2,0]] inX: 用于分类的输入向量 dataSet: 输入的训练样本集 labels: 标签向量 @@ -36,6 +38,10 @@ def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] # tile生成和训练样本对应的矩阵,并与训练样本求差 diffMat = tile(inX, (dataSetSize, 1)) - dataSet + """ + [[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]] + (A1-A2)^2+(B1-B2)^2+(c1-c2)^2 + """ # 取平方 sqDiffMat = diffMat ** 2 # 将矩阵的每一行相加 @@ -64,7 +70,7 @@ def test1(): group, labels = createDataSet() print str(group) print str(labels) - print classify0([0, 0], group, labels, 3) + print classify0([0.1, 0.1], group, labels, 3) # ---------------------------------------------------------------------------------------- @@ -119,7 +125,7 @@ def datingClassTest(): """ hoRatio = 0.9 # 测试范围,一部分测试一部分作为样本 # 从文件中加载数据 - datingDataMat, datingLabels = file2matrix('../../../testData/datingTestSet2.txt') # load data setfrom file + datingDataMat, datingLabels = file2matrix('testData/datingTestSet2.txt') # load data setfrom file # 归一化数据 normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] @@ -153,7 +159,7 @@ def img2vector(filename): def handwritingClassTest(): # 1. 导入数据 hwLabels = [] - trainingFileList = listdir('../../../testData/trainingDigits') # load the training set + trainingFileList = listdir('testData/trainingDigits') # load the training set m = len(trainingFileList) trainingMat = zeros((m, 1024)) for i in range(m): @@ -161,17 +167,17 @@ def handwritingClassTest(): fileStr = fileNameStr.split('.')[0] # take off .txt classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) - trainingMat[i, :] = img2vector('../../../testData/trainingDigits/%s' % fileNameStr) + trainingMat[i, :] = img2vector('testData/trainingDigits/%s' % fileNameStr) # 2. 导入测试数据 - testFileList = listdir('../../../testData/testDigits') # iterate through the test set + testFileList = listdir('testData/testDigits') # iterate through the test set errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] # take off .txt classNumStr = int(fileStr.split('_')[0]) - vectorUnderTest = img2vector('../../../testData/testDigits/%s' % fileNameStr) + vectorUnderTest = img2vector('testData/testDigits/%s' % fileNameStr) classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr) if (classifierResult != classNumStr): errorCount += 1.0 diff --git a/src/python/03.DecisionTree/DecisionTree.py b/src/python/03.DecisionTree/DecisionTree.py index dce6eb25..ed266377 100644 --- a/src/python/03.DecisionTree/DecisionTree.py +++ b/src/python/03.DecisionTree/DecisionTree.py @@ -31,6 +31,7 @@ def createDataSet(): # ['no'], # ['no'], # ['no']] + # labels 露出水面 脚蹼 labels = ['no surfacing', 'flippers'] # change to discrete values return dataSet, labels @@ -122,10 +123,9 @@ def chooseBestFeatureToSplit(dataSet): subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) - # gain[信息增益] 值越大,意味着该分类提供的信息量越大,该特征对分类的不确定程度越小 - # 也就说: 列进行group分组后,对应的类别越多,信息量越大,那么香农熵越小,那么信息增益就越大,所以gain越大 + # gain[信息增益]: 划分数据集前后的信息变化, 获取信息熵最大的值 infoGain = baseEntropy - newEntropy - # print 'infoGain=', infoGain, 'bestFeature=', i + print 'infoGain=', infoGain, 'bestFeature=', i, baseEntropy, newEntropy if (infoGain > bestInfoGain): bestInfoGain = infoGain bestFeature = i @@ -133,7 +133,7 @@ def chooseBestFeatureToSplit(dataSet): def majorityCnt(classList): - """majorityCnt(选择出线次数最多的一个结果) + """majorityCnt(选择出现次数最多的一个结果) Args: classList label列的集合 diff --git a/src/python/04.NaiveBayes/bayes.py b/src/python/04.NaiveBayes/bayes.py index a4d10d95..138e7bb1 100755 --- a/src/python/04.NaiveBayes/bayes.py +++ b/src/python/04.NaiveBayes/bayes.py @@ -1,6 +1,10 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- from numpy import * +""" +p(xy)=p(x|y)p(y)=p(y|x)p(x) +p(x|y)=p(y|x)p(x)/p(y) +""" def loadDataSet(): @@ -8,7 +12,7 @@ def loadDataSet(): 创建数据集 :return: 单词列表postingList, 所属类别classVec """ - postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], + postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #[0,0,1,1,1......] ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], @@ -37,7 +41,7 @@ def setOfWords2Vec(vocabList, inputSet): :param inputSet: 输入数据集 :return: 匹配列表[0,1,0,1...] """ - returnVec = [0] * len(vocabList) + returnVec = [0] * len(vocabList)# [0,0......] for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] = 1 @@ -49,8 +53,8 @@ def setOfWords2Vec(vocabList, inputSet): def _trainNB0(trainMatrix, trainCategory): """ 训练数据原版 - :param trainMatrix: 文件单词矩阵 - :param trainCategory: 文件对应的类别 + :param trainMatrix: 文件单词矩阵 [[1,0,1,1,1....],[],[]...] + :param trainCategory: 文件对应的类别[0,1,1,0....] :return: """ # 文件数 @@ -60,21 +64,21 @@ def _trainNB0(trainMatrix, trainCategory): # 侮辱性文件的出现概率 pAbusive = sum(trainCategory) / float(numTrainDocs) # 构造单词出现次数列表 - p0Num = zeros(numWords) - p1Num = zeros(numWords) + p0Num = zeros(numWords)[0,0,0,.....] + p1Num = zeros(numWords)[0,0,0,.....] # 整个数据集单词出现总数 p0Denom = 0.0 p1Denom = 0.0 for i in range(numTrainDocs): if trainCategory[i] == 1: - p1Num += trainMatrix[i] + p1Num += trainMatrix[i] #[0,1,1,....]->[0,1,1,...] p1Denom += sum(trainMatrix[i]) else: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) # 类别1,即侮辱性文档的[P(F1|C1),P(F2|C1),P(F3|C1),P(F4|C1),P(F5|C1)....]列表 - p1Vect = p1Num / p1Denom + p1Vect = p1Num / p1Denom# [1,2,3,5]/90->[1/90,...] # 类别0,即正常文档的[P(F1|C0),P(F2|C0),P(F3|C0),P(F4|C0),P(F5|C0)....]列表 p0Vect = p0Num / p0Denom return p0Vect, p1Vect, pAbusive @@ -94,7 +98,7 @@ def trainNB0(trainMatrix, trainCategory): # 侮辱性文件的出现概率 pAbusive = sum(trainCategory) / float(numTrainDocs) # 构造单词出现次数列表 - p0Num = ones(numWords) + p0Num = ones(numWords)#[0,0......]->[1,1,1,1,1.....] p1Num = ones(numWords) # 整个数据集单词出现总数,2.0根据样本/实际调查结果调整分母的值 @@ -117,7 +121,7 @@ def trainNB0(trainMatrix, trainCategory): def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): """ 使用算法 - :param vec2Classify: 待测数据 + :param vec2Classify: 待测数据[0,1,1,1,1...] :param p0Vec: 类别1,即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表 :param p1Vec: 类别0,即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表 :param pClass1: 类别1,侮辱性文件的出现概率 diff --git a/src/python/12.FrequentPattemTree/fpGrowth.py b/src/python/12.FrequentPattemTree/fpGrowth.py index 8f992385..75f07aea 100644 --- a/src/python/12.FrequentPattemTree/fpGrowth.py +++ b/src/python/12.FrequentPattemTree/fpGrowth.py @@ -1,3 +1,6 @@ +#!/usr/bin/python +# coding:utf8 + ''' Created on Jun 14, 2011 FP-Growth FP means frequent pattern @@ -22,15 +25,41 @@ class treeNode: self.children = {} def inc(self, numOccur): + """inc(对count变量增加给定值) + + """ self.count += numOccur def disp(self, ind=1): + """disp(用于将树以文本形式显示) + + """ print ' '*ind, self.name, ' ', self.count for child in self.children.values(): child.disp(ind+1) +def loadSimpDat(): + simpDat = [['r', 'z', 'h', 'j', 'p'], + ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'], + ['z'], + ['r', 'x', 'n', 'o', 's'], + ['y', 'r', 'x', 'z', 'q', 't', 'p'], + ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']] + return simpDat + + +def createInitSet(dataSet): + retDict = {} + for trans in dataSet: + retDict[frozenset(trans)] = 1 + return retDict + + def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine + """ + + """ headerTable = {} #go over dataSet twice for trans in dataSet:#first pass counts frequency of occurance @@ -76,6 +105,21 @@ def updateHeader(nodeToTest, targetNode): #this version does not use recursion nodeToTest.nodeLink = targetNode +if __name__ == "__main__": + rootNode = treeNode('pyramid', 9, None) + rootNode.children['eye'] = treeNode('eye', 13, None) + rootNode.children['phoenix'] = treeNode('phoenix', 3, None) + # 将树以文本形式显示 + # print rootNode.disp() + + # load样本数据 + simpDat = loadSimpDat() + print simpDat + # 重新装载 frozen set 格式化样本数据,用dist存储数据和对应的次数 + initSet = createInitSet(simpDat) + print initSet + + def ascendTree(leafNode, prefixPath): #ascends from leaf node to root if leafNode.parent != None: prefixPath.append(leafNode.name) @@ -111,21 +155,6 @@ def mineTree(inTree, headerTable, minSup, preFix, freqItemList): mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList) -def loadSimpDat(): - simpDat = [['r', 'z', 'h', 'j', 'p'], - ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'], - ['z'], - ['r', 'x', 'n', 'o', 's'], - ['y', 'r', 'x', 'z', 'q', 't', 'p'], - ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']] - return simpDat - - -def createInitSet(dataSet): - retDict = {} - for trans in dataSet: - retDict[frozenset(trans)] = 1 - return retDict import twitter