2017-03-18_添加交流的课程注释

This commit is contained in:
jiangzhonglian
2017-03-18 20:55:22 +08:00
parent 096dd4c516
commit fc481871c4
4 changed files with 74 additions and 35 deletions

View File

@@ -23,6 +23,8 @@ def createDataSet():
def classify0(inX, dataSet, labels, k):
"""
inx[1,2,3]
DS=[[1,2,3],[1,2,0]]
inX: 用于分类的输入向量
dataSet: 输入的训练样本集
labels: 标签向量
@@ -36,6 +38,10 @@ def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
# tile生成和训练样本对应的矩阵并与训练样本求差
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
"""
[[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]]
(A1-A2)^2+(B1-B2)^2+(c1-c2)^2
"""
# 取平方
sqDiffMat = diffMat ** 2
# 将矩阵的每一行相加
@@ -64,7 +70,7 @@ def test1():
group, labels = createDataSet()
print str(group)
print str(labels)
print classify0([0, 0], group, labels, 3)
print classify0([0.1, 0.1], group, labels, 3)
# ----------------------------------------------------------------------------------------
@@ -119,7 +125,7 @@ def datingClassTest():
"""
hoRatio = 0.9 # 测试范围,一部分测试一部分作为样本
# 从文件中加载数据
datingDataMat, datingLabels = file2matrix('../../../testData/datingTestSet2.txt') # load data setfrom file
datingDataMat, datingLabels = file2matrix('testData/datingTestSet2.txt') # load data setfrom file
# 归一化数据
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
@@ -153,7 +159,7 @@ def img2vector(filename):
def handwritingClassTest():
# 1. 导入数据
hwLabels = []
trainingFileList = listdir('../../../testData/trainingDigits') # load the training set
trainingFileList = listdir('testData/trainingDigits') # load the training set
m = len(trainingFileList)
trainingMat = zeros((m, 1024))
for i in range(m):
@@ -161,17 +167,17 @@ def handwritingClassTest():
fileStr = fileNameStr.split('.')[0] # take off .txt
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i, :] = img2vector('../../../testData/trainingDigits/%s' % fileNameStr)
trainingMat[i, :] = img2vector('testData/trainingDigits/%s' % fileNameStr)
# 2. 导入测试数据
testFileList = listdir('../../../testData/testDigits') # iterate through the test set
testFileList = listdir('testData/testDigits') # iterate through the test set
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0] # take off .txt
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('../../../testData/testDigits/%s' % fileNameStr)
vectorUnderTest = img2vector('testData/testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
if (classifierResult != classNumStr): errorCount += 1.0

View File

@@ -31,6 +31,7 @@ def createDataSet():
# ['no'],
# ['no'],
# ['no']]
# labels 露出水面 脚蹼
labels = ['no surfacing', 'flippers']
# change to discrete values
return dataSet, labels
@@ -122,10 +123,9 @@ def chooseBestFeatureToSplit(dataSet):
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
# gain[信息增益] 值越大,意味着该分类提供的信息量越大,该特征对分类的不确定程度越小
# 也就说: 列进行group分组后对应的类别越多信息量越大那么香农熵越小那么信息增益就越大所以gain越大
# gain[信息增益]: 划分数据集前后的信息变化, 获取信息熵最大的值
infoGain = baseEntropy - newEntropy
# print 'infoGain=', infoGain, 'bestFeature=', i
print 'infoGain=', infoGain, 'bestFeature=', i, baseEntropy, newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
@@ -133,7 +133,7 @@ def chooseBestFeatureToSplit(dataSet):
def majorityCnt(classList):
"""majorityCnt(选择出线次数最多的一个结果)
"""majorityCnt(选择出次数最多的一个结果)
Args:
classList label列的集合

View File

@@ -1,6 +1,10 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from numpy import *
"""
p(xy)=p(x|y)p(y)=p(y|x)p(x)
p(x|y)=p(y|x)p(x)/p(y)
"""
def loadDataSet():
@@ -8,7 +12,7 @@ def loadDataSet():
创建数据集
:return: 单词列表postingList, 所属类别classVec
"""
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #[0,0,1,1,1......]
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
@@ -37,7 +41,7 @@ def setOfWords2Vec(vocabList, inputSet):
:param inputSet: 输入数据集
:return: 匹配列表[0,1,0,1...]
"""
returnVec = [0] * len(vocabList)
returnVec = [0] * len(vocabList)# [0,0......]
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
@@ -49,8 +53,8 @@ def setOfWords2Vec(vocabList, inputSet):
def _trainNB0(trainMatrix, trainCategory):
"""
训练数据原版
:param trainMatrix: 文件单词矩阵
:param trainCategory: 文件对应的类别
:param trainMatrix: 文件单词矩阵 [[1,0,1,1,1....],[],[]...]
:param trainCategory: 文件对应的类别[0,1,1,0....]
:return:
"""
# 文件数
@@ -60,21 +64,21 @@ def _trainNB0(trainMatrix, trainCategory):
# 侮辱性文件的出现概率
pAbusive = sum(trainCategory) / float(numTrainDocs)
# 构造单词出现次数列表
p0Num = zeros(numWords)
p1Num = zeros(numWords)
p0Num = zeros(numWords)[0,0,0,.....]
p1Num = zeros(numWords)[0,0,0,.....]
# 整个数据集单词出现总数
p0Denom = 0.0
p1Denom = 0.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Num += trainMatrix[i] #[0,1,1,....]->[0,1,1,...]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
# 类别1即侮辱性文档的[P(F1|C1),P(F2|C1),P(F3|C1),P(F4|C1),P(F5|C1)....]列表
p1Vect = p1Num / p1Denom
p1Vect = p1Num / p1Denom# [1,2,3,5]/90->[1/90,...]
# 类别0即正常文档的[P(F1|C0),P(F2|C0),P(F3|C0),P(F4|C0),P(F5|C0)....]列表
p0Vect = p0Num / p0Denom
return p0Vect, p1Vect, pAbusive
@@ -94,7 +98,7 @@ def trainNB0(trainMatrix, trainCategory):
# 侮辱性文件的出现概率
pAbusive = sum(trainCategory) / float(numTrainDocs)
# 构造单词出现次数列表
p0Num = ones(numWords)
p0Num = ones(numWords)#[0,0......]->[1,1,1,1,1.....]
p1Num = ones(numWords)
# 整个数据集单词出现总数2.0根据样本/实际调查结果调整分母的值
@@ -117,7 +121,7 @@ def trainNB0(trainMatrix, trainCategory):
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
"""
使用算法
:param vec2Classify: 待测数据
:param vec2Classify: 待测数据[0,1,1,1,1...]
:param p0Vec: 类别1即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表
:param p1Vec: 类别0即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表
:param pClass1: 类别1侮辱性文件的出现概率

View File

@@ -1,3 +1,6 @@
#!/usr/bin/python
# coding:utf8
'''
Created on Jun 14, 2011
FP-Growth FP means frequent pattern
@@ -22,15 +25,41 @@ class treeNode:
self.children = {}
def inc(self, numOccur):
"""inc(对count变量增加给定值)
"""
self.count += numOccur
def disp(self, ind=1):
"""disp(用于将树以文本形式显示)
"""
print ' '*ind, self.name, ' ', self.count
for child in self.children.values():
child.disp(ind+1)
def loadSimpDat():
simpDat = [['r', 'z', 'h', 'j', 'p'],
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'],
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
return simpDat
def createInitSet(dataSet):
retDict = {}
for trans in dataSet:
retDict[frozenset(trans)] = 1
return retDict
def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine
"""
"""
headerTable = {}
#go over dataSet twice
for trans in dataSet:#first pass counts frequency of occurance
@@ -76,6 +105,21 @@ def updateHeader(nodeToTest, targetNode): #this version does not use recursion
nodeToTest.nodeLink = targetNode
if __name__ == "__main__":
rootNode = treeNode('pyramid', 9, None)
rootNode.children['eye'] = treeNode('eye', 13, None)
rootNode.children['phoenix'] = treeNode('phoenix', 3, None)
# 将树以文本形式显示
# print rootNode.disp()
# load样本数据
simpDat = loadSimpDat()
print simpDat
# 重新装载 frozen set 格式化样本数据用dist存储数据和对应的次数
initSet = createInitSet(simpDat)
print initSet
def ascendTree(leafNode, prefixPath): #ascends from leaf node to root
if leafNode.parent != None:
prefixPath.append(leafNode.name)
@@ -111,21 +155,6 @@ def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
def loadSimpDat():
simpDat = [['r', 'z', 'h', 'j', 'p'],
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'],
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
return simpDat
def createInitSet(dataSet):
retDict = {}
for trans in dataSet:
retDict[frozenset(trans)] = 1
return retDict
import twitter