mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-11 14:26:04 +08:00
2017-03-18_添加交流的课程注释
This commit is contained in:
@@ -23,6 +23,8 @@ def createDataSet():
|
||||
|
||||
def classify0(inX, dataSet, labels, k):
|
||||
"""
|
||||
inx[1,2,3]
|
||||
DS=[[1,2,3],[1,2,0]]
|
||||
inX: 用于分类的输入向量
|
||||
dataSet: 输入的训练样本集
|
||||
labels: 标签向量
|
||||
@@ -36,6 +38,10 @@ def classify0(inX, dataSet, labels, k):
|
||||
dataSetSize = dataSet.shape[0]
|
||||
# tile生成和训练样本对应的矩阵,并与训练样本求差
|
||||
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
|
||||
"""
|
||||
[[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]]
|
||||
(A1-A2)^2+(B1-B2)^2+(c1-c2)^2
|
||||
"""
|
||||
# 取平方
|
||||
sqDiffMat = diffMat ** 2
|
||||
# 将矩阵的每一行相加
|
||||
@@ -64,7 +70,7 @@ def test1():
|
||||
group, labels = createDataSet()
|
||||
print str(group)
|
||||
print str(labels)
|
||||
print classify0([0, 0], group, labels, 3)
|
||||
print classify0([0.1, 0.1], group, labels, 3)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------
|
||||
@@ -119,7 +125,7 @@ def datingClassTest():
|
||||
"""
|
||||
hoRatio = 0.9 # 测试范围,一部分测试一部分作为样本
|
||||
# 从文件中加载数据
|
||||
datingDataMat, datingLabels = file2matrix('../../../testData/datingTestSet2.txt') # load data setfrom file
|
||||
datingDataMat, datingLabels = file2matrix('testData/datingTestSet2.txt') # load data setfrom file
|
||||
# 归一化数据
|
||||
normMat, ranges, minVals = autoNorm(datingDataMat)
|
||||
m = normMat.shape[0]
|
||||
@@ -153,7 +159,7 @@ def img2vector(filename):
|
||||
def handwritingClassTest():
|
||||
# 1. 导入数据
|
||||
hwLabels = []
|
||||
trainingFileList = listdir('../../../testData/trainingDigits') # load the training set
|
||||
trainingFileList = listdir('testData/trainingDigits') # load the training set
|
||||
m = len(trainingFileList)
|
||||
trainingMat = zeros((m, 1024))
|
||||
for i in range(m):
|
||||
@@ -161,17 +167,17 @@ def handwritingClassTest():
|
||||
fileStr = fileNameStr.split('.')[0] # take off .txt
|
||||
classNumStr = int(fileStr.split('_')[0])
|
||||
hwLabels.append(classNumStr)
|
||||
trainingMat[i, :] = img2vector('../../../testData/trainingDigits/%s' % fileNameStr)
|
||||
trainingMat[i, :] = img2vector('testData/trainingDigits/%s' % fileNameStr)
|
||||
|
||||
# 2. 导入测试数据
|
||||
testFileList = listdir('../../../testData/testDigits') # iterate through the test set
|
||||
testFileList = listdir('testData/testDigits') # iterate through the test set
|
||||
errorCount = 0.0
|
||||
mTest = len(testFileList)
|
||||
for i in range(mTest):
|
||||
fileNameStr = testFileList[i]
|
||||
fileStr = fileNameStr.split('.')[0] # take off .txt
|
||||
classNumStr = int(fileStr.split('_')[0])
|
||||
vectorUnderTest = img2vector('../../../testData/testDigits/%s' % fileNameStr)
|
||||
vectorUnderTest = img2vector('testData/testDigits/%s' % fileNameStr)
|
||||
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
|
||||
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
|
||||
if (classifierResult != classNumStr): errorCount += 1.0
|
||||
|
||||
@@ -31,6 +31,7 @@ def createDataSet():
|
||||
# ['no'],
|
||||
# ['no'],
|
||||
# ['no']]
|
||||
# labels 露出水面 脚蹼
|
||||
labels = ['no surfacing', 'flippers']
|
||||
# change to discrete values
|
||||
return dataSet, labels
|
||||
@@ -122,10 +123,9 @@ def chooseBestFeatureToSplit(dataSet):
|
||||
subDataSet = splitDataSet(dataSet, i, value)
|
||||
prob = len(subDataSet)/float(len(dataSet))
|
||||
newEntropy += prob * calcShannonEnt(subDataSet)
|
||||
# gain[信息增益] 值越大,意味着该分类提供的信息量越大,该特征对分类的不确定程度越小
|
||||
# 也就说: 列进行group分组后,对应的类别越多,信息量越大,那么香农熵越小,那么信息增益就越大,所以gain越大
|
||||
# gain[信息增益]: 划分数据集前后的信息变化, 获取信息熵最大的值
|
||||
infoGain = baseEntropy - newEntropy
|
||||
# print 'infoGain=', infoGain, 'bestFeature=', i
|
||||
print 'infoGain=', infoGain, 'bestFeature=', i, baseEntropy, newEntropy
|
||||
if (infoGain > bestInfoGain):
|
||||
bestInfoGain = infoGain
|
||||
bestFeature = i
|
||||
@@ -133,7 +133,7 @@ def chooseBestFeatureToSplit(dataSet):
|
||||
|
||||
|
||||
def majorityCnt(classList):
|
||||
"""majorityCnt(选择出线次数最多的一个结果)
|
||||
"""majorityCnt(选择出现次数最多的一个结果)
|
||||
|
||||
Args:
|
||||
classList label列的集合
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
from numpy import *
|
||||
"""
|
||||
p(xy)=p(x|y)p(y)=p(y|x)p(x)
|
||||
p(x|y)=p(y|x)p(x)/p(y)
|
||||
"""
|
||||
|
||||
|
||||
def loadDataSet():
|
||||
@@ -8,7 +12,7 @@ def loadDataSet():
|
||||
创建数据集
|
||||
:return: 单词列表postingList, 所属类别classVec
|
||||
"""
|
||||
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
|
||||
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #[0,0,1,1,1......]
|
||||
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
|
||||
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
|
||||
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
|
||||
@@ -37,7 +41,7 @@ def setOfWords2Vec(vocabList, inputSet):
|
||||
:param inputSet: 输入数据集
|
||||
:return: 匹配列表[0,1,0,1...]
|
||||
"""
|
||||
returnVec = [0] * len(vocabList)
|
||||
returnVec = [0] * len(vocabList)# [0,0......]
|
||||
for word in inputSet:
|
||||
if word in vocabList:
|
||||
returnVec[vocabList.index(word)] = 1
|
||||
@@ -49,8 +53,8 @@ def setOfWords2Vec(vocabList, inputSet):
|
||||
def _trainNB0(trainMatrix, trainCategory):
|
||||
"""
|
||||
训练数据原版
|
||||
:param trainMatrix: 文件单词矩阵
|
||||
:param trainCategory: 文件对应的类别
|
||||
:param trainMatrix: 文件单词矩阵 [[1,0,1,1,1....],[],[]...]
|
||||
:param trainCategory: 文件对应的类别[0,1,1,0....]
|
||||
:return:
|
||||
"""
|
||||
# 文件数
|
||||
@@ -60,21 +64,21 @@ def _trainNB0(trainMatrix, trainCategory):
|
||||
# 侮辱性文件的出现概率
|
||||
pAbusive = sum(trainCategory) / float(numTrainDocs)
|
||||
# 构造单词出现次数列表
|
||||
p0Num = zeros(numWords)
|
||||
p1Num = zeros(numWords)
|
||||
p0Num = zeros(numWords)[0,0,0,.....]
|
||||
p1Num = zeros(numWords)[0,0,0,.....]
|
||||
|
||||
# 整个数据集单词出现总数
|
||||
p0Denom = 0.0
|
||||
p1Denom = 0.0
|
||||
for i in range(numTrainDocs):
|
||||
if trainCategory[i] == 1:
|
||||
p1Num += trainMatrix[i]
|
||||
p1Num += trainMatrix[i] #[0,1,1,....]->[0,1,1,...]
|
||||
p1Denom += sum(trainMatrix[i])
|
||||
else:
|
||||
p0Num += trainMatrix[i]
|
||||
p0Denom += sum(trainMatrix[i])
|
||||
# 类别1,即侮辱性文档的[P(F1|C1),P(F2|C1),P(F3|C1),P(F4|C1),P(F5|C1)....]列表
|
||||
p1Vect = p1Num / p1Denom
|
||||
p1Vect = p1Num / p1Denom# [1,2,3,5]/90->[1/90,...]
|
||||
# 类别0,即正常文档的[P(F1|C0),P(F2|C0),P(F3|C0),P(F4|C0),P(F5|C0)....]列表
|
||||
p0Vect = p0Num / p0Denom
|
||||
return p0Vect, p1Vect, pAbusive
|
||||
@@ -94,7 +98,7 @@ def trainNB0(trainMatrix, trainCategory):
|
||||
# 侮辱性文件的出现概率
|
||||
pAbusive = sum(trainCategory) / float(numTrainDocs)
|
||||
# 构造单词出现次数列表
|
||||
p0Num = ones(numWords)
|
||||
p0Num = ones(numWords)#[0,0......]->[1,1,1,1,1.....]
|
||||
p1Num = ones(numWords)
|
||||
|
||||
# 整个数据集单词出现总数,2.0根据样本/实际调查结果调整分母的值
|
||||
@@ -117,7 +121,7 @@ def trainNB0(trainMatrix, trainCategory):
|
||||
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
|
||||
"""
|
||||
使用算法
|
||||
:param vec2Classify: 待测数据
|
||||
:param vec2Classify: 待测数据[0,1,1,1,1...]
|
||||
:param p0Vec: 类别1,即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表
|
||||
:param p1Vec: 类别0,即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表
|
||||
:param pClass1: 类别1,侮辱性文件的出现概率
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on Jun 14, 2011
|
||||
FP-Growth FP means frequent pattern
|
||||
@@ -22,15 +25,41 @@ class treeNode:
|
||||
self.children = {}
|
||||
|
||||
def inc(self, numOccur):
|
||||
"""inc(对count变量增加给定值)
|
||||
|
||||
"""
|
||||
self.count += numOccur
|
||||
|
||||
def disp(self, ind=1):
|
||||
"""disp(用于将树以文本形式显示)
|
||||
|
||||
"""
|
||||
print ' '*ind, self.name, ' ', self.count
|
||||
for child in self.children.values():
|
||||
child.disp(ind+1)
|
||||
|
||||
|
||||
def loadSimpDat():
|
||||
simpDat = [['r', 'z', 'h', 'j', 'p'],
|
||||
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
|
||||
['z'],
|
||||
['r', 'x', 'n', 'o', 's'],
|
||||
['y', 'r', 'x', 'z', 'q', 't', 'p'],
|
||||
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
|
||||
return simpDat
|
||||
|
||||
|
||||
def createInitSet(dataSet):
|
||||
retDict = {}
|
||||
for trans in dataSet:
|
||||
retDict[frozenset(trans)] = 1
|
||||
return retDict
|
||||
|
||||
|
||||
def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine
|
||||
"""
|
||||
|
||||
"""
|
||||
headerTable = {}
|
||||
#go over dataSet twice
|
||||
for trans in dataSet:#first pass counts frequency of occurance
|
||||
@@ -76,6 +105,21 @@ def updateHeader(nodeToTest, targetNode): #this version does not use recursion
|
||||
nodeToTest.nodeLink = targetNode
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
rootNode = treeNode('pyramid', 9, None)
|
||||
rootNode.children['eye'] = treeNode('eye', 13, None)
|
||||
rootNode.children['phoenix'] = treeNode('phoenix', 3, None)
|
||||
# 将树以文本形式显示
|
||||
# print rootNode.disp()
|
||||
|
||||
# load样本数据
|
||||
simpDat = loadSimpDat()
|
||||
print simpDat
|
||||
# 重新装载 frozen set 格式化样本数据,用dist存储数据和对应的次数
|
||||
initSet = createInitSet(simpDat)
|
||||
print initSet
|
||||
|
||||
|
||||
def ascendTree(leafNode, prefixPath): #ascends from leaf node to root
|
||||
if leafNode.parent != None:
|
||||
prefixPath.append(leafNode.name)
|
||||
@@ -111,21 +155,6 @@ def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
|
||||
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
|
||||
|
||||
|
||||
def loadSimpDat():
|
||||
simpDat = [['r', 'z', 'h', 'j', 'p'],
|
||||
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
|
||||
['z'],
|
||||
['r', 'x', 'n', 'o', 's'],
|
||||
['y', 'r', 'x', 'z', 'q', 't', 'p'],
|
||||
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
|
||||
return simpDat
|
||||
|
||||
|
||||
def createInitSet(dataSet):
|
||||
retDict = {}
|
||||
for trans in dataSet:
|
||||
retDict[frozenset(trans)] = 1
|
||||
return retDict
|
||||
|
||||
|
||||
import twitter
|
||||
|
||||
Reference in New Issue
Block a user