mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-11 14:26:04 +08:00
添加前两种回归的测试和注释
This commit is contained in:
@@ -28,7 +28,7 @@ randMat = mat(randArray)
|
||||
# .T表示对矩阵转置(行列颠倒)
|
||||
invRandMat = randMat.I
|
||||
# 输出结果
|
||||
print randArray, '\n---\n', randMat, '\n+++\n', invRandMat
|
||||
print(randArray, '\n---\n', randMat, '\n+++\n', invRandMat)
|
||||
# 矩阵和逆矩阵 进行求积 (单位矩阵,对角线都为1嘛,理论上4*4的矩阵其他的都为0)
|
||||
myEye = randMat*invRandMat
|
||||
# 误差
|
||||
|
||||
219
src/python/02.kNN/kNN.py
Normal file
219
src/python/02.kNN/kNN.py
Normal file
@@ -0,0 +1,219 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
'''
|
||||
导入科学计算包numpy和运算符模块operator
|
||||
'''
|
||||
from numpy import *
|
||||
import operator
|
||||
from os import listdir
|
||||
|
||||
|
||||
def createDataSet():
|
||||
"""
|
||||
创建数据集和标签
|
||||
|
||||
调用方式
|
||||
import kNN
|
||||
group, labels = kNN.createDataSet()
|
||||
"""
|
||||
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
|
||||
labels = ['A', 'A', 'B', 'B']
|
||||
return group, labels
|
||||
|
||||
|
||||
def classify0(inX, dataSet, labels, k):
|
||||
"""
|
||||
inx[1,2,3]
|
||||
DS=[[1,2,3],[1,2,0]]
|
||||
inX: 用于分类的输入向量
|
||||
dataSet: 输入的训练样本集
|
||||
labels: 标签向量
|
||||
k: 选择最近邻居的数目
|
||||
注意:labels元素数目和dataSet行数相同;程序使用欧式距离公式.
|
||||
|
||||
预测数据所在分类可在输入下列命令
|
||||
kNN.classify0([0,0], group, labels, 3)
|
||||
"""
|
||||
# 1. 距离计算
|
||||
dataSetSize = dataSet.shape[0]
|
||||
# tile生成和训练样本对应的矩阵,并与训练样本求差
|
||||
"""
|
||||
tile: 列-3表示复制的行树, 行-1/2表示对inx的重复的次数
|
||||
|
||||
In [8]: tile(inx, (3, 1))
|
||||
Out[8]:
|
||||
array([[1, 2],
|
||||
[1, 2],
|
||||
[1, 2]])
|
||||
|
||||
In [9]: tile(inx, (3, 2))
|
||||
Out[9]:
|
||||
array([[1, 2, 1, 2],
|
||||
[1, 2, 1, 2],
|
||||
[1, 2, 1, 2]])
|
||||
"""
|
||||
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
|
||||
"""
|
||||
欧氏距离: 点到点之间的距离
|
||||
第一行: 同一个点 到 dataSet的第一个点的距离。
|
||||
第二行: 同一个点 到 dataSet的第二个点的距离。
|
||||
...
|
||||
第N行: 同一个点 到 dataSet的第N个点的距离。
|
||||
|
||||
[[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]]
|
||||
(A1-A2)^2+(B1-B2)^2+(c1-c2)^2
|
||||
"""
|
||||
# 取平方
|
||||
sqDiffMat = diffMat ** 2
|
||||
# 将矩阵的每一行相加
|
||||
sqDistances = sqDiffMat.sum(axis=1)
|
||||
# 开方
|
||||
distances = sqDistances ** 0.5
|
||||
# 距离排序
|
||||
sortedDistIndicies = distances.argsort()
|
||||
|
||||
# 2. 选择距离最小的k个点
|
||||
classCount = {}
|
||||
for i in range(k):
|
||||
# 找到该样本的类型
|
||||
voteIlabel = labels[sortedDistIndicies[i]]
|
||||
# 在字典中将该类型加一
|
||||
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
|
||||
# 3. 排序并返回出现最多的那个类型
|
||||
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
|
||||
return sortedClassCount[0][0]
|
||||
|
||||
|
||||
def test1():
|
||||
"""
|
||||
第一个例子演示
|
||||
"""
|
||||
group, labels = createDataSet()
|
||||
print str(group)
|
||||
print str(labels)
|
||||
print classify0([0.1, 0.1], group, labels, 3)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------
|
||||
def file2matrix(filename):
|
||||
"""
|
||||
导入训练数据
|
||||
:param filename: 数据文件路径
|
||||
:return: 数据矩阵returnMat和对应的类别classLabelVector
|
||||
"""
|
||||
fr = open(filename)
|
||||
numberOfLines = len(fr.readlines()) # get the number of lines in the file
|
||||
# 生成对应的空矩阵
|
||||
returnMat = zeros((numberOfLines, 3)) # prepare matrix to return
|
||||
classLabelVector = [] # prepare labels return
|
||||
fr = open(filename)
|
||||
index = 0
|
||||
for line in fr.readlines():
|
||||
line = line.strip()
|
||||
listFromLine = line.split('\t')
|
||||
# 每列的属性数据
|
||||
returnMat[index, :] = listFromLine[0:3]
|
||||
# 每列的类别数据
|
||||
classLabelVector.append(int(listFromLine[-1]))
|
||||
index += 1
|
||||
# 返回数据矩阵returnMat和对应的类别classLabelVector
|
||||
return returnMat, classLabelVector
|
||||
|
||||
|
||||
def autoNorm(dataSet):
|
||||
"""
|
||||
归一化特征值,消除属性之间量级不同导致的影响
|
||||
:param dataSet: 数据集
|
||||
:return: 归一化后的数据集normDataSet,ranges和minVals即最小值与范围,并没有用到
|
||||
|
||||
归一化公式:
|
||||
Y = (X-Xmin)-(Xmax-Xmin)
|
||||
"""
|
||||
# 计算每种属性的最大值、最小值、范围
|
||||
minVals = dataSet.min(0)
|
||||
maxVals = dataSet.max(0)
|
||||
# 极差
|
||||
ranges = maxVals - minVals
|
||||
normDataSet = zeros(shape(dataSet))
|
||||
m = dataSet.shape[0]
|
||||
# 生成与最小值之差组成的矩阵
|
||||
normDataSet = dataSet - tile(minVals, (m, 1))
|
||||
# 将最小值之差除以范围组成矩阵
|
||||
normDataSet = normDataSet / tile(ranges, (m, 1)) # element wise divide
|
||||
return normDataSet, ranges, minVals
|
||||
|
||||
|
||||
def datingClassTest():
|
||||
"""
|
||||
对约会网站的测试方法
|
||||
:return: 错误数
|
||||
"""
|
||||
# 设置测试数据的的一个比例(训练数据集比例=1-hoRatio)
|
||||
hoRatio = 0.1 # 测试范围,一部分测试一部分作为样本
|
||||
# 从文件中加载数据
|
||||
datingDataMat, datingLabels = file2matrix('testData/datingTestSet2.txt') # load data setfrom file
|
||||
# 归一化数据
|
||||
normMat, ranges, minVals = autoNorm(datingDataMat)
|
||||
m = normMat.shape[0]
|
||||
# 设置测试的样本数量, numTestVecs:m表示训练样本的数量
|
||||
numTestVecs = int(m * hoRatio)
|
||||
print 'numTestVecs=', numTestVecs
|
||||
errorCount = 0.0
|
||||
for i in range(numTestVecs):
|
||||
# 对数据测试,
|
||||
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
|
||||
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
|
||||
if (classifierResult != datingLabels[i]): errorCount += 1.0
|
||||
print "the total error rate is: %f" % (errorCount / float(numTestVecs))
|
||||
print errorCount
|
||||
|
||||
|
||||
def img2vector(filename):
|
||||
"""
|
||||
将图像数据转换为向量
|
||||
:param filename: 图片文件
|
||||
:return: 一纬矩阵
|
||||
"""
|
||||
returnVect = zeros((1, 1024))
|
||||
fr = open(filename)
|
||||
for i in range(32):
|
||||
lineStr = fr.readline()
|
||||
for j in range(32):
|
||||
returnVect[0, 32 * i + j] = int(lineStr[j])
|
||||
return returnVect
|
||||
|
||||
|
||||
def handwritingClassTest():
|
||||
# 1. 导入数据
|
||||
hwLabels = []
|
||||
trainingFileList = listdir('testData/trainingDigits') # load the training set
|
||||
m = len(trainingFileList)
|
||||
trainingMat = zeros((m, 1024))
|
||||
# hwLabels存储0~9对应的index位置, trainingMat存放的每个位置对应的图片向量
|
||||
for i in range(m):
|
||||
fileNameStr = trainingFileList[i]
|
||||
fileStr = fileNameStr.split('.')[0] # take off .txt
|
||||
classNumStr = int(fileStr.split('_')[0])
|
||||
hwLabels.append(classNumStr)
|
||||
trainingMat[i, :] = img2vector('testData/trainingDigits/%s' % fileNameStr)
|
||||
|
||||
# 2. 导入测试数据
|
||||
testFileList = listdir('testData/testDigits') # iterate through the test set
|
||||
errorCount = 0.0
|
||||
mTest = len(testFileList)
|
||||
for i in range(mTest):
|
||||
fileNameStr = testFileList[i]
|
||||
fileStr = fileNameStr.split('.')[0] # take off .txt
|
||||
classNumStr = int(fileStr.split('_')[0])
|
||||
vectorUnderTest = img2vector('testData/testDigits/%s' % fileNameStr)
|
||||
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
|
||||
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
|
||||
if (classifierResult != classNumStr): errorCount += 1.0
|
||||
print "\nthe total number of errors is: %d" % errorCount
|
||||
print "\nthe total error rate is: %f" % (errorCount / float(mTest))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# test1()
|
||||
# datingClassTest()
|
||||
handwritingClassTest()
|
||||
@@ -51,7 +51,7 @@ def predict_train(x_train, y_train):
|
||||
return y_pre, clf
|
||||
|
||||
|
||||
def show_precision_recall(x, clf, y_train, y_pre):
|
||||
def show_precision_recall(x, y, clf, y_train, y_pre):
|
||||
'''
|
||||
准确率与召回率
|
||||
参考链接: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html#sklearn.metrics.precision_recall_curve
|
||||
@@ -110,7 +110,7 @@ if __name__ == '__main__':
|
||||
y_pre, clf = predict_train(x_train, y_train)
|
||||
|
||||
# 展现 准确率与召回率
|
||||
show_precision_recall(x, clf, y_train, y_pre)
|
||||
show_precision_recall(x, y, clf, y_train, y_pre)
|
||||
|
||||
# 可视化输出
|
||||
show_pdf(clf)
|
||||
|
||||
@@ -5,11 +5,12 @@
|
||||
Created on Oct 12, 2010
|
||||
Update on 2017-02-27
|
||||
Decision Tree Source Code for Machine Learning in Action Ch. 3
|
||||
@author: Peter Harrington/jiangzhonglian
|
||||
@author: Peter Harrington/片刻
|
||||
'''
|
||||
from math import log
|
||||
print(__doc__)
|
||||
import operator
|
||||
import DecisionTreePlot as dtPlot
|
||||
from math import log
|
||||
import decisionTreePlot as dtPlot
|
||||
|
||||
|
||||
def createDataSet():
|
||||
@@ -19,8 +20,6 @@ def createDataSet():
|
||||
无需传入参数
|
||||
Returns:
|
||||
返回数据集和对应的label标签
|
||||
Raises:
|
||||
|
||||
"""
|
||||
dataSet = [[1, 1, 'yes'],
|
||||
[1, 1, 'yes'],
|
||||
@@ -32,6 +31,7 @@ def createDataSet():
|
||||
# ['no'],
|
||||
# ['no'],
|
||||
# ['no']]
|
||||
# labels 露出水面 脚蹼
|
||||
labels = ['no surfacing', 'flippers']
|
||||
# change to discrete values
|
||||
return dataSet, labels
|
||||
@@ -43,9 +43,7 @@ def calcShannonEnt(dataSet):
|
||||
Args:
|
||||
dataSet 数据集
|
||||
Returns:
|
||||
返回香农熵的计算值
|
||||
Raises:
|
||||
|
||||
返回 每一组feature下的某个分类下,香农熵的信息期望
|
||||
"""
|
||||
# 求list的长度,表示计算参与训练的数据量
|
||||
numEntries = len(dataSet)
|
||||
@@ -80,8 +78,6 @@ def splitDataSet(dataSet, axis, value):
|
||||
value 表示axis列对应的value值
|
||||
Returns:
|
||||
axis列为value的数据集【该数据集需要排除axis列】
|
||||
Raises:
|
||||
|
||||
"""
|
||||
retDataSet = []
|
||||
for featVec in dataSet:
|
||||
@@ -105,10 +101,8 @@ def chooseBestFeatureToSplit(dataSet):
|
||||
dataSet 数据集
|
||||
Returns:
|
||||
bestFeature 最优的特征列
|
||||
Raises:
|
||||
|
||||
"""
|
||||
# 求第一行有多少列的 Feature
|
||||
# 求第一行有多少列的 Feature, 最后一列是label列嘛
|
||||
numFeatures = len(dataSet[0]) - 1
|
||||
# label的信息熵
|
||||
baseEntropy = calcShannonEnt(dataSet)
|
||||
@@ -129,8 +123,9 @@ def chooseBestFeatureToSplit(dataSet):
|
||||
subDataSet = splitDataSet(dataSet, i, value)
|
||||
prob = len(subDataSet)/float(len(dataSet))
|
||||
newEntropy += prob * calcShannonEnt(subDataSet)
|
||||
# gain[信息增益] 值越大,意味着该分类提供的信息量越大,该特征对分类的不确定程度越小
|
||||
# gain[信息增益]: 划分数据集前后的信息变化, 获取信息熵最大的值
|
||||
infoGain = baseEntropy - newEntropy
|
||||
print 'infoGain=', infoGain, 'bestFeature=', i, baseEntropy, newEntropy
|
||||
if (infoGain > bestInfoGain):
|
||||
bestInfoGain = infoGain
|
||||
bestFeature = i
|
||||
@@ -138,14 +133,12 @@ def chooseBestFeatureToSplit(dataSet):
|
||||
|
||||
|
||||
def majorityCnt(classList):
|
||||
"""majorityCnt(选择出线次数最多的一个结果)
|
||||
"""majorityCnt(选择出现次数最多的一个结果)
|
||||
|
||||
Args:
|
||||
classList label列的集合
|
||||
Returns:
|
||||
bestFeature 最优的特征列
|
||||
Raises:
|
||||
|
||||
"""
|
||||
classCount = {}
|
||||
for vote in classList:
|
||||
@@ -169,6 +162,7 @@ def createTree(dataSet, labels):
|
||||
|
||||
# 选择最优的列,得到最有列对应的label含义
|
||||
bestFeat = chooseBestFeatureToSplit(dataSet)
|
||||
# 获取label的名称
|
||||
bestFeatLabel = labels[bestFeat]
|
||||
# 初始化myTree
|
||||
myTree = {bestFeatLabel: {}}
|
||||
@@ -187,16 +181,26 @@ def createTree(dataSet, labels):
|
||||
|
||||
|
||||
def classify(inputTree, featLabels, testVec):
|
||||
# 获取tree的第一个节点对应的key值
|
||||
"""classify(给输入的节点,进行分类)
|
||||
|
||||
Args:
|
||||
inputTree 决策树模型
|
||||
featLabels Feature标签对应的名称
|
||||
testVec 测试输入的数据
|
||||
Returns:
|
||||
classLabel 分类的结果值,需要映射label才能知道名称
|
||||
"""
|
||||
# 获取tree的根节点对于的key值
|
||||
firstStr = inputTree.keys()[0]
|
||||
# 获取第一个节点对应的value值
|
||||
# 通过key得到根节点对应的value
|
||||
secondDict = inputTree[firstStr]
|
||||
# 判断根节点的索引值,然后根据testVec来获取对应的树分枝位置
|
||||
# 判断根节点名称获取根节点在label中的先后顺序,这样就知道输入的testVec怎么开始对照树来做分类
|
||||
featIndex = featLabels.index(firstStr)
|
||||
# 测试数据,找到根节点对应的label位置,也就知道从输入的数据的第几位来开始分类
|
||||
key = testVec[featIndex]
|
||||
valueOfFeat = secondDict[key]
|
||||
print '+++', firstStr, 'xxx', secondDict, '---', key, '>>>', valueOfFeat
|
||||
# 判断分枝是否结束
|
||||
# 判断分枝是否结束: 判断valueOfFeat是否是dict类型
|
||||
if isinstance(valueOfFeat, dict):
|
||||
classLabel = classify(valueOfFeat, featLabels, testVec)
|
||||
else:
|
||||
|
||||
@@ -128,5 +128,5 @@ def retrieveTree(i):
|
||||
return listOfTrees[i]
|
||||
|
||||
|
||||
myTree = retrieveTree(0)
|
||||
createPlot(myTree)
|
||||
# myTree = retrieveTree(1)
|
||||
# createPlot(myTree)
|
||||
|
||||
181
src/python/04.NaiveBayes/bayes.py
Executable file
181
src/python/04.NaiveBayes/bayes.py
Executable file
@@ -0,0 +1,181 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
from numpy import *
|
||||
"""
|
||||
p(xy)=p(x|y)p(y)=p(y|x)p(x)
|
||||
p(x|y)=p(y|x)p(x)/p(y)
|
||||
"""
|
||||
|
||||
|
||||
def loadDataSet():
|
||||
"""
|
||||
创建数据集
|
||||
:return: 单词列表postingList, 所属类别classVec
|
||||
"""
|
||||
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #[0,0,1,1,1......]
|
||||
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
|
||||
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
|
||||
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
|
||||
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
|
||||
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
|
||||
classVec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 not
|
||||
return postingList, classVec
|
||||
|
||||
|
||||
def createVocabList(dataSet):
|
||||
"""
|
||||
获取所有单词的集合
|
||||
:param dataSet: 数据集
|
||||
:return: 所有单词的集合(即不含重复元素的单词列表)
|
||||
"""
|
||||
vocabSet = set([]) # create empty set
|
||||
for document in dataSet:
|
||||
vocabSet = vocabSet | set(document) # union of the two sets
|
||||
return list(vocabSet)
|
||||
|
||||
|
||||
def setOfWords2Vec(vocabList, inputSet):
|
||||
"""
|
||||
遍历查看该单词属否出现,出现该单词则将该单词置1
|
||||
:param vocabList: 所有单词集合列表
|
||||
:param inputSet: 输入数据集
|
||||
:return: 匹配列表[0,1,0,1...]
|
||||
"""
|
||||
returnVec = [0] * len(vocabList)# [0,0......]
|
||||
for word in inputSet:
|
||||
if word in vocabList:
|
||||
returnVec[vocabList.index(word)] = 1
|
||||
else:
|
||||
print "the word: %s is not in my Vocabulary!" % word
|
||||
return returnVec
|
||||
|
||||
|
||||
def _trainNB0(trainMatrix, trainCategory):
|
||||
"""
|
||||
训练数据原版
|
||||
:param trainMatrix: 文件单词矩阵 [[1,0,1,1,1....],[],[]...]
|
||||
:param trainCategory: 文件对应的类别[0,1,1,0....]
|
||||
:return:
|
||||
"""
|
||||
# 文件数
|
||||
numTrainDocs = len(trainMatrix)
|
||||
# 单词数
|
||||
numWords = len(trainMatrix[0])
|
||||
# 侮辱性文件的出现概率
|
||||
pAbusive = sum(trainCategory) / float(numTrainDocs)
|
||||
# 构造单词出现次数列表
|
||||
p0Num = zeros(numWords) # [0,0,0,.....]
|
||||
p1Num = zeros(numWords) # [0,0,0,.....]
|
||||
|
||||
# 整个数据集单词出现总数
|
||||
p0Denom = 0.0
|
||||
p1Denom = 0.0
|
||||
for i in range(numTrainDocs):
|
||||
if trainCategory[i] == 1:
|
||||
p1Num += trainMatrix[i] #[0,1,1,....]->[0,1,1,...]
|
||||
p1Denom += sum(trainMatrix[i])
|
||||
else:
|
||||
p0Num += trainMatrix[i]
|
||||
p0Denom += sum(trainMatrix[i])
|
||||
# 类别1,即侮辱性文档的[P(F1|C1),P(F2|C1),P(F3|C1),P(F4|C1),P(F5|C1)....]列表
|
||||
p1Vect = p1Num / p1Denom# [1,2,3,5]/90->[1/90,...]
|
||||
# 类别0,即正常文档的[P(F1|C0),P(F2|C0),P(F3|C0),P(F4|C0),P(F5|C0)....]列表
|
||||
p0Vect = p0Num / p0Denom
|
||||
return p0Vect, p1Vect, pAbusive
|
||||
|
||||
|
||||
def trainNB0(trainMatrix, trainCategory):
|
||||
"""
|
||||
训练数据优化版本
|
||||
:param trainMatrix: 文件单词矩阵
|
||||
:param trainCategory: 文件对应的类别
|
||||
:return:
|
||||
"""
|
||||
# 总文件数
|
||||
numTrainDocs = len(trainMatrix)
|
||||
# 总单词数
|
||||
numWords = len(trainMatrix[0])
|
||||
# 侮辱性文件的出现概率
|
||||
pAbusive = sum(trainCategory) / float(numTrainDocs)
|
||||
# 构造单词出现次数列表
|
||||
# p0Num 正常的统计
|
||||
# p1Num 侮辱的统计
|
||||
p0Num = ones(numWords)#[0,0......]->[1,1,1,1,1.....]
|
||||
p1Num = ones(numWords)
|
||||
|
||||
# 整个数据集单词出现总数,2.0根据样本/实际调查结果调整分母的值(2主要是避免分母为0,当然值可以调整)
|
||||
# p0Denom 正常的统计
|
||||
# p1Denom 侮辱的统计
|
||||
p0Denom = 2.0
|
||||
p1Denom = 2.0
|
||||
for i in range(numTrainDocs):
|
||||
if trainCategory[i] == 1:
|
||||
# 累加辱骂词的频次
|
||||
p1Num += trainMatrix[i]
|
||||
# 对每篇文章的辱骂的频次 进行统计汇总
|
||||
p1Denom += sum(trainMatrix[i])
|
||||
else:
|
||||
p0Num += trainMatrix[i]
|
||||
p0Denom += sum(trainMatrix[i])
|
||||
# 类别1,即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表
|
||||
p1Vect = log(p1Num / p1Denom)
|
||||
# 类别0,即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表
|
||||
p0Vect = log(p0Num / p0Denom)
|
||||
return p0Vect, p1Vect, pAbusive
|
||||
|
||||
|
||||
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
|
||||
"""
|
||||
使用算法:
|
||||
# 将乘法转坏为加法
|
||||
乘法:P(C|F1F2...Fn) = P(F1F2...Fn|C)P(C)/P(F1F2...Fn)
|
||||
加法:P(F1|C)*P(F2|C)....P(Fn|C)P(C) -> log(P(F1|C))+log(P(F2|C))+....+log(P(Fn|C))+log(P(C))
|
||||
:param vec2Classify: 待测数据[0,1,1,1,1...]
|
||||
:param p0Vec: 类别1,即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表
|
||||
:param p1Vec: 类别0,即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表
|
||||
:param pClass1: 类别1,侮辱性文件的出现概率
|
||||
:return: 类别1 or 0
|
||||
"""
|
||||
# 计算公式 log(P(F1|C))+log(P(F2|C))+....+log(P(Fn|C))+log(P(C))
|
||||
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
|
||||
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
|
||||
if p1 > p0:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def bagOfWords2VecMN(vocabList, inputSet):
|
||||
returnVec = [0] * len(vocabList)
|
||||
for word in inputSet:
|
||||
if word in vocabList:
|
||||
returnVec[vocabList.index(word)] += 1
|
||||
return returnVec
|
||||
|
||||
|
||||
def testingNB():
|
||||
"""
|
||||
测试朴素贝叶斯算法
|
||||
"""
|
||||
# 1. 加载数据集
|
||||
listOPosts, listClasses = loadDataSet()
|
||||
# 2. 创建单词集合
|
||||
myVocabList = createVocabList(listOPosts)
|
||||
# 3. 计算单词是否出现并创建数据矩阵
|
||||
trainMat = []
|
||||
for postinDoc in listOPosts:
|
||||
# 返回m*len(myVocabList)的矩阵, 记录的都是0,1信息
|
||||
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
|
||||
# 4. 训练数据
|
||||
p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
|
||||
# 5. 测试数据
|
||||
testEntry = ['love', 'my', 'dalmation']
|
||||
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
|
||||
print testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)
|
||||
testEntry = ['stupid', 'garbage']
|
||||
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
|
||||
print testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
testingNB()
|
||||
103
src/python/05.Logistic/core/logRegression01.py
Normal file
103
src/python/05.Logistic/core/logRegression01.py
Normal file
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
from numpy import *
|
||||
import matplotlib.pyplot as plt
|
||||
import time
|
||||
'''
|
||||
1、需要安装模块:pip install matplotlib-1.5.0-cp27-none-win_amd64.whl
|
||||
由于直接安装会出现问题,所以建议下载whl包进行安装,下载网址:
|
||||
https://pypi.python.org/pypi/matplotlib/1.5.0
|
||||
|
||||
2、可以看见画出的图像
|
||||
'''
|
||||
|
||||
"""
|
||||
@version:
|
||||
@author: yangjf
|
||||
@license: ApacheCN
|
||||
@contact: highfei2011@126.com
|
||||
@site: https://github.com/apachecn/MachineLearning
|
||||
@software: PyCharm
|
||||
@file: logRegression01.py
|
||||
@time: 2017/3/3 22:03
|
||||
@test result: ok
|
||||
"""
|
||||
|
||||
# sigmoid函数
|
||||
def sigmoid(inX):
|
||||
return 1.0 / (1 + exp(-inX))
|
||||
|
||||
def trainLogRegres(train_x, train_y, opts):
|
||||
# 计算训练时间
|
||||
startTime = time.time()
|
||||
|
||||
numSamples, numFeatures = shape(train_x)
|
||||
alpha = opts['alpha']; maxIter = opts['maxIter']
|
||||
weights = ones((numFeatures, 1))
|
||||
|
||||
# 通过梯度下降算法优化
|
||||
for k in range(maxIter):
|
||||
if opts['optimizeType'] == 'gradDescent': # 梯度下降算法
|
||||
output = sigmoid(train_x * weights)
|
||||
error = train_y - output
|
||||
weights = weights + alpha * train_x.transpose() * error
|
||||
elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降
|
||||
for i in range(numSamples):
|
||||
output = sigmoid(train_x[i, :] * weights)
|
||||
error = train_y[i, 0] - output
|
||||
weights = weights + alpha * train_x[i, :].transpose() * error
|
||||
elif opts['optimizeType'] == 'smoothStocGradDescent': # 光滑随机梯度下降
|
||||
# 随机选择样本以优化以减少周期波动
|
||||
dataIndex = range(numSamples)
|
||||
for i in range(numSamples):
|
||||
alpha = 4.0 / (1.0 + k + i) + 0.01
|
||||
randIndex = int(random.uniform(0, len(dataIndex)))
|
||||
output = sigmoid(train_x[randIndex, :] * weights)
|
||||
error = train_y[randIndex, 0] - output
|
||||
weights = weights + alpha * train_x[randIndex, :].transpose() * error
|
||||
del(dataIndex[randIndex]) # 在一次交互期间,删除优化的样品
|
||||
else:
|
||||
raise NameError('Not support optimize method type!')
|
||||
|
||||
|
||||
print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
|
||||
return weights
|
||||
|
||||
|
||||
#测试给定测试集的训练Logistic回归模型
|
||||
def testLogRegres(weights, test_x, test_y):
|
||||
numSamples, numFeatures = shape(test_x)
|
||||
matchCount = 0
|
||||
for i in xrange(numSamples):
|
||||
predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
|
||||
if predict == bool(test_y[i, 0]):
|
||||
matchCount += 1
|
||||
accuracy = float(matchCount) / numSamples
|
||||
return accuracy
|
||||
|
||||
|
||||
# 显示你的训练逻辑回归模型只有2-D数据可用
|
||||
def showLogRegres(weights, train_x, train_y):
|
||||
# 注意:train_x和train_y是垫数据类型
|
||||
numSamples, numFeatures = shape(train_x)
|
||||
if numFeatures != 3:
|
||||
print "抱歉! 我不能绘制,因为你的数据的维度不是2!"
|
||||
return 1
|
||||
|
||||
# 画出所有抽样数据
|
||||
for i in xrange(numSamples):
|
||||
if int(train_y[i, 0]) == 0:
|
||||
plt.plot(train_x[i, 1], train_x[i, 2], 'or')
|
||||
elif int(train_y[i, 0]) == 1:
|
||||
plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
|
||||
|
||||
# 画图操作
|
||||
min_x = min(train_x[:, 1])[0, 0]
|
||||
max_x = max(train_x[:, 1])[0, 0]
|
||||
weights = weights.getA() # 将mat转换为数组
|
||||
y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
|
||||
y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
|
||||
plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
|
||||
plt.xlabel('X1'); plt.ylabel('X2')
|
||||
#显示图像
|
||||
plt.show()
|
||||
54
src/python/05.Logistic/core/test_logRegression.py
Normal file
54
src/python/05.Logistic/core/test_logRegression.py
Normal file
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
import os
|
||||
import sys
|
||||
sys.path.append("C:\Python27")
|
||||
from numpy import *
|
||||
|
||||
from logRegression01 import *
|
||||
"""
|
||||
@version:
|
||||
@author: yangjf
|
||||
@license: ApacheCN
|
||||
@contact: highfei2011@126.com
|
||||
@site: https://github.com/apachecn/MachineLearning
|
||||
@software: PyCharm
|
||||
@file: test_logRegression.py
|
||||
@time: 2017/3/3 22:09
|
||||
@test result: ok
|
||||
"""
|
||||
|
||||
def loadData():
|
||||
train_x = []
|
||||
train_y = []
|
||||
# 获取当前文件所在路径
|
||||
project_dir = os.getcwdu()
|
||||
# 截取字符串至项目名:Test\
|
||||
project_dir = project_dir[:project_dir.find("MachineLearning\\") + 15]
|
||||
print project_dir
|
||||
fileIn = open("%s/testData/Logistic_testdata.txt" % project_dir)
|
||||
for line in fileIn.readlines():
|
||||
lineArr = line.strip().split()
|
||||
train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||||
train_y.append(float(lineArr[2]))
|
||||
return mat(train_x), mat(train_y).transpose()
|
||||
|
||||
|
||||
##第一步: 加载数据
|
||||
print "step 1: load data..."
|
||||
train_x, train_y = loadData()
|
||||
test_x = train_x; test_y = train_y
|
||||
|
||||
##第二步: 训练数据...
|
||||
print "step 2: training..."
|
||||
opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'}
|
||||
optimalWeights = trainLogRegres(train_x, train_y, opts)
|
||||
|
||||
##第三步: 测试
|
||||
print "step 3: testing..."
|
||||
accuracy = testLogRegres(optimalWeights, test_x, test_y)
|
||||
|
||||
##第四步: 显示结果
|
||||
print "step 4: show the result..."
|
||||
print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
|
||||
showLogRegres(optimalWeights, train_x, train_y)
|
||||
472
src/python/06.SVM/svmMLiA.py
Normal file
472
src/python/06.SVM/svmMLiA.py
Normal file
@@ -0,0 +1,472 @@
|
||||
"""
|
||||
Created on Nov 4, 2010
|
||||
Update on 2017-03-21
|
||||
Chapter 5 source file for Machine Learing in Action
|
||||
@author: Peter/geekidentity
|
||||
"""
|
||||
from numpy import *
|
||||
from time import sleep
|
||||
|
||||
def loadDataSet(fileName):
|
||||
"""
|
||||
对文件进行逐行解析,从而得到第行的类标签和整个数据矩阵
|
||||
Args:
|
||||
fileName: testSet.txt
|
||||
|
||||
Returns:
|
||||
数据矩阵, 类标签
|
||||
"""
|
||||
dataMat = []; labelMat = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
lineArr = line.strip().split('\t')
|
||||
dataMat.append([float(lineArr[0]), float(lineArr[1])])
|
||||
labelMat.append(float(lineArr[2]))
|
||||
return dataMat,labelMat
|
||||
|
||||
def selectJrand(i,m):
|
||||
"""
|
||||
随机选择一个整数
|
||||
Args:
|
||||
i: 第一个alpha的下标
|
||||
m: 所有alpha的数目
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
j=i #we want to select any J not equal to i
|
||||
while (j==i):
|
||||
j = int(random.uniform(0,m))
|
||||
return j
|
||||
|
||||
def clipAlpha(aj,H,L):
|
||||
"""
|
||||
用于调整大于H或小于L的alpha值
|
||||
Args:
|
||||
aj:
|
||||
H:
|
||||
L:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if aj > H:
|
||||
aj = H
|
||||
if L > aj:
|
||||
aj = L
|
||||
return aj
|
||||
|
||||
def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
|
||||
"""
|
||||
SVM SMO算法的简单实现:
|
||||
创建一个alpha向量并将其初始化为0向量
|
||||
当迭代次数据小于最大迭代次数时(外循环)
|
||||
对数据集中的每个数据向量(内循环):
|
||||
如果该数据向量可以被优化:
|
||||
随机选择另外一个数据向量
|
||||
同时优化这两个向量
|
||||
如果两个向量都不能被优化,退出内循环
|
||||
如果所有向量都没有被优化,增加迭代数目,继续下一次循环
|
||||
Args:
|
||||
dataMatIn: 数据集
|
||||
classLabels: 类别标签
|
||||
C: 常数C
|
||||
toler: 容错率
|
||||
maxIter: 退出前最大的循环次数
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
dataMatrix = mat(dataMatIn); labelMat = mat(classLabels).transpose()
|
||||
b = 0; m,n = shape(dataMatrix)
|
||||
alphas = mat(zeros((m,1)))
|
||||
iter = 0
|
||||
while (iter < maxIter):
|
||||
alphaPairsChanged = 0
|
||||
for i in range(m):
|
||||
fXi = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b
|
||||
Ei = fXi - float(labelMat[i])#if checks if an example violates KKT conditions
|
||||
if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] > 0)):
|
||||
j = selectJrand(i,m)
|
||||
fXj = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b
|
||||
Ej = fXj - float(labelMat[j])
|
||||
alphaIold = alphas[i].copy(); alphaJold = alphas[j].copy()
|
||||
if (labelMat[i] != labelMat[j]):
|
||||
L = max(0, alphas[j] - alphas[i])
|
||||
H = min(C, C + alphas[j] - alphas[i])
|
||||
else:
|
||||
L = max(0, alphas[j] + alphas[i] - C)
|
||||
H = min(C, alphas[j] + alphas[i])
|
||||
if L==H: print("L==H"); continue
|
||||
eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T - dataMatrix[i,:]*dataMatrix[i,:].T - dataMatrix[j,:]*dataMatrix[j,:].T
|
||||
if eta >= 0: print("eta>=0"); continue
|
||||
alphas[j] -= labelMat[j]*(Ei - Ej)/eta
|
||||
alphas[j] = clipAlpha(alphas[j],H,L)
|
||||
if (abs(alphas[j] - alphaJold) < 0.00001): print("j not moving enough"); continue
|
||||
alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j])#update i by the same amount as j
|
||||
#the update is in the oppostie direction
|
||||
b1 = b - Ei- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
|
||||
b2 = b - Ej- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
|
||||
if (0 < alphas[i]) and (C > alphas[i]): b = b1
|
||||
elif (0 < alphas[j]) and (C > alphas[j]): b = b2
|
||||
else: b = (b1 + b2)/2.0
|
||||
alphaPairsChanged += 1
|
||||
print("iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged))
|
||||
if (alphaPairsChanged == 0): iter += 1
|
||||
else: iter = 0
|
||||
print("iteration number: %d" % iter)
|
||||
return b,alphas
|
||||
|
||||
|
||||
def kernelTrans(X, A, kTup): # calc the kernel or transform data to a higher dimensional space
|
||||
m, n = shape(X)
|
||||
K = mat(zeros((m, 1)))
|
||||
if kTup[0] == 'lin':
|
||||
K = X * A.T # linear kernel
|
||||
elif kTup[0] == 'rbf':
|
||||
for j in range(m):
|
||||
deltaRow = X[j, :] - A
|
||||
K[j] = deltaRow * deltaRow.T
|
||||
K = exp(K / (-1 * kTup[1] ** 2)) # divide in NumPy is element-wise not matrix like Matlab
|
||||
else:
|
||||
raise NameError('Houston We Have a Problem -- \
|
||||
That Kernel is not recognized')
|
||||
return K
|
||||
|
||||
|
||||
class optStruct:
|
||||
def __init__(self, dataMatIn, classLabels, C, toler, kTup): # Initialize the structure with the parameters
|
||||
self.X = dataMatIn
|
||||
self.labelMat = classLabels
|
||||
self.C = C
|
||||
self.tol = toler
|
||||
self.m = shape(dataMatIn)[0]
|
||||
self.alphas = mat(zeros((self.m, 1)))
|
||||
self.b = 0
|
||||
self.eCache = mat(zeros((self.m, 2))) # first column is valid flag
|
||||
self.K = mat(zeros((self.m, self.m)))
|
||||
for i in range(self.m):
|
||||
self.K[:, i] = kernelTrans(self.X, self.X[i, :], kTup)
|
||||
|
||||
|
||||
def calcEk(oS, k):
|
||||
fXk = float(multiply(oS.alphas, oS.labelMat).T * oS.K[:, k] + oS.b)
|
||||
Ek = fXk - float(oS.labelMat[k])
|
||||
return Ek
|
||||
|
||||
|
||||
def selectJ(i, oS, Ei): # this is the second choice -heurstic, and calcs Ej
|
||||
maxK = -1
|
||||
maxDeltaE = 0
|
||||
Ej = 0
|
||||
oS.eCache[i] = [1, Ei] # set valid #choose the alpha that gives the maximum delta E
|
||||
validEcacheList = nonzero(oS.eCache[:, 0].A)[0]
|
||||
if (len(validEcacheList)) > 1:
|
||||
for k in validEcacheList: # loop through valid Ecache values and find the one that maximizes delta E
|
||||
if k == i: continue # don't calc for i, waste of time
|
||||
Ek = calcEk(oS, k)
|
||||
deltaE = abs(Ei - Ek)
|
||||
if (deltaE > maxDeltaE):
|
||||
maxK = k;
|
||||
maxDeltaE = deltaE;
|
||||
Ej = Ek
|
||||
return maxK, Ej
|
||||
else: # in this case (first time around) we don't have any valid eCache values
|
||||
j = selectJrand(i, oS.m)
|
||||
Ej = calcEk(oS, j)
|
||||
return j, Ej
|
||||
|
||||
|
||||
def updateEk(oS, k): # after any alpha has changed update the new value in the cache
|
||||
Ek = calcEk(oS, k)
|
||||
oS.eCache[k] = [1, Ek]
|
||||
|
||||
|
||||
def innerL(i, oS):
|
||||
Ei = calcEk(oS, i)
|
||||
if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or (
|
||||
(oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)):
|
||||
j, Ej = selectJ(i, oS, Ei) # this has been changed from selectJrand
|
||||
alphaIold = oS.alphas[i].copy();
|
||||
alphaJold = oS.alphas[j].copy();
|
||||
if (oS.labelMat[i] != oS.labelMat[j]):
|
||||
L = max(0, oS.alphas[j] - oS.alphas[i])
|
||||
H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
|
||||
else:
|
||||
L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
|
||||
H = min(oS.C, oS.alphas[j] + oS.alphas[i])
|
||||
if L == H: print
|
||||
"L==H";
|
||||
return 0
|
||||
eta = 2.0 * oS.K[i, j] - oS.K[i, i] - oS.K[j, j] # changed for kernel
|
||||
if eta >= 0: print
|
||||
"eta>=0";
|
||||
return 0
|
||||
oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej) / eta
|
||||
oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
|
||||
updateEk(oS, j) # added this for the Ecache
|
||||
if (abs(oS.alphas[j] - alphaJold) < 0.00001): print
|
||||
"j not moving enough";
|
||||
return 0
|
||||
oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * (alphaJold - oS.alphas[j]) # update i by the same amount as j
|
||||
updateEk(oS, i) # added this for the Ecache #the update is in the oppostie direction
|
||||
b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, i] - oS.labelMat[j] * (
|
||||
oS.alphas[j] - alphaJold) * oS.K[i, j]
|
||||
b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, j] - oS.labelMat[j] * (
|
||||
oS.alphas[j] - alphaJold) * oS.K[j, j]
|
||||
if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]):
|
||||
oS.b = b1
|
||||
elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]):
|
||||
oS.b = b2
|
||||
else:
|
||||
oS.b = (b1 + b2) / 2.0
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup=('lin', 0)): # full Platt SMO
|
||||
oS = optStruct(mat(dataMatIn), mat(classLabels).transpose(), C, toler, kTup)
|
||||
iter = 0
|
||||
entireSet = True;
|
||||
alphaPairsChanged = 0
|
||||
while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
|
||||
alphaPairsChanged = 0
|
||||
if entireSet: # go over all
|
||||
for i in range(oS.m):
|
||||
alphaPairsChanged += innerL(i, oS)
|
||||
print
|
||||
"fullSet, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged)
|
||||
iter += 1
|
||||
else: # go over non-bound (railed) alphas
|
||||
nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
|
||||
for i in nonBoundIs:
|
||||
alphaPairsChanged += innerL(i, oS)
|
||||
print
|
||||
"non-bound, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged)
|
||||
iter += 1
|
||||
if entireSet:
|
||||
entireSet = False # toggle entire set loop
|
||||
elif (alphaPairsChanged == 0):
|
||||
entireSet = True
|
||||
print
|
||||
"iteration number: %d" % iter
|
||||
return oS.b, oS.alphas
|
||||
|
||||
|
||||
def calcWs(alphas, dataArr, classLabels):
|
||||
X = mat(dataArr);
|
||||
labelMat = mat(classLabels).transpose()
|
||||
m, n = shape(X)
|
||||
w = zeros((n, 1))
|
||||
for i in range(m):
|
||||
w += multiply(alphas[i] * labelMat[i], X[i, :].T)
|
||||
return w
|
||||
|
||||
|
||||
def testRbf(k1=1.3):
|
||||
dataArr, labelArr = loadDataSet('testSetRBF.txt')
|
||||
b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1)) # C=200 important
|
||||
datMat = mat(dataArr);
|
||||
labelMat = mat(labelArr).transpose()
|
||||
svInd = nonzero(alphas.A > 0)[0]
|
||||
sVs = datMat[svInd] # get matrix of only support vectors
|
||||
labelSV = labelMat[svInd];
|
||||
print
|
||||
"there are %d Support Vectors" % shape(sVs)[0]
|
||||
m, n = shape(datMat)
|
||||
errorCount = 0
|
||||
for i in range(m):
|
||||
kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
|
||||
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
|
||||
if sign(predict) != sign(labelArr[i]): errorCount += 1
|
||||
print
|
||||
"the training error rate is: %f" % (float(errorCount) / m)
|
||||
dataArr, labelArr = loadDataSet('testSetRBF2.txt')
|
||||
errorCount = 0
|
||||
datMat = mat(dataArr);
|
||||
labelMat = mat(labelArr).transpose()
|
||||
m, n = shape(datMat)
|
||||
for i in range(m):
|
||||
kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
|
||||
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
|
||||
if sign(predict) != sign(labelArr[i]): errorCount += 1
|
||||
print
|
||||
"the test error rate is: %f" % (float(errorCount) / m)
|
||||
|
||||
|
||||
def img2vector(filename):
|
||||
returnVect = zeros((1, 1024))
|
||||
fr = open(filename)
|
||||
for i in range(32):
|
||||
lineStr = fr.readline()
|
||||
for j in range(32):
|
||||
returnVect[0, 32 * i + j] = int(lineStr[j])
|
||||
return returnVect
|
||||
|
||||
|
||||
def loadImages(dirName):
|
||||
from os import listdir
|
||||
hwLabels = []
|
||||
trainingFileList = listdir(dirName) # load the training set
|
||||
m = len(trainingFileList)
|
||||
trainingMat = zeros((m, 1024))
|
||||
for i in range(m):
|
||||
fileNameStr = trainingFileList[i]
|
||||
fileStr = fileNameStr.split('.')[0] # take off .txt
|
||||
classNumStr = int(fileStr.split('_')[0])
|
||||
if classNumStr == 9:
|
||||
hwLabels.append(-1)
|
||||
else:
|
||||
hwLabels.append(1)
|
||||
trainingMat[i, :] = img2vector('%s/%s' % (dirName, fileNameStr))
|
||||
return trainingMat, hwLabels
|
||||
|
||||
|
||||
def testDigits(kTup=('rbf', 10)):
|
||||
dataArr, labelArr = loadImages('trainingDigits')
|
||||
b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup)
|
||||
datMat = mat(dataArr);
|
||||
labelMat = mat(labelArr).transpose()
|
||||
svInd = nonzero(alphas.A > 0)[0]
|
||||
sVs = datMat[svInd]
|
||||
labelSV = labelMat[svInd];
|
||||
print("there are %d Support Vectors" % shape(sVs)[0])
|
||||
m, n = shape(datMat)
|
||||
errorCount = 0
|
||||
for i in range(m):
|
||||
kernelEval = kernelTrans(sVs, datMat[i, :], kTup)
|
||||
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
|
||||
if sign(predict) != sign(labelArr[i]): errorCount += 1
|
||||
print
|
||||
"the training error rate is: %f" % (float(errorCount) / m)
|
||||
dataArr, labelArr = loadImages('testDigits')
|
||||
errorCount = 0
|
||||
datMat = mat(dataArr);
|
||||
labelMat = mat(labelArr).transpose()
|
||||
m, n = shape(datMat)
|
||||
for i in range(m):
|
||||
kernelEval = kernelTrans(sVs, datMat[i, :], kTup)
|
||||
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
|
||||
if sign(predict) != sign(labelArr[i]): errorCount += 1
|
||||
print
|
||||
"the test error rate is: %f" % (float(errorCount) / m)
|
||||
|
||||
|
||||
'''#######********************************
|
||||
Non-Kernel VErsions below
|
||||
''' #######********************************
|
||||
|
||||
|
||||
class optStructK:
|
||||
def __init__(self, dataMatIn, classLabels, C, toler): # Initialize the structure with the parameters
|
||||
self.X = dataMatIn
|
||||
self.labelMat = classLabels
|
||||
self.C = C
|
||||
self.tol = toler
|
||||
self.m = shape(dataMatIn)[0]
|
||||
self.alphas = mat(zeros((self.m, 1)))
|
||||
self.b = 0
|
||||
self.eCache = mat(zeros((self.m, 2))) # first column is valid flag
|
||||
|
||||
|
||||
def calcEkK(oS, k):
|
||||
fXk = float(multiply(oS.alphas, oS.labelMat).T * (oS.X * oS.X[k, :].T)) + oS.b
|
||||
Ek = fXk - float(oS.labelMat[k])
|
||||
return Ek
|
||||
|
||||
|
||||
def selectJK(i, oS, Ei): # this is the second choice -heurstic, and calcs Ej
|
||||
maxK = -1
|
||||
maxDeltaE = 0
|
||||
Ej = 0
|
||||
oS.eCache[i] = [1, Ei] # set valid #choose the alpha that gives the maximum delta E
|
||||
validEcacheList = nonzero(oS.eCache[:, 0].A)[0]
|
||||
if (len(validEcacheList)) > 1:
|
||||
for k in validEcacheList: # loop through valid Ecache values and find the one that maximizes delta E
|
||||
if k == i: continue # don't calc for i, waste of time
|
||||
Ek = calcEk(oS, k)
|
||||
deltaE = abs(Ei - Ek)
|
||||
if (deltaE > maxDeltaE):
|
||||
maxK = k;
|
||||
maxDeltaE = deltaE;
|
||||
Ej = Ek
|
||||
return maxK, Ej
|
||||
else: # in this case (first time around) we don't have any valid eCache values
|
||||
j = selectJrand(i, oS.m)
|
||||
Ej = calcEk(oS, j)
|
||||
return j, Ej
|
||||
|
||||
|
||||
def updateEkK(oS, k): # after any alpha has changed update the new value in the cache
|
||||
Ek = calcEk(oS, k)
|
||||
oS.eCache[k] = [1, Ek]
|
||||
|
||||
|
||||
def innerLK(i, oS):
|
||||
Ei = calcEk(oS, i)
|
||||
if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or (
|
||||
(oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)):
|
||||
j, Ej = selectJ(i, oS, Ei) # this has been changed from selectJrand
|
||||
alphaIold = oS.alphas[i].copy();
|
||||
alphaJold = oS.alphas[j].copy();
|
||||
if (oS.labelMat[i] != oS.labelMat[j]):
|
||||
L = max(0, oS.alphas[j] - oS.alphas[i])
|
||||
H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
|
||||
else:
|
||||
L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
|
||||
H = min(oS.C, oS.alphas[j] + oS.alphas[i])
|
||||
if L == H: print
|
||||
"L==H";
|
||||
return 0
|
||||
eta = 2.0 * oS.X[i, :] * oS.X[j, :].T - oS.X[i, :] * oS.X[i, :].T - oS.X[j, :] * oS.X[j, :].T
|
||||
if eta >= 0: print
|
||||
"eta>=0";
|
||||
return 0
|
||||
oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej) / eta
|
||||
oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
|
||||
updateEk(oS, j) # added this for the Ecache
|
||||
if (abs(oS.alphas[j] - alphaJold) < 0.00001): print
|
||||
"j not moving enough";
|
||||
return 0
|
||||
oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * (alphaJold - oS.alphas[j]) # update i by the same amount as j
|
||||
updateEk(oS, i) # added this for the Ecache #the update is in the oppostie direction
|
||||
b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[i, :].T - oS.labelMat[j] * (
|
||||
oS.alphas[j] - alphaJold) * oS.X[i, :] * oS.X[j, :].T
|
||||
b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[j, :].T - oS.labelMat[j] * (
|
||||
oS.alphas[j] - alphaJold) * oS.X[j, :] * oS.X[j, :].T
|
||||
if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]):
|
||||
oS.b = b1
|
||||
elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]):
|
||||
oS.b = b2
|
||||
else:
|
||||
oS.b = (b1 + b2) / 2.0
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def smoPK(dataMatIn, classLabels, C, toler, maxIter): # full Platt SMO
|
||||
oS = optStruct(mat(dataMatIn), mat(classLabels).transpose(), C, toler)
|
||||
iter = 0
|
||||
entireSet = True;
|
||||
alphaPairsChanged = 0
|
||||
while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
|
||||
alphaPairsChanged = 0
|
||||
if entireSet: # go over all
|
||||
for i in range(oS.m):
|
||||
alphaPairsChanged += innerL(i, oS)
|
||||
print("fullSet, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
|
||||
iter += 1
|
||||
else: # go over non-bound (railed) alphas
|
||||
nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
|
||||
for i in nonBoundIs:
|
||||
alphaPairsChanged += innerL(i, oS)
|
||||
print("non-bound, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
|
||||
iter += 1
|
||||
if entireSet:
|
||||
entireSet = False # toggle entire set loop
|
||||
elif (alphaPairsChanged == 0):
|
||||
entireSet = True
|
||||
print("iteration number: %d" % iter)
|
||||
return oS.b, oS.alphas
|
||||
257
src/python/07.AdaBoost/adaboost.py
Normal file
257
src/python/07.AdaBoost/adaboost.py
Normal file
@@ -0,0 +1,257 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on Nov 28, 2010
|
||||
Adaboost is short for Adaptive Boosting
|
||||
@author: Peter/jiangzhonglian
|
||||
'''
|
||||
from numpy import *
|
||||
|
||||
|
||||
# def loadSimpData():
|
||||
# """ 测试数据
|
||||
# Returns:
|
||||
# dataArr feature对应的数据集
|
||||
# labelArr feature对应的分类标签
|
||||
# """
|
||||
# dataArr = array([[1., 2.1], [2., 1.1], [1.3, 1.], [1., 1.], [2., 1.]])
|
||||
# labelArr = [1.0, 1.0, -1.0, -1.0, 1.0]
|
||||
# return dataArr, labelArr
|
||||
|
||||
|
||||
# general function to parse tab -delimited floats
|
||||
def loadDataSet(fileName):
|
||||
# get number of fields
|
||||
numFeat = len(open(fileName).readline().split('\t'))
|
||||
dataArr = []
|
||||
labelArr = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
lineArr = []
|
||||
curLine = line.strip().split('\t')
|
||||
for i in range(numFeat-1):
|
||||
lineArr.append(float(curLine[i]))
|
||||
dataArr.append(lineArr)
|
||||
labelArr.append(float(curLine[-1]))
|
||||
return dataArr, labelArr
|
||||
|
||||
|
||||
def stumpClassify(dataMat, dimen, threshVal, threshIneq):
|
||||
"""stumpClassify(将数据集,按照feature列的value进行 二元切分比较来赋值)
|
||||
|
||||
Args:
|
||||
dataMat Matrix数据集
|
||||
dimen 特征列
|
||||
threshVal 特征列要比较的值
|
||||
Returns:
|
||||
retArray 结果集
|
||||
"""
|
||||
retArray = ones((shape(dataMat)[0], 1))
|
||||
# dataMat[:, dimen] 表示数据集中第dimen列的所有值
|
||||
# print '-----', threshIneq, dataMat[:, dimen], threshVal
|
||||
if threshIneq == 'lt':
|
||||
retArray[dataMat[:, dimen] <= threshVal] = -1.0
|
||||
else:
|
||||
retArray[dataMat[:, dimen] > threshVal] = -1.0
|
||||
return retArray
|
||||
|
||||
|
||||
def buildStump(dataArr, labelArr, D):
|
||||
|
||||
# 转换数据
|
||||
dataMat = mat(dataArr)
|
||||
labelMat = mat(labelArr).T
|
||||
# m行 n列
|
||||
m, n = shape(dataMat)
|
||||
|
||||
# 初始化数据
|
||||
numSteps = 10.0
|
||||
bestStump = {}
|
||||
bestClasEst = mat(zeros((m, 1)))
|
||||
# 初始化的最小误差为无穷大
|
||||
minError = inf
|
||||
|
||||
# 循环所有的feature列
|
||||
for i in range(n):
|
||||
rangeMin = dataMat[:, i].min()
|
||||
rangeMax = dataMat[:, i].max()
|
||||
# print 'rangeMin=%s, rangeMax=%s' % (rangeMin, rangeMax)
|
||||
# 计算每一份的元素个数
|
||||
stepSize = (rangeMax-rangeMin)/numSteps
|
||||
# 分成-1~numSteps= 1+numSteps份, 加本身是需要+1的
|
||||
for j in range(-1, int(numSteps)+1):
|
||||
# go over less than and greater than
|
||||
for inequal in ['lt', 'gt']:
|
||||
# 如果是-1,那么得到rangeMin-stepSize; 如果是numSteps,那么得到rangeMax
|
||||
threshVal = (rangeMin + float(j) * stepSize)
|
||||
# 对单层决策树进行简单分类
|
||||
predictedVals = stumpClassify(dataMat, i, threshVal, inequal)
|
||||
# print predictedVals
|
||||
errArr = mat(ones((m, 1)))
|
||||
# 正确为0,错误为1
|
||||
errArr[predictedVals == labelMat] = 0
|
||||
# 计算 平均每个特征的概率0.2*错误概率的总和为多少,就知道错误率多高
|
||||
# calc total error multiplied by D
|
||||
weightedError = D.T*errArr
|
||||
'''
|
||||
dim 表示 feature列
|
||||
threshVal 表示树的分界值
|
||||
inequal 表示计算树左右颠倒的错误率的情况
|
||||
weightedError 表示整体结果的错误率
|
||||
'''
|
||||
# print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
|
||||
if weightedError < minError:
|
||||
minError = weightedError
|
||||
bestClasEst = predictedVals.copy()
|
||||
bestStump['dim'] = i
|
||||
bestStump['thresh'] = threshVal
|
||||
bestStump['ineq'] = inequal
|
||||
return bestStump, minError, bestClasEst
|
||||
|
||||
|
||||
def adaBoostTrainDS(dataArr, labelArr, numIt=40):
|
||||
weakClassArr = []
|
||||
m = shape(dataArr)[0]
|
||||
# 初始化 init D to all equal
|
||||
D = mat(ones((m, 1))/m)
|
||||
aggClassEst = mat(zeros((m, 1)))
|
||||
for i in range(numIt):
|
||||
# build Stump
|
||||
bestStump, error, classEst = buildStump(dataArr, labelArr, D)
|
||||
# print "D:", D.T
|
||||
# calc alpha, throw in max(error,eps) to account for error=0
|
||||
alpha = float(0.5*log((1.0-error)/max(error, 1e-16)))
|
||||
bestStump['alpha'] = alpha
|
||||
# store Stump Params in Array
|
||||
weakClassArr.append(bestStump)
|
||||
|
||||
# print "alpha=%s, classEst=%s, bestStump=%s, error=%s " % (alpha, classEst.T, bestStump, error)
|
||||
# -1主要是下面求e的-alpha次方; 如果判断正确,乘积为1,否则为-1,这样就可以算出分类的情况了
|
||||
expon = multiply(-1*alpha*mat(labelArr).T, classEst)
|
||||
# print 'expon=', -1*alpha*mat(labelArr).T, classEst, expon
|
||||
# 计算e的expon次方,然后计算得到一个综合的概率的值
|
||||
# 结果发现: 正确的alpha的权重值变小了,错误的变大了。也就说D里面分类的权重值变了。(可以举例验证,假设:alpha=0.6,什么的)
|
||||
D = multiply(D, exp(expon))
|
||||
D = D/D.sum()
|
||||
# print "D: ", D.T
|
||||
# 计算分类结果的值,在上一轮结果的基础上,进行加和操作
|
||||
# calc training error of all classifiers, if this is 0 quit for loop early (use break)
|
||||
aggClassEst += alpha*classEst
|
||||
# print "aggClassEst: ", aggClassEst.T
|
||||
# sign 判断正为1, 0为0, 负为-1,通过最终加和的权重值,判断符号。
|
||||
# 结果为:错误的样本标签集合,因为是 !=,那么结果就是0 正, 1 负
|
||||
aggErrors = multiply(sign(aggClassEst) != mat(labelArr).T, ones((m, 1)))
|
||||
errorRate = aggErrors.sum()/m
|
||||
# print "total error=%s " % (errorRate)
|
||||
if errorRate == 0.0:
|
||||
break
|
||||
return weakClassArr, aggClassEst
|
||||
|
||||
|
||||
def adaClassify(datToClass, classifierArr):
|
||||
# do stuff similar to last aggClassEst in adaBoostTrainDS
|
||||
dataMat = mat(datToClass)
|
||||
m = shape(dataMat)[0]
|
||||
aggClassEst = mat(zeros((m, 1)))
|
||||
|
||||
# 循环 多个分类器
|
||||
for i in range(len(classifierArr)):
|
||||
# 通过分类器来核算每一次的分类结果,然后通过alpha*每一次的结果 得到最后的权重加和的值。
|
||||
classEst = stumpClassify(dataMat, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq'])
|
||||
aggClassEst += classifierArr[i]['alpha']*classEst
|
||||
# print aggClassEst
|
||||
return sign(aggClassEst)
|
||||
|
||||
|
||||
def plotROC(predStrengths, classLabels):
|
||||
"""plotROC(打印ROC曲线,并计算AUC的面积大小)
|
||||
|
||||
Args:
|
||||
predStrengths 最终预测结果的权重值
|
||||
classLabels 原始数据的分类结果集
|
||||
"""
|
||||
import matplotlib.pyplot as plt
|
||||
# variable to calculate AUC
|
||||
ySum = 0.0
|
||||
# 对正样本的进行求和
|
||||
numPosClas = sum(array(classLabels)==1.0)
|
||||
# 正样本的概率
|
||||
yStep = 1/float(numPosClas)
|
||||
# 负样本的概率
|
||||
xStep = 1/float(len(classLabels)-numPosClas)
|
||||
# argsort函数返回的是数组值从小到大的索引值
|
||||
# get sorted index, it's reverse
|
||||
sortedIndicies = predStrengths.argsort()
|
||||
|
||||
# 开始创建模版对象
|
||||
fig = plt.figure()
|
||||
fig.clf()
|
||||
ax = plt.subplot(111)
|
||||
# cursor光标值
|
||||
cur = (1.0, 1.0)
|
||||
# loop through all the values, drawing a line segment at each point
|
||||
for index in sortedIndicies.tolist()[0]:
|
||||
if classLabels[index] == 1.0:
|
||||
delX = 0
|
||||
delY = yStep
|
||||
else:
|
||||
delX = xStep
|
||||
delY = 0
|
||||
ySum += cur[1]
|
||||
# draw line from cur to (cur[0]-delX, cur[1]-delY)
|
||||
# 画点连线 (x1, x2, y1, y2)
|
||||
# print cur[0], cur[0]-delX, cur[1], cur[1]-delY
|
||||
ax.plot([cur[0], cur[0]-delX], [cur[1], cur[1]-delY], c='b')
|
||||
cur = (cur[0]-delX, cur[1]-delY)
|
||||
# 画对角的虚线线
|
||||
ax.plot([0, 1], [0, 1], 'b--')
|
||||
plt.xlabel('False positive rate')
|
||||
plt.ylabel('True positive rate')
|
||||
plt.title('ROC curve for AdaBoost horse colic detection system')
|
||||
# 设置画图的范围区间 (x1, x2, y1, y2)
|
||||
ax.axis([0, 1, 0, 1])
|
||||
plt.show()
|
||||
'''
|
||||
参考说明:http://blog.csdn.net/wenyusuran/article/details/39056013
|
||||
为了计算AUC,我们需要对多个小矩形的面积进行累加。这些小矩形的宽度是xStep,因此
|
||||
可以先对所有矩形的高度进行累加,最后再乘以xStep得到其总面积。所有高度的和(ySum)随
|
||||
着x轴的每次移动而渐次增加。
|
||||
'''
|
||||
print "the Area Under the Curve is: ", ySum*xStep
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# dataArr, labelArr = loadSimpData()
|
||||
# print '-----\n', dataArr, '\n', labelArr
|
||||
|
||||
# # D表示最初,对1进行均分为5份,平均每一个初始的概率都为0.2
|
||||
# D = mat(ones((5, 1))/5)
|
||||
# # print '-----', D
|
||||
|
||||
# # print buildStump(dataArr, labelArr, D)
|
||||
|
||||
# # 分类器:weakClassArr
|
||||
# # 历史累计的分类结果集
|
||||
# weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 9)
|
||||
# print weakClassArr, '\n-----\n', aggClassEst.T
|
||||
|
||||
# # 测试数据的分类结果
|
||||
# print adaClassify([0, 0], weakClassArr)
|
||||
# print adaClassify([[5, 5], [0, 0]], weakClassArr)
|
||||
|
||||
|
||||
# 马疝病数据集
|
||||
# 训练集合
|
||||
dataArr, labelArr = loadDataSet("testData/AB_horseColicTraining2.txt")
|
||||
weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 50)
|
||||
# 计算ROC下面的AUC的面积大小
|
||||
plotROC(aggClassEst.T, labelArr)
|
||||
|
||||
# # 测试集合
|
||||
# dataArrTest, labelArrTest = loadDataSet("testData/AB_horseColicTest2.txt")
|
||||
# m = shape(dataArrTest)[0]
|
||||
# predicting10 = adaClassify(dataArrTest, weakClassArr)
|
||||
# errArr = mat(ones((m, 1)))
|
||||
# # 测试:计算总样本数,错误样本数,错误率
|
||||
# print m, errArr[predicting10 != mat(labelArrTest).T].sum(), errArr[predicting10 != mat(labelArrTest).T].sum()/m
|
||||
@@ -224,6 +224,7 @@ def crossValidation(xArr,yArr,numVal=10):
|
||||
|
||||
|
||||
|
||||
|
||||
#test for standRegression
|
||||
def regression1():
|
||||
xArr, yArr = loadDataSet("../../../testData/Regression_data.txt")
|
||||
@@ -241,6 +242,7 @@ def regression1():
|
||||
|
||||
|
||||
|
||||
|
||||
#test for LWLR
|
||||
def regression2():
|
||||
xArr, yArr = loadDataSet("../../../testData/Regression_data.txt")
|
||||
|
||||
105
src/python/09.RegTrees/RTSklearn.py
Normal file
105
src/python/09.RegTrees/RTSklearn.py
Normal file
@@ -0,0 +1,105 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
# '''
|
||||
# Created on 2017-03-10
|
||||
# Update on 2017-03-10
|
||||
# author: jiangzhonglian
|
||||
# content: 回归树
|
||||
# '''
|
||||
|
||||
# print(__doc__)
|
||||
|
||||
|
||||
# # Import the necessary modules and libraries
|
||||
# import numpy as np
|
||||
# from sklearn.tree import DecisionTreeRegressor
|
||||
# import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
# # Create a random dataset
|
||||
# rng = np.random.RandomState(1)
|
||||
# X = np.sort(5 * rng.rand(80, 1), axis=0)
|
||||
# y = np.sin(X).ravel()
|
||||
# print X, '\n\n\n-----------\n\n\n', y
|
||||
# y[::5] += 3 * (0.5 - rng.rand(16))
|
||||
|
||||
|
||||
# # Fit regression model
|
||||
# regr_1 = DecisionTreeRegressor(max_depth=2, min_samples_leaf=5)
|
||||
# regr_2 = DecisionTreeRegressor(max_depth=5, min_samples_leaf=5)
|
||||
# regr_1.fit(X, y)
|
||||
# regr_2.fit(X, y)
|
||||
|
||||
|
||||
# # Predict
|
||||
# X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
|
||||
# y_1 = regr_1.predict(X_test)
|
||||
# y_2 = regr_2.predict(X_test)
|
||||
|
||||
|
||||
# # Plot the results
|
||||
# plt.figure()
|
||||
# plt.scatter(X, y, c="darkorange", label="data")
|
||||
# plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
|
||||
# plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
|
||||
# plt.xlabel("data")
|
||||
# plt.ylabel("target")
|
||||
# plt.title("Decision Tree Regression")
|
||||
# plt.legend()
|
||||
# plt.show()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
'''
|
||||
Created on 2017-03-10
|
||||
Update on 2017-03-10
|
||||
author: jiangzhonglian
|
||||
content: 模型树
|
||||
'''
|
||||
|
||||
print(__doc__)
|
||||
|
||||
# Author: Noel Dawe <noel.dawe@gmail.com>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
# importing necessary libraries
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
from sklearn.ensemble import AdaBoostRegressor
|
||||
|
||||
# Create the dataset
|
||||
rng = np.random.RandomState(1)
|
||||
X = np.linspace(0, 6, 100)[:, np.newaxis]
|
||||
y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])
|
||||
|
||||
# Fit regression model
|
||||
regr_1 = DecisionTreeRegressor(max_depth=4)
|
||||
|
||||
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
|
||||
n_estimators=300, random_state=rng)
|
||||
|
||||
regr_1.fit(X, y)
|
||||
regr_2.fit(X, y)
|
||||
|
||||
# Predict
|
||||
y_1 = regr_1.predict(X)
|
||||
y_2 = regr_2.predict(X)
|
||||
|
||||
# Plot the results
|
||||
plt.figure()
|
||||
plt.scatter(X, y, c="k", label="training samples")
|
||||
plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2)
|
||||
plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2)
|
||||
plt.xlabel("data")
|
||||
plt.ylabel("target")
|
||||
plt.title("Boosted Decision Tree Regression")
|
||||
plt.legend()
|
||||
plt.show()
|
||||
@@ -1,16 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on 2017-03-06
|
||||
Update on 2017-03-06
|
||||
@author: jiangzhonglian
|
||||
'''
|
||||
|
||||
|
||||
class treeNode():
|
||||
def __init__(self, feat, val, right, left):
|
||||
self.featureToSplitOn = feat
|
||||
self.valueOfSplit = val
|
||||
self.rightBranch = right
|
||||
self.leftBranch = left
|
||||
@@ -5,8 +5,9 @@
|
||||
Created on Feb 4, 2011
|
||||
Update on 2017-03-02
|
||||
Tree-Based Regression Methods Source Code for Machine Learning in Action Ch. 9
|
||||
@author: Peter Harrington/jiangzhonglian
|
||||
@author: Peter Harrington/片刻
|
||||
'''
|
||||
print(__doc__)
|
||||
from numpy import *
|
||||
|
||||
|
||||
|
||||
123
src/python/09.RegTrees/treeExplore.py
Normal file
123
src/python/09.RegTrees/treeExplore.py
Normal file
@@ -0,0 +1,123 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on 2017-03-08
|
||||
Update on 2017-03-08
|
||||
Tree-Based Regression Methods Source Code for Machine Learning in Action Ch. 9
|
||||
@author: jiangzhonglian
|
||||
'''
|
||||
import regTrees
|
||||
from Tkinter import *
|
||||
from numpy import *
|
||||
|
||||
import matplotlib
|
||||
from matplotlib.figure import Figure
|
||||
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
|
||||
matplotlib.use('TkAgg')
|
||||
|
||||
|
||||
def test_widget_text(root):
|
||||
mylabel = Label(root, text="helloworld")
|
||||
# 相当于告诉 布局管理器(Geometry Manager),如果不设定位置,默认在 0行0列的位置
|
||||
mylabel.grid()
|
||||
|
||||
|
||||
# 最大为误差, 最大子叶节点的数量
|
||||
def reDraw(tolS, tolN):
|
||||
# clear the figure
|
||||
reDraw.f.clf()
|
||||
reDraw.a = reDraw.f.add_subplot(111)
|
||||
|
||||
# 检查复选框是否选中
|
||||
if chkBtnVar.get():
|
||||
if tolN < 2:
|
||||
tolN = 2
|
||||
myTree = regTrees.createTree(reDraw.rawDat, regTrees.modelLeaf, regTrees.modelErr, (tolS, tolN))
|
||||
yHat = regTrees.createForeCast(myTree, reDraw.testDat, regTrees.modelTreeEval)
|
||||
else:
|
||||
myTree = regTrees.createTree(reDraw.rawDat, ops=(tolS, tolN))
|
||||
yHat = regTrees.createForeCast(myTree, reDraw.testDat)
|
||||
|
||||
# use scatter for data set
|
||||
reDraw.a.scatter(reDraw.rawDat[:, 0], reDraw.rawDat[:, 1], s=5)
|
||||
# use plot for yHat
|
||||
reDraw.a.plot(reDraw.testDat, yHat, linewidth=2.0, c='red')
|
||||
reDraw.canvas.show()
|
||||
|
||||
|
||||
def getInputs():
|
||||
try:
|
||||
tolN = int(tolNentry.get())
|
||||
except:
|
||||
tolN = 10
|
||||
print "enter Integer for tolN"
|
||||
tolNentry.delete(0, END)
|
||||
tolNentry.insert(0, '10')
|
||||
try:
|
||||
tolS = float(tolSentry.get())
|
||||
except:
|
||||
tolS = 1.0
|
||||
print "enter Float for tolS"
|
||||
tolSentry.delete(0, END)
|
||||
tolSentry.insert(0, '1.0')
|
||||
return tolN, tolS
|
||||
|
||||
|
||||
# 画新的tree
|
||||
def drawNewTree():
|
||||
# #get values from Entry boxes
|
||||
tolN, tolS = getInputs()
|
||||
reDraw(tolS, tolN)
|
||||
|
||||
|
||||
def main(root):
|
||||
# 标题
|
||||
Label(root, text="Plot Place Holder").grid(row=0, columnspan=3)
|
||||
# 输入栏1, 叶子的数量
|
||||
Label(root, text="tolN").grid(row=1, column=0)
|
||||
global tolNentry
|
||||
tolNentry = Entry(root)
|
||||
tolNentry.grid(row=1, column=1)
|
||||
tolNentry.insert(0, '10')
|
||||
# 输入栏2, 误差量
|
||||
Label(root, text="tolS").grid(row=2, column=0)
|
||||
global tolSentry
|
||||
tolSentry = Entry(root)
|
||||
tolSentry.grid(row=2, column=1)
|
||||
# 设置输出值
|
||||
tolSentry.insert(0,'1.0')
|
||||
|
||||
# 设置提交的按钮
|
||||
Button(root, text="确定", command=drawNewTree).grid(row=1, column=2, rowspan=3)
|
||||
|
||||
# 设置复选按钮
|
||||
global chkBtnVar
|
||||
chkBtnVar = IntVar()
|
||||
chkBtn = Checkbutton(root, text="Model Tree", variable = chkBtnVar)
|
||||
chkBtn.grid(row=3, column=0, columnspan=2)
|
||||
|
||||
# 退出按钮
|
||||
Button(root, text="退出", fg="black", command=quit).grid(row=1, column=2)
|
||||
|
||||
|
||||
# 创建一个画板 canvas
|
||||
reDraw.f = Figure(figsize=(5, 4), dpi=100)
|
||||
reDraw.canvas = FigureCanvasTkAgg(reDraw.f, master=root)
|
||||
reDraw.canvas.show()
|
||||
reDraw.canvas.get_tk_widget().grid(row=0, columnspan=3)
|
||||
|
||||
reDraw.rawDat = mat(regTrees.loadDataSet('testData/RT_sine.txt'))
|
||||
reDraw.testDat = arange(min(reDraw.rawDat[:, 0]), max(reDraw.rawDat[:, 0]), 0.01)
|
||||
reDraw(1.0, 10)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# 创建一个事件
|
||||
root = Tk()
|
||||
# test_widget_text(root)
|
||||
main(root)
|
||||
|
||||
# 启动事件循环
|
||||
root.mainloop()
|
||||
50
src/python/10.kmeans/kMeans.py
Normal file
50
src/python/10.kmeans/kMeans.py
Normal file
@@ -0,0 +1,50 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
from numpy import *
|
||||
|
||||
# 从文本中构建矩阵,加载文本文件,然后处理
|
||||
def loadDataSet(fileName): # 通用函数,用来解析以 tab 键分隔的 floats(浮点数)
|
||||
dataMat = [] # assume last column is target value
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
curLine = line.strip().split('\t')
|
||||
fltLine = map(float,curLine) # 映射所有的元素为 float(浮点数)类型
|
||||
dataMat.append(fltLine)
|
||||
return dataMat
|
||||
|
||||
# 计算两个向量的欧式距离(可根据场景选择)
|
||||
def distEclud(vecA, vecB):
|
||||
return sqrt(sum(power(vecA - vecB, 2))) # la.norm(vecA-vecB)
|
||||
|
||||
# 为给定数据集构建一个包含 k 个随机质心的集合。随机质心必须要在整个数据集的边界之内,这可以通过找到数据集每一维的最小和最大值来完成。然后生成 0~1.0 之间的随机数并通过取值范围和最小值,以便确保随机点在数据的边界之内。
|
||||
def randCent(dataSet, k):
|
||||
n = shape(dataSet)[1] # 列数
|
||||
centroids = mat(zeros((k,n))) # 创建质心矩阵
|
||||
for j in range(n): # 穿件随机簇质心,并且在每一维的边界内
|
||||
minJ = min(dataSet[:,j])
|
||||
rangeJ = float(max(dataSet[:,j]) - minJ)
|
||||
centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1)) # 随机生成
|
||||
return centroids
|
||||
|
||||
# k-means 聚类算法
|
||||
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
|
||||
m = shape(dataSet)[0]
|
||||
clusterAssment = mat(zeros((m,2))) # 创建矩阵来分配数据点到质心中
|
||||
centroids = createCent(dataSet, k)
|
||||
clusterChanged = True
|
||||
while clusterChanged:
|
||||
clusterChanged = False
|
||||
for i in range(m): # 循环每一个数据点并分配到最近的质心中去
|
||||
minDist = inf; minIndex = -1
|
||||
for j in range(k):
|
||||
distJI = distMeas(centroids[j,:],dataSet[i,:])
|
||||
if distJI < minDist:
|
||||
minDist = distJI; minIndex = j
|
||||
if clusterAssment[i,0] != minIndex: clusterChanged = True
|
||||
clusterAssment[i,:] = minIndex,minDist**2
|
||||
print centroids
|
||||
for cent in range(k): # 重新计算质心
|
||||
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]] # 获取该簇中的所有点
|
||||
centroids[cent,:] = mean(ptsInClust, axis=0) # 分配质心
|
||||
return centroids, clusterAssment
|
||||
324
src/python/11.Apriori/apriori.py
Normal file
324
src/python/11.Apriori/apriori.py
Normal file
@@ -0,0 +1,324 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf8
|
||||
|
||||
'''
|
||||
Created on Mar 24, 2011
|
||||
Update on 2017-03-16
|
||||
Ch 11 code
|
||||
@author: Peter/片刻
|
||||
'''
|
||||
print(__doc__)
|
||||
from numpy import *
|
||||
|
||||
|
||||
def loadDataSet():
|
||||
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
|
||||
|
||||
|
||||
def createC1(dataSet):
|
||||
C1 = []
|
||||
for transaction in dataSet:
|
||||
for item in transaction:
|
||||
if not [item] in C1:
|
||||
# 遍历所有的元素,然后append到C1中
|
||||
C1.append([item])
|
||||
# 对数组进行 从小到大 的排序
|
||||
C1.sort()
|
||||
# frozenset表示冻结的set集合,元素无可改变;可以把它当字典的key来使用
|
||||
return map(frozenset, C1)
|
||||
|
||||
|
||||
def scanD(D, Ck, minSupport):
|
||||
"""scanD
|
||||
|
||||
Args:
|
||||
D 原始数据集, D用来判断,CK中的元素,是否存在于原数据D中
|
||||
Ck 合并后的数据集
|
||||
Returns:
|
||||
retList 支持度大于阈值的集合
|
||||
supportData 全量key的字典集合
|
||||
"""
|
||||
# ssCnt 临时存放Ck的元素集合,查看Ck每个元素 并 计算元素出现的次数 生成相应的字典
|
||||
ssCnt = {}
|
||||
for tid in D:
|
||||
for can in Ck:
|
||||
# s.issubset(t) 测试是否 s 中的每一个元素都在 t 中
|
||||
if can.issubset(tid):
|
||||
if not ssCnt.has_key(can):
|
||||
ssCnt[can] = 1
|
||||
else:
|
||||
ssCnt[can] += 1
|
||||
# 元素有多少行
|
||||
numItems = float(len(D))
|
||||
retList = []
|
||||
supportData = {}
|
||||
for key in ssCnt:
|
||||
# 计算支持度
|
||||
support = ssCnt[key]/numItems
|
||||
if support >= minSupport:
|
||||
# 在retList的首位插入元素,只存储支持度满足频繁项集的值
|
||||
retList.insert(0, key)
|
||||
# 存储所有的key和对应的support值
|
||||
supportData[key] = support
|
||||
return retList, supportData
|
||||
|
||||
|
||||
# creates Ck
|
||||
def aprioriGen(Lk, k):
|
||||
"""aprioriGen(循环数据集,然后进行两两合并)
|
||||
|
||||
Args:
|
||||
Lk 频繁项集
|
||||
k 元素的前k-2相同,就进行合并
|
||||
Returns:
|
||||
retList 元素两两合并的数据集
|
||||
"""
|
||||
retList = []
|
||||
lenLk = len(Lk)
|
||||
# 循环Lk这个数组
|
||||
for i in range(lenLk):
|
||||
for j in range(i+1, lenLk):
|
||||
L1 = list(Lk[i])[: k-2]
|
||||
L2 = list(Lk[j])[: k-2]
|
||||
# print '-----', Lk, Lk[i], L1
|
||||
L1.sort()
|
||||
L2.sort()
|
||||
# 第一次L1,L2为空,元素直接进行合并,返回元素两两合并的数据集
|
||||
# if first k-2 elements are equal
|
||||
if L1 == L2:
|
||||
# set union
|
||||
# print 'union=', Lk[i] | Lk[j], Lk[i], Lk[j]
|
||||
retList.append(Lk[i] | Lk[j])
|
||||
return retList
|
||||
|
||||
|
||||
def apriori(dataSet, minSupport=0.5):
|
||||
"""apriori
|
||||
|
||||
Args:
|
||||
dataSet 原始数据集
|
||||
minSupport 支持度的阈值
|
||||
Returns:
|
||||
L 频繁项集的全集
|
||||
supportData 所有元素的支持度全集
|
||||
"""
|
||||
# 冻结每一行数据
|
||||
C1 = createC1(dataSet)
|
||||
D = map(set, dataSet)
|
||||
|
||||
# 计算支持support, L1表示满足support的key, supportData表示全集的集合
|
||||
L1, supportData = scanD(D, C1, minSupport)
|
||||
# print "L1=", L1, "\n", "outcome: ", supportData
|
||||
|
||||
L = [L1]
|
||||
k = 2
|
||||
while (len(L[k-2]) > 0):
|
||||
# 合并k-2相同的数据集
|
||||
Ck = aprioriGen(L[k-2], k)
|
||||
# print '-----------', D, Ck
|
||||
# 计算合并后的数据集的支持度
|
||||
# Lk满足支持度的key的list, supK表示key全集
|
||||
# print 'Ck', Ck
|
||||
Lk, supK = scanD(D, Ck, minSupport)
|
||||
# 如果字典没有,就追加元素,如果有,就更新元素
|
||||
supportData.update(supK)
|
||||
if len(Lk) == 0:
|
||||
break
|
||||
# Lk表示满足频繁子项的集合,L元素在增加
|
||||
L.append(Lk)
|
||||
k += 1
|
||||
# print 'k=', k, len(L[k-2])
|
||||
return L, supportData
|
||||
|
||||
|
||||
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
|
||||
"""calcConf
|
||||
|
||||
Args:
|
||||
freqSet 每一组的各个元素
|
||||
H 将元素变成set集合
|
||||
supportData 所有元素的支持度全集
|
||||
brl bigRuleList的空数组
|
||||
minConf 置信度的阈值
|
||||
Returns:
|
||||
prunedH 记录 可信度大于阈值的集合
|
||||
"""
|
||||
# 记录 可信度大于阈值的集合
|
||||
prunedH = []
|
||||
for conseq in H:
|
||||
# 计算自信度的值,例如元素 H=set(1, 2), 分别求:supportData[1] 和 supportData[2]
|
||||
# print 'confidence=', freqSet, conseq, freqSet-conseq
|
||||
conf = supportData[freqSet]/supportData[freqSet-conseq]
|
||||
if conf >= minConf:
|
||||
print freqSet-conseq, '-->', conseq, 'conf:', conf
|
||||
brl.append((freqSet-conseq, conseq, conf))
|
||||
prunedH.append(conseq)
|
||||
return prunedH
|
||||
|
||||
|
||||
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
|
||||
"""rulesFromConseq
|
||||
|
||||
Args:
|
||||
freqSet 每一组的各个元素
|
||||
H 将元素变成set集合
|
||||
supportData 所有元素的支持度全集
|
||||
brl bigRuleList的空数组
|
||||
minConf 置信度的阈值
|
||||
Returns:
|
||||
prunedH 记录 可信度大于阈值的集合
|
||||
"""
|
||||
# 去除list列表中第一个出现的冻结的set集合
|
||||
m = len(H[0])
|
||||
# 判断,freqSet的长度是否>组合的长度+1
|
||||
if (len(freqSet) > (m + 1)):
|
||||
# 合并相邻的集合,组合为2/3/..n的集合
|
||||
Hmp1 = aprioriGen(H, m+1)
|
||||
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
|
||||
# 如果有2个结果都可以,直接返回结果就行,下面这个判断是多余,我个人觉得
|
||||
# print 'Hmp1=', Hmp1
|
||||
if (len(Hmp1) > 1):
|
||||
# print '-------'
|
||||
# print len(freqSet), len(Hmp1[0]) + 1
|
||||
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
|
||||
|
||||
|
||||
def generateRules(L, supportData, minConf=0.7):
|
||||
"""generateRules
|
||||
|
||||
Args:
|
||||
L 频繁项集的全集
|
||||
supportData 所有元素的支持度全集
|
||||
minConf 可信度的阈值
|
||||
Returns:
|
||||
bigRuleList 关于 (A->B+置信度) 3个字段的组合
|
||||
"""
|
||||
bigRuleList = []
|
||||
# 循环L频繁项集,所有的统一大小组合(2/../n个的组合,从第2组开始)
|
||||
for i in range(1, len(L)):
|
||||
# 获取频繁项集中每个组合的所有元素
|
||||
for freqSet in L[i]:
|
||||
# 组合总的元素并遍历子元素,并转化为冻结的set集合,再存放到list列表中
|
||||
H1 = [frozenset([item]) for item in freqSet]
|
||||
# 2个的组合,走else, 2个以上的组合,走if
|
||||
if (i > 1):
|
||||
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
|
||||
else:
|
||||
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
|
||||
return bigRuleList
|
||||
|
||||
|
||||
def getActionIds():
|
||||
from time import sleep
|
||||
from votesmart import votesmart
|
||||
# votesmart.apikey = 'get your api key first'
|
||||
votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
|
||||
actionIdList = []
|
||||
billTitleList = []
|
||||
fr = open('testData/Apriori_recent20bills.txt')
|
||||
for line in fr.readlines():
|
||||
billNum = int(line.split('\t')[0])
|
||||
try:
|
||||
billDetail = votesmart.votes.getBill(billNum) # api call
|
||||
for action in billDetail.actions:
|
||||
if action.level == 'House' and (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
|
||||
actionId = int(action.actionId)
|
||||
print 'bill: %d has actionId: %d' % (billNum, actionId)
|
||||
actionIdList.append(actionId)
|
||||
billTitleList.append(line.strip().split('\t')[1])
|
||||
except:
|
||||
print "problem getting bill %d" % billNum
|
||||
sleep(1) # delay to be polite
|
||||
return actionIdList, billTitleList
|
||||
|
||||
|
||||
def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
|
||||
itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
|
||||
for billTitle in billTitleList:#fill up itemMeaning list
|
||||
itemMeaning.append('%s -- Nay' % billTitle)
|
||||
itemMeaning.append('%s -- Yea' % billTitle)
|
||||
transDict = {}#list of items in each transaction (politician)
|
||||
voteCount = 2
|
||||
for actionId in actionIdList:
|
||||
sleep(3)
|
||||
print 'getting votes for actionId: %d' % actionId
|
||||
try:
|
||||
voteList = votesmart.votes.getBillActionVotes(actionId)
|
||||
for vote in voteList:
|
||||
if not transDict.has_key(vote.candidateName):
|
||||
transDict[vote.candidateName] = []
|
||||
if vote.officeParties == 'Democratic':
|
||||
transDict[vote.candidateName].append(1)
|
||||
elif vote.officeParties == 'Republican':
|
||||
transDict[vote.candidateName].append(0)
|
||||
if vote.action == 'Nay':
|
||||
transDict[vote.candidateName].append(voteCount)
|
||||
elif vote.action == 'Yea':
|
||||
transDict[vote.candidateName].append(voteCount + 1)
|
||||
except:
|
||||
print "problem getting actionId: %d" % actionId
|
||||
voteCount += 2
|
||||
return transDict, itemMeaning
|
||||
|
||||
|
||||
# 暂时没用上
|
||||
# def pntRules(ruleList, itemMeaning):
|
||||
# for ruleTup in ruleList:
|
||||
# for item in ruleTup[0]:
|
||||
# print itemMeaning[item]
|
||||
# print " -------->"
|
||||
# for item in ruleTup[1]:
|
||||
# print itemMeaning[item]
|
||||
# print "confidence: %f" % ruleTup[2]
|
||||
# print #print a blank line
|
||||
|
||||
|
||||
def main():
|
||||
# 以前的测试
|
||||
# project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
|
||||
# 收集并准备数据
|
||||
# dataMat, labelMat = loadDataSet("%s/resources/Apriori_testdata.txt" % project_dir)
|
||||
|
||||
# 现在的的测试
|
||||
# # 1. 加载数据
|
||||
# dataSet = loadDataSet()
|
||||
# print(dataSet)
|
||||
# # 调用 apriori 做购物篮分析
|
||||
# # 支持度满足阈值的key集合L,和所有key的全集suppoerData
|
||||
# L, supportData = apriori(dataSet, minSupport=0.5)
|
||||
# # print L, supportData
|
||||
# print '\ngenerateRules\n'
|
||||
# rules = generateRules(L, supportData, minConf=0.05)
|
||||
# print rules
|
||||
|
||||
# 项目实战
|
||||
# 构建美国国会投票记录的事务数据集
|
||||
# actionIdList, billTitleList = getActionIds()
|
||||
# # 测试前2个
|
||||
# # transDict, itemMeaning = getTransList(actionIdList[: 2], billTitleList[: 2])
|
||||
# # transDict 表示 action_id的集合,transDict[key]这个就是action_id对应的选项,例如 [1, 2, 3]
|
||||
# transDict, itemMeaning = getTransList(actionIdList, billTitleList)
|
||||
# # 得到全集的数据
|
||||
# dataSet = [transDict[key] for key in transDict.keys()]
|
||||
# L, supportData = apriori(dataSet, minSupport=0.3)
|
||||
# rules = generateRules(L, supportData, minConf=0.95)
|
||||
# print rules
|
||||
|
||||
# 项目实战
|
||||
# 发现毒蘑菇的相似特性
|
||||
# 得到全集的数据
|
||||
dataSet = [line.split() for line in open("testData/Apriori_mushroom.dat").readlines()]
|
||||
L, supportData = apriori(dataSet, minSupport=0.3)
|
||||
# 2表示毒蘑菇,1表示可食用的蘑菇
|
||||
# 找出关于2的频繁子项出来,就知道如果是毒蘑菇,那么出现频繁的也可能是毒蘑菇
|
||||
for item in L[1]:
|
||||
if item.intersection('2'):
|
||||
print item
|
||||
|
||||
for item in L[2]:
|
||||
if item.intersection('2'):
|
||||
print item
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,12 +0,0 @@
|
||||
def loadDataSet():
|
||||
return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]
|
||||
def createC1(dataSet):
|
||||
c1=[]
|
||||
for transaction in dataSet:
|
||||
for item in transaction:
|
||||
if not [item] in c1:
|
||||
c1.append([item])
|
||||
c1.sort()
|
||||
return map(frozenset,c1)
|
||||
def scanD(D,ck,minSupport):
|
||||
ssCnt = {}
|
||||
@@ -1,19 +1,337 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on Jun 14, 2011
|
||||
FP-Growth FP means frequent pattern
|
||||
the FP-Growth algorithm needs:
|
||||
1. FP-tree (class treeNode)
|
||||
2. header table (use dict)
|
||||
This finds frequent itemsets similar to apriori but does not find association rules.
|
||||
@author: Peter/片刻
|
||||
'''
|
||||
print(__doc__)
|
||||
|
||||
|
||||
class treeNode:
|
||||
def __init__(self,nameValue,numOccur,parentNode):
|
||||
def __init__(self, nameValue, numOccur, parentNode):
|
||||
self.name = nameValue
|
||||
self.count = numOccur
|
||||
self.nodeLink = None
|
||||
# needs to be updated
|
||||
self.parent = parentNode
|
||||
self.children = {}
|
||||
def inc(self,numOccur):
|
||||
|
||||
def inc(self, numOccur):
|
||||
"""inc(对count变量增加给定值)
|
||||
"""
|
||||
self.count += numOccur
|
||||
def disp(self,ind=1):
|
||||
print(' '*ind,self.name,' ',self.count)
|
||||
|
||||
def disp(self, ind=1):
|
||||
"""disp(用于将树以文本形式显示)
|
||||
|
||||
"""
|
||||
print ' '*ind, self.name, ' ', self.count
|
||||
for child in self.children.values():
|
||||
child.disp(ind+1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import fpGrowth
|
||||
rootNode = fpGrowth.treeNode('pyramid',9,None)
|
||||
rootNode.children['eye']=fpGrowth.treeNode('eye',13,None)
|
||||
rootNode.disp()
|
||||
|
||||
def loadSimpDat():
|
||||
simpDat = [['r', 'z', 'h', 'j', 'p'],
|
||||
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
|
||||
['z'],
|
||||
['r', 'x', 'n', 'o', 's'],
|
||||
['y', 'r', 'x', 'z', 'q', 't', 'p'],
|
||||
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
|
||||
return simpDat
|
||||
|
||||
|
||||
def createInitSet(dataSet):
|
||||
retDict = {}
|
||||
for trans in dataSet:
|
||||
retDict[frozenset(trans)] = 1
|
||||
return retDict
|
||||
|
||||
|
||||
# this version does not use recursion
|
||||
def updateHeader(nodeToTest, targetNode):
|
||||
"""updateHeader(更新头指针,建立相同元素之间的关系,例如: 左边的r指向右边的r值,就是后出现的相同元素 指向 已经出现的元素)
|
||||
|
||||
从头指针的nodeLink开始,一直沿着nodeLink直到到达链表末尾。这就是链表。
|
||||
性能:如果链表很长可能会遇到迭代调用的次数限制。
|
||||
|
||||
Args:
|
||||
nodeToTest 满足minSup {所有的元素+(value, treeNode)}
|
||||
targetNode Tree对象的子节点
|
||||
"""
|
||||
# 建立相同元素之间的关系,例如: 左边的r指向右边的r值
|
||||
while (nodeToTest.nodeLink is not None):
|
||||
nodeToTest = nodeToTest.nodeLink
|
||||
nodeToTest.nodeLink = targetNode
|
||||
|
||||
|
||||
def updateTree(items, inTree, headerTable, count):
|
||||
"""updateTree(更新FP-tree,第二次遍历)
|
||||
|
||||
# 针对每一行的数据
|
||||
# 最大的key, 添加
|
||||
Args:
|
||||
items 满足minSup 排序后的元素key的数组(大到小的排序)
|
||||
inTree 空的Tree对象
|
||||
headerTable 满足minSup {所有的元素+(value, treeNode)}
|
||||
count 原数据集中每一组Kay出现的次数
|
||||
"""
|
||||
# 取出 元素 出现次数最高的
|
||||
# 如果该元素在 inTree.children 这个字典中,就进行累加
|
||||
# 如果该元素不存在 就 inTree.children 字典中新增key,value为初始化的 treeNode 对象
|
||||
if items[0] in inTree.children:
|
||||
# 更新 最大元素,对应的 treeNode 对象的count进行叠加
|
||||
inTree.children[items[0]].inc(count)
|
||||
else:
|
||||
# 如果不存在子节点,我们为该inTree添加子节点
|
||||
inTree.children[items[0]] = treeNode(items[0], count, inTree)
|
||||
# 如果满足minSup的dist字典的value值第二位为null, 我们就设置该元素为 本节点对应的tree节点
|
||||
# 如果元素第二位不为null,我们就更新header节点
|
||||
if headerTable[items[0]][1] is None:
|
||||
# headerTable只记录第一次节点出现的位置
|
||||
headerTable[items[0]][1] = inTree.children[items[0]]
|
||||
else:
|
||||
# 本质上是修改headerTable的key对应的Tree,的nodeLink值
|
||||
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
|
||||
if len(items) > 1:
|
||||
# 递归的调用,在items[0]的基础上,添加item0[1]做子节点, count只要循环的进行累计加和而已,统计出节点的最后的统计值。
|
||||
updateTree(items[1::], inTree.children[items[0]], headerTable, count)
|
||||
|
||||
|
||||
def createTree(dataSet, minSup=1):
|
||||
"""createTree(生成FP-tree,第一次遍历)
|
||||
|
||||
Args:
|
||||
dataSet dist{行:出现次数}的样本数据
|
||||
minSup 最小的支持度
|
||||
Returns:
|
||||
retTree FP-tree
|
||||
headerTable 满足minSup {所有的元素+(value, treeNode)}
|
||||
"""
|
||||
# 支持度>=minSup的dist{所有元素:出现的次数}
|
||||
headerTable = {}
|
||||
# 循环 dist{行:出现次数}的样本数据
|
||||
for trans in dataSet:
|
||||
# 对所有的行进行循环,得到行里面的所有元素
|
||||
# 统计每一行中,每个元素出现的总次数
|
||||
for item in trans:
|
||||
# 例如: {'ababa': 3} count(a)=3+3+3=9 count(b)=3+3=6
|
||||
headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
|
||||
# 删除 headerTable中,元素次数<最小支持度的元素
|
||||
for k in headerTable.keys():
|
||||
if headerTable[k] < minSup:
|
||||
del(headerTable[k])
|
||||
|
||||
# 满足minSup: set(各元素集合)
|
||||
freqItemSet = set(headerTable.keys())
|
||||
# 如果不存在,直接返回None
|
||||
if len(freqItemSet) == 0:
|
||||
return None, None
|
||||
for k in headerTable:
|
||||
# 格式化: dist{元素key: [元素次数, None]}
|
||||
headerTable[k] = [headerTable[k], None]
|
||||
|
||||
# create tree
|
||||
retTree = treeNode('Null Set', 1, None)
|
||||
# 循环 dist{行:出现次数}的样本数据
|
||||
for tranSet, count in dataSet.items():
|
||||
# print 'tranSet, count=', tranSet, count
|
||||
# localD = dist{元素key: 元素次数}
|
||||
localD = {}
|
||||
for item in tranSet:
|
||||
# 判断是否在满足minSup的集合中
|
||||
if item in freqItemSet:
|
||||
# print 'headerTable[item][0]=', headerTable[item][0], headerTable[item]
|
||||
localD[item] = headerTable[item][0]
|
||||
# print 'localD=', localD
|
||||
if len(localD) > 0:
|
||||
# p=key,value; 所以是通过value值的大小,进行从大到小进行排序
|
||||
# orderedItems 表示取出元组的key值,也就是字母本身,但是字母本身是大到小的顺序
|
||||
orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
|
||||
# print 'orderedItems=', orderedItems, 'headerTable', headerTable, '\n\n\n'
|
||||
# 填充树,通过有序的orderedItems的第一位,进行顺序填充 第一层的子节点。
|
||||
updateTree(orderedItems, retTree, headerTable, count)
|
||||
|
||||
return retTree, headerTable
|
||||
|
||||
|
||||
def ascendTree(leafNode, prefixPath):
|
||||
"""ascendTree(如果存在父节点,就记录当前节点的name值)
|
||||
|
||||
Args:
|
||||
leafNode 查询的节点对于的nodeTree
|
||||
prefixPath 要查询的节点值
|
||||
"""
|
||||
if leafNode.parent is not None:
|
||||
prefixPath.append(leafNode.name)
|
||||
ascendTree(leafNode.parent, prefixPath)
|
||||
|
||||
|
||||
def findPrefixPath(basePat, treeNode):
|
||||
"""findPrefixPath 基础数据集
|
||||
|
||||
Args:
|
||||
basePat 要查询的节点值
|
||||
treeNode 查询的节点所在的当前nodeTree
|
||||
Returns:
|
||||
condPats 对非basePat的倒叙值作为key,赋值为count数
|
||||
"""
|
||||
condPats = {}
|
||||
# 对 treeNode的link进行循环
|
||||
while treeNode is not None:
|
||||
prefixPath = []
|
||||
# 寻找改节点的父节点,相当于找到了该节点的频繁项集
|
||||
ascendTree(treeNode, prefixPath)
|
||||
# 避免 单独`Z`一个元素,添加了空节点
|
||||
if len(prefixPath) > 1:
|
||||
# 对非basePat的倒叙值作为key,赋值为count数
|
||||
# prefixPath[1:] 变frozenset后,字母就变无须了
|
||||
# condPats[frozenset(prefixPath)] = treeNode.count
|
||||
condPats[frozenset(prefixPath[1:])] = treeNode.count
|
||||
# 递归,寻找改节点的上一个 相同值的链接节点
|
||||
treeNode = treeNode.nodeLink
|
||||
# print treeNode
|
||||
return condPats
|
||||
|
||||
|
||||
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
|
||||
"""mineTree(创建条件FP树)
|
||||
|
||||
Args:
|
||||
inTree myFPtree
|
||||
headerTable 满足minSup {所有的元素+(value, treeNode)}
|
||||
minSup 最小支持项集
|
||||
preFix preFix为newFreqSet上一次的存储记录,一旦没有myHead,就不会更新
|
||||
freqItemList 用来存储频繁子项的列表
|
||||
"""
|
||||
# 通过value进行从小到大的排序, 得到频繁项集的key
|
||||
# 最小支持项集的key的list集合
|
||||
bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]
|
||||
# print '-----', sorted(headerTable.items(), key=lambda p: p[1])
|
||||
print 'bigL=', bigL
|
||||
# 循环遍历 最频繁项集的key,从小到大的递归寻找对应的频繁项集
|
||||
for basePat in bigL:
|
||||
# preFix为newFreqSet上一次的存储记录,一旦没有myHead,就不会更新
|
||||
newFreqSet = preFix.copy()
|
||||
newFreqSet.add(basePat)
|
||||
print 'newFreqSet=', newFreqSet, preFix
|
||||
|
||||
freqItemList.append(newFreqSet)
|
||||
print 'freqItemList=', freqItemList
|
||||
condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
|
||||
print 'condPattBases=', basePat, condPattBases
|
||||
|
||||
# 构建FP-tree
|
||||
myCondTree, myHead = createTree(condPattBases, minSup)
|
||||
print 'myHead=', myHead
|
||||
# 挖掘条件 FP-tree, 如果
|
||||
if myHead is not None:
|
||||
myCondTree.disp(1)
|
||||
print '\n\n\n'
|
||||
# 递归 myHead 找出频繁项集
|
||||
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
|
||||
print '\n\n\n'
|
||||
|
||||
|
||||
import twitter
|
||||
from time import sleep
|
||||
import re
|
||||
|
||||
|
||||
def getLotsOfTweets(searchStr):
|
||||
"""
|
||||
获取 100个搜索结果页面
|
||||
"""
|
||||
CONSUMER_KEY = ''
|
||||
CONSUMER_SECRET = ''
|
||||
ACCESS_TOKEN_KEY = ''
|
||||
ACCESS_TOKEN_SECRET = ''
|
||||
api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET, access_token_key=ACCESS_TOKEN_KEY, access_token_secret=ACCESS_TOKEN_SECRET)
|
||||
|
||||
# you can get 1500 results 15 pages * 100 per page
|
||||
resultsPages = []
|
||||
for i in range(1, 15):
|
||||
print "fetching page %d" % i
|
||||
searchResults = api.GetSearch(searchStr, per_page=100, page=i)
|
||||
resultsPages.append(searchResults)
|
||||
sleep(6)
|
||||
return resultsPages
|
||||
|
||||
|
||||
def textParse(bigString):
|
||||
"""
|
||||
解析页面内容
|
||||
"""
|
||||
urlsRemoved = re.sub('(http:[/][/]|www.)([a-z]|[A-Z]|[0-9]|[/.]|[~])*', '', bigString)
|
||||
listOfTokens = re.split(r'\W*', urlsRemoved)
|
||||
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
|
||||
|
||||
|
||||
def mineTweets(tweetArr, minSup=5):
|
||||
"""
|
||||
获取频繁项集
|
||||
"""
|
||||
parsedList = []
|
||||
for i in range(14):
|
||||
for j in range(100):
|
||||
parsedList.append(textParse(tweetArr[i][j].text))
|
||||
initSet = createInitSet(parsedList)
|
||||
myFPtree, myHeaderTab = createTree(initSet, minSup)
|
||||
myFreqList = []
|
||||
mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
|
||||
return myFreqList
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# rootNode = treeNode('pyramid', 9, None)
|
||||
# rootNode.children['eye'] = treeNode('eye', 13, None)
|
||||
# rootNode.children['phoenix'] = treeNode('phoenix', 3, None)
|
||||
# # 将树以文本形式显示
|
||||
# # print rootNode.disp()
|
||||
|
||||
# # load样本数据
|
||||
# simpDat = loadSimpDat()
|
||||
# # print simpDat, '\n'
|
||||
# # frozen set 格式化 并 重新装载 样本数据,对所有的行进行统计求和,格式: {行:出现次数}
|
||||
# initSet = createInitSet(simpDat)
|
||||
# # print initSet
|
||||
|
||||
# # 创建FP树
|
||||
# # 输入:dist{行:出现次数}的样本数据 和 最小的支持度
|
||||
# # 输出:最终的PF-tree,通过循环获取第一层的节点,然后每一层的节点进行递归的获取每一行的字节点,也就是分支。然后所谓的指针,就是后来的指向已存在的
|
||||
# myFPtree, myHeaderTab = createTree(initSet, 3)
|
||||
# myFPtree.disp()
|
||||
|
||||
# # 抽取条件模式基
|
||||
# # 查询树节点的,频繁子项
|
||||
# # print findPrefixPath('x', myHeaderTab['x'][1])
|
||||
# # print findPrefixPath('z', myHeaderTab['z'][1])
|
||||
# # print findPrefixPath('r', myHeaderTab['r'][1])
|
||||
|
||||
# # 创建条件模式基
|
||||
# freqItemList = []
|
||||
# mineTree(myFPtree, myHeaderTab, 3, set([]), freqItemList)
|
||||
# print freqItemList
|
||||
|
||||
# # 项目实战
|
||||
# # 1.twitter项目案例
|
||||
# # 无法运行,因为没发链接twitter
|
||||
# lotsOtweets = getLotsOfTweets('RIMM')
|
||||
# listOfTerms = mineTweets(lotsOtweets, 20)
|
||||
# print len(listOfTerms)
|
||||
# for t in listOfTerms:
|
||||
# print t
|
||||
|
||||
# 2.新闻网站点击流中挖掘
|
||||
parsedDat = [line.split() for line in open('testData/FPGrowth_kosarak.dat').readlines()]
|
||||
initSet = createInitSet(parsedDat)
|
||||
myFPtree, myHeaderTab = createTree(initSet, 100000)
|
||||
|
||||
myFreList = []
|
||||
mineTree(myFPtree, myHeaderTab, 100000, set([]), myFreList)
|
||||
print myFreList
|
||||
|
||||
155
src/python/14.SVD/svdRec.py
Normal file
155
src/python/14.SVD/svdRec.py
Normal file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/python
|
||||
# encoding: utf-8
|
||||
|
||||
from numpy import *
|
||||
from numpy import linalg as la
|
||||
|
||||
|
||||
def loadExData():
|
||||
# 利用SVD提高推荐效果,菜肴矩阵
|
||||
return[[2, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
|
||||
[0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0],
|
||||
[3, 3, 4, 0, 3, 0, 0, 2, 2, 0, 0],
|
||||
[5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],
|
||||
[4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5],
|
||||
[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4],
|
||||
[0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],
|
||||
[0, 0, 0, 3, 0, 0, 0, 0, 4, 5, 0],
|
||||
[1, 1, 2, 1, 1, 2, 1, 0, 4, 5, 0]]
|
||||
"""
|
||||
# 推荐引擎示例矩阵
|
||||
return[[4, 4, 0, 2, 2],
|
||||
[4, 0, 0, 3, 3],
|
||||
[4, 0, 0, 1, 1],
|
||||
[1, 1, 1, 2, 0],
|
||||
[2, 2, 2, 0, 0],
|
||||
[1, 1, 1, 0, 0],
|
||||
[5, 5, 5, 0, 0]]
|
||||
|
||||
原矩阵
|
||||
return[[1, 1, 1, 0, 0],
|
||||
[2, 2, 2, 0, 0],
|
||||
[1, 1, 1, 0, 0],
|
||||
[5, 5, 5, 0, 0],
|
||||
[1, 1, 0, 2, 2],
|
||||
[0, 0, 0, 3, 3],
|
||||
[0, 0, 0, 1, 1]]
|
||||
"""
|
||||
|
||||
|
||||
# 欧氏距离相似度,假定inA和inB 都是列向量
|
||||
# 计算向量的第二范式,相当于计算了欧氏距离
|
||||
def ecludSim(inA, inB):
|
||||
return 1.0/(1.0 + la.norm(inA - inB))
|
||||
|
||||
|
||||
# pearsSim()函数会检查是否存在3个或更多的点。
|
||||
# corrcoef直接计算皮尔逊相关系数
|
||||
def pearsSim(inA, inB):
|
||||
# 如果不存在,该函数返回1.0,此时两个向量完全相关。
|
||||
if len(inA) < 3:
|
||||
return 1.0
|
||||
return 0.5 + 0.5*corrcoef(inA, inB, rowvar=0)[0][1]
|
||||
|
||||
|
||||
# 计算余弦相似度
|
||||
def cosSim(inA, inB):
|
||||
num = float(inA.T*inB)
|
||||
denom = la.norm(inA)*la.norm(inB)
|
||||
return 0.5 + 0.5*(num/denom)
|
||||
|
||||
|
||||
# 基于物品相似度的推荐引擎
|
||||
# standEst()函数,用来计算在给定相似度计算方法的条件下,用户对物品的估计评分值。
|
||||
# standEst()函数的参数包括数据矩阵、用户编号、物品编号和相似度计算方法
|
||||
def standEst(dataMat, user, simMeas, item):
|
||||
# 得到数据集中的物品数目
|
||||
n = shape(dataMat)[1]
|
||||
# 初始化两个评分值
|
||||
simTotal = 0.0
|
||||
ratSimTotal = 0.0
|
||||
# 遍历行中的每个物品(对用户评过分的物品进行遍历,并将它与其他物品进行比较)
|
||||
for j in range(n):
|
||||
userRating = dataMat[user, j]
|
||||
# 如果某个物品的评分值为0,则跳过这个物品
|
||||
if userRating == 0:
|
||||
continue
|
||||
# 寻找两个用户都评级的物品
|
||||
# 变量overLap 给出的是两个物品当中已经被评分的那个元素
|
||||
overLap = nonzero(logical_and(dataMat[:, item].A>0, dataMat[:, j].A>0))[0]
|
||||
# 如果相似度为0,则两着没有任何重合元素,终止本次循环
|
||||
if len(overLap) == 0:similarity =0
|
||||
# 如果存在重合的物品,则基于这些重合物重新计算相似度。
|
||||
else: similarity = simMeas(dataMat[overLap,item], \
|
||||
dataMat[overLap,j])
|
||||
# print 'the %d and %d similarity is : %f'(iten,j,similarity)
|
||||
# 相似度会不断累加,每次计算时还考虑相似度和当前用户评分的乘积
|
||||
# similarity 用户相似度, userRating 用户评分
|
||||
simTotal += similarity
|
||||
ratSimTotal += similarity * userRating
|
||||
if simTotal == 0:
|
||||
return 0
|
||||
# 通过除以所有的评分总和,对上述相似度评分的乘积进行归一化,使得最后评分在0~5之间,这些评分用来对预测值进行排序
|
||||
else:
|
||||
return ratSimTotal/simTotal
|
||||
|
||||
|
||||
# recommend()函数,就是推荐引擎,它会调用standEst()函数,产生了最高的N个推荐结果。
|
||||
# 如果不指定N的大小,则默认值为3。该函数另外的参数还包括相似度计算方法和估计方法
|
||||
def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
|
||||
# 寻找未评级的物品
|
||||
# 对给定的用户建立一个未评分的物品列表
|
||||
unratedItems = nonzero(dataMat[user, :].A == 0)[1]
|
||||
# 如果不存在未评分物品,那么就退出函数
|
||||
if len(unratedItems) == 0:
|
||||
return 'you rated everything'
|
||||
# 在所有的未评分物品上进行循环
|
||||
itemScores = []
|
||||
for item in unratedItems:
|
||||
estimatedScore = estMethod(dataMat, user, simMeas, item)
|
||||
# 寻找前N个未评级物品,调用standEst()来产生该物品的预测得分,该物品的编号和估计值会放在一个元素列表itemScores中
|
||||
itemScores.append((item, estimatedScore))
|
||||
# 按照估计得分,对该列表进行排序并返回。列表逆排序,第一个值就是最大值
|
||||
return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[: N]
|
||||
|
||||
|
||||
# 基于SVD的评分估计
|
||||
# 在recommend() 中,这个函数用于替换对standEst()的调用,该函数对给定用户给定物品构建了一个评分估计值
|
||||
def svdEst(dataMat, user, simMeas, item):
|
||||
n = shape(dataMat)[1]
|
||||
# 对数据集进行SVD分解
|
||||
simTotal = 0.0
|
||||
ratSimTotal = 0.0
|
||||
# 在SVD分解之后,我们只利用包含了90%能量值的奇异值,这些奇异值会以NumPy数组的形式得以保存
|
||||
U, Sigma, VT = la.svd(dataMat)
|
||||
# 如果要进行矩阵运算,就必须要用这些奇异值构建出一个对角矩阵
|
||||
Sig4 = mat(eye(4) * Sigma[: 4])
|
||||
# 利用U矩阵将物品转换到低维空间中,构建转换后的物品
|
||||
xformedItems = dataMat.T * U[:, :4] * Sig4.I
|
||||
# 对于给定的用户,for循环在用户对应行的元素上进行遍历,
|
||||
# 这和standEst()函数中的for循环的目的一样,只不过这里的相似度计算时在低维空间下进行的。
|
||||
for j in range(n):
|
||||
userRating = dataMat[user, j]
|
||||
if userRating == 0 or j == item:
|
||||
continue
|
||||
# 相似度的计算方法也会作为一个参数传递给该函数
|
||||
similarity = simMeas(xformedItems[item, :].T,xformedItems[j, :].T)
|
||||
# for 循环中加入了一条print语句,以便了解相似度计算的进展情况。如果觉得累赘,可以去掉
|
||||
print 'the %d and %d similarity is: %f' % (item, j, similarity)
|
||||
# 对相似度求和
|
||||
simTotal += similarity
|
||||
# 对相似度及对应评分值的乘积求和
|
||||
ratSimTotal += similarity * userRating
|
||||
if simTotal == 0:
|
||||
return 0
|
||||
else:
|
||||
# 计算估计评分
|
||||
return ratSimTotal/simTotal
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
myMat = mat(loadExData())
|
||||
print myMat
|
||||
print recommend(myMat, 1, estMethod=svdEst)
|
||||
@@ -117,7 +117,7 @@ def plotBestFit(dataArr, labelMat, weights):
|
||||
def main():
|
||||
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
|
||||
# 1.收集并准备数据
|
||||
dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
|
||||
dataMat, labelMat = loadDataSet("%s/testData/Logistic_testdata.txt" % project_dir)
|
||||
|
||||
# print dataMat, '---\n', labelMat
|
||||
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
|
||||
|
||||
@@ -1,206 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf8
|
||||
|
||||
'''
|
||||
Created on Mar 24, 2011
|
||||
Ch 11 code
|
||||
@author: Peter
|
||||
'''
|
||||
from numpy import *
|
||||
|
||||
def loadDataSet():
|
||||
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
|
||||
|
||||
def createC1(dataSet):
|
||||
C1 = []
|
||||
for transaction in dataSet:
|
||||
for item in transaction:
|
||||
if not [item] in C1:
|
||||
C1.append([item])
|
||||
|
||||
C1.sort()
|
||||
return map(frozenset, C1) # use frozen set so we
|
||||
# can use it as a key in a dict
|
||||
|
||||
def scanD(D, Ck, minSupport):
|
||||
ssCnt = {}
|
||||
for tid in D:
|
||||
for can in Ck:
|
||||
# s.issubset(t) 测试是否 s 中的每一个元素都在 t 中
|
||||
if can.issubset(tid):
|
||||
if not ssCnt.has_key(can): ssCnt[can]=1
|
||||
else: ssCnt[can] += 1
|
||||
numItems = float(len(D))
|
||||
retList = []
|
||||
supportData = {}
|
||||
for key in ssCnt:
|
||||
support = ssCnt[key]/numItems
|
||||
if support >= minSupport:
|
||||
retList.insert(0, key)
|
||||
supportData[key] = support
|
||||
return retList, supportData
|
||||
|
||||
def aprioriGen(Lk, k): #creates Ck
|
||||
retList = []
|
||||
lenLk = len(Lk)
|
||||
for i in range(lenLk):
|
||||
for j in range(i+1, lenLk):
|
||||
L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
|
||||
L1.sort(); L2.sort()
|
||||
if L1==L2: #if first k-2 elements are equal
|
||||
retList.append(Lk[i] | Lk[j]) #set union
|
||||
return retList
|
||||
|
||||
def apriori(dataSet, minSupport = 0.5):
|
||||
# 冻结每一行数据
|
||||
C1 = createC1(dataSet)
|
||||
D = map(set, dataSet)
|
||||
|
||||
# 计算支持support
|
||||
L1, supportData = scanD(D, C1, minSupport)
|
||||
print("outcome: ", supportData)
|
||||
|
||||
L = [L1]
|
||||
k = 2
|
||||
while (len(L[k-2]) > 0):
|
||||
Ck = aprioriGen(L[k-2], k)
|
||||
Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
|
||||
supportData.update(supK)
|
||||
L.append(Lk)
|
||||
k += 1
|
||||
return L, supportData
|
||||
|
||||
def main():
|
||||
# project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
|
||||
# 1.收集并准备数据
|
||||
# dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
|
||||
|
||||
|
||||
# 1. 加载数据
|
||||
dataSet = loadDataSet()
|
||||
print(dataSet)
|
||||
# 调用 apriori 做购物篮分析
|
||||
apriori(dataSet, minSupport = 0.7)
|
||||
|
||||
if __name__=="__main__":
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def generateRules(L, supportData, minConf=0.7): #supportData is a dict coming from scanD
|
||||
bigRuleList = []
|
||||
for i in range(1, len(L)):#only get the sets with two or more items
|
||||
for freqSet in L[i]:
|
||||
H1 = [frozenset([item]) for item in freqSet]
|
||||
if (i > 1):
|
||||
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
|
||||
else:
|
||||
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
|
||||
return bigRuleList
|
||||
|
||||
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
|
||||
prunedH = [] #create new list to return
|
||||
for conseq in H:
|
||||
conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
|
||||
if conf >= minConf:
|
||||
print freqSet-conseq,'-->',conseq,'conf:',conf
|
||||
brl.append((freqSet-conseq, conseq, conf))
|
||||
prunedH.append(conseq)
|
||||
return prunedH
|
||||
|
||||
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
|
||||
m = len(H[0])
|
||||
if (len(freqSet) > (m + 1)): #try further merging
|
||||
Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
|
||||
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
|
||||
if (len(Hmp1) > 1): #need at least two sets to merge
|
||||
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
|
||||
|
||||
def pntRules(ruleList, itemMeaning):
|
||||
for ruleTup in ruleList:
|
||||
for item in ruleTup[0]:
|
||||
print itemMeaning[item]
|
||||
print " -------->"
|
||||
for item in ruleTup[1]:
|
||||
print itemMeaning[item]
|
||||
print "confidence: %f" % ruleTup[2]
|
||||
print #print a blank line
|
||||
|
||||
|
||||
# from time import sleep
|
||||
# from votesmart import votesmart
|
||||
# votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
|
||||
# #votesmart.apikey = 'get your api key first'
|
||||
# def getActionIds():
|
||||
# actionIdList = []; billTitleList = []
|
||||
# fr = open('recent20bills.txt')
|
||||
# for line in fr.readlines():
|
||||
# billNum = int(line.split('\t')[0])
|
||||
# try:
|
||||
# billDetail = votesmart.votes.getBill(billNum) #api call
|
||||
# for action in billDetail.actions:
|
||||
# if action.level == 'House' and \
|
||||
# (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
|
||||
# actionId = int(action.actionId)
|
||||
# print 'bill: %d has actionId: %d' % (billNum, actionId)
|
||||
# actionIdList.append(actionId)
|
||||
# billTitleList.append(line.strip().split('\t')[1])
|
||||
# except:
|
||||
# print "problem getting bill %d" % billNum
|
||||
# sleep(1) #delay to be polite
|
||||
# return actionIdList, billTitleList
|
||||
#
|
||||
# def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
|
||||
# itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
|
||||
# for billTitle in billTitleList:#fill up itemMeaning list
|
||||
# itemMeaning.append('%s -- Nay' % billTitle)
|
||||
# itemMeaning.append('%s -- Yea' % billTitle)
|
||||
# transDict = {}#list of items in each transaction (politician)
|
||||
# voteCount = 2
|
||||
# for actionId in actionIdList:
|
||||
# sleep(3)
|
||||
# print 'getting votes for actionId: %d' % actionId
|
||||
# try:
|
||||
# voteList = votesmart.votes.getBillActionVotes(actionId)
|
||||
# for vote in voteList:
|
||||
# if not transDict.has_key(vote.candidateName):
|
||||
# transDict[vote.candidateName] = []
|
||||
# if vote.officeParties == 'Democratic':
|
||||
# transDict[vote.candidateName].append(1)
|
||||
# elif vote.officeParties == 'Republican':
|
||||
# transDict[vote.candidateName].append(0)
|
||||
# if vote.action == 'Nay':
|
||||
# transDict[vote.candidateName].append(voteCount)
|
||||
# elif vote.action == 'Yea':
|
||||
# transDict[vote.candidateName].append(voteCount + 1)
|
||||
# except:
|
||||
# print "problem getting actionId: %d" % actionId
|
||||
# voteCount += 2
|
||||
# return transDict, itemMeaning
|
||||
Reference in New Issue
Block a user