添加前两种回归的测试和注释

This commit is contained in:
chenyyx
2017-03-24 22:44:30 +08:00
2933 changed files with 1094642 additions and 311 deletions

View File

@@ -28,7 +28,7 @@ randMat = mat(randArray)
# .T表示对矩阵转置(行列颠倒)
invRandMat = randMat.I
# 输出结果
print randArray, '\n---\n', randMat, '\n+++\n', invRandMat
print(randArray, '\n---\n', randMat, '\n+++\n', invRandMat)
# 矩阵和逆矩阵 进行求积 (单位矩阵对角线都为1嘛理论上4*4的矩阵其他的都为0)
myEye = randMat*invRandMat
# 误差

219
src/python/02.kNN/kNN.py Normal file
View File

@@ -0,0 +1,219 @@
#!/usr/bin/env python
# encoding: utf-8
'''
导入科学计算包numpy和运算符模块operator
'''
from numpy import *
import operator
from os import listdir
def createDataSet():
"""
创建数据集和标签
调用方式
import kNN
group, labels = kNN.createDataSet()
"""
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
def classify0(inX, dataSet, labels, k):
"""
inx[1,2,3]
DS=[[1,2,3],[1,2,0]]
inX: 用于分类的输入向量
dataSet: 输入的训练样本集
labels: 标签向量
k: 选择最近邻居的数目
注意labels元素数目和dataSet行数相同程序使用欧式距离公式.
预测数据所在分类可在输入下列命令
kNN.classify0([0,0], group, labels, 3)
"""
# 1. 距离计算
dataSetSize = dataSet.shape[0]
# tile生成和训练样本对应的矩阵并与训练样本求差
"""
tile: 列-3表示复制的行树 行-12表示对inx的重复的次数
In [8]: tile(inx, (3, 1))
Out[8]:
array([[1, 2],
[1, 2],
[1, 2]])
In [9]: tile(inx, (3, 2))
Out[9]:
array([[1, 2, 1, 2],
[1, 2, 1, 2],
[1, 2, 1, 2]])
"""
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
"""
欧氏距离: 点到点之间的距离
第一行: 同一个点 到 dataSet的第一个点的距离。
第二行: 同一个点 到 dataSet的第二个点的距离。
...
第N行 同一个点 到 dataSet的第N个点的距离。
[[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]]
(A1-A2)^2+(B1-B2)^2+(c1-c2)^2
"""
# 取平方
sqDiffMat = diffMat ** 2
# 将矩阵的每一行相加
sqDistances = sqDiffMat.sum(axis=1)
# 开方
distances = sqDistances ** 0.5
# 距离排序
sortedDistIndicies = distances.argsort()
# 2. 选择距离最小的k个点
classCount = {}
for i in range(k):
# 找到该样本的类型
voteIlabel = labels[sortedDistIndicies[i]]
# 在字典中将该类型加一
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
# 3. 排序并返回出现最多的那个类型
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def test1():
"""
第一个例子演示
"""
group, labels = createDataSet()
print str(group)
print str(labels)
print classify0([0.1, 0.1], group, labels, 3)
# ----------------------------------------------------------------------------------------
def file2matrix(filename):
"""
导入训练数据
:param filename: 数据文件路径
:return: 数据矩阵returnMat和对应的类别classLabelVector
"""
fr = open(filename)
numberOfLines = len(fr.readlines()) # get the number of lines in the file
# 生成对应的空矩阵
returnMat = zeros((numberOfLines, 3)) # prepare matrix to return
classLabelVector = [] # prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
# 每列的属性数据
returnMat[index, :] = listFromLine[0:3]
# 每列的类别数据
classLabelVector.append(int(listFromLine[-1]))
index += 1
# 返回数据矩阵returnMat和对应的类别classLabelVector
return returnMat, classLabelVector
def autoNorm(dataSet):
"""
归一化特征值,消除属性之间量级不同导致的影响
:param dataSet: 数据集
:return: 归一化后的数据集normDataSet,ranges和minVals即最小值与范围并没有用到
归一化公式:
Y = (X-Xmin)-(Xmax-Xmin)
"""
# 计算每种属性的最大值、最小值、范围
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
# 极差
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
# 生成与最小值之差组成的矩阵
normDataSet = dataSet - tile(minVals, (m, 1))
# 将最小值之差除以范围组成矩阵
normDataSet = normDataSet / tile(ranges, (m, 1)) # element wise divide
return normDataSet, ranges, minVals
def datingClassTest():
"""
对约会网站的测试方法
:return: 错误数
"""
# 设置测试数据的的一个比例(训练数据集比例=1-hoRatio
hoRatio = 0.1 # 测试范围,一部分测试一部分作为样本
# 从文件中加载数据
datingDataMat, datingLabels = file2matrix('testData/datingTestSet2.txt') # load data setfrom file
# 归一化数据
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
# 设置测试的样本数量, numTestVecs:m表示训练样本的数量
numTestVecs = int(m * hoRatio)
print 'numTestVecs=', numTestVecs
errorCount = 0.0
for i in range(numTestVecs):
# 对数据测试,
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %f" % (errorCount / float(numTestVecs))
print errorCount
def img2vector(filename):
"""
将图像数据转换为向量
:param filename: 图片文件
:return: 一纬矩阵
"""
returnVect = zeros((1, 1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32 * i + j] = int(lineStr[j])
return returnVect
def handwritingClassTest():
# 1. 导入数据
hwLabels = []
trainingFileList = listdir('testData/trainingDigits') # load the training set
m = len(trainingFileList)
trainingMat = zeros((m, 1024))
# hwLabels存储09对应的index位置 trainingMat存放的每个位置对应的图片向量
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0] # take off .txt
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i, :] = img2vector('testData/trainingDigits/%s' % fileNameStr)
# 2. 导入测试数据
testFileList = listdir('testData/testDigits') # iterate through the test set
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0] # take off .txt
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testData/testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
if (classifierResult != classNumStr): errorCount += 1.0
print "\nthe total number of errors is: %d" % errorCount
print "\nthe total error rate is: %f" % (errorCount / float(mTest))
if __name__ == '__main__':
# test1()
# datingClassTest()
handwritingClassTest()

View File

@@ -51,7 +51,7 @@ def predict_train(x_train, y_train):
return y_pre, clf
def show_precision_recall(x, clf, y_train, y_pre):
def show_precision_recall(x, y, clf, y_train, y_pre):
'''
准确率与召回率
参考链接: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html#sklearn.metrics.precision_recall_curve
@@ -110,7 +110,7 @@ if __name__ == '__main__':
y_pre, clf = predict_train(x_train, y_train)
# 展现 准确率与召回率
show_precision_recall(x, clf, y_train, y_pre)
show_precision_recall(x, y, clf, y_train, y_pre)
# 可视化输出
show_pdf(clf)

View File

@@ -5,11 +5,12 @@
Created on Oct 12, 2010
Update on 2017-02-27
Decision Tree Source Code for Machine Learning in Action Ch. 3
@author: Peter Harrington/jiangzhonglian
@author: Peter Harrington/片刻
'''
from math import log
print(__doc__)
import operator
import DecisionTreePlot as dtPlot
from math import log
import decisionTreePlot as dtPlot
def createDataSet():
@@ -19,8 +20,6 @@ def createDataSet():
无需传入参数
Returns:
返回数据集和对应的label标签
Raises:
"""
dataSet = [[1, 1, 'yes'],
[1, 1, 'yes'],
@@ -32,6 +31,7 @@ def createDataSet():
# ['no'],
# ['no'],
# ['no']]
# labels 露出水面 脚蹼
labels = ['no surfacing', 'flippers']
# change to discrete values
return dataSet, labels
@@ -43,9 +43,7 @@ def calcShannonEnt(dataSet):
Args:
dataSet 数据集
Returns:
返回香农熵的计算值
Raises:
返回 每一组feature下的某个分类下香农熵的信息期望
"""
# 求list的长度表示计算参与训练的数据量
numEntries = len(dataSet)
@@ -80,8 +78,6 @@ def splitDataSet(dataSet, axis, value):
value 表示axis列对应的value值
Returns:
axis列为value的数据集【该数据集需要排除axis列】
Raises:
"""
retDataSet = []
for featVec in dataSet:
@@ -105,10 +101,8 @@ def chooseBestFeatureToSplit(dataSet):
dataSet 数据集
Returns:
bestFeature 最优的特征列
Raises:
"""
# 求第一行有多少列的 Feature
# 求第一行有多少列的 Feature, 最后一列是label列嘛
numFeatures = len(dataSet[0]) - 1
# label的信息熵
baseEntropy = calcShannonEnt(dataSet)
@@ -129,8 +123,9 @@ def chooseBestFeatureToSplit(dataSet):
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
# gain[信息增益] 值越大,意味着该分类提供的信息量越大,该特征对分类的不确定程度越小
# gain[信息增益]: 划分数据集前后的信息变化, 获取信息熵最大的值
infoGain = baseEntropy - newEntropy
print 'infoGain=', infoGain, 'bestFeature=', i, baseEntropy, newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
@@ -138,14 +133,12 @@ def chooseBestFeatureToSplit(dataSet):
def majorityCnt(classList):
"""majorityCnt(选择出线次数最多的一个结果)
"""majorityCnt(选择出次数最多的一个结果)
Args:
classList label列的集合
Returns:
bestFeature 最优的特征列
Raises:
"""
classCount = {}
for vote in classList:
@@ -169,6 +162,7 @@ def createTree(dataSet, labels):
# 选择最优的列得到最有列对应的label含义
bestFeat = chooseBestFeatureToSplit(dataSet)
# 获取label的名称
bestFeatLabel = labels[bestFeat]
# 初始化myTree
myTree = {bestFeatLabel: {}}
@@ -187,16 +181,26 @@ def createTree(dataSet, labels):
def classify(inputTree, featLabels, testVec):
# 获取tree的第一个节点对应的key值
"""classify(给输入的节点,进行分类)
Args:
inputTree 决策树模型
featLabels Feature标签对应的名称
testVec 测试输入的数据
Returns:
classLabel 分类的结果值需要映射label才能知道名称
"""
# 获取tree的根节点对于的key值
firstStr = inputTree.keys()[0]
# 获取第一个节点对应的value
# 通过key得到根节点对应的value
secondDict = inputTree[firstStr]
# 判断根节点的索引值然后根据testVec来获取对应的树分枝位置
# 判断根节点名称获取根节点在label中的先后顺序这样就知道输入的testVec怎么开始对照树来做分类
featIndex = featLabels.index(firstStr)
# 测试数据找到根节点对应的label位置也就知道从输入的数据的第几位来开始分类
key = testVec[featIndex]
valueOfFeat = secondDict[key]
print '+++', firstStr, 'xxx', secondDict, '---', key, '>>>', valueOfFeat
# 判断分枝是否结束
# 判断分枝是否结束: 判断valueOfFeat是否是dict类型
if isinstance(valueOfFeat, dict):
classLabel = classify(valueOfFeat, featLabels, testVec)
else:

View File

@@ -128,5 +128,5 @@ def retrieveTree(i):
return listOfTrees[i]
myTree = retrieveTree(0)
createPlot(myTree)
# myTree = retrieveTree(1)
# createPlot(myTree)

181
src/python/04.NaiveBayes/bayes.py Executable file
View File

@@ -0,0 +1,181 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from numpy import *
"""
p(xy)=p(x|y)p(y)=p(y|x)p(x)
p(x|y)=p(y|x)p(x)/p(y)
"""
def loadDataSet():
"""
创建数据集
:return: 单词列表postingList, 所属类别classVec
"""
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #[0,0,1,1,1......]
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 not
return postingList, classVec
def createVocabList(dataSet):
"""
获取所有单词的集合
:param dataSet: 数据集
:return: 所有单词的集合(即不含重复元素的单词列表)
"""
vocabSet = set([]) # create empty set
for document in dataSet:
vocabSet = vocabSet | set(document) # union of the two sets
return list(vocabSet)
def setOfWords2Vec(vocabList, inputSet):
"""
遍历查看该单词属否出现出现该单词则将该单词置1
:param vocabList: 所有单词集合列表
:param inputSet: 输入数据集
:return: 匹配列表[0,1,0,1...]
"""
returnVec = [0] * len(vocabList)# [0,0......]
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print "the word: %s is not in my Vocabulary!" % word
return returnVec
def _trainNB0(trainMatrix, trainCategory):
"""
训练数据原版
:param trainMatrix: 文件单词矩阵 [[1,0,1,1,1....],[],[]...]
:param trainCategory: 文件对应的类别[0,1,1,0....]
:return:
"""
# 文件数
numTrainDocs = len(trainMatrix)
# 单词数
numWords = len(trainMatrix[0])
# 侮辱性文件的出现概率
pAbusive = sum(trainCategory) / float(numTrainDocs)
# 构造单词出现次数列表
p0Num = zeros(numWords) # [0,0,0,.....]
p1Num = zeros(numWords) # [0,0,0,.....]
# 整个数据集单词出现总数
p0Denom = 0.0
p1Denom = 0.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i] #[0,1,1,....]->[0,1,1,...]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
# 类别1即侮辱性文档的[P(F1|C1),P(F2|C1),P(F3|C1),P(F4|C1),P(F5|C1)....]列表
p1Vect = p1Num / p1Denom# [1,2,3,5]/90->[1/90,...]
# 类别0即正常文档的[P(F1|C0),P(F2|C0),P(F3|C0),P(F4|C0),P(F5|C0)....]列表
p0Vect = p0Num / p0Denom
return p0Vect, p1Vect, pAbusive
def trainNB0(trainMatrix, trainCategory):
"""
训练数据优化版本
:param trainMatrix: 文件单词矩阵
:param trainCategory: 文件对应的类别
:return:
"""
# 总文件数
numTrainDocs = len(trainMatrix)
# 总单词数
numWords = len(trainMatrix[0])
# 侮辱性文件的出现概率
pAbusive = sum(trainCategory) / float(numTrainDocs)
# 构造单词出现次数列表
# p0Num 正常的统计
# p1Num 侮辱的统计
p0Num = ones(numWords)#[0,0......]->[1,1,1,1,1.....]
p1Num = ones(numWords)
# 整个数据集单词出现总数2.0根据样本/实际调查结果调整分母的值2主要是避免分母为0当然值可以调整
# p0Denom 正常的统计
# p1Denom 侮辱的统计
p0Denom = 2.0
p1Denom = 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
# 累加辱骂词的频次
p1Num += trainMatrix[i]
# 对每篇文章的辱骂的频次 进行统计汇总
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
# 类别1即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表
p1Vect = log(p1Num / p1Denom)
# 类别0即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表
p0Vect = log(p0Num / p0Denom)
return p0Vect, p1Vect, pAbusive
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
"""
使用算法:
# 将乘法转坏为加法
乘法P(C|F1F2...Fn) = P(F1F2...Fn|C)P(C)/P(F1F2...Fn)
加法P(F1|C)*P(F2|C)....P(Fn|C)P(C) -> log(P(F1|C))+log(P(F2|C))+....+log(P(Fn|C))+log(P(C))
:param vec2Classify: 待测数据[0,1,1,1,1...]
:param p0Vec: 类别1即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表
:param p1Vec: 类别0即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表
:param pClass1: 类别1侮辱性文件的出现概率
:return: 类别1 or 0
"""
# 计算公式 log(P(F1|C))+log(P(F2|C))+....+log(P(Fn|C))+log(P(C))
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
def testingNB():
"""
测试朴素贝叶斯算法
"""
# 1. 加载数据集
listOPosts, listClasses = loadDataSet()
# 2. 创建单词集合
myVocabList = createVocabList(listOPosts)
# 3. 计算单词是否出现并创建数据矩阵
trainMat = []
for postinDoc in listOPosts:
# 返回m*len(myVocabList)的矩阵, 记录的都是01信息
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
# 4. 训练数据
p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
# 5. 测试数据
testEntry = ['love', 'my', 'dalmation']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)
testEntry = ['stupid', 'garbage']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)
if __name__ == "__main__":
testingNB()

View File

@@ -0,0 +1,103 @@
#!/usr/bin/env python
# encoding: utf-8
from numpy import *
import matplotlib.pyplot as plt
import time
'''
1、需要安装模块pip install matplotlib-1.5.0-cp27-none-win_amd64.whl
由于直接安装会出现问题所以建议下载whl包进行安装下载网址
https://pypi.python.org/pypi/matplotlib/1.5.0
2、可以看见画出的图像
'''
"""
@version:
@author: yangjf
@license: ApacheCN
@contact: highfei2011@126.com
@site: https://github.com/apachecn/MachineLearning
@software: PyCharm
@file: logRegression01.py
@time: 2017/3/3 22:03
@test result: ok
"""
# sigmoid函数
def sigmoid(inX):
return 1.0 / (1 + exp(-inX))
def trainLogRegres(train_x, train_y, opts):
# 计算训练时间
startTime = time.time()
numSamples, numFeatures = shape(train_x)
alpha = opts['alpha']; maxIter = opts['maxIter']
weights = ones((numFeatures, 1))
# 通过梯度下降算法优化
for k in range(maxIter):
if opts['optimizeType'] == 'gradDescent': # 梯度下降算法
output = sigmoid(train_x * weights)
error = train_y - output
weights = weights + alpha * train_x.transpose() * error
elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降
for i in range(numSamples):
output = sigmoid(train_x[i, :] * weights)
error = train_y[i, 0] - output
weights = weights + alpha * train_x[i, :].transpose() * error
elif opts['optimizeType'] == 'smoothStocGradDescent': # 光滑随机梯度下降
# 随机选择样本以优化以减少周期波动
dataIndex = range(numSamples)
for i in range(numSamples):
alpha = 4.0 / (1.0 + k + i) + 0.01
randIndex = int(random.uniform(0, len(dataIndex)))
output = sigmoid(train_x[randIndex, :] * weights)
error = train_y[randIndex, 0] - output
weights = weights + alpha * train_x[randIndex, :].transpose() * error
del(dataIndex[randIndex]) # 在一次交互期间,删除优化的样品
else:
raise NameError('Not support optimize method type!')
print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
return weights
#测试给定测试集的训练Logistic回归模型
def testLogRegres(weights, test_x, test_y):
numSamples, numFeatures = shape(test_x)
matchCount = 0
for i in xrange(numSamples):
predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
if predict == bool(test_y[i, 0]):
matchCount += 1
accuracy = float(matchCount) / numSamples
return accuracy
# 显示你的训练逻辑回归模型只有2-D数据可用
def showLogRegres(weights, train_x, train_y):
# 注意train_x和train_y是垫数据类型
numSamples, numFeatures = shape(train_x)
if numFeatures != 3:
print "抱歉! 我不能绘制因为你的数据的维度不是2"
return 1
# 画出所有抽样数据
for i in xrange(numSamples):
if int(train_y[i, 0]) == 0:
plt.plot(train_x[i, 1], train_x[i, 2], 'or')
elif int(train_y[i, 0]) == 1:
plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
# 画图操作
min_x = min(train_x[:, 1])[0, 0]
max_x = max(train_x[:, 1])[0, 0]
weights = weights.getA() # 将mat转换为数组
y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
plt.xlabel('X1'); plt.ylabel('X2')
#显示图像
plt.show()

View File

@@ -0,0 +1,54 @@
#!/usr/bin/env python
# encoding: utf-8
import os
import sys
sys.path.append("C:\Python27")
from numpy import *
from logRegression01 import *
"""
@version:
@author: yangjf
@license: ApacheCN
@contact: highfei2011@126.com
@site: https://github.com/apachecn/MachineLearning
@software: PyCharm
@file: test_logRegression.py
@time: 2017/3/3 22:09
@test result: ok
"""
def loadData():
train_x = []
train_y = []
# 获取当前文件所在路径
project_dir = os.getcwdu()
# 截取字符串至项目名Test\
project_dir = project_dir[:project_dir.find("MachineLearning\\") + 15]
print project_dir
fileIn = open("%s/testData/Logistic_testdata.txt" % project_dir)
for line in fileIn.readlines():
lineArr = line.strip().split()
train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
train_y.append(float(lineArr[2]))
return mat(train_x), mat(train_y).transpose()
##第一步: 加载数据
print "step 1: load data..."
train_x, train_y = loadData()
test_x = train_x; test_y = train_y
##第二步: 训练数据...
print "step 2: training..."
opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'}
optimalWeights = trainLogRegres(train_x, train_y, opts)
##第三步: 测试
print "step 3: testing..."
accuracy = testLogRegres(optimalWeights, test_x, test_y)
##第四步: 显示结果
print "step 4: show the result..."
print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
showLogRegres(optimalWeights, train_x, train_y)

View File

@@ -0,0 +1,472 @@
"""
Created on Nov 4, 2010
Update on 2017-03-21
Chapter 5 source file for Machine Learing in Action
@author: Peter/geekidentity
"""
from numpy import *
from time import sleep
def loadDataSet(fileName):
"""
对文件进行逐行解析,从而得到第行的类标签和整个数据矩阵
Args:
fileName: testSet.txt
Returns:
数据矩阵, 类标签
"""
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = line.strip().split('\t')
dataMat.append([float(lineArr[0]), float(lineArr[1])])
labelMat.append(float(lineArr[2]))
return dataMat,labelMat
def selectJrand(i,m):
"""
随机选择一个整数
Args:
i: 第一个alpha的下标
m: 所有alpha的数目
Returns:
"""
j=i #we want to select any J not equal to i
while (j==i):
j = int(random.uniform(0,m))
return j
def clipAlpha(aj,H,L):
"""
用于调整大于H或小于L的alpha值
Args:
aj:
H:
L:
Returns:
"""
if aj > H:
aj = H
if L > aj:
aj = L
return aj
def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
"""
SVM SMO算法的简单实现:
创建一个alpha向量并将其初始化为0向量
当迭代次数据小于最大迭代次数时(外循环)
对数据集中的每个数据向量(内循环):
如果该数据向量可以被优化:
随机选择另外一个数据向量
同时优化这两个向量
如果两个向量都不能被优化,退出内循环
如果所有向量都没有被优化,增加迭代数目,继续下一次循环
Args:
dataMatIn: 数据集
classLabels: 类别标签
C: 常数C
toler: 容错率
maxIter: 退出前最大的循环次数
Returns:
"""
dataMatrix = mat(dataMatIn); labelMat = mat(classLabels).transpose()
b = 0; m,n = shape(dataMatrix)
alphas = mat(zeros((m,1)))
iter = 0
while (iter < maxIter):
alphaPairsChanged = 0
for i in range(m):
fXi = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b
Ei = fXi - float(labelMat[i])#if checks if an example violates KKT conditions
if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] > 0)):
j = selectJrand(i,m)
fXj = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b
Ej = fXj - float(labelMat[j])
alphaIold = alphas[i].copy(); alphaJold = alphas[j].copy()
if (labelMat[i] != labelMat[j]):
L = max(0, alphas[j] - alphas[i])
H = min(C, C + alphas[j] - alphas[i])
else:
L = max(0, alphas[j] + alphas[i] - C)
H = min(C, alphas[j] + alphas[i])
if L==H: print("L==H"); continue
eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T - dataMatrix[i,:]*dataMatrix[i,:].T - dataMatrix[j,:]*dataMatrix[j,:].T
if eta >= 0: print("eta>=0"); continue
alphas[j] -= labelMat[j]*(Ei - Ej)/eta
alphas[j] = clipAlpha(alphas[j],H,L)
if (abs(alphas[j] - alphaJold) < 0.00001): print("j not moving enough"); continue
alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j])#update i by the same amount as j
#the update is in the oppostie direction
b1 = b - Ei- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
b2 = b - Ej- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
if (0 < alphas[i]) and (C > alphas[i]): b = b1
elif (0 < alphas[j]) and (C > alphas[j]): b = b2
else: b = (b1 + b2)/2.0
alphaPairsChanged += 1
print("iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged))
if (alphaPairsChanged == 0): iter += 1
else: iter = 0
print("iteration number: %d" % iter)
return b,alphas
def kernelTrans(X, A, kTup): # calc the kernel or transform data to a higher dimensional space
m, n = shape(X)
K = mat(zeros((m, 1)))
if kTup[0] == 'lin':
K = X * A.T # linear kernel
elif kTup[0] == 'rbf':
for j in range(m):
deltaRow = X[j, :] - A
K[j] = deltaRow * deltaRow.T
K = exp(K / (-1 * kTup[1] ** 2)) # divide in NumPy is element-wise not matrix like Matlab
else:
raise NameError('Houston We Have a Problem -- \
That Kernel is not recognized')
return K
class optStruct:
def __init__(self, dataMatIn, classLabels, C, toler, kTup): # Initialize the structure with the parameters
self.X = dataMatIn
self.labelMat = classLabels
self.C = C
self.tol = toler
self.m = shape(dataMatIn)[0]
self.alphas = mat(zeros((self.m, 1)))
self.b = 0
self.eCache = mat(zeros((self.m, 2))) # first column is valid flag
self.K = mat(zeros((self.m, self.m)))
for i in range(self.m):
self.K[:, i] = kernelTrans(self.X, self.X[i, :], kTup)
def calcEk(oS, k):
fXk = float(multiply(oS.alphas, oS.labelMat).T * oS.K[:, k] + oS.b)
Ek = fXk - float(oS.labelMat[k])
return Ek
def selectJ(i, oS, Ei): # this is the second choice -heurstic, and calcs Ej
maxK = -1
maxDeltaE = 0
Ej = 0
oS.eCache[i] = [1, Ei] # set valid #choose the alpha that gives the maximum delta E
validEcacheList = nonzero(oS.eCache[:, 0].A)[0]
if (len(validEcacheList)) > 1:
for k in validEcacheList: # loop through valid Ecache values and find the one that maximizes delta E
if k == i: continue # don't calc for i, waste of time
Ek = calcEk(oS, k)
deltaE = abs(Ei - Ek)
if (deltaE > maxDeltaE):
maxK = k;
maxDeltaE = deltaE;
Ej = Ek
return maxK, Ej
else: # in this case (first time around) we don't have any valid eCache values
j = selectJrand(i, oS.m)
Ej = calcEk(oS, j)
return j, Ej
def updateEk(oS, k): # after any alpha has changed update the new value in the cache
Ek = calcEk(oS, k)
oS.eCache[k] = [1, Ek]
def innerL(i, oS):
Ei = calcEk(oS, i)
if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or (
(oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)):
j, Ej = selectJ(i, oS, Ei) # this has been changed from selectJrand
alphaIold = oS.alphas[i].copy();
alphaJold = oS.alphas[j].copy();
if (oS.labelMat[i] != oS.labelMat[j]):
L = max(0, oS.alphas[j] - oS.alphas[i])
H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
else:
L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
H = min(oS.C, oS.alphas[j] + oS.alphas[i])
if L == H: print
"L==H";
return 0
eta = 2.0 * oS.K[i, j] - oS.K[i, i] - oS.K[j, j] # changed for kernel
if eta >= 0: print
"eta>=0";
return 0
oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej) / eta
oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
updateEk(oS, j) # added this for the Ecache
if (abs(oS.alphas[j] - alphaJold) < 0.00001): print
"j not moving enough";
return 0
oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * (alphaJold - oS.alphas[j]) # update i by the same amount as j
updateEk(oS, i) # added this for the Ecache #the update is in the oppostie direction
b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, i] - oS.labelMat[j] * (
oS.alphas[j] - alphaJold) * oS.K[i, j]
b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, j] - oS.labelMat[j] * (
oS.alphas[j] - alphaJold) * oS.K[j, j]
if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]):
oS.b = b1
elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]):
oS.b = b2
else:
oS.b = (b1 + b2) / 2.0
return 1
else:
return 0
def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup=('lin', 0)): # full Platt SMO
oS = optStruct(mat(dataMatIn), mat(classLabels).transpose(), C, toler, kTup)
iter = 0
entireSet = True;
alphaPairsChanged = 0
while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
alphaPairsChanged = 0
if entireSet: # go over all
for i in range(oS.m):
alphaPairsChanged += innerL(i, oS)
print
"fullSet, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged)
iter += 1
else: # go over non-bound (railed) alphas
nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
for i in nonBoundIs:
alphaPairsChanged += innerL(i, oS)
print
"non-bound, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged)
iter += 1
if entireSet:
entireSet = False # toggle entire set loop
elif (alphaPairsChanged == 0):
entireSet = True
print
"iteration number: %d" % iter
return oS.b, oS.alphas
def calcWs(alphas, dataArr, classLabels):
X = mat(dataArr);
labelMat = mat(classLabels).transpose()
m, n = shape(X)
w = zeros((n, 1))
for i in range(m):
w += multiply(alphas[i] * labelMat[i], X[i, :].T)
return w
def testRbf(k1=1.3):
dataArr, labelArr = loadDataSet('testSetRBF.txt')
b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1)) # C=200 important
datMat = mat(dataArr);
labelMat = mat(labelArr).transpose()
svInd = nonzero(alphas.A > 0)[0]
sVs = datMat[svInd] # get matrix of only support vectors
labelSV = labelMat[svInd];
print
"there are %d Support Vectors" % shape(sVs)[0]
m, n = shape(datMat)
errorCount = 0
for i in range(m):
kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
if sign(predict) != sign(labelArr[i]): errorCount += 1
print
"the training error rate is: %f" % (float(errorCount) / m)
dataArr, labelArr = loadDataSet('testSetRBF2.txt')
errorCount = 0
datMat = mat(dataArr);
labelMat = mat(labelArr).transpose()
m, n = shape(datMat)
for i in range(m):
kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
if sign(predict) != sign(labelArr[i]): errorCount += 1
print
"the test error rate is: %f" % (float(errorCount) / m)
def img2vector(filename):
returnVect = zeros((1, 1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32 * i + j] = int(lineStr[j])
return returnVect
def loadImages(dirName):
from os import listdir
hwLabels = []
trainingFileList = listdir(dirName) # load the training set
m = len(trainingFileList)
trainingMat = zeros((m, 1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0] # take off .txt
classNumStr = int(fileStr.split('_')[0])
if classNumStr == 9:
hwLabels.append(-1)
else:
hwLabels.append(1)
trainingMat[i, :] = img2vector('%s/%s' % (dirName, fileNameStr))
return trainingMat, hwLabels
def testDigits(kTup=('rbf', 10)):
dataArr, labelArr = loadImages('trainingDigits')
b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup)
datMat = mat(dataArr);
labelMat = mat(labelArr).transpose()
svInd = nonzero(alphas.A > 0)[0]
sVs = datMat[svInd]
labelSV = labelMat[svInd];
print("there are %d Support Vectors" % shape(sVs)[0])
m, n = shape(datMat)
errorCount = 0
for i in range(m):
kernelEval = kernelTrans(sVs, datMat[i, :], kTup)
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
if sign(predict) != sign(labelArr[i]): errorCount += 1
print
"the training error rate is: %f" % (float(errorCount) / m)
dataArr, labelArr = loadImages('testDigits')
errorCount = 0
datMat = mat(dataArr);
labelMat = mat(labelArr).transpose()
m, n = shape(datMat)
for i in range(m):
kernelEval = kernelTrans(sVs, datMat[i, :], kTup)
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
if sign(predict) != sign(labelArr[i]): errorCount += 1
print
"the test error rate is: %f" % (float(errorCount) / m)
'''#######********************************
Non-Kernel VErsions below
''' #######********************************
class optStructK:
def __init__(self, dataMatIn, classLabels, C, toler): # Initialize the structure with the parameters
self.X = dataMatIn
self.labelMat = classLabels
self.C = C
self.tol = toler
self.m = shape(dataMatIn)[0]
self.alphas = mat(zeros((self.m, 1)))
self.b = 0
self.eCache = mat(zeros((self.m, 2))) # first column is valid flag
def calcEkK(oS, k):
fXk = float(multiply(oS.alphas, oS.labelMat).T * (oS.X * oS.X[k, :].T)) + oS.b
Ek = fXk - float(oS.labelMat[k])
return Ek
def selectJK(i, oS, Ei): # this is the second choice -heurstic, and calcs Ej
maxK = -1
maxDeltaE = 0
Ej = 0
oS.eCache[i] = [1, Ei] # set valid #choose the alpha that gives the maximum delta E
validEcacheList = nonzero(oS.eCache[:, 0].A)[0]
if (len(validEcacheList)) > 1:
for k in validEcacheList: # loop through valid Ecache values and find the one that maximizes delta E
if k == i: continue # don't calc for i, waste of time
Ek = calcEk(oS, k)
deltaE = abs(Ei - Ek)
if (deltaE > maxDeltaE):
maxK = k;
maxDeltaE = deltaE;
Ej = Ek
return maxK, Ej
else: # in this case (first time around) we don't have any valid eCache values
j = selectJrand(i, oS.m)
Ej = calcEk(oS, j)
return j, Ej
def updateEkK(oS, k): # after any alpha has changed update the new value in the cache
Ek = calcEk(oS, k)
oS.eCache[k] = [1, Ek]
def innerLK(i, oS):
Ei = calcEk(oS, i)
if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or (
(oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)):
j, Ej = selectJ(i, oS, Ei) # this has been changed from selectJrand
alphaIold = oS.alphas[i].copy();
alphaJold = oS.alphas[j].copy();
if (oS.labelMat[i] != oS.labelMat[j]):
L = max(0, oS.alphas[j] - oS.alphas[i])
H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
else:
L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
H = min(oS.C, oS.alphas[j] + oS.alphas[i])
if L == H: print
"L==H";
return 0
eta = 2.0 * oS.X[i, :] * oS.X[j, :].T - oS.X[i, :] * oS.X[i, :].T - oS.X[j, :] * oS.X[j, :].T
if eta >= 0: print
"eta>=0";
return 0
oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej) / eta
oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
updateEk(oS, j) # added this for the Ecache
if (abs(oS.alphas[j] - alphaJold) < 0.00001): print
"j not moving enough";
return 0
oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * (alphaJold - oS.alphas[j]) # update i by the same amount as j
updateEk(oS, i) # added this for the Ecache #the update is in the oppostie direction
b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[i, :].T - oS.labelMat[j] * (
oS.alphas[j] - alphaJold) * oS.X[i, :] * oS.X[j, :].T
b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[j, :].T - oS.labelMat[j] * (
oS.alphas[j] - alphaJold) * oS.X[j, :] * oS.X[j, :].T
if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]):
oS.b = b1
elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]):
oS.b = b2
else:
oS.b = (b1 + b2) / 2.0
return 1
else:
return 0
def smoPK(dataMatIn, classLabels, C, toler, maxIter): # full Platt SMO
oS = optStruct(mat(dataMatIn), mat(classLabels).transpose(), C, toler)
iter = 0
entireSet = True;
alphaPairsChanged = 0
while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
alphaPairsChanged = 0
if entireSet: # go over all
for i in range(oS.m):
alphaPairsChanged += innerL(i, oS)
print("fullSet, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
iter += 1
else: # go over non-bound (railed) alphas
nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
for i in nonBoundIs:
alphaPairsChanged += innerL(i, oS)
print("non-bound, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
iter += 1
if entireSet:
entireSet = False # toggle entire set loop
elif (alphaPairsChanged == 0):
entireSet = True
print("iteration number: %d" % iter)
return oS.b, oS.alphas

View File

@@ -0,0 +1,257 @@
#!/usr/bin/python
# coding:utf8
'''
Created on Nov 28, 2010
Adaboost is short for Adaptive Boosting
@author: Peter/jiangzhonglian
'''
from numpy import *
# def loadSimpData():
# """ 测试数据
# Returns:
# dataArr feature对应的数据集
# labelArr feature对应的分类标签
# """
# dataArr = array([[1., 2.1], [2., 1.1], [1.3, 1.], [1., 1.], [2., 1.]])
# labelArr = [1.0, 1.0, -1.0, -1.0, 1.0]
# return dataArr, labelArr
# general function to parse tab -delimited floats
def loadDataSet(fileName):
# get number of fields
numFeat = len(open(fileName).readline().split('\t'))
dataArr = []
labelArr = []
fr = open(fileName)
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat-1):
lineArr.append(float(curLine[i]))
dataArr.append(lineArr)
labelArr.append(float(curLine[-1]))
return dataArr, labelArr
def stumpClassify(dataMat, dimen, threshVal, threshIneq):
"""stumpClassify(将数据集按照feature列的value进行 二元切分比较来赋值)
Args:
dataMat Matrix数据集
dimen 特征列
threshVal 特征列要比较的值
Returns:
retArray 结果集
"""
retArray = ones((shape(dataMat)[0], 1))
# dataMat[:, dimen] 表示数据集中第dimen列的所有值
# print '-----', threshIneq, dataMat[:, dimen], threshVal
if threshIneq == 'lt':
retArray[dataMat[:, dimen] <= threshVal] = -1.0
else:
retArray[dataMat[:, dimen] > threshVal] = -1.0
return retArray
def buildStump(dataArr, labelArr, D):
# 转换数据
dataMat = mat(dataArr)
labelMat = mat(labelArr).T
# m行 n列
m, n = shape(dataMat)
# 初始化数据
numSteps = 10.0
bestStump = {}
bestClasEst = mat(zeros((m, 1)))
# 初始化的最小误差为无穷大
minError = inf
# 循环所有的feature列
for i in range(n):
rangeMin = dataMat[:, i].min()
rangeMax = dataMat[:, i].max()
# print 'rangeMin=%s, rangeMax=%s' % (rangeMin, rangeMax)
# 计算每一份的元素个数
stepSize = (rangeMax-rangeMin)/numSteps
# 分成-1numSteps= 1+numSteps份, 加本身是需要+1的
for j in range(-1, int(numSteps)+1):
# go over less than and greater than
for inequal in ['lt', 'gt']:
# 如果是-1那么得到rangeMin-stepSize; 如果是numSteps那么得到rangeMax
threshVal = (rangeMin + float(j) * stepSize)
# 对单层决策树进行简单分类
predictedVals = stumpClassify(dataMat, i, threshVal, inequal)
# print predictedVals
errArr = mat(ones((m, 1)))
# 正确为0错误为1
errArr[predictedVals == labelMat] = 0
# 计算 平均每个特征的概率0.2*错误概率的总和为多少,就知道错误率多高
# calc total error multiplied by D
weightedError = D.T*errArr
'''
dim 表示 feature列
threshVal 表示树的分界值
inequal 表示计算树左右颠倒的错误率的情况
weightedError 表示整体结果的错误率
'''
# print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
if weightedError < minError:
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump, minError, bestClasEst
def adaBoostTrainDS(dataArr, labelArr, numIt=40):
weakClassArr = []
m = shape(dataArr)[0]
# 初始化 init D to all equal
D = mat(ones((m, 1))/m)
aggClassEst = mat(zeros((m, 1)))
for i in range(numIt):
# build Stump
bestStump, error, classEst = buildStump(dataArr, labelArr, D)
# print "D:", D.T
# calc alpha, throw in max(error,eps) to account for error=0
alpha = float(0.5*log((1.0-error)/max(error, 1e-16)))
bestStump['alpha'] = alpha
# store Stump Params in Array
weakClassArr.append(bestStump)
# print "alpha=%s, classEst=%s, bestStump=%s, error=%s " % (alpha, classEst.T, bestStump, error)
# -1主要是下面求e的-alpha次方 如果判断正确乘积为1否则为-1这样就可以算出分类的情况了
expon = multiply(-1*alpha*mat(labelArr).T, classEst)
# print 'expon=', -1*alpha*mat(labelArr).T, classEst, expon
# 计算e的expon次方然后计算得到一个综合的概率的值
# 结果发现: 正确的alpha的权重值变小了错误的变大了。也就说D里面分类的权重值变了。可以举例验证假设alpha=0.6,什么的)
D = multiply(D, exp(expon))
D = D/D.sum()
# print "D: ", D.T
# 计算分类结果的值,在上一轮结果的基础上,进行加和操作
# calc training error of all classifiers, if this is 0 quit for loop early (use break)
aggClassEst += alpha*classEst
# print "aggClassEst: ", aggClassEst.T
# sign 判断正为1 0为0 负为-1通过最终加和的权重值判断符号。
# 结果为:错误的样本标签集合,因为是 !=,那么结果就是0 正, 1 负
aggErrors = multiply(sign(aggClassEst) != mat(labelArr).T, ones((m, 1)))
errorRate = aggErrors.sum()/m
# print "total error=%s " % (errorRate)
if errorRate == 0.0:
break
return weakClassArr, aggClassEst
def adaClassify(datToClass, classifierArr):
# do stuff similar to last aggClassEst in adaBoostTrainDS
dataMat = mat(datToClass)
m = shape(dataMat)[0]
aggClassEst = mat(zeros((m, 1)))
# 循环 多个分类器
for i in range(len(classifierArr)):
# 通过分类器来核算每一次的分类结果然后通过alpha*每一次的结果 得到最后的权重加和的值。
classEst = stumpClassify(dataMat, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq'])
aggClassEst += classifierArr[i]['alpha']*classEst
# print aggClassEst
return sign(aggClassEst)
def plotROC(predStrengths, classLabels):
"""plotROC(打印ROC曲线并计算AUC的面积大小)
Args:
predStrengths 最终预测结果的权重值
classLabels 原始数据的分类结果集
"""
import matplotlib.pyplot as plt
# variable to calculate AUC
ySum = 0.0
# 对正样本的进行求和
numPosClas = sum(array(classLabels)==1.0)
# 正样本的概率
yStep = 1/float(numPosClas)
# 负样本的概率
xStep = 1/float(len(classLabels)-numPosClas)
# argsort函数返回的是数组值从小到大的索引值
# get sorted index, it's reverse
sortedIndicies = predStrengths.argsort()
# 开始创建模版对象
fig = plt.figure()
fig.clf()
ax = plt.subplot(111)
# cursor光标值
cur = (1.0, 1.0)
# loop through all the values, drawing a line segment at each point
for index in sortedIndicies.tolist()[0]:
if classLabels[index] == 1.0:
delX = 0
delY = yStep
else:
delX = xStep
delY = 0
ySum += cur[1]
# draw line from cur to (cur[0]-delX, cur[1]-delY)
# 画点连线 (x1, x2, y1, y2)
# print cur[0], cur[0]-delX, cur[1], cur[1]-delY
ax.plot([cur[0], cur[0]-delX], [cur[1], cur[1]-delY], c='b')
cur = (cur[0]-delX, cur[1]-delY)
# 画对角的虚线线
ax.plot([0, 1], [0, 1], 'b--')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for AdaBoost horse colic detection system')
# 设置画图的范围区间 (x1, x2, y1, y2)
ax.axis([0, 1, 0, 1])
plt.show()
'''
参考说明http://blog.csdn.net/wenyusuran/article/details/39056013
为了计算AUC我们需要对多个小矩形的面积进行累加。这些小矩形的宽度是xStep因此
可以先对所有矩形的高度进行累加最后再乘以xStep得到其总面积。所有高度的和(ySum)随
着x轴的每次移动而渐次增加。
'''
print "the Area Under the Curve is: ", ySum*xStep
if __name__ == "__main__":
# dataArr, labelArr = loadSimpData()
# print '-----\n', dataArr, '\n', labelArr
# # D表示最初对1进行均分为5份平均每一个初始的概率都为0.2
# D = mat(ones((5, 1))/5)
# # print '-----', D
# # print buildStump(dataArr, labelArr, D)
# # 分类器weakClassArr
# # 历史累计的分类结果集
# weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 9)
# print weakClassArr, '\n-----\n', aggClassEst.T
# # 测试数据的分类结果
# print adaClassify([0, 0], weakClassArr)
# print adaClassify([[5, 5], [0, 0]], weakClassArr)
# 马疝病数据集
# 训练集合
dataArr, labelArr = loadDataSet("testData/AB_horseColicTraining2.txt")
weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 50)
# 计算ROC下面的AUC的面积大小
plotROC(aggClassEst.T, labelArr)
# # 测试集合
# dataArrTest, labelArrTest = loadDataSet("testData/AB_horseColicTest2.txt")
# m = shape(dataArrTest)[0]
# predicting10 = adaClassify(dataArrTest, weakClassArr)
# errArr = mat(ones((m, 1)))
# # 测试:计算总样本数,错误样本数,错误率
# print m, errArr[predicting10 != mat(labelArrTest).T].sum(), errArr[predicting10 != mat(labelArrTest).T].sum()/m

View File

@@ -224,6 +224,7 @@ def crossValidation(xArr,yArr,numVal=10):
#test for standRegression
def regression1():
xArr, yArr = loadDataSet("../../../testData/Regression_data.txt")
@@ -241,6 +242,7 @@ def regression1():
#test for LWLR
def regression2():
xArr, yArr = loadDataSet("../../../testData/Regression_data.txt")

View File

@@ -0,0 +1,105 @@
#!/usr/bin/python
# coding:utf8
# '''
# Created on 2017-03-10
# Update on 2017-03-10
# author: jiangzhonglian
# content: 回归树
# '''
# print(__doc__)
# # Import the necessary modules and libraries
# import numpy as np
# from sklearn.tree import DecisionTreeRegressor
# import matplotlib.pyplot as plt
# # Create a random dataset
# rng = np.random.RandomState(1)
# X = np.sort(5 * rng.rand(80, 1), axis=0)
# y = np.sin(X).ravel()
# print X, '\n\n\n-----------\n\n\n', y
# y[::5] += 3 * (0.5 - rng.rand(16))
# # Fit regression model
# regr_1 = DecisionTreeRegressor(max_depth=2, min_samples_leaf=5)
# regr_2 = DecisionTreeRegressor(max_depth=5, min_samples_leaf=5)
# regr_1.fit(X, y)
# regr_2.fit(X, y)
# # Predict
# X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
# y_1 = regr_1.predict(X_test)
# y_2 = regr_2.predict(X_test)
# # Plot the results
# plt.figure()
# plt.scatter(X, y, c="darkorange", label="data")
# plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
# plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
# plt.xlabel("data")
# plt.ylabel("target")
# plt.title("Decision Tree Regression")
# plt.legend()
# plt.show()
'''
Created on 2017-03-10
Update on 2017-03-10
author: jiangzhonglian
content: 模型树
'''
print(__doc__)
# Author: Noel Dawe <noel.dawe@gmail.com>
#
# License: BSD 3 clause
# importing necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
# Create the dataset
rng = np.random.RandomState(1)
X = np.linspace(0, 6, 100)[:, np.newaxis]
y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])
# Fit regression model
regr_1 = DecisionTreeRegressor(max_depth=4)
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
n_estimators=300, random_state=rng)
regr_1.fit(X, y)
regr_2.fit(X, y)
# Predict
y_1 = regr_1.predict(X)
y_2 = regr_2.predict(X)
# Plot the results
plt.figure()
plt.scatter(X, y, c="k", label="training samples")
plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2)
plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Boosted Decision Tree Regression")
plt.legend()
plt.show()

View File

@@ -1,16 +0,0 @@
#!/usr/bin/python
# coding:utf8
'''
Created on 2017-03-06
Update on 2017-03-06
@author: jiangzhonglian
'''
class treeNode():
def __init__(self, feat, val, right, left):
self.featureToSplitOn = feat
self.valueOfSplit = val
self.rightBranch = right
self.leftBranch = left

View File

@@ -5,8 +5,9 @@
Created on Feb 4, 2011
Update on 2017-03-02
Tree-Based Regression Methods Source Code for Machine Learning in Action Ch. 9
@author: Peter Harrington/jiangzhonglian
@author: Peter Harrington/片刻
'''
print(__doc__)
from numpy import *

View File

@@ -0,0 +1,123 @@
#!/usr/bin/python
# coding:utf8
'''
Created on 2017-03-08
Update on 2017-03-08
Tree-Based Regression Methods Source Code for Machine Learning in Action Ch. 9
@author: jiangzhonglian
'''
import regTrees
from Tkinter import *
from numpy import *
import matplotlib
from matplotlib.figure import Figure
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
matplotlib.use('TkAgg')
def test_widget_text(root):
mylabel = Label(root, text="helloworld")
# 相当于告诉 布局管理器(Geometry Manager),如果不设定位置,默认在 0行0列的位置
mylabel.grid()
# 最大为误差, 最大子叶节点的数量
def reDraw(tolS, tolN):
# clear the figure
reDraw.f.clf()
reDraw.a = reDraw.f.add_subplot(111)
# 检查复选框是否选中
if chkBtnVar.get():
if tolN < 2:
tolN = 2
myTree = regTrees.createTree(reDraw.rawDat, regTrees.modelLeaf, regTrees.modelErr, (tolS, tolN))
yHat = regTrees.createForeCast(myTree, reDraw.testDat, regTrees.modelTreeEval)
else:
myTree = regTrees.createTree(reDraw.rawDat, ops=(tolS, tolN))
yHat = regTrees.createForeCast(myTree, reDraw.testDat)
# use scatter for data set
reDraw.a.scatter(reDraw.rawDat[:, 0], reDraw.rawDat[:, 1], s=5)
# use plot for yHat
reDraw.a.plot(reDraw.testDat, yHat, linewidth=2.0, c='red')
reDraw.canvas.show()
def getInputs():
try:
tolN = int(tolNentry.get())
except:
tolN = 10
print "enter Integer for tolN"
tolNentry.delete(0, END)
tolNentry.insert(0, '10')
try:
tolS = float(tolSentry.get())
except:
tolS = 1.0
print "enter Float for tolS"
tolSentry.delete(0, END)
tolSentry.insert(0, '1.0')
return tolN, tolS
# 画新的tree
def drawNewTree():
# #get values from Entry boxes
tolN, tolS = getInputs()
reDraw(tolS, tolN)
def main(root):
# 标题
Label(root, text="Plot Place Holder").grid(row=0, columnspan=3)
# 输入栏1, 叶子的数量
Label(root, text="tolN").grid(row=1, column=0)
global tolNentry
tolNentry = Entry(root)
tolNentry.grid(row=1, column=1)
tolNentry.insert(0, '10')
# 输入栏2, 误差量
Label(root, text="tolS").grid(row=2, column=0)
global tolSentry
tolSentry = Entry(root)
tolSentry.grid(row=2, column=1)
# 设置输出值
tolSentry.insert(0,'1.0')
# 设置提交的按钮
Button(root, text="确定", command=drawNewTree).grid(row=1, column=2, rowspan=3)
# 设置复选按钮
global chkBtnVar
chkBtnVar = IntVar()
chkBtn = Checkbutton(root, text="Model Tree", variable = chkBtnVar)
chkBtn.grid(row=3, column=0, columnspan=2)
# 退出按钮
Button(root, text="退出", fg="black", command=quit).grid(row=1, column=2)
# 创建一个画板 canvas
reDraw.f = Figure(figsize=(5, 4), dpi=100)
reDraw.canvas = FigureCanvasTkAgg(reDraw.f, master=root)
reDraw.canvas.show()
reDraw.canvas.get_tk_widget().grid(row=0, columnspan=3)
reDraw.rawDat = mat(regTrees.loadDataSet('testData/RT_sine.txt'))
reDraw.testDat = arange(min(reDraw.rawDat[:, 0]), max(reDraw.rawDat[:, 0]), 0.01)
reDraw(1.0, 10)
if __name__ == "__main__":
# 创建一个事件
root = Tk()
# test_widget_text(root)
main(root)
# 启动事件循环
root.mainloop()

View File

@@ -0,0 +1,50 @@
#!/usr/bin/python
# coding:utf8
from numpy import *
# 从文本中构建矩阵,加载文本文件,然后处理
def loadDataSet(fileName): # 通用函数,用来解析以 tab 键分隔的 floats浮点数
dataMat = [] # assume last column is target value
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = map(float,curLine) # 映射所有的元素为 float浮点数类型
dataMat.append(fltLine)
return dataMat
# 计算两个向量的欧式距离(可根据场景选择)
def distEclud(vecA, vecB):
return sqrt(sum(power(vecA - vecB, 2))) # la.norm(vecA-vecB)
# 为给定数据集构建一个包含 k 个随机质心的集合。随机质心必须要在整个数据集的边界之内,这可以通过找到数据集每一维的最小和最大值来完成。然后生成 0~1.0 之间的随机数并通过取值范围和最小值,以便确保随机点在数据的边界之内。
def randCent(dataSet, k):
n = shape(dataSet)[1] # 列数
centroids = mat(zeros((k,n))) # 创建质心矩阵
for j in range(n): # 穿件随机簇质心,并且在每一维的边界内
minJ = min(dataSet[:,j])
rangeJ = float(max(dataSet[:,j]) - minJ)
centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1)) # 随机生成
return centroids
# k-means 聚类算法
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m,2))) # 创建矩阵来分配数据点到质心中
centroids = createCent(dataSet, k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m): # 循环每一个数据点并分配到最近的质心中去
minDist = inf; minIndex = -1
for j in range(k):
distJI = distMeas(centroids[j,:],dataSet[i,:])
if distJI < minDist:
minDist = distJI; minIndex = j
if clusterAssment[i,0] != minIndex: clusterChanged = True
clusterAssment[i,:] = minIndex,minDist**2
print centroids
for cent in range(k): # 重新计算质心
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]] # 获取该簇中的所有点
centroids[cent,:] = mean(ptsInClust, axis=0) # 分配质心
return centroids, clusterAssment

View File

@@ -0,0 +1,324 @@
#!/usr/bin/python
# coding: utf8
'''
Created on Mar 24, 2011
Update on 2017-03-16
Ch 11 code
@author: Peter/片刻
'''
print(__doc__)
from numpy import *
def loadDataSet():
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
def createC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
# 遍历所有的元素然后append到C1中
C1.append([item])
# 对数组进行 从小到大 的排序
C1.sort()
# frozenset表示冻结的set集合元素无可改变可以把它当字典的key来使用
return map(frozenset, C1)
def scanD(D, Ck, minSupport):
"""scanD
Args:
D 原始数据集, D用来判断CK中的元素是否存在于原数据D中
Ck 合并后的数据集
Returns:
retList 支持度大于阈值的集合
supportData 全量key的字典集合
"""
# ssCnt 临时存放Ck的元素集合查看Ck每个元素 并 计算元素出现的次数 生成相应的字典
ssCnt = {}
for tid in D:
for can in Ck:
# s.issubset(t) 测试是否 s 中的每一个元素都在 t 中
if can.issubset(tid):
if not ssCnt.has_key(can):
ssCnt[can] = 1
else:
ssCnt[can] += 1
# 元素有多少行
numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt:
# 计算支持度
support = ssCnt[key]/numItems
if support >= minSupport:
# 在retList的首位插入元素只存储支持度满足频繁项集的值
retList.insert(0, key)
# 存储所有的key和对应的support值
supportData[key] = support
return retList, supportData
# creates Ck
def aprioriGen(Lk, k):
"""aprioriGen(循环数据集,然后进行两两合并)
Args:
Lk 频繁项集
k 元素的前k-2相同就进行合并
Returns:
retList 元素两两合并的数据集
"""
retList = []
lenLk = len(Lk)
# 循环Lk这个数组
for i in range(lenLk):
for j in range(i+1, lenLk):
L1 = list(Lk[i])[: k-2]
L2 = list(Lk[j])[: k-2]
# print '-----', Lk, Lk[i], L1
L1.sort()
L2.sort()
# 第一次L1,L2为空元素直接进行合并返回元素两两合并的数据集
# if first k-2 elements are equal
if L1 == L2:
# set union
# print 'union=', Lk[i] | Lk[j], Lk[i], Lk[j]
retList.append(Lk[i] | Lk[j])
return retList
def apriori(dataSet, minSupport=0.5):
"""apriori
Args:
dataSet 原始数据集
minSupport 支持度的阈值
Returns:
L 频繁项集的全集
supportData 所有元素的支持度全集
"""
# 冻结每一行数据
C1 = createC1(dataSet)
D = map(set, dataSet)
# 计算支持support L1表示满足support的key, supportData表示全集的集合
L1, supportData = scanD(D, C1, minSupport)
# print "L1=", L1, "\n", "outcome: ", supportData
L = [L1]
k = 2
while (len(L[k-2]) > 0):
# 合并k-2相同的数据集
Ck = aprioriGen(L[k-2], k)
# print '-----------', D, Ck
# 计算合并后的数据集的支持度
# Lk满足支持度的key的list supK表示key全集
# print 'Ck', Ck
Lk, supK = scanD(D, Ck, minSupport)
# 如果字典没有,就追加元素,如果有,就更新元素
supportData.update(supK)
if len(Lk) == 0:
break
# Lk表示满足频繁子项的集合L元素在增加
L.append(Lk)
k += 1
# print 'k=', k, len(L[k-2])
return L, supportData
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
"""calcConf
Args:
freqSet 每一组的各个元素
H 将元素变成set集合
supportData 所有元素的支持度全集
brl bigRuleList的空数组
minConf 置信度的阈值
Returns:
prunedH 记录 可信度大于阈值的集合
"""
# 记录 可信度大于阈值的集合
prunedH = []
for conseq in H:
# 计算自信度的值,例如元素 H=set(1, 2) 分别求supportData[1] 和 supportData[2]
# print 'confidence=', freqSet, conseq, freqSet-conseq
conf = supportData[freqSet]/supportData[freqSet-conseq]
if conf >= minConf:
print freqSet-conseq, '-->', conseq, 'conf:', conf
brl.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
"""rulesFromConseq
Args:
freqSet 每一组的各个元素
H 将元素变成set集合
supportData 所有元素的支持度全集
brl bigRuleList的空数组
minConf 置信度的阈值
Returns:
prunedH 记录 可信度大于阈值的集合
"""
# 去除list列表中第一个出现的冻结的set集合
m = len(H[0])
# 判断freqSet的长度是否>组合的长度+1
if (len(freqSet) > (m + 1)):
# 合并相邻的集合组合为2/3/..n的集合
Hmp1 = aprioriGen(H, m+1)
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
# 如果有2个结果都可以直接返回结果就行下面这个判断是多余我个人觉得
# print 'Hmp1=', Hmp1
if (len(Hmp1) > 1):
# print '-------'
# print len(freqSet), len(Hmp1[0]) + 1
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
def generateRules(L, supportData, minConf=0.7):
"""generateRules
Args:
L 频繁项集的全集
supportData 所有元素的支持度全集
minConf 可信度的阈值
Returns:
bigRuleList 关于 (A->B+置信度) 3个字段的组合
"""
bigRuleList = []
# 循环L频繁项集所有的统一大小组合2/../n个的组合从第2组开始
for i in range(1, len(L)):
# 获取频繁项集中每个组合的所有元素
for freqSet in L[i]:
# 组合总的元素并遍历子元素并转化为冻结的set集合再存放到list列表中
H1 = [frozenset([item]) for item in freqSet]
# 2个的组合走else, 2个以上的组合走if
if (i > 1):
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList
def getActionIds():
from time import sleep
from votesmart import votesmart
# votesmart.apikey = 'get your api key first'
votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
actionIdList = []
billTitleList = []
fr = open('testData/Apriori_recent20bills.txt')
for line in fr.readlines():
billNum = int(line.split('\t')[0])
try:
billDetail = votesmart.votes.getBill(billNum) # api call
for action in billDetail.actions:
if action.level == 'House' and (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
actionId = int(action.actionId)
print 'bill: %d has actionId: %d' % (billNum, actionId)
actionIdList.append(actionId)
billTitleList.append(line.strip().split('\t')[1])
except:
print "problem getting bill %d" % billNum
sleep(1) # delay to be polite
return actionIdList, billTitleList
def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
for billTitle in billTitleList:#fill up itemMeaning list
itemMeaning.append('%s -- Nay' % billTitle)
itemMeaning.append('%s -- Yea' % billTitle)
transDict = {}#list of items in each transaction (politician)
voteCount = 2
for actionId in actionIdList:
sleep(3)
print 'getting votes for actionId: %d' % actionId
try:
voteList = votesmart.votes.getBillActionVotes(actionId)
for vote in voteList:
if not transDict.has_key(vote.candidateName):
transDict[vote.candidateName] = []
if vote.officeParties == 'Democratic':
transDict[vote.candidateName].append(1)
elif vote.officeParties == 'Republican':
transDict[vote.candidateName].append(0)
if vote.action == 'Nay':
transDict[vote.candidateName].append(voteCount)
elif vote.action == 'Yea':
transDict[vote.candidateName].append(voteCount + 1)
except:
print "problem getting actionId: %d" % actionId
voteCount += 2
return transDict, itemMeaning
# 暂时没用上
# def pntRules(ruleList, itemMeaning):
# for ruleTup in ruleList:
# for item in ruleTup[0]:
# print itemMeaning[item]
# print " -------->"
# for item in ruleTup[1]:
# print itemMeaning[item]
# print "confidence: %f" % ruleTup[2]
# print #print a blank line
def main():
# 以前的测试
# project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# 收集并准备数据
# dataMat, labelMat = loadDataSet("%s/resources/Apriori_testdata.txt" % project_dir)
# 现在的的测试
# # 1. 加载数据
# dataSet = loadDataSet()
# print(dataSet)
# # 调用 apriori 做购物篮分析
# # 支持度满足阈值的key集合L和所有key的全集suppoerData
# L, supportData = apriori(dataSet, minSupport=0.5)
# # print L, supportData
# print '\ngenerateRules\n'
# rules = generateRules(L, supportData, minConf=0.05)
# print rules
# 项目实战
# 构建美国国会投票记录的事务数据集
# actionIdList, billTitleList = getActionIds()
# # 测试前2个
# # transDict, itemMeaning = getTransList(actionIdList[: 2], billTitleList[: 2])
# # transDict 表示 action_id的集合transDict[key]这个就是action_id对应的选项例如 [1, 2, 3]
# transDict, itemMeaning = getTransList(actionIdList, billTitleList)
# # 得到全集的数据
# dataSet = [transDict[key] for key in transDict.keys()]
# L, supportData = apriori(dataSet, minSupport=0.3)
# rules = generateRules(L, supportData, minConf=0.95)
# print rules
# 项目实战
# 发现毒蘑菇的相似特性
# 得到全集的数据
dataSet = [line.split() for line in open("testData/Apriori_mushroom.dat").readlines()]
L, supportData = apriori(dataSet, minSupport=0.3)
# 2表示毒蘑菇1表示可食用的蘑菇
# 找出关于2的频繁子项出来就知道如果是毒蘑菇那么出现频繁的也可能是毒蘑菇
for item in L[1]:
if item.intersection('2'):
print item
for item in L[2]:
if item.intersection('2'):
print item
if __name__ == "__main__":
main()

View File

@@ -1,12 +0,0 @@
def loadDataSet():
return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]
def createC1(dataSet):
c1=[]
for transaction in dataSet:
for item in transaction:
if not [item] in c1:
c1.append([item])
c1.sort()
return map(frozenset,c1)
def scanD(D,ck,minSupport):
ssCnt = {}

View File

@@ -1,19 +1,337 @@
#!/usr/bin/python
# coding:utf8
'''
Created on Jun 14, 2011
FP-Growth FP means frequent pattern
the FP-Growth algorithm needs:
1. FP-tree (class treeNode)
2. header table (use dict)
This finds frequent itemsets similar to apriori but does not find association rules.
@author: Peter/片刻
'''
print(__doc__)
class treeNode:
def __init__(self,nameValue,numOccur,parentNode):
def __init__(self, nameValue, numOccur, parentNode):
self.name = nameValue
self.count = numOccur
self.nodeLink = None
# needs to be updated
self.parent = parentNode
self.children = {}
def inc(self,numOccur):
def inc(self, numOccur):
"""inc(对count变量增加给定值)
"""
self.count += numOccur
def disp(self,ind=1):
print(' '*ind,self.name,' ',self.count)
def disp(self, ind=1):
"""disp(用于将树以文本形式显示)
"""
print ' '*ind, self.name, ' ', self.count
for child in self.children.values():
child.disp(ind+1)
if __name__ == "__main__":
import fpGrowth
rootNode = fpGrowth.treeNode('pyramid',9,None)
rootNode.children['eye']=fpGrowth.treeNode('eye',13,None)
rootNode.disp()
def loadSimpDat():
simpDat = [['r', 'z', 'h', 'j', 'p'],
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'],
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
return simpDat
def createInitSet(dataSet):
retDict = {}
for trans in dataSet:
retDict[frozenset(trans)] = 1
return retDict
# this version does not use recursion
def updateHeader(nodeToTest, targetNode):
"""updateHeader(更新头指针,建立相同元素之间的关系,例如: 左边的r指向右边的r值就是后出现的相同元素 指向 已经出现的元素)
从头指针的nodeLink开始一直沿着nodeLink直到到达链表末尾。这就是链表。
性能:如果链表很长可能会遇到迭代调用的次数限制。
Args:
nodeToTest 满足minSup {所有的元素+(value, treeNode)}
targetNode Tree对象的子节点
"""
# 建立相同元素之间的关系,例如: 左边的r指向右边的r值
while (nodeToTest.nodeLink is not None):
nodeToTest = nodeToTest.nodeLink
nodeToTest.nodeLink = targetNode
def updateTree(items, inTree, headerTable, count):
"""updateTree(更新FP-tree第二次遍历)
# 针对每一行的数据
# 最大的key, 添加
Args:
items 满足minSup 排序后的元素key的数组大到小的排序
inTree 空的Tree对象
headerTable 满足minSup {所有的元素+(value, treeNode)}
count 原数据集中每一组Kay出现的次数
"""
# 取出 元素 出现次数最高的
# 如果该元素在 inTree.children 这个字典中,就进行累加
# 如果该元素不存在 就 inTree.children 字典中新增keyvalue为初始化的 treeNode 对象
if items[0] in inTree.children:
# 更新 最大元素,对应的 treeNode 对象的count进行叠加
inTree.children[items[0]].inc(count)
else:
# 如果不存在子节点我们为该inTree添加子节点
inTree.children[items[0]] = treeNode(items[0], count, inTree)
# 如果满足minSup的dist字典的value值第二位为null 我们就设置该元素为 本节点对应的tree节点
# 如果元素第二位不为null我们就更新header节点
if headerTable[items[0]][1] is None:
# headerTable只记录第一次节点出现的位置
headerTable[items[0]][1] = inTree.children[items[0]]
else:
# 本质上是修改headerTable的key对应的Tree的nodeLink值
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
if len(items) > 1:
# 递归的调用在items[0]的基础上添加item0[1]做子节点, count只要循环的进行累计加和而已统计出节点的最后的统计值。
updateTree(items[1::], inTree.children[items[0]], headerTable, count)
def createTree(dataSet, minSup=1):
"""createTree(生成FP-tree第一次遍历)
Args:
dataSet dist{行:出现次数}的样本数据
minSup 最小的支持度
Returns:
retTree FP-tree
headerTable 满足minSup {所有的元素+(value, treeNode)}
"""
# 支持度>=minSup的dist{所有元素:出现的次数}
headerTable = {}
# 循环 dist{行:出现次数}的样本数据
for trans in dataSet:
# 对所有的行进行循环,得到行里面的所有元素
# 统计每一行中,每个元素出现的总次数
for item in trans:
# 例如: {'ababa': 3} count(a)=3+3+3=9 count(b)=3+3=6
headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
# 删除 headerTable中元素次数<最小支持度的元素
for k in headerTable.keys():
if headerTable[k] < minSup:
del(headerTable[k])
# 满足minSup: set(各元素集合)
freqItemSet = set(headerTable.keys())
# 如果不存在直接返回None
if len(freqItemSet) == 0:
return None, None
for k in headerTable:
# 格式化: dist{元素key: [元素次数, None]}
headerTable[k] = [headerTable[k], None]
# create tree
retTree = treeNode('Null Set', 1, None)
# 循环 dist{行:出现次数}的样本数据
for tranSet, count in dataSet.items():
# print 'tranSet, count=', tranSet, count
# localD = dist{元素key: 元素次数}
localD = {}
for item in tranSet:
# 判断是否在满足minSup的集合中
if item in freqItemSet:
# print 'headerTable[item][0]=', headerTable[item][0], headerTable[item]
localD[item] = headerTable[item][0]
# print 'localD=', localD
if len(localD) > 0:
# p=key,value; 所以是通过value值的大小进行从大到小进行排序
# orderedItems 表示取出元组的key值也就是字母本身但是字母本身是大到小的顺序
orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
# print 'orderedItems=', orderedItems, 'headerTable', headerTable, '\n\n\n'
# 填充树通过有序的orderedItems的第一位进行顺序填充 第一层的子节点。
updateTree(orderedItems, retTree, headerTable, count)
return retTree, headerTable
def ascendTree(leafNode, prefixPath):
"""ascendTree(如果存在父节点就记录当前节点的name值)
Args:
leafNode 查询的节点对于的nodeTree
prefixPath 要查询的节点值
"""
if leafNode.parent is not None:
prefixPath.append(leafNode.name)
ascendTree(leafNode.parent, prefixPath)
def findPrefixPath(basePat, treeNode):
"""findPrefixPath 基础数据集
Args:
basePat 要查询的节点值
treeNode 查询的节点所在的当前nodeTree
Returns:
condPats 对非basePat的倒叙值作为key,赋值为count数
"""
condPats = {}
# 对 treeNode的link进行循环
while treeNode is not None:
prefixPath = []
# 寻找改节点的父节点,相当于找到了该节点的频繁项集
ascendTree(treeNode, prefixPath)
# 避免 单独`Z`一个元素,添加了空节点
if len(prefixPath) > 1:
# 对非basePat的倒叙值作为key,赋值为count数
# prefixPath[1:] 变frozenset后字母就变无须了
# condPats[frozenset(prefixPath)] = treeNode.count
condPats[frozenset(prefixPath[1:])] = treeNode.count
# 递归,寻找改节点的上一个 相同值的链接节点
treeNode = treeNode.nodeLink
# print treeNode
return condPats
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
"""mineTree(创建条件FP树)
Args:
inTree myFPtree
headerTable 满足minSup {所有的元素+(value, treeNode)}
minSup 最小支持项集
preFix preFix为newFreqSet上一次的存储记录一旦没有myHead就不会更新
freqItemList 用来存储频繁子项的列表
"""
# 通过value进行从小到大的排序 得到频繁项集的key
# 最小支持项集的key的list集合
bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]
# print '-----', sorted(headerTable.items(), key=lambda p: p[1])
print 'bigL=', bigL
# 循环遍历 最频繁项集的key从小到大的递归寻找对应的频繁项集
for basePat in bigL:
# preFix为newFreqSet上一次的存储记录一旦没有myHead就不会更新
newFreqSet = preFix.copy()
newFreqSet.add(basePat)
print 'newFreqSet=', newFreqSet, preFix
freqItemList.append(newFreqSet)
print 'freqItemList=', freqItemList
condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
print 'condPattBases=', basePat, condPattBases
# 构建FP-tree
myCondTree, myHead = createTree(condPattBases, minSup)
print 'myHead=', myHead
# 挖掘条件 FP-tree, 如果
if myHead is not None:
myCondTree.disp(1)
print '\n\n\n'
# 递归 myHead 找出频繁项集
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
print '\n\n\n'
import twitter
from time import sleep
import re
def getLotsOfTweets(searchStr):
"""
获取 100个搜索结果页面
"""
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
ACCESS_TOKEN_KEY = ''
ACCESS_TOKEN_SECRET = ''
api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET, access_token_key=ACCESS_TOKEN_KEY, access_token_secret=ACCESS_TOKEN_SECRET)
# you can get 1500 results 15 pages * 100 per page
resultsPages = []
for i in range(1, 15):
print "fetching page %d" % i
searchResults = api.GetSearch(searchStr, per_page=100, page=i)
resultsPages.append(searchResults)
sleep(6)
return resultsPages
def textParse(bigString):
"""
解析页面内容
"""
urlsRemoved = re.sub('(http:[/][/]|www.)([a-z]|[A-Z]|[0-9]|[/.]|[~])*', '', bigString)
listOfTokens = re.split(r'\W*', urlsRemoved)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def mineTweets(tweetArr, minSup=5):
"""
获取频繁项集
"""
parsedList = []
for i in range(14):
for j in range(100):
parsedList.append(textParse(tweetArr[i][j].text))
initSet = createInitSet(parsedList)
myFPtree, myHeaderTab = createTree(initSet, minSup)
myFreqList = []
mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
return myFreqList
if __name__ == "__main__":
# rootNode = treeNode('pyramid', 9, None)
# rootNode.children['eye'] = treeNode('eye', 13, None)
# rootNode.children['phoenix'] = treeNode('phoenix', 3, None)
# # 将树以文本形式显示
# # print rootNode.disp()
# # load样本数据
# simpDat = loadSimpDat()
# # print simpDat, '\n'
# # frozen set 格式化 并 重新装载 样本数据,对所有的行进行统计求和,格式: {行:出现次数}
# initSet = createInitSet(simpDat)
# # print initSet
# # 创建FP树
# # 输入dist{行:出现次数}的样本数据 和 最小的支持度
# # 输出最终的PF-tree通过循环获取第一层的节点然后每一层的节点进行递归的获取每一行的字节点也就是分支。然后所谓的指针就是后来的指向已存在的
# myFPtree, myHeaderTab = createTree(initSet, 3)
# myFPtree.disp()
# # 抽取条件模式基
# # 查询树节点的,频繁子项
# # print findPrefixPath('x', myHeaderTab['x'][1])
# # print findPrefixPath('z', myHeaderTab['z'][1])
# # print findPrefixPath('r', myHeaderTab['r'][1])
# # 创建条件模式基
# freqItemList = []
# mineTree(myFPtree, myHeaderTab, 3, set([]), freqItemList)
# print freqItemList
# # 项目实战
# # 1.twitter项目案例
# # 无法运行因为没发链接twitter
# lotsOtweets = getLotsOfTweets('RIMM')
# listOfTerms = mineTweets(lotsOtweets, 20)
# print len(listOfTerms)
# for t in listOfTerms:
# print t
# 2.新闻网站点击流中挖掘
parsedDat = [line.split() for line in open('testData/FPGrowth_kosarak.dat').readlines()]
initSet = createInitSet(parsedDat)
myFPtree, myHeaderTab = createTree(initSet, 100000)
myFreList = []
mineTree(myFPtree, myHeaderTab, 100000, set([]), myFreList)
print myFreList

155
src/python/14.SVD/svdRec.py Normal file
View File

@@ -0,0 +1,155 @@
#!/usr/bin/python
# encoding: utf-8
from numpy import *
from numpy import linalg as la
def loadExData():
# 利用SVD提高推荐效果菜肴矩阵
return[[2, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0],
[3, 3, 4, 0, 3, 0, 0, 2, 2, 0, 0],
[5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],
[4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5],
[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4],
[0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],
[0, 0, 0, 3, 0, 0, 0, 0, 4, 5, 0],
[1, 1, 2, 1, 1, 2, 1, 0, 4, 5, 0]]
"""
# 推荐引擎示例矩阵
return[[4, 4, 0, 2, 2],
[4, 0, 0, 3, 3],
[4, 0, 0, 1, 1],
[1, 1, 1, 2, 0],
[2, 2, 2, 0, 0],
[1, 1, 1, 0, 0],
[5, 5, 5, 0, 0]]
原矩阵
return[[1, 1, 1, 0, 0],
[2, 2, 2, 0, 0],
[1, 1, 1, 0, 0],
[5, 5, 5, 0, 0],
[1, 1, 0, 2, 2],
[0, 0, 0, 3, 3],
[0, 0, 0, 1, 1]]
"""
# 欧氏距离相似度假定inA和inB 都是列向量
# 计算向量的第二范式,相当于计算了欧氏距离
def ecludSim(inA, inB):
return 1.0/(1.0 + la.norm(inA - inB))
# pearsSim()函数会检查是否存在3个或更多的点。
# corrcoef直接计算皮尔逊相关系数
def pearsSim(inA, inB):
# 如果不存在该函数返回1.0,此时两个向量完全相关。
if len(inA) < 3:
return 1.0
return 0.5 + 0.5*corrcoef(inA, inB, rowvar=0)[0][1]
# 计算余弦相似度
def cosSim(inA, inB):
num = float(inA.T*inB)
denom = la.norm(inA)*la.norm(inB)
return 0.5 + 0.5*(num/denom)
# 基于物品相似度的推荐引擎
# standEst()函数,用来计算在给定相似度计算方法的条件下,用户对物品的估计评分值。
# standEst()函数的参数包括数据矩阵、用户编号、物品编号和相似度计算方法
def standEst(dataMat, user, simMeas, item):
# 得到数据集中的物品数目
n = shape(dataMat)[1]
# 初始化两个评分值
simTotal = 0.0
ratSimTotal = 0.0
# 遍历行中的每个物品(对用户评过分的物品进行遍历,并将它与其他物品进行比较)
for j in range(n):
userRating = dataMat[user, j]
# 如果某个物品的评分值为0则跳过这个物品
if userRating == 0:
continue
# 寻找两个用户都评级的物品
# 变量overLap 给出的是两个物品当中已经被评分的那个元素
overLap = nonzero(logical_and(dataMat[:, item].A>0, dataMat[:, j].A>0))[0]
# 如果相似度为0则两着没有任何重合元素终止本次循环
if len(overLap) == 0:similarity =0
# 如果存在重合的物品,则基于这些重合物重新计算相似度。
else: similarity = simMeas(dataMat[overLap,item], \
dataMat[overLap,j])
# print 'the %d and %d similarity is : %f'(iten,j,similarity)
# 相似度会不断累加,每次计算时还考虑相似度和当前用户评分的乘积
# similarity 用户相似度, userRating 用户评分
simTotal += similarity
ratSimTotal += similarity * userRating
if simTotal == 0:
return 0
# 通过除以所有的评分总和对上述相似度评分的乘积进行归一化使得最后评分在0~5之间这些评分用来对预测值进行排序
else:
return ratSimTotal/simTotal
# recommend()函数就是推荐引擎它会调用standEst()函数产生了最高的N个推荐结果。
# 如果不指定N的大小则默认值为3。该函数另外的参数还包括相似度计算方法和估计方法
def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
# 寻找未评级的物品
# 对给定的用户建立一个未评分的物品列表
unratedItems = nonzero(dataMat[user, :].A == 0)[1]
# 如果不存在未评分物品,那么就退出函数
if len(unratedItems) == 0:
return 'you rated everything'
# 在所有的未评分物品上进行循环
itemScores = []
for item in unratedItems:
estimatedScore = estMethod(dataMat, user, simMeas, item)
# 寻找前N个未评级物品调用standEst()来产生该物品的预测得分该物品的编号和估计值会放在一个元素列表itemScores中
itemScores.append((item, estimatedScore))
# 按照估计得分,对该列表进行排序并返回。列表逆排序,第一个值就是最大值
return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[: N]
# 基于SVD的评分估计
# 在recommend() 中这个函数用于替换对standEst()的调用,该函数对给定用户给定物品构建了一个评分估计值
def svdEst(dataMat, user, simMeas, item):
n = shape(dataMat)[1]
# 对数据集进行SVD分解
simTotal = 0.0
ratSimTotal = 0.0
# 在SVD分解之后我们只利用包含了90%能量值的奇异值这些奇异值会以NumPy数组的形式得以保存
U, Sigma, VT = la.svd(dataMat)
# 如果要进行矩阵运算,就必须要用这些奇异值构建出一个对角矩阵
Sig4 = mat(eye(4) * Sigma[: 4])
# 利用U矩阵将物品转换到低维空间中构建转换后的物品
xformedItems = dataMat.T * U[:, :4] * Sig4.I
# 对于给定的用户for循环在用户对应行的元素上进行遍历
# 这和standEst()函数中的for循环的目的一样只不过这里的相似度计算时在低维空间下进行的。
for j in range(n):
userRating = dataMat[user, j]
if userRating == 0 or j == item:
continue
# 相似度的计算方法也会作为一个参数传递给该函数
similarity = simMeas(xformedItems[item, :].T,xformedItems[j, :].T)
# for 循环中加入了一条print语句以便了解相似度计算的进展情况。如果觉得累赘可以去掉
print 'the %d and %d similarity is: %f' % (item, j, similarity)
# 对相似度求和
simTotal += similarity
# 对相似度及对应评分值的乘积求和
ratSimTotal += similarity * userRating
if simTotal == 0:
return 0
else:
# 计算估计评分
return ratSimTotal/simTotal
if __name__ == "__main__":
myMat = mat(loadExData())
print myMat
print recommend(myMat, 1, estMethod=svdEst)

View File

@@ -117,7 +117,7 @@ def plotBestFit(dataArr, labelMat, weights):
def main():
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# 1.收集并准备数据
dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
dataMat, labelMat = loadDataSet("%s/testData/Logistic_testdata.txt" % project_dir)
# print dataMat, '---\n', labelMat
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值

View File

@@ -1,206 +0,0 @@
#!/usr/bin/python
# coding: utf8
'''
Created on Mar 24, 2011
Ch 11 code
@author: Peter
'''
from numpy import *
def loadDataSet():
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
def createC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return map(frozenset, C1) # use frozen set so we
# can use it as a key in a dict
def scanD(D, Ck, minSupport):
ssCnt = {}
for tid in D:
for can in Ck:
# s.issubset(t) 测试是否 s 中的每一个元素都在 t 中
if can.issubset(tid):
if not ssCnt.has_key(can): ssCnt[can]=1
else: ssCnt[can] += 1
numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt:
support = ssCnt[key]/numItems
if support >= minSupport:
retList.insert(0, key)
supportData[key] = support
return retList, supportData
def aprioriGen(Lk, k): #creates Ck
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i+1, lenLk):
L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
L1.sort(); L2.sort()
if L1==L2: #if first k-2 elements are equal
retList.append(Lk[i] | Lk[j]) #set union
return retList
def apriori(dataSet, minSupport = 0.5):
# 冻结每一行数据
C1 = createC1(dataSet)
D = map(set, dataSet)
# 计算支持support
L1, supportData = scanD(D, C1, minSupport)
print("outcome: ", supportData)
L = [L1]
k = 2
while (len(L[k-2]) > 0):
Ck = aprioriGen(L[k-2], k)
Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
supportData.update(supK)
L.append(Lk)
k += 1
return L, supportData
def main():
# project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# 1.收集并准备数据
# dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
# 1. 加载数据
dataSet = loadDataSet()
print(dataSet)
# 调用 apriori 做购物篮分析
apriori(dataSet, minSupport = 0.7)
if __name__=="__main__":
main()
def generateRules(L, supportData, minConf=0.7): #supportData is a dict coming from scanD
bigRuleList = []
for i in range(1, len(L)):#only get the sets with two or more items
for freqSet in L[i]:
H1 = [frozenset([item]) for item in freqSet]
if (i > 1):
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
prunedH = [] #create new list to return
for conseq in H:
conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
if conf >= minConf:
print freqSet-conseq,'-->',conseq,'conf:',conf
brl.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
m = len(H[0])
if (len(freqSet) > (m + 1)): #try further merging
Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
if (len(Hmp1) > 1): #need at least two sets to merge
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
def pntRules(ruleList, itemMeaning):
for ruleTup in ruleList:
for item in ruleTup[0]:
print itemMeaning[item]
print " -------->"
for item in ruleTup[1]:
print itemMeaning[item]
print "confidence: %f" % ruleTup[2]
print #print a blank line
# from time import sleep
# from votesmart import votesmart
# votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
# #votesmart.apikey = 'get your api key first'
# def getActionIds():
# actionIdList = []; billTitleList = []
# fr = open('recent20bills.txt')
# for line in fr.readlines():
# billNum = int(line.split('\t')[0])
# try:
# billDetail = votesmart.votes.getBill(billNum) #api call
# for action in billDetail.actions:
# if action.level == 'House' and \
# (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
# actionId = int(action.actionId)
# print 'bill: %d has actionId: %d' % (billNum, actionId)
# actionIdList.append(actionId)
# billTitleList.append(line.strip().split('\t')[1])
# except:
# print "problem getting bill %d" % billNum
# sleep(1) #delay to be polite
# return actionIdList, billTitleList
#
# def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
# itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
# for billTitle in billTitleList:#fill up itemMeaning list
# itemMeaning.append('%s -- Nay' % billTitle)
# itemMeaning.append('%s -- Yea' % billTitle)
# transDict = {}#list of items in each transaction (politician)
# voteCount = 2
# for actionId in actionIdList:
# sleep(3)
# print 'getting votes for actionId: %d' % actionId
# try:
# voteList = votesmart.votes.getBillActionVotes(actionId)
# for vote in voteList:
# if not transDict.has_key(vote.candidateName):
# transDict[vote.candidateName] = []
# if vote.officeParties == 'Democratic':
# transDict[vote.candidateName].append(1)
# elif vote.officeParties == 'Republican':
# transDict[vote.candidateName].append(0)
# if vote.action == 'Nay':
# transDict[vote.candidateName].append(voteCount)
# elif vote.action == 'Yea':
# transDict[vote.candidateName].append(voteCount + 1)
# except:
# print "problem getting actionId: %d" % actionId
# voteCount += 2
# return transDict, itemMeaning