# Conflicts:
#	docs/5.Logistic回归.md
This commit is contained in:
yangjifei
2017-03-08 18:37:30 +08:00
19 changed files with 1985 additions and 10 deletions

View File

@@ -20,10 +20,15 @@ randArray = random.rand(4, 4)
# 转化关系, 数组转化为矩阵
randMat = mat(randArray)
# .I表示对矩阵求逆
# .I表示对矩阵求逆(可以利用矩阵的初等变换
# # 意义逆矩阵是一个判断相似性的工具。逆矩阵A与列向量p相乘后将得到列向量qq的第i个分量表示p与A的第i个列向量的相似度。
# # 参考案例链接:
# # https://www.zhihu.com/question/33258489
# # http://blog.csdn.net/vernice/article/details/48506027
# .T表示对矩阵转置(行列颠倒)
invRandMat = randMat.I
# 输出结果
print randArray, '\n', randMat, '\n', invRandMat
print randArray, '\n---\n', randMat, '\n+++\n', invRandMat
# 矩阵和逆矩阵 进行求积 (单位矩阵对角线都为1嘛理论上4*4的矩阵其他的都为0)
myEye = randMat*invRandMat
# 误差

View File

@@ -104,6 +104,7 @@ def show_pdf(clf):
# from IPython.display import Image
# Image(graph.create_png())
if __name__ == '__main__':
x, y = createDataSet()

View File

@@ -77,9 +77,9 @@ def plotTree(myTree, parentPt, nodeTxt):
plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
for key in secondDict.keys():
# 判断该节点是否是Node节点
if type(secondDict[key]).__name__=='dict':
if type(secondDict[key]).__name__ == 'dict':
# 如果是就递归调用[recursion]
plotTree(secondDict[key],cntrPt,str(key))
plotTree(secondDict[key], cntrPt, str(key))
else:
# 如果不是,就在原来节点一半的地方找到节点的坐标
plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
@@ -121,7 +121,7 @@ def createPlot(inTree):
# 测试数据集
def retrieveTree(i):
listOfTrees =[
listOfTrees = [
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
{'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
]

View File

@@ -0,0 +1,16 @@
#!/usr/bin/python
# coding:utf8
'''
Created on 2017-03-06
Update on 2017-03-06
@author: jiangzhonglian
'''
class treeNode():
def __init__(self, feat, val, right, left):
self.featureToSplitOn = feat
self.valueOfSplit = val
self.rightBranch = right
self.leftBranch = left

View File

@@ -0,0 +1,324 @@
#!/usr/bin/python
# coding:utf8
'''
Created on Feb 4, 2011
Update on 2017-03-02
Tree-Based Regression Methods Source Code for Machine Learning in Action Ch. 9
@author: Peter Harrington/jiangzhonglian
'''
from numpy import *
# 默认解析的数据是用tab分隔并且是数值类型
# general function to parse tab -delimited floats
def loadDataSet(fileName):
"""loadDataSet(解析每一行并转化为float类型)
Args:
fileName 文件名
Returns:
dataMat 每一行的数据集array类型
Raises:
"""
# 假定最后一列是结果值
# assume last column is target value
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
# 将所有的元素转化为float类型
# map all elements to float()
fltLine = map(float, curLine)
dataMat.append(fltLine)
return dataMat
def binSplitDataSet(dataSet, feature, value):
"""binSplitDataSet(将数据集按照feature列的value进行 二元切分)
Args:
dataMat 数据集
feature 特征列
value 特征列要比较的值
Returns:
mat0 小于的数据集在左边
mat1 大于的数据集在右边
Raises:
"""
# # 测试案例
# print 'dataSet[:, feature]=', dataSet[:, feature]
# print 'nonzero(dataSet[:, feature] > value)[0]=', nonzero(dataSet[:, feature] > value)[0]
# print 'nonzero(dataSet[:, feature] <= value)[0]=', nonzero(dataSet[:, feature] <= value)[0]
# dataSet[:, feature] 取去每一行中第1列的值(从0开始算)
# nonzero(dataSet[:, feature] > value) 返回结果为true行的index下标
mat0 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :]
mat1 = dataSet[nonzero(dataSet[:, feature] > value)[0], :]
return mat0, mat1
# 返回每一个叶子结点的均值
# returns the value used for each leaf
def regLeaf(dataSet):
return mean(dataSet[:, -1])
# 计算总方差=方差*样本数
def regErr(dataSet):
# shape(dataSet)[0] 表示行数
return var(dataSet[:, -1]) * shape(dataSet)[0]
# 1.用最佳方式切分数据集
# 2.生成相应的叶节点
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
"""chooseBestSplit(用最佳方式切分数据集 和 生成相应的叶节点)
Args:
dataSet 数据集
leafType 计算叶子节点的函数
errType 求总方差
ops [容许误差下降值,切分的最少样本数]
Returns:
bestIndex feature的index坐标
bestValue 切分的最优值
Raises:
"""
tolS = ops[0]
tolN = ops[1]
# 如果结果集(最后一列为1个变量),就返回推出
# .T 对数据集进行转置
# .tolist()[0] 转化为数组并取第0列
if len(set(dataSet[:, -1].T.tolist()[0])) == 1:
# exit cond 1
return None, leafType(dataSet)
# 计算行列值
m, n = shape(dataSet)
# 无分类误差的总方差和
# the choice of the best feature is driven by Reduction in RSS error from mean
S = errType(dataSet)
# inf 正无穷大
bestS, bestIndex, bestValue = inf, 0, 0
# 循环处理每一列对应的feature值
for featIndex in range(n-1):
for splitVal in set(dataSet[:, featIndex].T.tolist()[0]):
# 对该列进行分组然后组内的成员的val值进行 二元切分
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
# 判断二元切分的方式的元素数量是否符合预期
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
continue
newS = errType(mat0) + errType(mat1)
# 如果二元切分,算出来的误差在可接受范围内,那么就记录切分点,并记录最小误差
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
# 判断二元切分的方式的元素误差是否符合预期
# if the decrease (S-bestS) is less than a threshold don't do the split
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
# 对整体的成员进行判断,是否符合预期
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
return None, leafType(dataSet)
return bestIndex, bestValue
# assume dataSet is NumPy Mat so we can array filtering
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
# 选择最好的切分方式: feature索引值最优切分值
# choose the best split
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
# if the splitting hit a stop condition return val
if feat is None:
return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
# 大于在右边,小于在左边
lSet, rSet = binSplitDataSet(dataSet, feat, val)
# 递归的进行调用
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
# 判断节点是否是一个字典
def isTree(obj):
return (type(obj).__name__ == 'dict')
# 计算左右枝丫的均值
def getMean(tree):
if isTree(tree['right']):
tree['right'] = getMean(tree['right'])
if isTree(tree['left']):
tree['left'] = getMean(tree['left'])
return (tree['left']+tree['right'])/2.0
# 检查是否适合合并分枝
def prune(tree, testData):
# 判断是否测试数据集没有数据
if shape(testData)[0] == 0:
return getMean(tree)
# 对测试进行分支看属于哪只分支然后返回tree结果的均值
if (isTree(tree['right']) or isTree(tree['left'])):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
if isTree(tree['left']):
tree['left'] = prune(tree['left'], lSet)
if isTree(tree['right']):
tree['right'] = prune(tree['right'], rSet)
# 如果左右两边无子分支,那么计算一下总方差 和 该结果集的本身不分枝的总方差比较
# 1.如果测试数据集足够大将tree进行分支到最后
# 2.如果测试数据集不够大,那么就无法进行合并
# 注意返回的结果: 是合并后对原来为字典tree进行赋值相当于进行了合并
if not isTree(tree['left']) and not isTree(tree['right']):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
# power(x, y)表示x的y次方
errorNoMerge = sum(power(lSet[:, -1] - tree['left'], 2)) + sum(power(rSet[:, -1] - tree['right'], 2))
treeMean = (tree['left'] + tree['right'])/2.0
errorMerge = sum(power(testData[:, -1] - treeMean, 2))
# 如果 合并的总方差 < 不合并的总方差,那么就进行合并
if errorMerge < errorNoMerge:
print "merging"
return treeMean
else:
return tree
else:
return tree
# 得到模型的ws系数f(x) = x0 + x1*featrue1+ x3*featrue2 ...
# create linear model and return coeficients
def modelLeaf(dataSet):
ws, X, Y = linearSolve(dataSet)
return ws
# 计算线性模型的误差值
def modelErr(dataSet):
ws, X, Y = linearSolve(dataSet)
yHat = X * ws
# print corrcoef(yHat, Y, rowvar=0)
return sum(power(Y - yHat, 2))
# helper function used in two places
def linearSolve(dataSet):
m, n = shape(dataSet)
# 产生一个关于1的矩阵
X = mat(ones((m, n)))
Y = mat(ones((m, 1)))
# X的0列为1常数项用于计算平衡误差
X[:, 1: n] = dataSet[:, 0: n-1]
Y = dataSet[:, -1]
# 转置矩阵*矩阵
xTx = X.T * X
# 如果矩阵的逆不存在,会造成程序异常
if linalg.det(xTx) == 0.0:
raise NameError('This matrix is singular, cannot do inverse,\ntry increasing the second value of ops')
# 最小二乘法求最优解
ws = xTx.I * (X.T * Y)
return ws, X, Y
# 回归树测试案例
def regTreeEval(model, inDat):
return float(model)
# 模型树测试案例
def modelTreeEval(model, inDat):
n = shape(inDat)[1]
X = mat(ones((1, n+1)))
X[:, 1: n+1] = inDat
# print X, model
return float(X * model)
# 计算预测的结果
def treeForeCast(tree, inData, modelEval=regTreeEval):
if not isTree(tree):
return modelEval(tree, inData)
if inData[tree['spInd']] <= tree['spVal']:
if isTree(tree['left']):
return treeForeCast(tree['left'], inData, modelEval)
else:
return modelEval(tree['left'], inData)
else:
if isTree(tree['right']):
return treeForeCast(tree['right'], inData, modelEval)
else:
return modelEval(tree['right'], inData)
# 预测结果
def createForeCast(tree, testData, modelEval=regTreeEval):
m = len(testData)
yHat = mat(zeros((m, 1)))
for i in range(m):
yHat[i, 0] = treeForeCast(tree, mat(testData[i]), modelEval)
return yHat
if __name__ == "__main__":
# # 测试数据集
# testMat = mat(eye(4))
# print testMat
# print type(testMat)
# mat0, mat1 = binSplitDataSet(testMat, 1, 0.5)
# print mat0, '\n-----------\n', mat1
# 回归树
# myDat = loadDataSet('testData/RT_data1.txt')
# myDat = loadDataSet('testData/RT_data2.txt')
# myMat = mat(myDat)
# myTree = createTree(myMat)
# 1. 预剪枝就是,提起设置最大误差数和最少元素数
# myDat = loadDataSet('testData/RT_data3.txt')
# myMat = mat(myDat)
# myTree = createTree(myMat, ops=(0, 1))
# print myTree
# 2.后剪枝
# myDatTest = loadDataSet('testData/RT_data3test.txt')
# myMat2Test = mat(myDatTest)
# myFinalTree = prune(myTree, myMat2Test)
# print '\n\n\n-------------------'
# print myFinalTree
# --------
# 模型树求解
# myDat = loadDataSet('testData/RT_data4.txt')
# myMat = mat(myDat)
# myTree = createTree(myMat, modelLeaf, modelErr)
# print myTree
# 回归树 VS 模型树 VS 线性回归
trainMat = mat(loadDataSet('testData/RT_bikeSpeedVsIq_train.txt'))
testMat = mat(loadDataSet('testData/RT_bikeSpeedVsIq_test.txt'))
# 回归树
myTree1 = createTree(trainMat, ops=(1, 20))
print myTree1
yHat1 = createForeCast(myTree1, testMat[:, 0])
print "回归树:", corrcoef(yHat1, testMat[:, 1],rowvar=0)[0, 1]
# 模型树
myTree2 = createTree(trainMat, modelLeaf, modelErr, ops=(1, 20))
yHat2 = createForeCast(myTree2, testMat[:, 0], modelTreeEval)
print myTree2
print "模型树:", corrcoef(yHat2, testMat[:, 1],rowvar=0)[0, 1]
# 线性回归
ws, X, Y = linearSolve(trainMat)
print ws
m = len(testMat[:, 0])
yHat3 = mat(zeros((m, 1)))
for i in range(shape(testMat)[0]):
yHat3[i] = testMat[i, 0]*ws[1, 0] + ws[0, 0]
print "线性回归:", corrcoef(yHat3, testMat[:, 1],rowvar=0)[0, 1]

View File

@@ -11,6 +11,7 @@ import os
from numpy import *
import matplotlib.pylab as plt
def loadDataSet(fileName): #general function to parse tab -delimited floats
numFeat = len(open(fileName).readline().split('\t')) - 1 #get number of fields
dataMat = []; labelMat = []
@@ -24,6 +25,7 @@ def loadDataSet(fileName): #general function to parse tab -delimited floats
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
def standRegres(xArr,yArr):
# >>> A.T # transpose, 转置
xMat = mat(xArr); yMat = mat(yArr).T
@@ -37,6 +39,7 @@ def standRegres(xArr,yArr):
ws = xTx.I * (xMat.T*yMat) # 最小二乘法求最优解
return ws
def plotBestFit(xArr, yArr, ws):
xMat = mat(xArr)
@@ -60,6 +63,7 @@ def plotBestFit(xArr, yArr, ws):
plt.xlabel('X'); plt.ylabel('Y')
plt.show()
def main1():
# w0*x0+w1*x1+w2*x2=f(x)
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
@@ -91,6 +95,7 @@ def lwlr(testPoint, xArr, yArr,k=1.0):
ws = xTx.I * (xMat.T * (weights * yMat))
return testPoint * ws
def lwlrTest(testArr,xArr,yArr,k=1.0): #loops over all the data points and applies lwlr to each one
m = shape(testArr)[0]
# m*1的矩阵
@@ -101,6 +106,7 @@ def lwlrTest(testArr,xArr,yArr,k=1.0): #loops over all the data points and appl
yHat[i] = lwlr(testArr[i],xArr,yArr,k)
return yHat
def lwlrTestPlot(xArr, yArr, yHat):
xMat = mat(xArr)
@@ -123,11 +129,13 @@ def lwlrTestPlot(xArr, yArr, yHat):
plt.xlabel('X'); plt.ylabel('Y')
plt.show()
def main2():
# w0*x0+w1*x1+w2*x2=f(x)
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# 1.收集并准备数据
xArr, yArr = loadDataSet("%s/resources/ex0.txt" % project_dir)
# xArr, yArr = loadDataSet("%s/resources/ex0.txt" % project_dir)
xArr, yArr = loadDataSet("testData/Regression_data.txt")
# print xArr, '---\n', yArr
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
yHat = lwlrTest(xArr, xArr, yArr, 0.003)
@@ -136,12 +144,14 @@ def main2():
# 数据可视化
lwlrTestPlot(xArr, yArr, yHat)
if __name__=="__main__":
if __name__ == "__main__":
# 线性回归
# main1()
# 局部加权线性回归
main2()
def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
return ((yArr-yHatArr)**2).sum()