更新注释

This commit is contained in:
jiangzhonglian
2017-03-26 18:12:25 +08:00
parent 0bf2130751
commit 95a29ca47e
6 changed files with 46 additions and 346 deletions

View File

@@ -1,103 +0,0 @@
#!/usr/bin/env python
# encoding: utf-8
from numpy import *
import matplotlib.pyplot as plt
import time
'''
1、需要安装模块pip install matplotlib-1.5.0-cp27-none-win_amd64.whl
由于直接安装会出现问题所以建议下载whl包进行安装下载网址
https://pypi.python.org/pypi/matplotlib/1.5.0
2、可以看见画出的图像
'''
"""
@version:
@author: yangjf
@license: ApacheCN
@contact: highfei2011@126.com
@site: https://github.com/apachecn/MachineLearning
@software: PyCharm
@file: logRegression01.py
@time: 2017/3/3 22:03
@test result: ok
"""
# sigmoid函数
def sigmoid(inX):
return 1.0 / (1 + exp(-inX))
def trainLogRegres(train_x, train_y, opts):
# 计算训练时间
startTime = time.time()
numSamples, numFeatures = shape(train_x)
alpha = opts['alpha']; maxIter = opts['maxIter']
weights = ones((numFeatures, 1))
# 通过梯度下降算法优化
for k in range(maxIter):
if opts['optimizeType'] == 'gradDescent': # 梯度下降算法
output = sigmoid(train_x * weights)
error = train_y - output
weights = weights + alpha * train_x.transpose() * error
elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降
for i in range(numSamples):
output = sigmoid(train_x[i, :] * weights)
error = train_y[i, 0] - output
weights = weights + alpha * train_x[i, :].transpose() * error
elif opts['optimizeType'] == 'smoothStocGradDescent': # 光滑随机梯度下降
# 随机选择样本以优化以减少周期波动
dataIndex = range(numSamples)
for i in range(numSamples):
alpha = 4.0 / (1.0 + k + i) + 0.01
randIndex = int(random.uniform(0, len(dataIndex)))
output = sigmoid(train_x[randIndex, :] * weights)
error = train_y[randIndex, 0] - output
weights = weights + alpha * train_x[randIndex, :].transpose() * error
del(dataIndex[randIndex]) # 在一次交互期间,删除优化的样品
else:
raise NameError('Not support optimize method type!')
print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
return weights
#测试给定测试集的训练Logistic回归模型
def testLogRegres(weights, test_x, test_y):
numSamples, numFeatures = shape(test_x)
matchCount = 0
for i in xrange(numSamples):
predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
if predict == bool(test_y[i, 0]):
matchCount += 1
accuracy = float(matchCount) / numSamples
return accuracy
# 显示你的训练逻辑回归模型只有2-D数据可用
def showLogRegres(weights, train_x, train_y):
# 注意train_x和train_y是垫数据类型
numSamples, numFeatures = shape(train_x)
if numFeatures != 3:
print "抱歉! 我不能绘制因为你的数据的维度不是2"
return 1
# 画出所有抽样数据
for i in xrange(numSamples):
if int(train_y[i, 0]) == 0:
plt.plot(train_x[i, 1], train_x[i, 2], 'or')
elif int(train_y[i, 0]) == 1:
plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
# 画图操作
min_x = min(train_x[:, 1])[0, 0]
max_x = max(train_x[:, 1])[0, 0]
weights = weights.getA() # 将mat转换为数组
y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
plt.xlabel('X1'); plt.ylabel('X2')
#显示图像
plt.show()

View File

@@ -1,54 +0,0 @@
#!/usr/bin/env python
# encoding: utf-8
import os
import sys
sys.path.append("C:\Python27")
from numpy import *
from logRegression01 import *
"""
@version:
@author: yangjf
@license: ApacheCN
@contact: highfei2011@126.com
@site: https://github.com/apachecn/MachineLearning
@software: PyCharm
@file: test_logRegression.py
@time: 2017/3/3 22:09
@test result: ok
"""
def loadData():
train_x = []
train_y = []
# 获取当前文件所在路径
project_dir = os.getcwdu()
# 截取字符串至项目名Test\
project_dir = project_dir[:project_dir.find("MachineLearning\\") + 15]
print project_dir
fileIn = open("%s/testData/Logistic_testdata.txt" % project_dir)
for line in fileIn.readlines():
lineArr = line.strip().split()
train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
train_y.append(float(lineArr[2]))
return mat(train_x), mat(train_y).transpose()
##第一步: 加载数据
print "step 1: load data..."
train_x, train_y = loadData()
test_x = train_x; test_y = train_y
##第二步: 训练数据...
print "step 2: training..."
opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'}
optimalWeights = trainLogRegres(train_x, train_y, opts)
##第三步: 测试
print "step 3: testing..."
accuracy = testLogRegres(optimalWeights, test_x, test_y)
##第四步: 显示结果
print "step 4: show the result..."
print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
showLogRegres(optimalWeights, train_x, train_y)

View File

@@ -1,164 +0,0 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from numpy import *
def loadDataSet():
dataMat = []
labelMat = []
fr = open('testSet.txt')
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return dataMat, labelMat
def sigmoid(inX):
return 1.0 / (1 + exp(-inX))
def gradAscent(dataMatIn, classLabels):
# 转化为矩阵[[1,1,2],[1,1,2]....]
dataMatrix = mat(dataMatIn) # convert to NumPy matrix
# 转化为矩阵[[0,1,0,1,0,1.....]],并转制[[0],[1],[0].....]
x = mat(classLabels)
labelMat = x.transpose() # convert to NumPy matrix
# m->数据量 n->特征数
m, n = shape(dataMatrix)
# 步长
alpha = 0.001
# 迭代次数
maxCycles = 500
# 生成一个长度和特征数相同的矩阵此处n为3 -> [[1],[1],[1]]
weights = ones((n, 1))
for k in range(maxCycles): # heavy on matrix operations
# 1. dataMatrix * weights 矩阵乘法: [[1,1,2],[1,1,2]....] * [[1],[1],[1]] -> [[]]
s = dataMatrix * weights
# 把每个特征与系数的乘积只和带入Sigmoid函数
h = sigmoid(dataMatrix * weights) # matrix mult
# [[x,x,x,x,x,......一共一百个误差]]
error = (labelMat - h) # vector subtraction
# dataMatrix.transpose() * error 推理略去
# [[x,x,x,x....一共一百个数],[],[]]
data_tran = dataMatrix.transpose()
# [[a,b,c]]
data_tran_error = data_tran * error
# weights = weights + alpha * dataMatrix.transpose() * error # matrix mult
weights = weights + alpha * data_tran_error
return weights
# 随机梯度上升
# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集,计算复杂都较高
# 随机梯度上升一次只用一个样本点来更新回归系数
def stocGradAscent0(dataMatrix, classLabels):
m, n = shape(dataMatrix)
alpha = 0.01
weights = ones(n) # initialize to all ones
for i in range(m):
h = sigmoid(sum(dataMatrix[i] * weights))
error = classLabels[i] - h
weights = weights + alpha * error * dataMatrix[i]
return weights
def plotBestFit(weights):
import matplotlib.pyplot as plt
dataMat, labelMat = loadDataSet()
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = [];
ycord1 = []
xcord2 = [];
ycord2 = []
for i in range(n):
if int(labelMat[i]) == 1:
xcord1.append(dataArr[i, 1]);
ycord1.append(dataArr[i, 2])
else:
xcord2.append(dataArr[i, 1]);
ycord2.append(dataArr[i, 2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = arange(-3.0, 3.0, 0.1)
y = (-weights[0] - weights[1] * x) / weights[2]
ax.plot(x, y)
plt.xlabel('X1');
plt.ylabel('X2');
plt.show()
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
m, n = shape(dataMatrix)
weights = ones(n) # initialize to all ones
for j in range(numIter):
dataIndex = range(m)
for i in range(m):
# 步长在不断减小
alpha = 4 / (1.0 + j + i) + 0.0001 # apha decreases with iteration, does not
# 随机选取样本减少周期波动
randIndex = int(random.uniform(0, len(dataIndex))) # go to 0 because of the constant
h = sigmoid(sum(dataMatrix[randIndex] * weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error * dataMatrix[randIndex]
del (dataIndex[randIndex])
return weights
# a, b = loadDataSet()
# weights = gradAscent(a, b)
# plotBestFit(weights)
#
######################################################################################################################
def classifyVector(inX, weights):
prob = sigmoid(sum(inX * weights))
if prob > 0.5:
return 1.0
else:
return 0.0
def colicTest():
frTrain = open('horseColicTraining.txt');
frTest = open('horseColicTest.txt')
trainingSet = [];
trainingLabels = []
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[21]))
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
errorCount = 0;
numTestVec = 0.0
for line in frTest.readlines():
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
errorCount += 1
errorRate = (float(errorCount) / numTestVec)
print "the error rate of this test is: %f" % errorRate
return errorRate
def multiTest():
numTests = 10;
errorSum = 0.0
for k in range(numTests):
errorSum += colicTest()
print "after %d iterations the average error rate is: %f" % (numTests, errorSum / float(numTests))
# multiTest()
colicTest()

View File

@@ -6,10 +6,10 @@ Created on Oct 27, 2010
Logistic Regression Working Module
@author: Peter
'''
import os
from numpy import *
import matplotlib.pyplot as plt
# 解析数据
def loadDataSet(file_name):
# dataMat为原始数据 labelMat为原始数据的标签
@@ -25,17 +25,24 @@ def loadDataSet(file_name):
def sigmoid(inX):
return 1.0/(1+exp(-inX))
# 正常的处理方案
def gradAscent(dataMatIn, classLabels):
# 转化为矩阵[[1,1,2],[1,1,2]....]
dataMatrix = mat(dataMatIn) #convert to NumPy matrix
# 转化为矩阵[[0,1,0,1,0,1.....]],并转制[[0],[1],[0].....]
# transpose() 行列转制函数
# 将行矩阵转化为列矩阵 => 矩阵的转置
labelMat = mat(classLabels).transpose() #convert to NumPy matrix
# m->数据量 n->特征数
m,n = shape(dataMatrix)
# print m, n, '__'*10, shape(dataMatrix.transpose()), '__'*100
# 步长
alpha = 0.001
# 迭代次数
maxCycles = 500
# 权重
# 生成一个长度和特征数相同的矩阵此处n为3 -> [[1],[1],[1]]
# 回归系数
weights = ones((n,1))
for k in range(maxCycles): #heavy on matrix operations
# m*3的矩阵 * 3*1的单位矩阵 m*1的矩阵
@@ -49,7 +56,10 @@ def gradAscent(dataMatIn, classLabels):
weights = weights + alpha * dataMatrix.transpose() * error #matrix mult
return array(weights)
# 梯度上升算法
# 随机梯度上升
# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集,计算复杂都较高
# 随机梯度上升一次只用一个样本点来更新回归系数
def stocGradAscent0(dataMatrix, classLabels):
m,n = shape(dataMatrix)
alpha = 0.01
@@ -65,6 +75,7 @@ def stocGradAscent0(dataMatrix, classLabels):
weights = weights + alpha * error * dataMatrix[i]
return weights
# 随机梯度上升算法(随机化)
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
m,n = shape(dataMatrix)
@@ -86,6 +97,7 @@ def stocGradAscent1(dataMatrix, classLabels, numIter=150):
del(dataIndex[randIndex])
return weights
# 可视化展示
def plotBestFit(dataArr, labelMat, weights):
n = shape(dataArr)[0]
@@ -114,10 +126,12 @@ def plotBestFit(dataArr, labelMat, weights):
plt.xlabel('X'); plt.ylabel('Y')
plt.show()
def main():
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# 1.收集并准备数据
dataMat, labelMat = loadDataSet("%s/testData/Logistic_testdata.txt" % project_dir)
# dataMat, labelMat = loadDataSet("%s/testData/Logistic_testdata.txt" % project_dir)
dataMat, labelMat = loadDataSet("testData/Logistic_testdata.txt")
# print dataMat, '---\n', labelMat
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
@@ -132,5 +146,6 @@ def main():
# 数据可视化
plotBestFit(dataArr, labelMat, weights)
if __name__=="__main__":
main()
if __name__ == "__main__":
main()

View File

@@ -267,10 +267,10 @@ if __name__ == "__main__":
D = mat(ones((5, 1))/5)
print 'D=', D.T
bestStump, minError, bestClasEst = buildStump(dataArr, labelArr, D)
print 'bestStump=', bestStump
print 'minError=', minError
print 'bestClasEst=', bestClasEst.T
# bestStump, minError, bestClasEst = buildStump(dataArr, labelArr, D)
# print 'bestStump=', bestStump
# print 'minError=', minError
# print 'bestClasEst=', bestClasEst.T
# 分类器weakClassArr
@@ -288,14 +288,14 @@ if __name__ == "__main__":
print adaClassify([0, 0], weakClassArr).T
print adaClassify([[5, 5], [0, 0]], weakClassArr).T
# 马疝病数据集
# 训练集合
# # 马疝病数据集
# # 训练集合
# dataArr, labelArr = loadDataSet("testData/AB_horseColicTraining2.txt")
# weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 40)
# print weakClassArr, '\n-----\n', aggClassEst.T
# 计算ROC下面的AUC的面积大小
# # 计算ROC下面的AUC的面积大小
# plotROC(aggClassEst.T, labelArr)
# 测试集合
# # 测试集合
# dataArrTest, labelArrTest = loadDataSet("testData/AB_horseColicTest2.txt")
# m = shape(dataArrTest)[0]
# predicting10 = adaClassify(dataArrTest, weakClassArr)

View File

@@ -1,3 +1,6 @@
#!/usr/bin/python
# coding:utf8
'''
Create by ApacheCN-小瑶
Date from 2017-02-27
@@ -26,6 +29,8 @@ def standRegres(xArr,yArr): #线性回归
if linalg.det(xTx) == 0.0: #因为要用到xTx的逆矩阵所以事先需要确定计算得到的xTx是否可逆条件是矩阵的行列式不为0
print ("This matrix is singular, cannot do inverse")
return
# 最小二乘法
# http://www.apache.wiki/pages/viewpage.action?pageId=5505133
ws = xTx.I * (xMat.T*yMat) #书中的公式求得w的最优解
return ws
@@ -69,7 +74,7 @@ def ridgeRegres(xMat,yMat,lam=0.2): #岭回归
return
ws = denom.I * (xMat.T*yMat)
return ws
def ridgeTest(xArr,yArr):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0) #计算Y均值
@@ -85,6 +90,7 @@ def ridgeTest(xArr,yArr):
wMat[i,:]=ws.T
return wMat
def regularize(xMat):#按列进行规范化
inMat = xMat.copy()
inMeans = mean(inMat,0) #计算平均值然后减去它
@@ -227,7 +233,7 @@ def crossValidation(xArr,yArr,numVal=10):
#test for standRegression
def regression1():
xArr, yArr = loadDataSet("../../../testData/Regression_data.txt")
xArr, yArr = loadDataSet("testData/Regression_data.txt")
xMat = mat(xArr)
yMat = mat(yArr)
ws = standRegres(xArr, yArr)
@@ -245,7 +251,7 @@ def regression1():
#test for LWLR
def regression2():
xArr, yArr = loadDataSet("../../../testData/Regression_data.txt")
xArr, yArr = loadDataSet("testData/Regression_data.txt")
yHat = lwlrTest(xArr, xArr, yArr, 0.003)
xMat = mat(xArr)
srtInd = xMat[:,1].argsort(0) #argsort()函数是将x中的元素从小到大排列提取其对应的index(索引),然后输出
@@ -259,7 +265,7 @@ def regression2():
#test for ridgeRegression
def regression3():
abX,abY = loadDataSet("../../../testData/Regression_abalone.txt")
abX,abY = loadDataSet("testData/Regression_abalone.txt")
ridgeWeights = ridgeTest(abX, abY)
fig = plt.figure()
ax = fig.add_subplot(111)
@@ -269,7 +275,7 @@ def regression3():
#test for stageWise
def regression4():
xArr,yArr=loadDataSet("../../../testData/Regression_abalone.txt")
xArr,yArr=loadDataSet("testData/Regression_abalone.txt")
stageWise(xArr,yArr,0.01,200)
xMat = mat(xArr)
yMat = mat(yArr).T
@@ -280,7 +286,7 @@ def regression4():
print (weights.T)
if __name__ == "__main__":
#regression1()
#regression2()
#regression3()
regression4()
# regression1()
regression2()
# regression3()
# regression4()