mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-12 23:05:14 +08:00
更新注释
This commit is contained in:
@@ -1,103 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
from numpy import *
|
||||
import matplotlib.pyplot as plt
|
||||
import time
|
||||
'''
|
||||
1、需要安装模块:pip install matplotlib-1.5.0-cp27-none-win_amd64.whl
|
||||
由于直接安装会出现问题,所以建议下载whl包进行安装,下载网址:
|
||||
https://pypi.python.org/pypi/matplotlib/1.5.0
|
||||
|
||||
2、可以看见画出的图像
|
||||
'''
|
||||
|
||||
"""
|
||||
@version:
|
||||
@author: yangjf
|
||||
@license: ApacheCN
|
||||
@contact: highfei2011@126.com
|
||||
@site: https://github.com/apachecn/MachineLearning
|
||||
@software: PyCharm
|
||||
@file: logRegression01.py
|
||||
@time: 2017/3/3 22:03
|
||||
@test result: ok
|
||||
"""
|
||||
|
||||
# sigmoid函数
|
||||
def sigmoid(inX):
|
||||
return 1.0 / (1 + exp(-inX))
|
||||
|
||||
def trainLogRegres(train_x, train_y, opts):
|
||||
# 计算训练时间
|
||||
startTime = time.time()
|
||||
|
||||
numSamples, numFeatures = shape(train_x)
|
||||
alpha = opts['alpha']; maxIter = opts['maxIter']
|
||||
weights = ones((numFeatures, 1))
|
||||
|
||||
# 通过梯度下降算法优化
|
||||
for k in range(maxIter):
|
||||
if opts['optimizeType'] == 'gradDescent': # 梯度下降算法
|
||||
output = sigmoid(train_x * weights)
|
||||
error = train_y - output
|
||||
weights = weights + alpha * train_x.transpose() * error
|
||||
elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降
|
||||
for i in range(numSamples):
|
||||
output = sigmoid(train_x[i, :] * weights)
|
||||
error = train_y[i, 0] - output
|
||||
weights = weights + alpha * train_x[i, :].transpose() * error
|
||||
elif opts['optimizeType'] == 'smoothStocGradDescent': # 光滑随机梯度下降
|
||||
# 随机选择样本以优化以减少周期波动
|
||||
dataIndex = range(numSamples)
|
||||
for i in range(numSamples):
|
||||
alpha = 4.0 / (1.0 + k + i) + 0.01
|
||||
randIndex = int(random.uniform(0, len(dataIndex)))
|
||||
output = sigmoid(train_x[randIndex, :] * weights)
|
||||
error = train_y[randIndex, 0] - output
|
||||
weights = weights + alpha * train_x[randIndex, :].transpose() * error
|
||||
del(dataIndex[randIndex]) # 在一次交互期间,删除优化的样品
|
||||
else:
|
||||
raise NameError('Not support optimize method type!')
|
||||
|
||||
|
||||
print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
|
||||
return weights
|
||||
|
||||
|
||||
#测试给定测试集的训练Logistic回归模型
|
||||
def testLogRegres(weights, test_x, test_y):
|
||||
numSamples, numFeatures = shape(test_x)
|
||||
matchCount = 0
|
||||
for i in xrange(numSamples):
|
||||
predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
|
||||
if predict == bool(test_y[i, 0]):
|
||||
matchCount += 1
|
||||
accuracy = float(matchCount) / numSamples
|
||||
return accuracy
|
||||
|
||||
|
||||
# 显示你的训练逻辑回归模型只有2-D数据可用
|
||||
def showLogRegres(weights, train_x, train_y):
|
||||
# 注意:train_x和train_y是垫数据类型
|
||||
numSamples, numFeatures = shape(train_x)
|
||||
if numFeatures != 3:
|
||||
print "抱歉! 我不能绘制,因为你的数据的维度不是2!"
|
||||
return 1
|
||||
|
||||
# 画出所有抽样数据
|
||||
for i in xrange(numSamples):
|
||||
if int(train_y[i, 0]) == 0:
|
||||
plt.plot(train_x[i, 1], train_x[i, 2], 'or')
|
||||
elif int(train_y[i, 0]) == 1:
|
||||
plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
|
||||
|
||||
# 画图操作
|
||||
min_x = min(train_x[:, 1])[0, 0]
|
||||
max_x = max(train_x[:, 1])[0, 0]
|
||||
weights = weights.getA() # 将mat转换为数组
|
||||
y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
|
||||
y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
|
||||
plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
|
||||
plt.xlabel('X1'); plt.ylabel('X2')
|
||||
#显示图像
|
||||
plt.show()
|
||||
@@ -1,54 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
import os
|
||||
import sys
|
||||
sys.path.append("C:\Python27")
|
||||
from numpy import *
|
||||
|
||||
from logRegression01 import *
|
||||
"""
|
||||
@version:
|
||||
@author: yangjf
|
||||
@license: ApacheCN
|
||||
@contact: highfei2011@126.com
|
||||
@site: https://github.com/apachecn/MachineLearning
|
||||
@software: PyCharm
|
||||
@file: test_logRegression.py
|
||||
@time: 2017/3/3 22:09
|
||||
@test result: ok
|
||||
"""
|
||||
|
||||
def loadData():
|
||||
train_x = []
|
||||
train_y = []
|
||||
# 获取当前文件所在路径
|
||||
project_dir = os.getcwdu()
|
||||
# 截取字符串至项目名:Test\
|
||||
project_dir = project_dir[:project_dir.find("MachineLearning\\") + 15]
|
||||
print project_dir
|
||||
fileIn = open("%s/testData/Logistic_testdata.txt" % project_dir)
|
||||
for line in fileIn.readlines():
|
||||
lineArr = line.strip().split()
|
||||
train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||||
train_y.append(float(lineArr[2]))
|
||||
return mat(train_x), mat(train_y).transpose()
|
||||
|
||||
|
||||
##第一步: 加载数据
|
||||
print "step 1: load data..."
|
||||
train_x, train_y = loadData()
|
||||
test_x = train_x; test_y = train_y
|
||||
|
||||
##第二步: 训练数据...
|
||||
print "step 2: training..."
|
||||
opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'}
|
||||
optimalWeights = trainLogRegres(train_x, train_y, opts)
|
||||
|
||||
##第三步: 测试
|
||||
print "step 3: testing..."
|
||||
accuracy = testLogRegres(optimalWeights, test_x, test_y)
|
||||
|
||||
##第四步: 显示结果
|
||||
print "step 4: show the result..."
|
||||
print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
|
||||
showLogRegres(optimalWeights, train_x, train_y)
|
||||
@@ -1,164 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
from numpy import *
|
||||
|
||||
|
||||
def loadDataSet():
|
||||
dataMat = []
|
||||
labelMat = []
|
||||
fr = open('testSet.txt')
|
||||
for line in fr.readlines():
|
||||
lineArr = line.strip().split()
|
||||
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||||
labelMat.append(int(lineArr[2]))
|
||||
return dataMat, labelMat
|
||||
|
||||
|
||||
def sigmoid(inX):
|
||||
return 1.0 / (1 + exp(-inX))
|
||||
|
||||
|
||||
def gradAscent(dataMatIn, classLabels):
|
||||
# 转化为矩阵[[1,1,2],[1,1,2]....]
|
||||
dataMatrix = mat(dataMatIn) # convert to NumPy matrix
|
||||
# 转化为矩阵[[0,1,0,1,0,1.....]],并转制[[0],[1],[0].....]
|
||||
x = mat(classLabels)
|
||||
labelMat = x.transpose() # convert to NumPy matrix
|
||||
# m->数据量 n->特征数
|
||||
m, n = shape(dataMatrix)
|
||||
# 步长
|
||||
alpha = 0.001
|
||||
# 迭代次数
|
||||
maxCycles = 500
|
||||
# 生成一个长度和特征数相同的矩阵,此处n为3 -> [[1],[1],[1]]
|
||||
weights = ones((n, 1))
|
||||
for k in range(maxCycles): # heavy on matrix operations
|
||||
# 1. dataMatrix * weights 矩阵乘法: [[1,1,2],[1,1,2]....] * [[1],[1],[1]] -> [[]]
|
||||
s = dataMatrix * weights
|
||||
# 把每个特征与系数的乘积只和带入Sigmoid函数
|
||||
h = sigmoid(dataMatrix * weights) # matrix mult
|
||||
# [[x,x,x,x,x,......一共一百个误差]]
|
||||
error = (labelMat - h) # vector subtraction
|
||||
# dataMatrix.transpose() * error 推理略去
|
||||
# [[x,x,x,x....一共一百个数],[],[]]
|
||||
data_tran = dataMatrix.transpose()
|
||||
# [[a,b,c]]
|
||||
data_tran_error = data_tran * error
|
||||
|
||||
# weights = weights + alpha * dataMatrix.transpose() * error # matrix mult
|
||||
weights = weights + alpha * data_tran_error
|
||||
return weights
|
||||
|
||||
|
||||
# 随机梯度上升
|
||||
# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集,计算复杂都较高
|
||||
# 随机梯度上升一次只用一个样本点来更新回归系数
|
||||
def stocGradAscent0(dataMatrix, classLabels):
|
||||
m, n = shape(dataMatrix)
|
||||
alpha = 0.01
|
||||
weights = ones(n) # initialize to all ones
|
||||
for i in range(m):
|
||||
h = sigmoid(sum(dataMatrix[i] * weights))
|
||||
error = classLabels[i] - h
|
||||
weights = weights + alpha * error * dataMatrix[i]
|
||||
return weights
|
||||
|
||||
|
||||
def plotBestFit(weights):
|
||||
import matplotlib.pyplot as plt
|
||||
dataMat, labelMat = loadDataSet()
|
||||
dataArr = array(dataMat)
|
||||
n = shape(dataArr)[0]
|
||||
xcord1 = [];
|
||||
ycord1 = []
|
||||
xcord2 = [];
|
||||
ycord2 = []
|
||||
for i in range(n):
|
||||
if int(labelMat[i]) == 1:
|
||||
xcord1.append(dataArr[i, 1]);
|
||||
ycord1.append(dataArr[i, 2])
|
||||
else:
|
||||
xcord2.append(dataArr[i, 1]);
|
||||
ycord2.append(dataArr[i, 2])
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
|
||||
ax.scatter(xcord2, ycord2, s=30, c='green')
|
||||
x = arange(-3.0, 3.0, 0.1)
|
||||
y = (-weights[0] - weights[1] * x) / weights[2]
|
||||
ax.plot(x, y)
|
||||
plt.xlabel('X1');
|
||||
plt.ylabel('X2');
|
||||
plt.show()
|
||||
|
||||
|
||||
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
|
||||
m, n = shape(dataMatrix)
|
||||
weights = ones(n) # initialize to all ones
|
||||
for j in range(numIter):
|
||||
dataIndex = range(m)
|
||||
for i in range(m):
|
||||
# 步长在不断减小
|
||||
alpha = 4 / (1.0 + j + i) + 0.0001 # apha decreases with iteration, does not
|
||||
# 随机选取样本减少周期波动
|
||||
randIndex = int(random.uniform(0, len(dataIndex))) # go to 0 because of the constant
|
||||
h = sigmoid(sum(dataMatrix[randIndex] * weights))
|
||||
error = classLabels[randIndex] - h
|
||||
weights = weights + alpha * error * dataMatrix[randIndex]
|
||||
del (dataIndex[randIndex])
|
||||
return weights
|
||||
|
||||
|
||||
# a, b = loadDataSet()
|
||||
# weights = gradAscent(a, b)
|
||||
# plotBestFit(weights)
|
||||
#
|
||||
|
||||
######################################################################################################################
|
||||
|
||||
def classifyVector(inX, weights):
|
||||
prob = sigmoid(sum(inX * weights))
|
||||
if prob > 0.5:
|
||||
return 1.0
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
|
||||
def colicTest():
|
||||
frTrain = open('horseColicTraining.txt');
|
||||
frTest = open('horseColicTest.txt')
|
||||
trainingSet = [];
|
||||
trainingLabels = []
|
||||
for line in frTrain.readlines():
|
||||
currLine = line.strip().split('\t')
|
||||
lineArr = []
|
||||
for i in range(21):
|
||||
lineArr.append(float(currLine[i]))
|
||||
trainingSet.append(lineArr)
|
||||
trainingLabels.append(float(currLine[21]))
|
||||
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
|
||||
errorCount = 0;
|
||||
numTestVec = 0.0
|
||||
for line in frTest.readlines():
|
||||
numTestVec += 1.0
|
||||
currLine = line.strip().split('\t')
|
||||
lineArr = []
|
||||
for i in range(21):
|
||||
lineArr.append(float(currLine[i]))
|
||||
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
|
||||
errorCount += 1
|
||||
errorRate = (float(errorCount) / numTestVec)
|
||||
print "the error rate of this test is: %f" % errorRate
|
||||
return errorRate
|
||||
|
||||
|
||||
def multiTest():
|
||||
numTests = 10;
|
||||
errorSum = 0.0
|
||||
for k in range(numTests):
|
||||
errorSum += colicTest()
|
||||
print "after %d iterations the average error rate is: %f" % (numTests, errorSum / float(numTests))
|
||||
|
||||
# multiTest()
|
||||
|
||||
colicTest()
|
||||
151
src/python/05.Logistic/logistic.py
Normal file
151
src/python/05.Logistic/logistic.py
Normal file
@@ -0,0 +1,151 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf8
|
||||
|
||||
'''
|
||||
Created on Oct 27, 2010
|
||||
Logistic Regression Working Module
|
||||
@author: Peter
|
||||
'''
|
||||
from numpy import *
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
# 解析数据
|
||||
def loadDataSet(file_name):
|
||||
# dataMat为原始数据, labelMat为原始数据的标签
|
||||
dataMat = []; labelMat = []
|
||||
fr = open(file_name)
|
||||
for line in fr.readlines():
|
||||
lineArr = line.strip().split()
|
||||
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||||
labelMat.append(int(lineArr[2]))
|
||||
return dataMat,labelMat
|
||||
|
||||
# sigmoid跳跃函数
|
||||
def sigmoid(inX):
|
||||
return 1.0/(1+exp(-inX))
|
||||
|
||||
|
||||
# 正常的处理方案
|
||||
def gradAscent(dataMatIn, classLabels):
|
||||
# 转化为矩阵[[1,1,2],[1,1,2]....]
|
||||
dataMatrix = mat(dataMatIn) #convert to NumPy matrix
|
||||
# 转化为矩阵[[0,1,0,1,0,1.....]],并转制[[0],[1],[0].....]
|
||||
# transpose() 行列转制函数
|
||||
# 将行矩阵转化为列矩阵 => 矩阵的转置
|
||||
labelMat = mat(classLabels).transpose() #convert to NumPy matrix
|
||||
# m->数据量 n->特征数
|
||||
m,n = shape(dataMatrix)
|
||||
# print m, n, '__'*10, shape(dataMatrix.transpose()), '__'*100
|
||||
# 步长
|
||||
alpha = 0.001
|
||||
# 迭代次数
|
||||
maxCycles = 500
|
||||
# 生成一个长度和特征数相同的矩阵,此处n为3 -> [[1],[1],[1]]
|
||||
# 回归系数
|
||||
weights = ones((n,1))
|
||||
for k in range(maxCycles): #heavy on matrix operations
|
||||
# m*3的矩阵 * 3*1的单位矩阵 = m*1的矩阵
|
||||
# 那么乘上单位矩阵的意义,就代表:通过公式得到的理论值
|
||||
# 参考地址: 矩阵乘法的本质是什么? https://www.zhihu.com/question/21351965/answer/31050145
|
||||
# n*3 * 3*1 = n*1
|
||||
h = sigmoid(dataMatrix*weights) #matrix mult
|
||||
# labelMat是实际值
|
||||
error = (labelMat - h) #vector subtraction
|
||||
# 0.001* (3*m)*(m*1) 表示在每一个列上的一个误差情况,最后得出 x1,x2,xn的系数的偏移量
|
||||
weights = weights + alpha * dataMatrix.transpose() * error #matrix mult
|
||||
return array(weights)
|
||||
|
||||
|
||||
# 随机梯度上升
|
||||
# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集,计算复杂都较高
|
||||
# 随机梯度上升一次只用一个样本点来更新回归系数
|
||||
def stocGradAscent0(dataMatrix, classLabels):
|
||||
m,n = shape(dataMatrix)
|
||||
alpha = 0.01
|
||||
# n*1的矩阵
|
||||
# 函数ones创建一个全1的数组
|
||||
weights = ones(n) #initialize to all ones
|
||||
for i in range(m):
|
||||
# sum(dataMatrix[i]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn
|
||||
h = sigmoid(sum(dataMatrix[i]*weights))
|
||||
error = classLabels[i] - h
|
||||
# 0.01*(1*1)*(1*n)
|
||||
print weights, "*"*10 , dataMatrix[i], "*"*10 , error
|
||||
weights = weights + alpha * error * dataMatrix[i]
|
||||
return weights
|
||||
|
||||
|
||||
# 随机梯度上升算法(随机化)
|
||||
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
|
||||
m,n = shape(dataMatrix)
|
||||
weights = ones(n) #initialize to all ones
|
||||
# 随机剃度, 循环150,观察是否收敛
|
||||
for j in range(numIter):
|
||||
# [0, 1, 2 .. m-1]
|
||||
dataIndex = range(m)
|
||||
for i in range(m):
|
||||
# i和j的不断增大,导致alpha的值不断减少,但是不为0
|
||||
alpha = 4/(1.0+j+i)+0.0001 #apha decreases with iteration, does not
|
||||
# 随机产生一个 0~len()之间的一个值
|
||||
randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant
|
||||
# sum(dataMatrix[i]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn
|
||||
h = sigmoid(sum(dataMatrix[randIndex]*weights))
|
||||
error = classLabels[randIndex] - h
|
||||
# print weights, '__h=%s' % h, '__'*20, alpha, '__'*20, error, '__'*20, dataMatrix[randIndex]
|
||||
weights = weights + alpha * error * dataMatrix[randIndex]
|
||||
del(dataIndex[randIndex])
|
||||
return weights
|
||||
|
||||
|
||||
# 可视化展示
|
||||
def plotBestFit(dataArr, labelMat, weights):
|
||||
n = shape(dataArr)[0]
|
||||
xcord1 = []; ycord1 = []
|
||||
xcord2 = []; ycord2 = []
|
||||
for i in range(n):
|
||||
if int(labelMat[i])== 1:
|
||||
xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
|
||||
else:
|
||||
xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
|
||||
ax.scatter(xcord2, ycord2, s=30, c='green')
|
||||
x = arange(-3.0, 3.0, 0.1)
|
||||
"""
|
||||
y的由来,卧槽,是不是没看懂?
|
||||
首先理论上是这个样子的。
|
||||
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||||
w0*x0+w1*x1+w2*x2=f(x)
|
||||
x0最开始就设置为1叻, x2就是我们画图的y值,而f(x)被我们磨合误差给算到w0,w1,w2身上去了
|
||||
所以: w0+w1*x+w2*y=0 => y = (-w0-w1*x)/w2
|
||||
"""
|
||||
y = (-weights[0]-weights[1]*x)/weights[2]
|
||||
ax.plot(x, y)
|
||||
plt.xlabel('X'); plt.ylabel('Y')
|
||||
plt.show()
|
||||
|
||||
|
||||
def main():
|
||||
# project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
|
||||
# 1.收集并准备数据
|
||||
# dataMat, labelMat = loadDataSet("%s/testData/Logistic_testdata.txt" % project_dir)
|
||||
dataMat, labelMat = loadDataSet("testData/Logistic_testdata.txt")
|
||||
|
||||
# print dataMat, '---\n', labelMat
|
||||
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
|
||||
# 因为数组没有是复制n份, array的乘法就是乘法
|
||||
dataArr = array(dataMat)
|
||||
# print dataArr
|
||||
# weights = gradAscent(dataArr, labelMat)
|
||||
# weights = stocGradAscent0(dataArr, labelMat)
|
||||
weights = stocGradAscent1(dataArr, labelMat)
|
||||
# print '*'*30, weights
|
||||
|
||||
# 数据可视化
|
||||
plotBestFit(dataArr, labelMat, weights)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user