mirror of
https://github.com/apachecn/ailearning.git
synced 2026-05-10 08:12:00 +08:00
136 lines
5.1 KiB
Python
136 lines
5.1 KiB
Python
#!/usr/bin/python
|
||
# coding: utf8
|
||
|
||
'''
|
||
Created on Oct 27, 2010
|
||
Logistic Regression Working Module
|
||
@author: Peter
|
||
'''
|
||
|
||
import os
|
||
from numpy import *
|
||
import matplotlib.pyplot as plt
|
||
# 解析数据
|
||
def loadDataSet(file_name):
|
||
# dataMat为原始数据, labelMat为原始数据的标签
|
||
dataMat = []; labelMat = []
|
||
fr = open(file_name)
|
||
for line in fr.readlines():
|
||
lineArr = line.strip().split()
|
||
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||
labelMat.append(int(lineArr[2]))
|
||
return dataMat,labelMat
|
||
|
||
# sigmoid跳跃函数
|
||
def sigmoid(inX):
|
||
return 1.0/(1+exp(-inX))
|
||
|
||
# 正常的处理方案
|
||
def gradAscent(dataMatIn, classLabels):
|
||
dataMatrix = mat(dataMatIn) #convert to NumPy matrix
|
||
# transpose() 行列转制函数
|
||
# 将行矩阵转化为列矩阵 => 矩阵的转置
|
||
labelMat = mat(classLabels).transpose() #convert to NumPy matrix
|
||
m,n = shape(dataMatrix)
|
||
# print m, n, '__'*10, shape(dataMatrix.transpose()), '__'*100
|
||
alpha = 0.001
|
||
maxCycles = 500
|
||
# 权重
|
||
weights = ones((n,1))
|
||
for k in range(maxCycles): #heavy on matrix operations
|
||
# m*3的矩阵 * 3*1的单位矩阵 = m*1的矩阵
|
||
# 那么乘上单位矩阵的意义,就代表:通过公式得到的理论值
|
||
# 参考地址: 矩阵乘法的本质是什么? https://www.zhihu.com/question/21351965/answer/31050145
|
||
# n*3 * 3*1 = n*1
|
||
h = sigmoid(dataMatrix*weights) #matrix mult
|
||
# labelMat是实际值
|
||
error = (labelMat - h) #vector subtraction
|
||
# 0.001* (3*m)*(m*1) 表示在每一个列上的一个误差情况,最后得出 x1,x2,xn的系数的偏移量
|
||
weights = weights + alpha * dataMatrix.transpose() * error #matrix mult
|
||
return array(weights)
|
||
|
||
# 梯度上升算法
|
||
def stocGradAscent0(dataMatrix, classLabels):
|
||
m,n = shape(dataMatrix)
|
||
alpha = 0.01
|
||
# n*1的矩阵
|
||
# 函数ones创建一个全1的数组
|
||
weights = ones(n) #initialize to all ones
|
||
for i in range(m):
|
||
# sum(dataMatrix[i]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn
|
||
h = sigmoid(sum(dataMatrix[i]*weights))
|
||
error = classLabels[i] - h
|
||
# 0.01*(1*1)*(1*n)
|
||
print weights, "*"*10 , dataMatrix[i], "*"*10 , error
|
||
weights = weights + alpha * error * dataMatrix[i]
|
||
return weights
|
||
|
||
# 随机梯度上升算法(随机化)
|
||
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
|
||
m,n = shape(dataMatrix)
|
||
weights = ones(n) #initialize to all ones
|
||
# 随机剃度, 循环150,观察是否收敛
|
||
for j in range(numIter):
|
||
# [0, 1, 2 .. m-1]
|
||
dataIndex = range(m)
|
||
for i in range(m):
|
||
# i和j的不断增大,导致alpha的值不断减少,但是不为0
|
||
alpha = 4/(1.0+j+i)+0.0001 #apha decreases with iteration, does not
|
||
# 随机产生一个 0~len()之间的一个值
|
||
randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant
|
||
# sum(dataMatrix[i]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn
|
||
h = sigmoid(sum(dataMatrix[randIndex]*weights))
|
||
error = classLabels[randIndex] - h
|
||
# print weights, '__h=%s' % h, '__'*20, alpha, '__'*20, error, '__'*20, dataMatrix[randIndex]
|
||
weights = weights + alpha * error * dataMatrix[randIndex]
|
||
del(dataIndex[randIndex])
|
||
return weights
|
||
|
||
# 可视化展示
|
||
def plotBestFit(dataArr, labelMat, weights):
|
||
n = shape(dataArr)[0]
|
||
xcord1 = []; ycord1 = []
|
||
xcord2 = []; ycord2 = []
|
||
for i in range(n):
|
||
if int(labelMat[i])== 1:
|
||
xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
|
||
else:
|
||
xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
|
||
fig = plt.figure()
|
||
ax = fig.add_subplot(111)
|
||
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
|
||
ax.scatter(xcord2, ycord2, s=30, c='green')
|
||
x = arange(-3.0, 3.0, 0.1)
|
||
"""
|
||
y的由来,卧槽,是不是没看懂?
|
||
首先理论上是这个样子的。
|
||
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||
w0*x0+w1*x1+w2*x2=f(x)
|
||
x0最开始就设置为1叻, x2就是我们画图的y值,而f(x)被我们磨合误差给算到w0,w1,w2身上去了
|
||
所以: w0+w1*x+w2*y=0 => y = (-w0-w1*x)/w2
|
||
"""
|
||
y = (-weights[0]-weights[1]*x)/weights[2]
|
||
ax.plot(x, y)
|
||
plt.xlabel('X'); plt.ylabel('Y')
|
||
plt.show()
|
||
|
||
def main():
|
||
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
|
||
# 1.收集并准备数据
|
||
dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
|
||
|
||
# print dataMat, '---\n', labelMat
|
||
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
|
||
# 因为数组没有是复制n份, array的乘法就是乘法
|
||
dataArr = array(dataMat)
|
||
# print dataArr
|
||
# weights = gradAscent(dataArr, labelMat)
|
||
# weights = stocGradAscent0(dataArr, labelMat)
|
||
weights = stocGradAscent1(dataArr, labelMat)
|
||
# print '*'*30, weights
|
||
|
||
# 数据可视化
|
||
plotBestFit(dataArr, labelMat, weights)
|
||
|
||
if __name__=="__main__":
|
||
main() |