Files
ailearning/src/python/05.Logistic/logRegres.py
2017-03-25 14:03:24 +08:00

164 lines
5.4 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from numpy import *
def loadDataSet():
dataMat = []
labelMat = []
fr = open('testSet.txt')
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return dataMat, labelMat
def sigmoid(inX):
return 1.0 / (1 + exp(-inX))
def gradAscent(dataMatIn, classLabels):
# 转化为矩阵[[1,1,2],[1,1,2]....]
dataMatrix = mat(dataMatIn) # convert to NumPy matrix
# 转化为矩阵[[0,1,0,1,0,1.....]],并转制[[0],[1],[0].....]
x = mat(classLabels)
labelMat = x.transpose() # convert to NumPy matrix
# m->数据量 n->特征数
m, n = shape(dataMatrix)
# 步长
alpha = 0.001
# 迭代次数
maxCycles = 500
# 生成一个长度和特征数相同的矩阵此处n为3 -> [[1],[1],[1]]
weights = ones((n, 1))
for k in range(maxCycles): # heavy on matrix operations
# 1. dataMatrix * weights 矩阵乘法: [[1,1,2],[1,1,2]....] * [[1],[1],[1]] -> [[]]
s = dataMatrix * weights
# 把每个特征与系数的乘积只和带入Sigmoid函数
h = sigmoid(dataMatrix * weights) # matrix mult
# [[x,x,x,x,x,......一共一百个误差]]
error = (labelMat - h) # vector subtraction
# dataMatrix.transpose() * error 推理略去
# [[x,x,x,x....一共一百个数],[],[]]
data_tran = dataMatrix.transpose()
# [[a,b,c]]
data_tran_error = data_tran * error
# weights = weights + alpha * dataMatrix.transpose() * error # matrix mult
weights = weights + alpha * data_tran_error
return weights
# 随机梯度上升
# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集,计算复杂都较高
# 随机梯度上升一次只用一个样本点来更新回归系数
def stocGradAscent0(dataMatrix, classLabels):
m, n = shape(dataMatrix)
alpha = 0.01
weights = ones(n) # initialize to all ones
for i in range(m):
h = sigmoid(sum(dataMatrix[i] * weights))
error = classLabels[i] - h
weights = weights + alpha * error * dataMatrix[i]
return weights
def plotBestFit(weights):
import matplotlib.pyplot as plt
dataMat, labelMat = loadDataSet()
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = [];
ycord1 = []
xcord2 = [];
ycord2 = []
for i in range(n):
if int(labelMat[i]) == 1:
xcord1.append(dataArr[i, 1]);
ycord1.append(dataArr[i, 2])
else:
xcord2.append(dataArr[i, 1]);
ycord2.append(dataArr[i, 2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = arange(-3.0, 3.0, 0.1)
y = (-weights[0] - weights[1] * x) / weights[2]
ax.plot(x, y)
plt.xlabel('X1');
plt.ylabel('X2');
plt.show()
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
m, n = shape(dataMatrix)
weights = ones(n) # initialize to all ones
for j in range(numIter):
dataIndex = range(m)
for i in range(m):
# 步长在不断减小
alpha = 4 / (1.0 + j + i) + 0.0001 # apha decreases with iteration, does not
# 随机选取样本减少周期波动
randIndex = int(random.uniform(0, len(dataIndex))) # go to 0 because of the constant
h = sigmoid(sum(dataMatrix[randIndex] * weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error * dataMatrix[randIndex]
del (dataIndex[randIndex])
return weights
# a, b = loadDataSet()
# weights = gradAscent(a, b)
# plotBestFit(weights)
#
######################################################################################################################
def classifyVector(inX, weights):
prob = sigmoid(sum(inX * weights))
if prob > 0.5:
return 1.0
else:
return 0.0
def colicTest():
frTrain = open('horseColicTraining.txt');
frTest = open('horseColicTest.txt')
trainingSet = [];
trainingLabels = []
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[21]))
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
errorCount = 0;
numTestVec = 0.0
for line in frTest.readlines():
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
errorCount += 1
errorRate = (float(errorCount) / numTestVec)
print "the error rate of this test is: %f" % errorRate
return errorRate
def multiTest():
numTests = 10;
errorSum = 0.0
for k in range(numTests):
errorSum += colicTest()
print "after %d iterations the average error rate is: %f" % (numTests, errorSum / float(numTests))
# multiTest()
colicTest()