mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-12 06:46:14 +08:00
164 lines
5.4 KiB
Python
Executable File
164 lines
5.4 KiB
Python
Executable File
#!/usr/bin/env python
|
||
# -*- coding:utf-8 -*-
|
||
from numpy import *
|
||
|
||
|
||
def loadDataSet():
|
||
dataMat = []
|
||
labelMat = []
|
||
fr = open('testSet.txt')
|
||
for line in fr.readlines():
|
||
lineArr = line.strip().split()
|
||
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||
labelMat.append(int(lineArr[2]))
|
||
return dataMat, labelMat
|
||
|
||
|
||
def sigmoid(inX):
|
||
return 1.0 / (1 + exp(-inX))
|
||
|
||
|
||
def gradAscent(dataMatIn, classLabels):
|
||
# 转化为矩阵[[1,1,2],[1,1,2]....]
|
||
dataMatrix = mat(dataMatIn) # convert to NumPy matrix
|
||
# 转化为矩阵[[0,1,0,1,0,1.....]],并转制[[0],[1],[0].....]
|
||
x = mat(classLabels)
|
||
labelMat = x.transpose() # convert to NumPy matrix
|
||
# m->数据量 n->特征数
|
||
m, n = shape(dataMatrix)
|
||
# 步长
|
||
alpha = 0.001
|
||
# 迭代次数
|
||
maxCycles = 500
|
||
# 生成一个长度和特征数相同的矩阵,此处n为3 -> [[1],[1],[1]]
|
||
weights = ones((n, 1))
|
||
for k in range(maxCycles): # heavy on matrix operations
|
||
# 1. dataMatrix * weights 矩阵乘法: [[1,1,2],[1,1,2]....] * [[1],[1],[1]] -> [[]]
|
||
s = dataMatrix * weights
|
||
# 把每个特征与系数的乘积只和带入Sigmoid函数
|
||
h = sigmoid(dataMatrix * weights) # matrix mult
|
||
# [[x,x,x,x,x,......一共一百个误差]]
|
||
error = (labelMat - h) # vector subtraction
|
||
# dataMatrix.transpose() * error 推理略去
|
||
# [[x,x,x,x....一共一百个数],[],[]]
|
||
data_tran = dataMatrix.transpose()
|
||
# [[a,b,c]]
|
||
data_tran_error = data_tran * error
|
||
|
||
# weights = weights + alpha * dataMatrix.transpose() * error # matrix mult
|
||
weights = weights + alpha * data_tran_error
|
||
return weights
|
||
|
||
|
||
# 随机梯度上升
|
||
# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集,计算复杂都较高
|
||
# 随机梯度上升一次只用一个样本点来更新回归系数
|
||
def stocGradAscent0(dataMatrix, classLabels):
|
||
m, n = shape(dataMatrix)
|
||
alpha = 0.01
|
||
weights = ones(n) # initialize to all ones
|
||
for i in range(m):
|
||
h = sigmoid(sum(dataMatrix[i] * weights))
|
||
error = classLabels[i] - h
|
||
weights = weights + alpha * error * dataMatrix[i]
|
||
return weights
|
||
|
||
|
||
def plotBestFit(weights):
|
||
import matplotlib.pyplot as plt
|
||
dataMat, labelMat = loadDataSet()
|
||
dataArr = array(dataMat)
|
||
n = shape(dataArr)[0]
|
||
xcord1 = [];
|
||
ycord1 = []
|
||
xcord2 = [];
|
||
ycord2 = []
|
||
for i in range(n):
|
||
if int(labelMat[i]) == 1:
|
||
xcord1.append(dataArr[i, 1]);
|
||
ycord1.append(dataArr[i, 2])
|
||
else:
|
||
xcord2.append(dataArr[i, 1]);
|
||
ycord2.append(dataArr[i, 2])
|
||
fig = plt.figure()
|
||
ax = fig.add_subplot(111)
|
||
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
|
||
ax.scatter(xcord2, ycord2, s=30, c='green')
|
||
x = arange(-3.0, 3.0, 0.1)
|
||
y = (-weights[0] - weights[1] * x) / weights[2]
|
||
ax.plot(x, y)
|
||
plt.xlabel('X1');
|
||
plt.ylabel('X2');
|
||
plt.show()
|
||
|
||
|
||
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
|
||
m, n = shape(dataMatrix)
|
||
weights = ones(n) # initialize to all ones
|
||
for j in range(numIter):
|
||
dataIndex = range(m)
|
||
for i in range(m):
|
||
# 步长在不断减小
|
||
alpha = 4 / (1.0 + j + i) + 0.0001 # apha decreases with iteration, does not
|
||
# 随机选取样本减少周期波动
|
||
randIndex = int(random.uniform(0, len(dataIndex))) # go to 0 because of the constant
|
||
h = sigmoid(sum(dataMatrix[randIndex] * weights))
|
||
error = classLabels[randIndex] - h
|
||
weights = weights + alpha * error * dataMatrix[randIndex]
|
||
del (dataIndex[randIndex])
|
||
return weights
|
||
|
||
|
||
# a, b = loadDataSet()
|
||
# weights = gradAscent(a, b)
|
||
# plotBestFit(weights)
|
||
#
|
||
|
||
######################################################################################################################
|
||
|
||
def classifyVector(inX, weights):
|
||
prob = sigmoid(sum(inX * weights))
|
||
if prob > 0.5:
|
||
return 1.0
|
||
else:
|
||
return 0.0
|
||
|
||
|
||
def colicTest():
|
||
frTrain = open('horseColicTraining.txt');
|
||
frTest = open('horseColicTest.txt')
|
||
trainingSet = [];
|
||
trainingLabels = []
|
||
for line in frTrain.readlines():
|
||
currLine = line.strip().split('\t')
|
||
lineArr = []
|
||
for i in range(21):
|
||
lineArr.append(float(currLine[i]))
|
||
trainingSet.append(lineArr)
|
||
trainingLabels.append(float(currLine[21]))
|
||
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
|
||
errorCount = 0;
|
||
numTestVec = 0.0
|
||
for line in frTest.readlines():
|
||
numTestVec += 1.0
|
||
currLine = line.strip().split('\t')
|
||
lineArr = []
|
||
for i in range(21):
|
||
lineArr.append(float(currLine[i]))
|
||
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
|
||
errorCount += 1
|
||
errorRate = (float(errorCount) / numTestVec)
|
||
print "the error rate of this test is: %f" % errorRate
|
||
return errorRate
|
||
|
||
|
||
def multiTest():
|
||
numTests = 10;
|
||
errorSum = 0.0
|
||
for k in range(numTests):
|
||
errorSum += colicTest()
|
||
print "after %d iterations the average error rate is: %f" % (numTests, errorSum / float(numTests))
|
||
|
||
# multiTest()
|
||
|
||
colicTest() |