mirror of
https://github.com/apachecn/ailearning.git
synced 2026-05-07 14:13:14 +08:00
添加logistic回归源代码中的预测病马死亡率的代码,修改相应的md文件
This commit is contained in:
@@ -321,7 +321,7 @@ def stocGradAscent1(dataMatrix, classLabels, numIter=150):
|
||||
|
||||
[完整代码地址](https://github.com/apachecn/MachineLearning/blob/master/src/python/5.Logistic/logistic.py): <https://github.com/apachecn/MachineLearning/blob/master/src/python/5.Logistic/logistic.py>
|
||||
|
||||
### 项目案例3: 从疝气病症预测病马的死亡率
|
||||
### 项目案例2: 从疝气病症预测病马的死亡率
|
||||
|
||||
#### 项目概述
|
||||
|
||||
@@ -473,27 +473,50 @@ def stocGradAscent1(dataMatrix, classLabels, numIter=150):
|
||||
Logistic 回归分类函数
|
||||
|
||||
```python
|
||||
# 分类函数,根据回归系数和特征向量来计算 Sigmoid的值
|
||||
def classifyVector(inX, weights):
|
||||
'''
|
||||
Desc:
|
||||
最终的分类函数,根据回归系数和特征向量来计算 Sigmoid 的值,大于0.5函数返回1,否则返回0
|
||||
Args:
|
||||
inX -- 特征向量,features
|
||||
weights -- 根据梯度下降/随机梯度下降 计算得到的回归系数
|
||||
Returns:
|
||||
如果 prob 计算大于 0.5 函数返回 1
|
||||
否则返回 0
|
||||
'''
|
||||
prob = sigmoid(sum(inX*weights))
|
||||
if prob > 0.5: return 1.0
|
||||
else: return 0.0
|
||||
|
||||
|
||||
# 打开测试集和训练集,并对数据进行格式化处理
|
||||
def colicTest():
|
||||
frTrain = open('horseColicTraining.txt')
|
||||
frTest = open('horseColicTest.txt')
|
||||
'''
|
||||
Desc:
|
||||
打开测试集和训练集,并对数据进行格式化处理
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
errorRate -- 分类错误率
|
||||
'''
|
||||
frTrain = open('input/5.Logistic/horseColicTraining.txt')
|
||||
frTest = open('input/5.Logistic/horseColicTest.txt')
|
||||
trainingSet = []
|
||||
trainingLabels = []
|
||||
# 解析训练数据集中的数据特征和Labels
|
||||
# trainingSet 中存储训练数据集的特征,trainingLabels 存储训练数据集的样本对应的分类标签
|
||||
for line in frTrain.readlines():
|
||||
currLine = line.strip().split('\t')
|
||||
lineArr = []
|
||||
for i in range(21):
|
||||
lineArr.append(float(currLine[i]))
|
||||
trainingSet.append(lineArr)
|
||||
trainLabels.append(float(currLine[21]))
|
||||
trainingLabels.append(float(currLine[21]))
|
||||
# 使用 改进后的 随机梯度下降算法 求得在此数据集上的最佳回归系数 trainWeights
|
||||
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 500)
|
||||
errorCount = 0
|
||||
numTestVec = 0.0
|
||||
# 读取 测试数据集 进行测试,计算分类错误的样本条数和最终的错误率
|
||||
for line in frTest.readlines():
|
||||
numTestVec += 1.0
|
||||
currLine = line.strip().split('\t')
|
||||
@@ -501,12 +524,13 @@ def colicTest():
|
||||
for i in range(21):
|
||||
lineArr.append(float(currLine[i]))
|
||||
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
|
||||
errorCount += 1
|
||||
errorCount += 1
|
||||
errorRate = (float(errorCount) / numTestVec)
|
||||
print "the error rate of this test is: %f" % errorRate
|
||||
return errorRate
|
||||
|
||||
|
||||
# 调用 colicTest() 10次并求结果的平均值
|
||||
def multiTest():
|
||||
numTests = 10
|
||||
errorSum = 0.0
|
||||
|
||||
@@ -12,6 +12,9 @@ from numpy import *
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 使用 Logistic 回归在简单数据集上的分类
|
||||
|
||||
# 解析数据
|
||||
def loadDataSet(file_name):
|
||||
# dataMat为原始数据, labelMat为原始数据的标签
|
||||
@@ -149,7 +152,7 @@ def plotBestFit(dataArr, labelMat, weights):
|
||||
plt.show()
|
||||
|
||||
|
||||
def main():
|
||||
def simpleTest():
|
||||
# 1.收集并准备数据
|
||||
dataMat, labelMat = loadDataSet("input/5.Logistic/TestSet.txt")
|
||||
|
||||
@@ -167,5 +170,75 @@ def main():
|
||||
plotBestFit(dataArr, labelMat, weights)
|
||||
|
||||
|
||||
#--------------------------------------------------------------------------------
|
||||
# 从疝气病症预测病马的死亡率
|
||||
|
||||
# 分类函数,根据回归系数和特征向量来计算 Sigmoid的值
|
||||
def classifyVector(inX, weights):
|
||||
'''
|
||||
Desc:
|
||||
最终的分类函数,根据回归系数和特征向量来计算 Sigmoid 的值,大于0.5函数返回1,否则返回0
|
||||
Args:
|
||||
inX -- 特征向量,features
|
||||
weights -- 根据梯度下降/随机梯度下降 计算得到的回归系数
|
||||
Returns:
|
||||
如果 prob 计算大于 0.5 函数返回 1
|
||||
否则返回 0
|
||||
'''
|
||||
prob = sigmoid(sum(inX*weights))
|
||||
if prob > 0.5: return 1.0
|
||||
else: return 0.0
|
||||
|
||||
# 打开测试集和训练集,并对数据进行格式化处理
|
||||
def colicTest():
|
||||
'''
|
||||
Desc:
|
||||
打开测试集和训练集,并对数据进行格式化处理
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
errorRate -- 分类错误率
|
||||
'''
|
||||
frTrain = open('input/5.Logistic/horseColicTraining.txt')
|
||||
frTest = open('input/5.Logistic/horseColicTest.txt')
|
||||
trainingSet = []
|
||||
trainingLabels = []
|
||||
# 解析训练数据集中的数据特征和Labels
|
||||
# trainingSet 中存储训练数据集的特征,trainingLabels 存储训练数据集的样本对应的分类标签
|
||||
for line in frTrain.readlines():
|
||||
currLine = line.strip().split('\t')
|
||||
lineArr = []
|
||||
for i in range(21):
|
||||
lineArr.append(float(currLine[i]))
|
||||
trainingSet.append(lineArr)
|
||||
trainingLabels.append(float(currLine[21]))
|
||||
# 使用 改进后的 随机梯度下降算法 求得在此数据集上的最佳回归系数 trainWeights
|
||||
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 500)
|
||||
errorCount = 0
|
||||
numTestVec = 0.0
|
||||
# 读取 测试数据集 进行测试,计算分类错误的样本条数和最终的错误率
|
||||
for line in frTest.readlines():
|
||||
numTestVec += 1.0
|
||||
currLine = line.strip().split('\t')
|
||||
lineArr = []
|
||||
for i in range(21):
|
||||
lineArr.append(float(currLine[i]))
|
||||
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
|
||||
errorCount += 1
|
||||
errorRate = (float(errorCount) / numTestVec)
|
||||
print "the error rate of this test is: %f" % errorRate
|
||||
return errorRate
|
||||
|
||||
|
||||
# 调用 colicTest() 10次并求结果的平均值
|
||||
def multiTest():
|
||||
numTests = 10
|
||||
errorSum = 0.0
|
||||
for k in range(numTests):
|
||||
errorSum += colicTest()
|
||||
print "after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
simpleTest()
|
||||
# multiTest()
|
||||
|
||||
Reference in New Issue
Block a user