From 05f383ee51dab056cac58bda6101710df4e5eec5 Mon Sep 17 00:00:00 2001 From: chenyyx Date: Mon, 11 Sep 2017 14:50:14 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0logistic=E5=9B=9E=E5=BD=92?= =?UTF-8?q?=E6=BA=90=E4=BB=A3=E7=A0=81=E4=B8=AD=E7=9A=84=E9=A2=84=E6=B5=8B?= =?UTF-8?q?=E7=97=85=E9=A9=AC=E6=AD=BB=E4=BA=A1=E7=8E=87=E7=9A=84=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=EF=BC=8C=E4=BF=AE=E6=94=B9=E7=9B=B8=E5=BA=94=E7=9A=84?= =?UTF-8?q?md=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/5.Logistic回归.md | 36 ++++++++++++--- src/python/5.Logistic/logistic.py | 77 ++++++++++++++++++++++++++++++- 2 files changed, 105 insertions(+), 8 deletions(-) diff --git a/docs/5.Logistic回归.md b/docs/5.Logistic回归.md index 700946ed..95172758 100644 --- a/docs/5.Logistic回归.md +++ b/docs/5.Logistic回归.md @@ -321,7 +321,7 @@ def stocGradAscent1(dataMatrix, classLabels, numIter=150): [完整代码地址](https://github.com/apachecn/MachineLearning/blob/master/src/python/5.Logistic/logistic.py): -### 项目案例3: 从疝气病症预测病马的死亡率 +### 项目案例2: 从疝气病症预测病马的死亡率 #### 项目概述 @@ -473,27 +473,50 @@ def stocGradAscent1(dataMatrix, classLabels, numIter=150): Logistic 回归分类函数 ```python +# 分类函数,根据回归系数和特征向量来计算 Sigmoid的值 def classifyVector(inX, weights): + ''' + Desc: + 最终的分类函数,根据回归系数和特征向量来计算 Sigmoid 的值,大于0.5函数返回1,否则返回0 + Args: + inX -- 特征向量,features + weights -- 根据梯度下降/随机梯度下降 计算得到的回归系数 + Returns: + 如果 prob 计算大于 0.5 函数返回 1 + 否则返回 0 + ''' prob = sigmoid(sum(inX*weights)) if prob > 0.5: return 1.0 else: return 0.0 - +# 打开测试集和训练集,并对数据进行格式化处理 def colicTest(): - frTrain = open('horseColicTraining.txt') - frTest = open('horseColicTest.txt') + ''' + Desc: + 打开测试集和训练集,并对数据进行格式化处理 + Args: + None + Returns: + errorRate -- 分类错误率 + ''' + frTrain = open('input/5.Logistic/horseColicTraining.txt') + frTest = open('input/5.Logistic/horseColicTest.txt') trainingSet = [] trainingLabels = [] + # 解析训练数据集中的数据特征和Labels + # trainingSet 中存储训练数据集的特征,trainingLabels 存储训练数据集的样本对应的分类标签 for line in frTrain.readlines(): currLine = line.strip().split('\t') lineArr = [] for i in range(21): lineArr.append(float(currLine[i])) trainingSet.append(lineArr) - trainLabels.append(float(currLine[21])) + trainingLabels.append(float(currLine[21])) + # 使用 改进后的 随机梯度下降算法 求得在此数据集上的最佳回归系数 trainWeights trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 500) errorCount = 0 numTestVec = 0.0 + # 读取 测试数据集 进行测试,计算分类错误的样本条数和最终的错误率 for line in frTest.readlines(): numTestVec += 1.0 currLine = line.strip().split('\t') @@ -501,12 +524,13 @@ def colicTest(): for i in range(21): lineArr.append(float(currLine[i])) if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]): - errorCount += 1 + errorCount += 1 errorRate = (float(errorCount) / numTestVec) print "the error rate of this test is: %f" % errorRate return errorRate +# 调用 colicTest() 10次并求结果的平均值 def multiTest(): numTests = 10 errorSum = 0.0 diff --git a/src/python/5.Logistic/logistic.py b/src/python/5.Logistic/logistic.py index c8e85021..60641821 100644 --- a/src/python/5.Logistic/logistic.py +++ b/src/python/5.Logistic/logistic.py @@ -12,6 +12,9 @@ from numpy import * import matplotlib.pyplot as plt +# --------------------------------------------------------------------------- +# 使用 Logistic 回归在简单数据集上的分类 + # 解析数据 def loadDataSet(file_name): # dataMat为原始数据, labelMat为原始数据的标签 @@ -149,7 +152,7 @@ def plotBestFit(dataArr, labelMat, weights): plt.show() -def main(): +def simpleTest(): # 1.收集并准备数据 dataMat, labelMat = loadDataSet("input/5.Logistic/TestSet.txt") @@ -167,5 +170,75 @@ def main(): plotBestFit(dataArr, labelMat, weights) +#-------------------------------------------------------------------------------- +# 从疝气病症预测病马的死亡率 + +# 分类函数,根据回归系数和特征向量来计算 Sigmoid的值 +def classifyVector(inX, weights): + ''' + Desc: + 最终的分类函数,根据回归系数和特征向量来计算 Sigmoid 的值,大于0.5函数返回1,否则返回0 + Args: + inX -- 特征向量,features + weights -- 根据梯度下降/随机梯度下降 计算得到的回归系数 + Returns: + 如果 prob 计算大于 0.5 函数返回 1 + 否则返回 0 + ''' + prob = sigmoid(sum(inX*weights)) + if prob > 0.5: return 1.0 + else: return 0.0 + +# 打开测试集和训练集,并对数据进行格式化处理 +def colicTest(): + ''' + Desc: + 打开测试集和训练集,并对数据进行格式化处理 + Args: + None + Returns: + errorRate -- 分类错误率 + ''' + frTrain = open('input/5.Logistic/horseColicTraining.txt') + frTest = open('input/5.Logistic/horseColicTest.txt') + trainingSet = [] + trainingLabels = [] + # 解析训练数据集中的数据特征和Labels + # trainingSet 中存储训练数据集的特征,trainingLabels 存储训练数据集的样本对应的分类标签 + for line in frTrain.readlines(): + currLine = line.strip().split('\t') + lineArr = [] + for i in range(21): + lineArr.append(float(currLine[i])) + trainingSet.append(lineArr) + trainingLabels.append(float(currLine[21])) + # 使用 改进后的 随机梯度下降算法 求得在此数据集上的最佳回归系数 trainWeights + trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 500) + errorCount = 0 + numTestVec = 0.0 + # 读取 测试数据集 进行测试,计算分类错误的样本条数和最终的错误率 + for line in frTest.readlines(): + numTestVec += 1.0 + currLine = line.strip().split('\t') + lineArr = [] + for i in range(21): + lineArr.append(float(currLine[i])) + if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]): + errorCount += 1 + errorRate = (float(errorCount) / numTestVec) + print "the error rate of this test is: %f" % errorRate + return errorRate + + +# 调用 colicTest() 10次并求结果的平均值 +def multiTest(): + numTests = 10 + errorSum = 0.0 + for k in range(numTests): + errorSum += colicTest() + print "after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests)) + + if __name__ == "__main__": - main() + simpleTest() + # multiTest()