更新注释

2026-02-12 23:05:14 +08:00 · 2017-03-26 18:12:25 +08:00
parent 0bf2130751
commit 95a29ca47e
6 changed files with 46 additions and 346 deletions
--- a/src/python/05.Logistic/core/logRegression01.py
+++ b/src/python/05.Logistic/core/logRegression01.py
@@ -1,103 +0,0 @@
-#!/usr/bin/env python
-# encoding: utf-8
-from numpy import *
-import matplotlib.pyplot as plt
-import time
-'''
-1、需要安装模块：pip install matplotlib-1.5.0-cp27-none-win_amd64.whl
-由于直接安装会出现问题，所以建议下载whl包进行安装，下载网址：
-https://pypi.python.org/pypi/matplotlib/1.5.0
-
-2、可以看见画出的图像
-'''
-
-"""
-@version: 
-@author: yangjf
-@license: ApacheCN
-@contact: highfei2011@126.com
-@site: https://github.com/apachecn/MachineLearning
-@software: PyCharm
-@file: logRegression01.py
-@time: 2017/3/3 22:03
-@test result: ok
-"""
-
-# sigmoid函数
-def sigmoid(inX):
-    return 1.0 / (1 + exp(-inX))
-
-def trainLogRegres(train_x, train_y, opts):
-    # 计算训练时间
-    startTime = time.time()
-
-    numSamples, numFeatures = shape(train_x)
-    alpha = opts['alpha']; maxIter = opts['maxIter']
-    weights = ones((numFeatures, 1))
-
-    # 通过梯度下降算法优化
-    for k in range(maxIter):
-        if opts['optimizeType'] == 'gradDescent': # 梯度下降算法
-            output = sigmoid(train_x * weights)
-            error = train_y - output
-            weights = weights + alpha * train_x.transpose() * error
-        elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降
-            for i in range(numSamples):
-                output = sigmoid(train_x[i, :] * weights)
-                error = train_y[i, 0] - output
-                weights = weights + alpha * train_x[i, :].transpose() * error
-        elif opts['optimizeType'] == 'smoothStocGradDescent': # 光滑随机梯度下降
-            # 随机选择样本以优化以减少周期波动
-            dataIndex = range(numSamples)
-            for i in range(numSamples):
-                alpha = 4.0 / (1.0 + k + i) + 0.01
-                randIndex = int(random.uniform(0, len(dataIndex)))
-                output = sigmoid(train_x[randIndex, :] * weights)
-                error = train_y[randIndex, 0] - output
-                weights = weights + alpha * train_x[randIndex, :].transpose() * error
-                del(dataIndex[randIndex]) # 在一次交互期间，删除优化的样品
-        else:
-            raise NameError('Not support optimize method type!')
-
-
-    print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
-    return weights
-
-
-#测试给定测试集的训练Logistic回归模型
-def testLogRegres(weights, test_x, test_y):
-    numSamples, numFeatures = shape(test_x)
-    matchCount = 0
-    for i in xrange(numSamples):
-        predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
-        if predict == bool(test_y[i, 0]):
-            matchCount += 1
-    accuracy = float(matchCount) / numSamples
-    return accuracy
-
-
-# 显示你的训练逻辑回归模型只有2-D数据可用
-def showLogRegres(weights, train_x, train_y):
-    # 注意：train_x和train_y是垫数据类型
-    numSamples, numFeatures = shape(train_x)
-    if numFeatures != 3:
-        print "抱歉! 我不能绘制，因为你的数据的维度不是2！"
-        return 1
-
-    # 画出所有抽样数据
-    for i in xrange(numSamples):
-        if int(train_y[i, 0]) == 0:
-            plt.plot(train_x[i, 1], train_x[i, 2], 'or')
-        elif int(train_y[i, 0]) == 1:
-            plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
-
-    # 画图操作
-    min_x = min(train_x[:, 1])[0, 0]
-    max_x = max(train_x[:, 1])[0, 0]
-    weights = weights.getA()  # 将mat转换为数组
-    y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
-    y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
-    plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
-    plt.xlabel('X1'); plt.ylabel('X2')
-    #显示图像
-    plt.show()
--- a/src/python/05.Logistic/core/test_logRegression.py
+++ b/src/python/05.Logistic/core/test_logRegression.py
@@ -1,54 +0,0 @@
-#!/usr/bin/env python
-# encoding: utf-8
-import os
-import sys
-sys.path.append("C:\Python27")
-from numpy import *
-
-from  logRegression01 import *
-"""
-@version: 
-@author: yangjf
-@license: ApacheCN
-@contact: highfei2011@126.com
-@site: https://github.com/apachecn/MachineLearning
-@software: PyCharm
-@file: test_logRegression.py
-@time: 2017/3/3 22:09
-@test result: ok
-"""
-
-def loadData():
-    train_x = []
-    train_y = []
-    # 获取当前文件所在路径
-    project_dir = os.getcwdu()
-    # 截取字符串至项目名：Test\
-    project_dir = project_dir[:project_dir.find("MachineLearning\\") + 15]
-    print project_dir
-    fileIn = open("%s/testData/Logistic_testdata.txt" % project_dir)
-    for line in fileIn.readlines():
-        lineArr = line.strip().split()
-        train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
-        train_y.append(float(lineArr[2]))
-    return mat(train_x), mat(train_y).transpose()
-
-
-##第一步: 加载数据
-print "step 1: load data..."
-train_x, train_y = loadData()
-test_x = train_x; test_y = train_y
-
-##第二步: 训练数据...
-print "step 2: training..."
-opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'}
-optimalWeights =  trainLogRegres(train_x, train_y, opts)
-
-##第三步: 测试
-print "step 3: testing..."
-accuracy =  testLogRegres(optimalWeights, test_x, test_y)
-
-##第四步: 显示结果
-print "step 4: show the result..."
-print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
-showLogRegres(optimalWeights, train_x, train_y)
--- a/src/python/05.Logistic/logRegres.py
+++ b/src/python/05.Logistic/logRegres.py
@@ -1,164 +0,0 @@
-#!/usr/bin/env python
-# -*- coding:utf-8 -*-
-from numpy import *
-
-
-def loadDataSet():
-    dataMat = []
-    labelMat = []
-    fr = open('testSet.txt')
-    for line in fr.readlines():
-        lineArr = line.strip().split()
-        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
-        labelMat.append(int(lineArr[2]))
-    return dataMat, labelMat
-
-
-def sigmoid(inX):
-    return 1.0 / (1 + exp(-inX))
-
-
-def gradAscent(dataMatIn, classLabels):
-    # 转化为矩阵[[1,1,2],[1,1,2]....]
-    dataMatrix = mat(dataMatIn)  # convert to NumPy matrix
-    # 转化为矩阵[[0,1,0,1,0,1.....]]，并转制[[0],[1],[0].....]
-    x = mat(classLabels)
-    labelMat = x.transpose()  # convert to NumPy matrix
-    # m->数据量 n->特征数
-    m, n = shape(dataMatrix)
-    # 步长
-    alpha = 0.001
-    # 迭代次数
-    maxCycles = 500
-    # 生成一个长度和特征数相同的矩阵，此处n为3 -> [[1],[1],[1]]
-    weights = ones((n, 1))
-    for k in range(maxCycles):  # heavy on matrix operations
-        # 1. dataMatrix * weights 矩阵乘法: [[1,1,2],[1,1,2]....] * [[1],[1],[1]] -> [[]]
-        s = dataMatrix * weights
-        # 把每个特征与系数的乘积只和带入Sigmoid函数
-        h = sigmoid(dataMatrix * weights)  # matrix mult
-        # [[x,x,x,x,x,......一共一百个误差]]
-        error = (labelMat - h)  # vector subtraction
-        # dataMatrix.transpose() * error 推理略去
-        # [[x,x,x,x....一共一百个数],[],[]]
-        data_tran = dataMatrix.transpose()
-        # [[a,b,c]]
-        data_tran_error = data_tran * error
-
-        # weights = weights + alpha * dataMatrix.transpose() * error  # matrix mult
-        weights = weights + alpha * data_tran_error
-    return weights
-
-
-# 随机梯度上升
-# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集，计算复杂都较高
-# 随机梯度上升一次只用一个样本点来更新回归系数
-def stocGradAscent0(dataMatrix, classLabels):
-    m, n = shape(dataMatrix)
-    alpha = 0.01
-    weights = ones(n)  # initialize to all ones
-    for i in range(m):
-        h = sigmoid(sum(dataMatrix[i] * weights))
-        error = classLabels[i] - h
-        weights = weights + alpha * error * dataMatrix[i]
-    return weights
-
-
-def plotBestFit(weights):
-    import matplotlib.pyplot as plt
-    dataMat, labelMat = loadDataSet()
-    dataArr = array(dataMat)
-    n = shape(dataArr)[0]
-    xcord1 = [];
-    ycord1 = []
-    xcord2 = [];
-    ycord2 = []
-    for i in range(n):
-        if int(labelMat[i]) == 1:
-            xcord1.append(dataArr[i, 1]);
-            ycord1.append(dataArr[i, 2])
-        else:
-            xcord2.append(dataArr[i, 1]);
-            ycord2.append(dataArr[i, 2])
-    fig = plt.figure()
-    ax = fig.add_subplot(111)
-    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
-    ax.scatter(xcord2, ycord2, s=30, c='green')
-    x = arange(-3.0, 3.0, 0.1)
-    y = (-weights[0] - weights[1] * x) / weights[2]
-    ax.plot(x, y)
-    plt.xlabel('X1');
-    plt.ylabel('X2');
-    plt.show()
-
-
-def stocGradAscent1(dataMatrix, classLabels, numIter=150):
-    m, n = shape(dataMatrix)
-    weights = ones(n)  # initialize to all ones
-    for j in range(numIter):
-        dataIndex = range(m)
-        for i in range(m):
-            # 步长在不断减小
-            alpha = 4 / (1.0 + j + i) + 0.0001  # apha decreases with iteration, does not
-            # 随机选取样本减少周期波动
-            randIndex = int(random.uniform(0, len(dataIndex)))  # go to 0 because of the constant
-            h = sigmoid(sum(dataMatrix[randIndex] * weights))
-            error = classLabels[randIndex] - h
-            weights = weights + alpha * error * dataMatrix[randIndex]
-            del (dataIndex[randIndex])
-    return weights
-
-
-# a, b = loadDataSet()
-# weights = gradAscent(a, b)
-# plotBestFit(weights)
-#
-
-######################################################################################################################
-
-def classifyVector(inX, weights):
-    prob = sigmoid(sum(inX * weights))
-    if prob > 0.5:
-        return 1.0
-    else:
-        return 0.0
-
-
-def colicTest():
-    frTrain = open('horseColicTraining.txt');
-    frTest = open('horseColicTest.txt')
-    trainingSet = [];
-    trainingLabels = []
-    for line in frTrain.readlines():
-        currLine = line.strip().split('\t')
-        lineArr = []
-        for i in range(21):
-            lineArr.append(float(currLine[i]))
-        trainingSet.append(lineArr)
-        trainingLabels.append(float(currLine[21]))
-    trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
-    errorCount = 0;
-    numTestVec = 0.0
-    for line in frTest.readlines():
-        numTestVec += 1.0
-        currLine = line.strip().split('\t')
-        lineArr = []
-        for i in range(21):
-            lineArr.append(float(currLine[i]))
-        if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
-            errorCount += 1
-    errorRate = (float(errorCount) / numTestVec)
-    print "the error rate of this test is: %f" % errorRate
-    return errorRate
-
-
-def multiTest():
-    numTests = 10;
-    errorSum = 0.0
-    for k in range(numTests):
-        errorSum += colicTest()
-    print "after %d iterations the average error rate is: %f" % (numTests, errorSum / float(numTests))
-
-    # multiTest()
-
-colicTest()
--- a/src/python/05.Logistic/logistic.py
+++ b/src/python/05.Logistic/logistic.py
@@ -0,0 +1,151 @@
+#!/usr/bin/python
+# coding: utf8
+
+'''
+Created on Oct 27, 2010
+Logistic Regression Working Module
+@author: Peter
+'''
+from numpy import *
+import matplotlib.pyplot as plt
+
+
+# 解析数据
+def loadDataSet(file_name):
+    # dataMat为原始数据， labelMat为原始数据的标签
+    dataMat = []; labelMat = []
+    fr = open(file_name)
+    for line in fr.readlines():
+        lineArr = line.strip().split()
+        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
+        labelMat.append(int(lineArr[2]))
+    return dataMat,labelMat
+
+# sigmoid跳跃函数
+def sigmoid(inX):
+    return 1.0/(1+exp(-inX))
+
+
+# 正常的处理方案
+def gradAscent(dataMatIn, classLabels):
+    # 转化为矩阵[[1,1,2],[1,1,2]....]
+    dataMatrix = mat(dataMatIn)             #convert to NumPy matrix
+    # 转化为矩阵[[0,1,0,1,0,1.....]]，并转制[[0],[1],[0].....]
+    # transpose() 行列转制函数
+    # 将行矩阵转化为列矩阵    =>  矩阵的转置
+    labelMat = mat(classLabels).transpose() #convert to NumPy matrix
+    # m->数据量 n->特征数
+    m,n = shape(dataMatrix)
+    # print m, n, '__'*10, shape(dataMatrix.transpose()), '__'*100
+    # 步长
+    alpha = 0.001
+    # 迭代次数
+    maxCycles = 500
+    # 生成一个长度和特征数相同的矩阵，此处n为3 -> [[1],[1],[1]]
+    # 回归系数
+    weights = ones((n,1))
+    for k in range(maxCycles):              #heavy on matrix operations
+        # m*3的矩阵 * 3*1的单位矩阵 ＝ m*1的矩阵
+        # 那么乘上单位矩阵的意义，就代表：通过公式得到的理论值
+        # 参考地址： 矩阵乘法的本质是什么？ https://www.zhihu.com/question/21351965/answer/31050145
+        # n*3   *  3*1  = n*1
+        h = sigmoid(dataMatrix*weights)     #matrix mult
+        # labelMat是实际值
+        error = (labelMat - h)              #vector subtraction
+        # 0.001* (3*m)*(m*1) 表示在每一个列上的一个误差情况，最后得出 x1,x2,xn的系数的偏移量
+        weights = weights + alpha * dataMatrix.transpose() * error #matrix mult
+    return array(weights)
+
+
+# 随机梯度上升
+# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集，计算复杂都较高
+# 随机梯度上升一次只用一个样本点来更新回归系数
+def stocGradAscent0(dataMatrix, classLabels):
+    m,n = shape(dataMatrix)
+    alpha = 0.01
+    # n*1的矩阵
+    # 函数ones创建一个全1的数组
+    weights = ones(n)   #initialize to all ones
+    for i in range(m):
+        # sum(dataMatrix[i]*weights)为了求 f(x)的值， f(x)=a1*x1+b2*x2+..+nn*xn
+        h = sigmoid(sum(dataMatrix[i]*weights))
+        error = classLabels[i] - h
+        # 0.01*(1*1)*(1*n)
+        print weights, "*"*10 , dataMatrix[i], "*"*10 , error
+        weights = weights + alpha * error * dataMatrix[i]
+    return weights
+
+
+# 随机梯度上升算法（随机化）
+def stocGradAscent1(dataMatrix, classLabels, numIter=150):
+    m,n = shape(dataMatrix)
+    weights = ones(n)   #initialize to all ones
+    # 随机剃度, 循环150,观察是否收敛
+    for j in range(numIter):
+        # [0, 1, 2 .. m-1]
+        dataIndex = range(m)
+        for i in range(m):
+            # i和j的不断增大，导致alpha的值不断减少，但是不为0
+            alpha = 4/(1.0+j+i)+0.0001    #apha decreases with iteration, does not
+            # 随机产生一个 0～len()之间的一个值
+            randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant
+            # sum(dataMatrix[i]*weights)为了求 f(x)的值， f(x)=a1*x1+b2*x2+..+nn*xn
+            h = sigmoid(sum(dataMatrix[randIndex]*weights))
+            error = classLabels[randIndex] - h
+            # print weights, '__h=%s' % h, '__'*20, alpha, '__'*20, error, '__'*20, dataMatrix[randIndex]
+            weights = weights + alpha * error * dataMatrix[randIndex]
+            del(dataIndex[randIndex])
+    return weights
+
+
+# 可视化展示
+def plotBestFit(dataArr, labelMat, weights):
+    n = shape(dataArr)[0]
+    xcord1 = []; ycord1 = []
+    xcord2 = []; ycord2 = []
+    for i in range(n):
+        if int(labelMat[i])== 1:
+            xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
+        else:
+            xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
+    ax.scatter(xcord2, ycord2, s=30, c='green')
+    x = arange(-3.0, 3.0, 0.1)
+    """
+    y的由来，卧槽，是不是没看懂？
+    首先理论上是这个样子的。
+    dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
+    w0*x0+w1*x1+w2*x2=f(x)
+    x0最开始就设置为1叻， x2就是我们画图的y值，而f(x)被我们磨合误差给算到w0,w1,w2身上去了
+    所以： w0+w1*x+w2*y=0 => y = (-w0-w1*x)/w2   
+    """
+    y = (-weights[0]-weights[1]*x)/weights[2]
+    ax.plot(x, y)
+    plt.xlabel('X'); plt.ylabel('Y')
+    plt.show()
+
+
+def main():
+    # project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
+    # 1.收集并准备数据
+    # dataMat, labelMat = loadDataSet("%s/testData/Logistic_testdata.txt" % project_dir)
+    dataMat, labelMat = loadDataSet("testData/Logistic_testdata.txt")
+
+    # print dataMat, '---\n', labelMat
+    # 2.训练模型，  f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
+    # 因为数组没有是复制n份， array的乘法就是乘法
+    dataArr = array(dataMat)
+    # print dataArr
+    # weights = gradAscent(dataArr, labelMat)
+    # weights = stocGradAscent0(dataArr, labelMat)
+    weights = stocGradAscent1(dataArr, labelMat)
+    # print '*'*30, weights
+
+    # 数据可视化
+    plotBestFit(dataArr, labelMat, weights)
+
+
+if __name__ == "__main__":
+    main()