From 95a29ca47e8fc83ed0dd1607eb88b7c2804581ae Mon Sep 17 00:00:00 2001
From: jiangzhonglian <jiang-s@163.com>
Date: Sun, 26 Mar 2017 18:12:25 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E6=B3=A8=E9=87=8A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../05.Logistic/core/logRegression01.py       | 103 -----------
 .../05.Logistic/core/test_logRegression.py    |  54 ------
 src/python/05.Logistic/logRegres.py           | 164 ------------------
 .../{Logistic.py => 05.Logistic/logistic.py}  |  31 +++-
 src/python/07.AdaBoost/adaboost.py            |  16 +-
 .../regression.py                             |  24 ++-
 6 files changed, 46 insertions(+), 346 deletions(-)
 delete mode 100644 src/python/05.Logistic/core/logRegression01.py
 delete mode 100644 src/python/05.Logistic/core/test_logRegression.py
 delete mode 100755 src/python/05.Logistic/logRegres.py
 rename src/python/{Logistic.py => 05.Logistic/logistic.py} (87%)

diff --git a/src/python/05.Logistic/core/logRegression01.py b/src/python/05.Logistic/core/logRegression01.py
deleted file mode 100644
index caa026be..00000000
--- a/src/python/05.Logistic/core/logRegression01.py
+++ /dev/null
@@ -1,103 +0,0 @@
-#!/usr/bin/env python
-# encoding: utf-8
-from numpy import *
-import matplotlib.pyplot as plt
-import time
-'''
-1、需要安装模块：pip install matplotlib-1.5.0-cp27-none-win_amd64.whl
-由于直接安装会出现问题，所以建议下载whl包进行安装，下载网址：
-https://pypi.python.org/pypi/matplotlib/1.5.0
-
-2、可以看见画出的图像
-'''
-
-"""
-@version: 
-@author: yangjf
-@license: ApacheCN
-@contact: highfei2011@126.com
-@site: https://github.com/apachecn/MachineLearning
-@software: PyCharm
-@file: logRegression01.py
-@time: 2017/3/3 22:03
-@test result: ok
-"""
-
-# sigmoid函数
-def sigmoid(inX):
-    return 1.0 / (1 + exp(-inX))
-
-def trainLogRegres(train_x, train_y, opts):
-    # 计算训练时间
-    startTime = time.time()
-
-    numSamples, numFeatures = shape(train_x)
-    alpha = opts['alpha']; maxIter = opts['maxIter']
-    weights = ones((numFeatures, 1))
-
-    # 通过梯度下降算法优化
-    for k in range(maxIter):
-        if opts['optimizeType'] == 'gradDescent': # 梯度下降算法
-            output = sigmoid(train_x * weights)
-            error = train_y - output
-            weights = weights + alpha * train_x.transpose() * error
-        elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降
-            for i in range(numSamples):
-                output = sigmoid(train_x[i, :] * weights)
-                error = train_y[i, 0] - output
-                weights = weights + alpha * train_x[i, :].transpose() * error
-        elif opts['optimizeType'] == 'smoothStocGradDescent': # 光滑随机梯度下降
-            # 随机选择样本以优化以减少周期波动
-            dataIndex = range(numSamples)
-            for i in range(numSamples):
-                alpha = 4.0 / (1.0 + k + i) + 0.01
-                randIndex = int(random.uniform(0, len(dataIndex)))
-                output = sigmoid(train_x[randIndex, :] * weights)
-                error = train_y[randIndex, 0] - output
-                weights = weights + alpha * train_x[randIndex, :].transpose() * error
-                del(dataIndex[randIndex]) # 在一次交互期间，删除优化的样品
-        else:
-            raise NameError('Not support optimize method type!')
-
-
-    print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
-    return weights
-
-
-#测试给定测试集的训练Logistic回归模型
-def testLogRegres(weights, test_x, test_y):
-    numSamples, numFeatures = shape(test_x)
-    matchCount = 0
-    for i in xrange(numSamples):
-        predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
-        if predict == bool(test_y[i, 0]):
-            matchCount += 1
-    accuracy = float(matchCount) / numSamples
-    return accuracy
-
-
-# 显示你的训练逻辑回归模型只有2-D数据可用
-def showLogRegres(weights, train_x, train_y):
-    # 注意：train_x和train_y是垫数据类型
-    numSamples, numFeatures = shape(train_x)
-    if numFeatures != 3:
-        print "抱歉! 我不能绘制，因为你的数据的维度不是2！"
-        return 1
-
-    # 画出所有抽样数据
-    for i in xrange(numSamples):
-        if int(train_y[i, 0]) == 0:
-            plt.plot(train_x[i, 1], train_x[i, 2], 'or')
-        elif int(train_y[i, 0]) == 1:
-            plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
-
-    # 画图操作
-    min_x = min(train_x[:, 1])[0, 0]
-    max_x = max(train_x[:, 1])[0, 0]
-    weights = weights.getA()  # 将mat转换为数组
-    y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
-    y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
-    plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
-    plt.xlabel('X1'); plt.ylabel('X2')
-    #显示图像
-    plt.show()
\ No newline at end of file
diff --git a/src/python/05.Logistic/core/test_logRegression.py b/src/python/05.Logistic/core/test_logRegression.py
deleted file mode 100644
index c7d5d50d..00000000
--- a/src/python/05.Logistic/core/test_logRegression.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env python
-# encoding: utf-8
-import os
-import sys
-sys.path.append("C:\Python27")
-from numpy import *
-
-from  logRegression01 import *
-"""
-@version: 
-@author: yangjf
-@license: ApacheCN
-@contact: highfei2011@126.com
-@site: https://github.com/apachecn/MachineLearning
-@software: PyCharm
-@file: test_logRegression.py
-@time: 2017/3/3 22:09
-@test result: ok
-"""
-
-def loadData():
-    train_x = []
-    train_y = []
-    # 获取当前文件所在路径
-    project_dir = os.getcwdu()
-    # 截取字符串至项目名：Test\
-    project_dir = project_dir[:project_dir.find("MachineLearning\\") + 15]
-    print project_dir
-    fileIn = open("%s/testData/Logistic_testdata.txt" % project_dir)
-    for line in fileIn.readlines():
-        lineArr = line.strip().split()
-        train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
-        train_y.append(float(lineArr[2]))
-    return mat(train_x), mat(train_y).transpose()
-
-
-##第一步: 加载数据
-print "step 1: load data..."
-train_x, train_y = loadData()
-test_x = train_x; test_y = train_y
-
-##第二步: 训练数据...
-print "step 2: training..."
-opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'}
-optimalWeights =  trainLogRegres(train_x, train_y, opts)
-
-##第三步: 测试
-print "step 3: testing..."
-accuracy =  testLogRegres(optimalWeights, test_x, test_y)
-
-##第四步: 显示结果
-print "step 4: show the result..."
-print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
-showLogRegres(optimalWeights, train_x, train_y)
\ No newline at end of file
diff --git a/src/python/05.Logistic/logRegres.py b/src/python/05.Logistic/logRegres.py
deleted file mode 100755
index 89216058..00000000
--- a/src/python/05.Logistic/logRegres.py
+++ /dev/null
@@ -1,164 +0,0 @@
-#!/usr/bin/env python
-# -*- coding:utf-8 -*-
-from numpy import *
-
-
-def loadDataSet():
-    dataMat = []
-    labelMat = []
-    fr = open('testSet.txt')
-    for line in fr.readlines():
-        lineArr = line.strip().split()
-        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
-        labelMat.append(int(lineArr[2]))
-    return dataMat, labelMat
-
-
-def sigmoid(inX):
-    return 1.0 / (1 + exp(-inX))
-
-
-def gradAscent(dataMatIn, classLabels):
-    # 转化为矩阵[[1,1,2],[1,1,2]....]
-    dataMatrix = mat(dataMatIn)  # convert to NumPy matrix
-    # 转化为矩阵[[0,1,0,1,0,1.....]]，并转制[[0],[1],[0].....]
-    x = mat(classLabels)
-    labelMat = x.transpose()  # convert to NumPy matrix
-    # m->数据量 n->特征数
-    m, n = shape(dataMatrix)
-    # 步长
-    alpha = 0.001
-    # 迭代次数
-    maxCycles = 500
-    # 生成一个长度和特征数相同的矩阵，此处n为3 -> [[1],[1],[1]]
-    weights = ones((n, 1))
-    for k in range(maxCycles):  # heavy on matrix operations
-        # 1. dataMatrix * weights 矩阵乘法: [[1,1,2],[1,1,2]....] * [[1],[1],[1]] -> [[]]
-        s = dataMatrix * weights
-        # 把每个特征与系数的乘积只和带入Sigmoid函数
-        h = sigmoid(dataMatrix * weights)  # matrix mult
-        # [[x,x,x,x,x,......一共一百个误差]]
-        error = (labelMat - h)  # vector subtraction
-        # dataMatrix.transpose() * error 推理略去
-        # [[x,x,x,x....一共一百个数],[],[]]
-        data_tran = dataMatrix.transpose()
-        # [[a,b,c]]
-        data_tran_error = data_tran * error
-
-        # weights = weights + alpha * dataMatrix.transpose() * error  # matrix mult
-        weights = weights + alpha * data_tran_error
-    return weights
-
-
-# 随机梯度上升
-# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集，计算复杂都较高
-# 随机梯度上升一次只用一个样本点来更新回归系数
-def stocGradAscent0(dataMatrix, classLabels):
-    m, n = shape(dataMatrix)
-    alpha = 0.01
-    weights = ones(n)  # initialize to all ones
-    for i in range(m):
-        h = sigmoid(sum(dataMatrix[i] * weights))
-        error = classLabels[i] - h
-        weights = weights + alpha * error * dataMatrix[i]
-    return weights
-
-
-def plotBestFit(weights):
-    import matplotlib.pyplot as plt
-    dataMat, labelMat = loadDataSet()
-    dataArr = array(dataMat)
-    n = shape(dataArr)[0]
-    xcord1 = [];
-    ycord1 = []
-    xcord2 = [];
-    ycord2 = []
-    for i in range(n):
-        if int(labelMat[i]) == 1:
-            xcord1.append(dataArr[i, 1]);
-            ycord1.append(dataArr[i, 2])
-        else:
-            xcord2.append(dataArr[i, 1]);
-            ycord2.append(dataArr[i, 2])
-    fig = plt.figure()
-    ax = fig.add_subplot(111)
-    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
-    ax.scatter(xcord2, ycord2, s=30, c='green')
-    x = arange(-3.0, 3.0, 0.1)
-    y = (-weights[0] - weights[1] * x) / weights[2]
-    ax.plot(x, y)
-    plt.xlabel('X1');
-    plt.ylabel('X2');
-    plt.show()
-
-
-def stocGradAscent1(dataMatrix, classLabels, numIter=150):
-    m, n = shape(dataMatrix)
-    weights = ones(n)  # initialize to all ones
-    for j in range(numIter):
-        dataIndex = range(m)
-        for i in range(m):
-            # 步长在不断减小
-            alpha = 4 / (1.0 + j + i) + 0.0001  # apha decreases with iteration, does not
-            # 随机选取样本减少周期波动
-            randIndex = int(random.uniform(0, len(dataIndex)))  # go to 0 because of the constant
-            h = sigmoid(sum(dataMatrix[randIndex] * weights))
-            error = classLabels[randIndex] - h
-            weights = weights + alpha * error * dataMatrix[randIndex]
-            del (dataIndex[randIndex])
-    return weights
-
-
-# a, b = loadDataSet()
-# weights = gradAscent(a, b)
-# plotBestFit(weights)
-#
-
-######################################################################################################################
-
-def classifyVector(inX, weights):
-    prob = sigmoid(sum(inX * weights))
-    if prob > 0.5:
-        return 1.0
-    else:
-        return 0.0
-
-
-def colicTest():
-    frTrain = open('horseColicTraining.txt');
-    frTest = open('horseColicTest.txt')
-    trainingSet = [];
-    trainingLabels = []
-    for line in frTrain.readlines():
-        currLine = line.strip().split('\t')
-        lineArr = []
-        for i in range(21):
-            lineArr.append(float(currLine[i]))
-        trainingSet.append(lineArr)
-        trainingLabels.append(float(currLine[21]))
-    trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
-    errorCount = 0;
-    numTestVec = 0.0
-    for line in frTest.readlines():
-        numTestVec += 1.0
-        currLine = line.strip().split('\t')
-        lineArr = []
-        for i in range(21):
-            lineArr.append(float(currLine[i]))
-        if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
-            errorCount += 1
-    errorRate = (float(errorCount) / numTestVec)
-    print "the error rate of this test is: %f" % errorRate
-    return errorRate
-
-
-def multiTest():
-    numTests = 10;
-    errorSum = 0.0
-    for k in range(numTests):
-        errorSum += colicTest()
-    print "after %d iterations the average error rate is: %f" % (numTests, errorSum / float(numTests))
-
-    # multiTest()
-
-colicTest()
\ No newline at end of file
diff --git a/src/python/Logistic.py b/src/python/05.Logistic/logistic.py
similarity index 87%
rename from src/python/Logistic.py
rename to src/python/05.Logistic/logistic.py
index c59e022d..7a727dc2 100644
--- a/src/python/Logistic.py
+++ b/src/python/05.Logistic/logistic.py
@@ -6,10 +6,10 @@ Created on Oct 27, 2010
 Logistic Regression Working Module
 @author: Peter
 '''
-
-import os
 from numpy import *
 import matplotlib.pyplot as plt
+
+
 # 解析数据
 def loadDataSet(file_name):
     # dataMat为原始数据， labelMat为原始数据的标签
@@ -25,17 +25,24 @@ def loadDataSet(file_name):
 def sigmoid(inX):
     return 1.0/(1+exp(-inX))
 
+
 # 正常的处理方案
 def gradAscent(dataMatIn, classLabels):
+    # 转化为矩阵[[1,1,2],[1,1,2]....]
     dataMatrix = mat(dataMatIn)             #convert to NumPy matrix
+    # 转化为矩阵[[0,1,0,1,0,1.....]]，并转制[[0],[1],[0].....]
     # transpose() 行列转制函数
     # 将行矩阵转化为列矩阵    =>  矩阵的转置
     labelMat = mat(classLabels).transpose() #convert to NumPy matrix
+    # m->数据量 n->特征数
     m,n = shape(dataMatrix)
     # print m, n, '__'*10, shape(dataMatrix.transpose()), '__'*100
+    # 步长
     alpha = 0.001
+    # 迭代次数
     maxCycles = 500
-    # 权重
+    # 生成一个长度和特征数相同的矩阵，此处n为3 -> [[1],[1],[1]]
+    # 回归系数
     weights = ones((n,1))
     for k in range(maxCycles):              #heavy on matrix operations
         # m*3的矩阵 * 3*1的单位矩阵 ＝ m*1的矩阵
@@ -49,7 +56,10 @@ def gradAscent(dataMatIn, classLabels):
         weights = weights + alpha * dataMatrix.transpose() * error #matrix mult
     return array(weights)
 
-# 梯度上升算法
+
+# 随机梯度上升
+# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集，计算复杂都较高
+# 随机梯度上升一次只用一个样本点来更新回归系数
 def stocGradAscent0(dataMatrix, classLabels):
     m,n = shape(dataMatrix)
     alpha = 0.01
@@ -65,6 +75,7 @@ def stocGradAscent0(dataMatrix, classLabels):
         weights = weights + alpha * error * dataMatrix[i]
     return weights
 
+
 # 随机梯度上升算法（随机化）
 def stocGradAscent1(dataMatrix, classLabels, numIter=150):
     m,n = shape(dataMatrix)
@@ -86,6 +97,7 @@ def stocGradAscent1(dataMatrix, classLabels, numIter=150):
             del(dataIndex[randIndex])
     return weights
 
+
 # 可视化展示
 def plotBestFit(dataArr, labelMat, weights):
     n = shape(dataArr)[0]
@@ -114,10 +126,12 @@ def plotBestFit(dataArr, labelMat, weights):
     plt.xlabel('X'); plt.ylabel('Y')
     plt.show()
 
+
 def main():
-    project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
+    # project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
     # 1.收集并准备数据
-    dataMat, labelMat = loadDataSet("%s/testData/Logistic_testdata.txt" % project_dir)
+    # dataMat, labelMat = loadDataSet("%s/testData/Logistic_testdata.txt" % project_dir)
+    dataMat, labelMat = loadDataSet("testData/Logistic_testdata.txt")
 
     # print dataMat, '---\n', labelMat
     # 2.训练模型，  f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
@@ -132,5 +146,6 @@ def main():
     # 数据可视化
     plotBestFit(dataArr, labelMat, weights)
 
-if __name__=="__main__":
-    main()
\ No newline at end of file
+
+if __name__ == "__main__":
+    main()
diff --git a/src/python/07.AdaBoost/adaboost.py b/src/python/07.AdaBoost/adaboost.py
index 8f6a44f9..676cdff8 100644
--- a/src/python/07.AdaBoost/adaboost.py
+++ b/src/python/07.AdaBoost/adaboost.py
@@ -267,10 +267,10 @@ if __name__ == "__main__":
     D = mat(ones((5, 1))/5)
     print 'D=', D.T
 
-    bestStump, minError, bestClasEst = buildStump(dataArr, labelArr, D)
-    print 'bestStump=', bestStump
-    print 'minError=', minError
-    print 'bestClasEst=', bestClasEst.T
+    # bestStump, minError, bestClasEst = buildStump(dataArr, labelArr, D)
+    # print 'bestStump=', bestStump
+    # print 'minError=', minError
+    # print 'bestClasEst=', bestClasEst.T
 
 
     # 分类器：weakClassArr
@@ -288,14 +288,14 @@ if __name__ == "__main__":
     print adaClassify([0, 0], weakClassArr).T
     print adaClassify([[5, 5], [0, 0]], weakClassArr).T
 
-    # 马疝病数据集
-    # 训练集合
+    # # 马疝病数据集
+    # # 训练集合
     # dataArr, labelArr = loadDataSet("testData/AB_horseColicTraining2.txt")
     # weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 40)
     # print weakClassArr, '\n-----\n', aggClassEst.T
-    # 计算ROC下面的AUC的面积大小
+    # # 计算ROC下面的AUC的面积大小
     # plotROC(aggClassEst.T, labelArr)
-    # 测试集合
+    # # 测试集合
     # dataArrTest, labelArrTest = loadDataSet("testData/AB_horseColicTest2.txt")
     # m = shape(dataArrTest)[0]
     # predicting10 = adaClassify(dataArrTest, weakClassArr)
diff --git a/src/python/08.Predictive numerical data regression/regression.py b/src/python/08.Predictive numerical data regression/regression.py
index c47a09fd..e2ef8aa1 100644
--- a/src/python/08.Predictive numerical data regression/regression.py	
+++ b/src/python/08.Predictive numerical data regression/regression.py	
@@ -1,3 +1,6 @@
+#!/usr/bin/python
+# coding:utf8
+
 '''
 Create by ApacheCN-小瑶
 Date from 2017-02-27
@@ -26,6 +29,8 @@ def standRegres(xArr,yArr):               #线性回归
     if linalg.det(xTx) == 0.0:            #因为要用到xTx的逆矩阵，所以事先需要确定计算得到的xTx是否可逆，条件是矩阵的行列式不为0
         print ("This matrix is singular, cannot do inverse")
         return
+    # 最小二乘法
+    # http://www.apache.wiki/pages/viewpage.action?pageId=5505133
     ws = xTx.I * (xMat.T*yMat)            #书中的公式，求得w的最优解
     return ws
 
@@ -69,7 +74,7 @@ def ridgeRegres(xMat,yMat,lam=0.2):  #岭回归
         return
     ws = denom.I * (xMat.T*yMat)
     return ws
-    
+
 def ridgeTest(xArr,yArr):
     xMat = mat(xArr); yMat=mat(yArr).T
     yMean = mean(yMat,0)    #计算Y均值
@@ -85,6 +90,7 @@ def ridgeTest(xArr,yArr):
         wMat[i,:]=ws.T
     return wMat
 
+
 def regularize(xMat):#按列进行规范化
     inMat = xMat.copy()
     inMeans = mean(inMat,0)   #计算平均值然后减去它
@@ -227,7 +233,7 @@ def crossValidation(xArr,yArr,numVal=10):
 
     #test for standRegression
 def regression1():
-    xArr, yArr = loadDataSet("../../../testData/Regression_data.txt")
+    xArr, yArr = loadDataSet("testData/Regression_data.txt")
     xMat = mat(xArr)
     yMat = mat(yArr)
     ws = standRegres(xArr, yArr)
@@ -245,7 +251,7 @@ def regression1():
 
     #test for LWLR
 def regression2():
-    xArr, yArr = loadDataSet("../../../testData/Regression_data.txt")
+    xArr, yArr = loadDataSet("testData/Regression_data.txt")
     yHat = lwlrTest(xArr, xArr, yArr, 0.003)
     xMat = mat(xArr)
     srtInd = xMat[:,1].argsort(0)           #argsort()函数是将x中的元素从小到大排列，提取其对应的index(索引)，然后输出
@@ -259,7 +265,7 @@ def regression2():
 
 #test for ridgeRegression
 def regression3():
-    abX,abY = loadDataSet("../../../testData/Regression_abalone.txt")
+    abX,abY = loadDataSet("testData/Regression_abalone.txt")
     ridgeWeights = ridgeTest(abX, abY)
     fig = plt.figure()
     ax = fig.add_subplot(111)
@@ -269,7 +275,7 @@ def regression3():
 
 #test for stageWise
 def regression4():
-    xArr,yArr=loadDataSet("../../../testData/Regression_abalone.txt")
+    xArr,yArr=loadDataSet("testData/Regression_abalone.txt")
     stageWise(xArr,yArr,0.01,200)
     xMat = mat(xArr)
     yMat = mat(yArr).T
@@ -280,7 +286,7 @@ def regression4():
     print (weights.T)
 
 if __name__ == "__main__":
-    #regression1()
-    #regression2()
-    #regression3()
-    regression4()
\ No newline at end of file
+    # regression1()
+    regression2()
+    # regression3()
+    # regression4()
\ No newline at end of file