Merge pull request #26 from highfei2011/master

修改目录结构和代码
2026-06-29 17:56:13 +08:00 · 2017-03-09 15:07:45 +08:00
parent 304eea7a89 bd8db6b800
commit cd9195d6f2
8 changed files with 298 additions and 11 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -88,4 +88,3 @@ ENV/
 # Rope project settings
 .ropeproject
 .vscode
-.idea
--- a/docs/5.Logistic回归.md
+++ b/docs/5.Logistic回归.md
@@ -1,11 +1,36 @@

-# 1) 逻辑回归基础
+# 5) 逻辑回归基础

  * 逻辑回归(Logistic Regression)
-    * 1.1 分类问题
-    * 1.2 假说表示
-    * 1.3 判定边界
-    * 1.4 代价函数
-    * 1.5 简化的成本函数和梯度下降
-    * 1.6 高级优化
-    * 1.7 多类分类：一个对所有
+    * 5.1 分类问题
+        * 在分类问题中，尝试预测的是结果是否属于某一个类（例如正确或错误）。
+        * 分类问题的例子有：
+            * 判断一封电子邮件是否是垃圾邮件；
+            * 判断一次金融交易是否是欺诈等等。
+        * 从二元的分类问题开始讨论:
+             将因变量(dependant variable)可能属于的两个类分别称为负向类（negative class）和正向类（positive class），则因变量
+             y属于{0，1}
+             注：其中 0 表示负向类，1 表示正向类。
+    * 5.2 假说表示
+
+    * 5.3 判定边界
+        * 在逻辑回归中，我们预测：
+             当 hθ 大于等于 0.5 时，预测 y=1
+             当 hθ 小于 0.5 时，预测 y=0
+        * 根据上面绘制出的 S 形函数图像，我们知道当
+             z=0时 ，g(z)=0.5
+             z>0时 ，g(z)>0.5
+             z<0时 ，g(z)<0.5
+             又z=θ的T次方与X的积，即：
+               z大于等于0时，预测：y=1
+               z小于0时，预测：y=0
+        * 现在假设我们有一个模型：Hθ(x)=g(θ0+θ1*x1+θ2*x2)
+             并且参数θ是向量[-3 1 1]。则当-3+x1+x2大于等于0，即x1+x2大于等于3时，模型将预测y=1。
+             我们可以绘制直线x1+x2=3，这条线便是我们模型的分界线，将预测为1的区域和预测为0的区域分隔开。
+        * 假使我们的数据呈现这样的分布情况，怎样的模型才能适合呢？
+          因为需要用曲线才能分隔 y=0 的区域和 y=1 的区域，我们需要二次方特征： 假设参数是Hθ(x)=g(θ0+θ1*x1+θ2*x2+θ3*(x1^2)+θ4*(x2^2)+θ4*(x2^2))
+          是[-1 0 0 1 1]，则我们得到的判定边界恰好是圆点在原点且半径为 1 的圆形。可以用非常复杂的模型来适应非常复杂形状的判定边界。
+    * 5.4 代价函数
+    * 5.5 简化的成本函数和梯度下降
+    * 5.6 高级优化
+    * 5.7 多类分类：一个对所有
--- a/src/python/03.DecisionTree/DTSklearn.py
+++ b/src/python/03.DecisionTree/DTSklearn.py
@@ -6,6 +6,12 @@ from sklearn import tree
 from sklearn.metrics import precision_recall_curve
 from sklearn.metrics import classification_report
 from sklearn.cross_validation import train_test_split
+"""
+需要安装依赖模块：
+pip install scikit_learn-0.18-cp27-cp27m-win_amd64.whl
+非常完整的网址：
+http://www.lfd.uci.edu/~gohlke/pythonlibs/#numpy
+"""


 def createDataSet():
--- a/src/python/05.Logistic/core/logRegression01.py
+++ b/src/python/05.Logistic/core/logRegression01.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+# encoding: utf-8
+from numpy import *
+import matplotlib.pyplot as plt
+import time
+'''
+1、需要安装模块：pip install matplotlib-1.5.0-cp27-none-win_amd64.whl
+由于直接安装会出现问题，所以建议下载whl包进行安装，下载网址：
+https://pypi.python.org/pypi/matplotlib/1.5.0
+
+2、可以看见画出的图像
+'''
+
+"""
+@version: 
+@author: yangjf
+@license: ApacheCN
+@contact: highfei2011@126.com
+@site: https://github.com/apachecn/MachineLearning
+@software: PyCharm
+@file: logRegression01.py
+@time: 2017/3/3 22:03
+@test result: ok
+"""
+
+# sigmoid函数
+def sigmoid(inX):
+    return 1.0 / (1 + exp(-inX))
+
+def trainLogRegres(train_x, train_y, opts):
+    # 计算训练时间
+    startTime = time.time()
+
+    numSamples, numFeatures = shape(train_x)
+    alpha = opts['alpha']; maxIter = opts['maxIter']
+    weights = ones((numFeatures, 1))
+
+    # 通过梯度下降算法优化
+    for k in range(maxIter):
+        if opts['optimizeType'] == 'gradDescent': # 梯度下降算法
+            output = sigmoid(train_x * weights)
+            error = train_y - output
+            weights = weights + alpha * train_x.transpose() * error
+        elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降
+            for i in range(numSamples):
+                output = sigmoid(train_x[i, :] * weights)
+                error = train_y[i, 0] - output
+                weights = weights + alpha * train_x[i, :].transpose() * error
+        elif opts['optimizeType'] == 'smoothStocGradDescent': # 光滑随机梯度下降
+            # 随机选择样本以优化以减少周期波动
+            dataIndex = range(numSamples)
+            for i in range(numSamples):
+                alpha = 4.0 / (1.0 + k + i) + 0.01
+                randIndex = int(random.uniform(0, len(dataIndex)))
+                output = sigmoid(train_x[randIndex, :] * weights)
+                error = train_y[randIndex, 0] - output
+                weights = weights + alpha * train_x[randIndex, :].transpose() * error
+                del(dataIndex[randIndex]) # 在一次交互期间，删除优化的样品
+        else:
+            raise NameError('Not support optimize method type!')
+
+
+    print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
+    return weights
+
+
+#测试给定测试集的训练Logistic回归模型
+def testLogRegres(weights, test_x, test_y):
+    numSamples, numFeatures = shape(test_x)
+    matchCount = 0
+    for i in xrange(numSamples):
+        predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
+        if predict == bool(test_y[i, 0]):
+            matchCount += 1
+    accuracy = float(matchCount) / numSamples
+    return accuracy
+
+
+# 显示你的训练逻辑回归模型只有2-D数据可用
+def showLogRegres(weights, train_x, train_y):
+    # 注意：train_x和train_y是垫数据类型
+    numSamples, numFeatures = shape(train_x)
+    if numFeatures != 3:
+        print "抱歉! 我不能绘制，因为你的数据的维度不是2！"
+        return 1
+
+    # 画出所有抽样数据
+    for i in xrange(numSamples):
+        if int(train_y[i, 0]) == 0:
+            plt.plot(train_x[i, 1], train_x[i, 2], 'or')
+        elif int(train_y[i, 0]) == 1:
+            plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
+
+    # 画图操作
+    min_x = min(train_x[:, 1])[0, 0]
+    max_x = max(train_x[:, 1])[0, 0]
+    weights = weights.getA()  # 将mat转换为数组
+    y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
+    y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
+    plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
+    plt.xlabel('X1'); plt.ylabel('X2')
+    #显示图像
+    plt.show()
--- a/src/python/05.Logistic/core/test_logRegression.py
+++ b/src/python/05.Logistic/core/test_logRegression.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# encoding: utf-8
+import os
+import sys
+sys.path.append("C:\Python27")
+from numpy import *
+
+from  logRegression01 import *
+"""
+@version: 
+@author: yangjf
+@license: ApacheCN
+@contact: highfei2011@126.com
+@site: https://github.com/apachecn/MachineLearning
+@software: PyCharm
+@file: test_logRegression.py
+@time: 2017/3/3 22:09
+@test result: ok
+"""
+
+def loadData():
+    train_x = []
+    train_y = []
+    # 获取当前文件所在路径
+    project_dir = os.getcwdu()
+    # 截取字符串至项目名：Test\
+    project_dir = project_dir[:project_dir.find("MachineLearning\\") + 15]
+    print project_dir
+    fileIn = open("%s/testData/Logistic_testdata.txt" % project_dir)
+    for line in fileIn.readlines():
+        lineArr = line.strip().split()
+        train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
+        train_y.append(float(lineArr[2]))
+    return mat(train_x), mat(train_y).transpose()
+
+
+##第一步: 加载数据
+print "step 1: load data..."
+train_x, train_y = loadData()
+test_x = train_x; test_y = train_y
+
+##第二步: 训练数据...
+print "step 2: training..."
+opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'}
+optimalWeights =  trainLogRegres(train_x, train_y, opts)
+
+##第三步: 测试
+print "step 3: testing..."
+accuracy =  testLogRegres(optimalWeights, test_x, test_y)
+
+##第四步: 显示结果
+print "step 4: show the result..."
+print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
+showLogRegres(optimalWeights, train_x, train_y)
--- a/src/python/Logistic.py
+++ b/src/python/Logistic.py
@@ -117,7 +117,7 @@ def plotBestFit(dataArr, labelMat, weights):
 def main():
    project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
    # 1.收集并准备数据
-    dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
+    dataMat, labelMat = loadDataSet("%s/testData/Logistic_testdata.txt" % project_dir)

    # print dataMat, '---\n', labelMat
    # 2.训练模型，  f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
--- a/src/python/apriori.py
+++ b/src/python/apriori.py
@@ -73,7 +73,7 @@ def apriori(dataSet, minSupport = 0.5):
 def main():
    # project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
    # 1.收集并准备数据
-    # dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
+    # dataMat, labelMat = loadDataSet("%s/resources/Logistic_testdata.txt" % project_dir)


    # 1. 加载数据
--- a/testData/Logistic_testdata.txt
+++ b/testData/Logistic_testdata.txt
@@ -0,0 +1,100 @@
+-0.017612   14.053064   0
+-1.395634   4.662541    1
+-0.752157   6.538620    0
+-1.322371   7.152853    0
+0.423363    11.054677   0
+0.406704    7.067335    1
+0.667394    12.741452   0
+-2.460150   6.866805    1
+0.569411    9.548755    0
+-0.026632   10.427743   0
+0.850433    6.920334    1
+1.347183    13.175500   0
+1.176813    3.167020    1
+-1.781871   9.097953    0
+-0.566606   5.749003    1
+0.931635    1.589505    1
+-0.024205   6.151823    1
+-0.036453   2.690988    1
+-0.196949   0.444165    1
+1.014459    5.754399    1
+1.985298    3.230619    1
+-1.693453   -0.557540   1
+-0.576525   11.778922   0
+-0.346811   -1.678730   1
+-2.124484   2.672471    1
+1.217916    9.597015    0
+-0.733928   9.098687    0
+-3.642001   -1.618087   1
+0.315985    3.523953    1
+1.416614    9.619232    0
+-0.386323   3.989286    1
+0.556921    8.294984    1
+1.224863    11.587360   0
+-1.347803   -2.406051   1
+1.196604    4.951851    1
+0.275221    9.543647    0
+0.470575    9.332488    0
+-1.889567   9.542662    0
+-1.527893   12.150579   0
+-1.185247   11.309318   0
+-0.445678   3.297303    1
+1.042222    6.105155    1
+-0.618787   10.320986   0
+1.152083    0.548467    1
+0.828534    2.676045    1
+-1.237728   10.549033   0
+-0.683565   -2.166125   1
+0.229456    5.921938    1
+-0.959885   11.555336   0
+0.492911    10.993324   0
+0.184992    8.721488    0
+-0.355715   10.325976   0
+-0.397822   8.058397    0
+0.824839    13.730343   0
+1.507278    5.027866    1
+0.099671    6.835839    1
+-0.344008   10.717485   0
+1.785928    7.718645    1
+-0.918801   11.560217   0
+-0.364009   4.747300    1
+-0.841722   4.119083    1
+0.490426    1.960539    1
+-0.007194   9.075792    0
+0.356107    12.447863   0
+0.342578    12.281162   0
+-0.810823   -1.466018   1
+2.530777    6.476801    1
+1.296683    11.607559   0
+0.475487    12.040035   0
+-0.783277   11.009725   0
+0.074798    11.023650   0
+-1.337472   0.468339    1
+-0.102781   13.763651   0
+-0.147324   2.874846    1
+0.518389    9.887035    0
+1.015399    7.571882    0
+-1.658086   -0.027255   1
+1.319944    2.171228    1
+2.056216    5.019981    1
+-0.851633   4.375691    1
+-1.510047   6.061992    0
+-1.076637   -3.181888   1
+1.821096    10.283990   0
+3.010150    8.401766    1
+-1.099458   1.688274    1
+-0.834872   -1.733869   1
+-0.846637   3.849075    1
+1.400102    12.628781   0
+1.752842    5.468166    1
+0.078557    0.059736    1
+0.089392    -0.715300   1
+1.825662    12.693808   0
+0.197445    9.744638    0
+0.126117    0.922311    1
+-0.679797   1.220530    1
+0.677983    2.556666    1
+0.761349    10.693862   0
+-2.168791   0.143632    1
+1.388610    9.341997    0
+0.317029    14.739025   0