add the regression code of python

2026-07-04 20:26:18 +08:00 · 2017-03-04 20:38:51 +08:00
parent d12b8eaa19
commit 0c43981c76
8 changed files with 311 additions and 9 deletions
--- a/src/python/05.Logistic/core/com/apachecn/logistic/logRegression.py
+++ b/src/python/05.Logistic/core/com/apachecn/logistic/logRegression.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+# encoding: utf-8
+from numpy import *
+import matplotlib.pyplot as plt
+import time
+
+
+"""
+@version: 
+@author: yangjf
+@license: ApacheCN
+@contact: highfei2011@126.com
+@site: https://github.com/apachecn/MachineLearning
+@software: PyCharm
+@file: logRegression01.py
+@time: 2017/3/3 22:03
+@test result:not pass
+"""
+
+# sigmoid函数
+def sigmoid(inX):
+    return 1.0 / (1 + exp(-inX))
+
+def trainLogRegres(train_x, train_y, opts):
+    # 计算训练时间
+    startTime = time.time()
+
+    numSamples, numFeatures = shape(train_x)
+    alpha = opts['alpha']; maxIter = opts['maxIter']
+    weights = ones((numFeatures, 1))
+
+    # 通过梯度下降算法优化
+    for k in range(maxIter):
+        if opts['optimizeType'] == 'gradDescent': # 梯度下降算法
+            output = sigmoid(train_x * weights)
+            error = train_y - output
+            weights = weights + alpha * train_x.transpose() * error
+        elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降
+            for i in range(numSamples):
+                output = sigmoid(train_x[i, :] * weights)
+                error = train_y[i, 0] - output
+                weights = weights + alpha * train_x[i, :].transpose() * error
+        elif opts['optimizeType'] == 'smoothStocGradDescent': # 光滑随机梯度下降
+            # 随机选择样本以优化以减少周期波动
+            dataIndex = range(numSamples)
+            for i in range(numSamples):
+                alpha = 4.0 / (1.0 + k + i) + 0.01
+                randIndex = int(random.uniform(0, len(dataIndex)))
+                output = sigmoid(train_x[randIndex, :] * weights)
+                error = train_y[randIndex, 0] - output
+                weights = weights + alpha * train_x[randIndex, :].transpose() * error
+                del(dataIndex[randIndex]) # 在一次交互期间，删除优化的样品
+        else:
+            raise NameError('Not support optimize method type!')
+
+
+    print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
+    return weights
+
+
+#测试给定测试集的训练Logistic回归模型
+def testLogRegres(weights, test_x, test_y):
+    numSamples, numFeatures = shape(test_x)
+    matchCount = 0
+    for i in xrange(numSamples):
+        predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
+        if predict == bool(test_y[i, 0]):
+            matchCount += 1
+    accuracy = float(matchCount) / numSamples
+    return accuracy
+
+
+# 显示你的训练逻辑回归模型只有2-D数据可用
+def showLogRegres(weights, train_x, train_y):
+    # 注意：train_x和train_y是垫数据类型
+    numSamples, numFeatures = shape(train_x)
+    if numFeatures != 3:
+        print "抱歉! 我不能绘制，因为你的数据的维度不是2！"
+        return 1
+
+    # 画出所有抽样数据
+    for i in xrange(numSamples):
+        if int(train_y[i, 0]) == 0:
+            plt.plot(train_x[i, 1], train_x[i, 2], 'or')
+        elif int(train_y[i, 0]) == 1:
+            plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
+
+    # 画图操作
+    min_x = min(train_x[:, 1])[0, 0]
+    max_x = max(train_x[:, 1])[0, 0]
+    weights = weights.getA()  # 将mat转换为数组
+    y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
+    y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
+    plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
+    plt.xlabel('X1'); plt.ylabel('X2')
+    #显示图像
+    plt.show()
--- a/src/python/05.Logistic/test/test_logRegression.py
+++ b/src/python/05.Logistic/test/test_logRegression.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# encoding: utf-8
+import sys
+sys.path.append("C:\Python27")
+
+from numpy import *
+import matplotlib.pyplot as plt
+from core.com.apachcn.logistic import logRegression
+
+"""
+@version: 
+@author: yangjf
+@license: ApacheCN
+@contact: highfei2011@126.com
+@site: https://github.com/apachecn/MachineLearning
+@software: PyCharm
+@file: test_logRegression.py
+@time: 2017/3/3 22:09
+"""
+
+def loadData():
+    train_x = []
+    train_y = []
+    fileIn = open('testData/testSet.txt')
+    for line in fileIn.readlines():
+        lineArr = line.strip().split()
+        train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
+        train_y.append(float(lineArr[2]))
+    return mat(train_x), mat(train_y).transpose()
+
+
+##第一步: 加载数据
+print "step 1: load data..."
+train_x, train_y = loadData()
+test_x = train_x; test_y = train_y
+
+##第二步: 训练数据...
+print "step 2: training..."
+opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'}
+optimalWeights = trainLogRegres(train_x, train_y, opts)
+
+##第三步: 测试
+print "step 3: testing..."
+accuracy = testLogRegres(optimalWeights, test_x, test_y)
+
+##第四步: 显示结果
+print "step 4: show the result..."
+print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
+showLogRegres(optimalWeights, train_x, train_y)