From 6000e5f36cc192022aac5dac6f5a59c4c9a60302 Mon Sep 17 00:00:00 2001
From: sheepmen <xuxinhome@126.com>
Date: Sat, 25 Mar 2017 14:03:24 +0800
Subject: [PATCH] DEV: ADD CAHPTER 4 CODE FILE

---
 src/python/03.DecisionTree/DecisionTree.py |  12 +-
 src/python/05.Logistic/logRegres.py        | 164 +++++++++++++++++++++
 2 files changed, 175 insertions(+), 1 deletion(-)
 create mode 100755 src/python/05.Logistic/logRegres.py

diff --git a/src/python/03.DecisionTree/DecisionTree.py b/src/python/03.DecisionTree/DecisionTree.py
index f6f4bf1f..0d7e37f3 100644
--- a/src/python/03.DecisionTree/DecisionTree.py
+++ b/src/python/03.DecisionTree/DecisionTree.py
@@ -10,7 +10,7 @@ Decision Tree Source Code for Machine Learning in Action Ch. 3
 print(__doc__)
 import operator
 from math import log
-import decisionTreePlot as dtPlot
+import DecisionTreePlot as dtPlot
 
 
 def createDataSet():
@@ -116,14 +116,24 @@ def chooseBestFeatureToSplit(dataSet):
         # get a set of unique values
         # 获取剔重后的集合
         uniqueVals = set(featList)
+        print('uniqueVals:'+str(uniqueVals))
         # 创建一个临时的信息熵
         newEntropy = 0.0
         # 遍历某一列的value集合，计算该列的信息熵
         for value in uniqueVals:
             subDataSet = splitDataSet(dataSet, i, value)
+            print (subDataSet)
             prob = len(subDataSet)/float(len(dataSet))
+<<<<<<< Updated upstream
             newEntropy += prob * calcShannonEnt(subDataSet)
         # gain[信息增益]: 划分数据集前后的信息变化， 获取信息熵最大的值
+=======
+            text = calcShannonEnt(subDataSet)
+            print('----------'+str(text))
+            newEntropy += prob * text
+        # gain[信息增益] 值越大，意味着该分类提供的信息量越大，该特征对分类的不确定程度越小
+        # 也就说： 列进行group分组后，对应的类别越多，信息量越大，那么香农熵越小，那么信息增益就越大，所以gain越大
+>>>>>>> Stashed changes
         infoGain = baseEntropy - newEntropy
         print 'infoGain=', infoGain, 'bestFeature=', i, baseEntropy, newEntropy
         if (infoGain > bestInfoGain):
diff --git a/src/python/05.Logistic/logRegres.py b/src/python/05.Logistic/logRegres.py
new file mode 100755
index 00000000..89216058
--- /dev/null
+++ b/src/python/05.Logistic/logRegres.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+from numpy import *
+
+
+def loadDataSet():
+    dataMat = []
+    labelMat = []
+    fr = open('testSet.txt')
+    for line in fr.readlines():
+        lineArr = line.strip().split()
+        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
+        labelMat.append(int(lineArr[2]))
+    return dataMat, labelMat
+
+
+def sigmoid(inX):
+    return 1.0 / (1 + exp(-inX))
+
+
+def gradAscent(dataMatIn, classLabels):
+    # 转化为矩阵[[1,1,2],[1,1,2]....]
+    dataMatrix = mat(dataMatIn)  # convert to NumPy matrix
+    # 转化为矩阵[[0,1,0,1,0,1.....]]，并转制[[0],[1],[0].....]
+    x = mat(classLabels)
+    labelMat = x.transpose()  # convert to NumPy matrix
+    # m->数据量 n->特征数
+    m, n = shape(dataMatrix)
+    # 步长
+    alpha = 0.001
+    # 迭代次数
+    maxCycles = 500
+    # 生成一个长度和特征数相同的矩阵，此处n为3 -> [[1],[1],[1]]
+    weights = ones((n, 1))
+    for k in range(maxCycles):  # heavy on matrix operations
+        # 1. dataMatrix * weights 矩阵乘法: [[1,1,2],[1,1,2]....] * [[1],[1],[1]] -> [[]]
+        s = dataMatrix * weights
+        # 把每个特征与系数的乘积只和带入Sigmoid函数
+        h = sigmoid(dataMatrix * weights)  # matrix mult
+        # [[x,x,x,x,x,......一共一百个误差]]
+        error = (labelMat - h)  # vector subtraction
+        # dataMatrix.transpose() * error 推理略去
+        # [[x,x,x,x....一共一百个数],[],[]]
+        data_tran = dataMatrix.transpose()
+        # [[a,b,c]]
+        data_tran_error = data_tran * error
+
+        # weights = weights + alpha * dataMatrix.transpose() * error  # matrix mult
+        weights = weights + alpha * data_tran_error
+    return weights
+
+
+# 随机梯度上升
+# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集，计算复杂都较高
+# 随机梯度上升一次只用一个样本点来更新回归系数
+def stocGradAscent0(dataMatrix, classLabels):
+    m, n = shape(dataMatrix)
+    alpha = 0.01
+    weights = ones(n)  # initialize to all ones
+    for i in range(m):
+        h = sigmoid(sum(dataMatrix[i] * weights))
+        error = classLabels[i] - h
+        weights = weights + alpha * error * dataMatrix[i]
+    return weights
+
+
+def plotBestFit(weights):
+    import matplotlib.pyplot as plt
+    dataMat, labelMat = loadDataSet()
+    dataArr = array(dataMat)
+    n = shape(dataArr)[0]
+    xcord1 = [];
+    ycord1 = []
+    xcord2 = [];
+    ycord2 = []
+    for i in range(n):
+        if int(labelMat[i]) == 1:
+            xcord1.append(dataArr[i, 1]);
+            ycord1.append(dataArr[i, 2])
+        else:
+            xcord2.append(dataArr[i, 1]);
+            ycord2.append(dataArr[i, 2])
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
+    ax.scatter(xcord2, ycord2, s=30, c='green')
+    x = arange(-3.0, 3.0, 0.1)
+    y = (-weights[0] - weights[1] * x) / weights[2]
+    ax.plot(x, y)
+    plt.xlabel('X1');
+    plt.ylabel('X2');
+    plt.show()
+
+
+def stocGradAscent1(dataMatrix, classLabels, numIter=150):
+    m, n = shape(dataMatrix)
+    weights = ones(n)  # initialize to all ones
+    for j in range(numIter):
+        dataIndex = range(m)
+        for i in range(m):
+            # 步长在不断减小
+            alpha = 4 / (1.0 + j + i) + 0.0001  # apha decreases with iteration, does not
+            # 随机选取样本减少周期波动
+            randIndex = int(random.uniform(0, len(dataIndex)))  # go to 0 because of the constant
+            h = sigmoid(sum(dataMatrix[randIndex] * weights))
+            error = classLabels[randIndex] - h
+            weights = weights + alpha * error * dataMatrix[randIndex]
+            del (dataIndex[randIndex])
+    return weights
+
+
+# a, b = loadDataSet()
+# weights = gradAscent(a, b)
+# plotBestFit(weights)
+#
+
+######################################################################################################################
+
+def classifyVector(inX, weights):
+    prob = sigmoid(sum(inX * weights))
+    if prob > 0.5:
+        return 1.0
+    else:
+        return 0.0
+
+
+def colicTest():
+    frTrain = open('horseColicTraining.txt');
+    frTest = open('horseColicTest.txt')
+    trainingSet = [];
+    trainingLabels = []
+    for line in frTrain.readlines():
+        currLine = line.strip().split('\t')
+        lineArr = []
+        for i in range(21):
+            lineArr.append(float(currLine[i]))
+        trainingSet.append(lineArr)
+        trainingLabels.append(float(currLine[21]))
+    trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
+    errorCount = 0;
+    numTestVec = 0.0
+    for line in frTest.readlines():
+        numTestVec += 1.0
+        currLine = line.strip().split('\t')
+        lineArr = []
+        for i in range(21):
+            lineArr.append(float(currLine[i]))
+        if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
+            errorCount += 1
+    errorRate = (float(errorCount) / numTestVec)
+    print "the error rate of this test is: %f" % errorRate
+    return errorRate
+
+
+def multiTest():
+    numTests = 10;
+    errorSum = 0.0
+    for k in range(numTests):
+        errorSum += colicTest()
+    print "after %d iterations the average error rate is: %f" % (numTests, errorSum / float(numTests))
+
+    # multiTest()
+
+colicTest()
\ No newline at end of file