From 6000e5f36cc192022aac5dac6f5a59c4c9a60302 Mon Sep 17 00:00:00 2001 From: sheepmen Date: Sat, 25 Mar 2017 14:03:24 +0800 Subject: [PATCH] DEV: ADD CAHPTER 4 CODE FILE --- src/python/03.DecisionTree/DecisionTree.py | 12 +- src/python/05.Logistic/logRegres.py | 164 +++++++++++++++++++++ 2 files changed, 175 insertions(+), 1 deletion(-) create mode 100755 src/python/05.Logistic/logRegres.py diff --git a/src/python/03.DecisionTree/DecisionTree.py b/src/python/03.DecisionTree/DecisionTree.py index f6f4bf1f..0d7e37f3 100644 --- a/src/python/03.DecisionTree/DecisionTree.py +++ b/src/python/03.DecisionTree/DecisionTree.py @@ -10,7 +10,7 @@ Decision Tree Source Code for Machine Learning in Action Ch. 3 print(__doc__) import operator from math import log -import decisionTreePlot as dtPlot +import DecisionTreePlot as dtPlot def createDataSet(): @@ -116,14 +116,24 @@ def chooseBestFeatureToSplit(dataSet): # get a set of unique values # 获取剔重后的集合 uniqueVals = set(featList) + print('uniqueVals:'+str(uniqueVals)) # 创建一个临时的信息熵 newEntropy = 0.0 # 遍历某一列的value集合,计算该列的信息熵 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) + print (subDataSet) prob = len(subDataSet)/float(len(dataSet)) +<<<<<<< Updated upstream newEntropy += prob * calcShannonEnt(subDataSet) # gain[信息增益]: 划分数据集前后的信息变化, 获取信息熵最大的值 +======= + text = calcShannonEnt(subDataSet) + print('----------'+str(text)) + newEntropy += prob * text + # gain[信息增益] 值越大,意味着该分类提供的信息量越大,该特征对分类的不确定程度越小 + # 也就说: 列进行group分组后,对应的类别越多,信息量越大,那么香农熵越小,那么信息增益就越大,所以gain越大 +>>>>>>> Stashed changes infoGain = baseEntropy - newEntropy print 'infoGain=', infoGain, 'bestFeature=', i, baseEntropy, newEntropy if (infoGain > bestInfoGain): diff --git a/src/python/05.Logistic/logRegres.py b/src/python/05.Logistic/logRegres.py new file mode 100755 index 00000000..89216058 --- /dev/null +++ b/src/python/05.Logistic/logRegres.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +from numpy import * + + +def loadDataSet(): + dataMat = [] + labelMat = [] + fr = open('testSet.txt') + for line in fr.readlines(): + lineArr = line.strip().split() + dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) + labelMat.append(int(lineArr[2])) + return dataMat, labelMat + + +def sigmoid(inX): + return 1.0 / (1 + exp(-inX)) + + +def gradAscent(dataMatIn, classLabels): + # 转化为矩阵[[1,1,2],[1,1,2]....] + dataMatrix = mat(dataMatIn) # convert to NumPy matrix + # 转化为矩阵[[0,1,0,1,0,1.....]],并转制[[0],[1],[0].....] + x = mat(classLabels) + labelMat = x.transpose() # convert to NumPy matrix + # m->数据量 n->特征数 + m, n = shape(dataMatrix) + # 步长 + alpha = 0.001 + # 迭代次数 + maxCycles = 500 + # 生成一个长度和特征数相同的矩阵,此处n为3 -> [[1],[1],[1]] + weights = ones((n, 1)) + for k in range(maxCycles): # heavy on matrix operations + # 1. dataMatrix * weights 矩阵乘法: [[1,1,2],[1,1,2]....] * [[1],[1],[1]] -> [[]] + s = dataMatrix * weights + # 把每个特征与系数的乘积只和带入Sigmoid函数 + h = sigmoid(dataMatrix * weights) # matrix mult + # [[x,x,x,x,x,......一共一百个误差]] + error = (labelMat - h) # vector subtraction + # dataMatrix.transpose() * error 推理略去 + # [[x,x,x,x....一共一百个数],[],[]] + data_tran = dataMatrix.transpose() + # [[a,b,c]] + data_tran_error = data_tran * error + + # weights = weights + alpha * dataMatrix.transpose() * error # matrix mult + weights = weights + alpha * data_tran_error + return weights + + +# 随机梯度上升 +# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集,计算复杂都较高 +# 随机梯度上升一次只用一个样本点来更新回归系数 +def stocGradAscent0(dataMatrix, classLabels): + m, n = shape(dataMatrix) + alpha = 0.01 + weights = ones(n) # initialize to all ones + for i in range(m): + h = sigmoid(sum(dataMatrix[i] * weights)) + error = classLabels[i] - h + weights = weights + alpha * error * dataMatrix[i] + return weights + + +def plotBestFit(weights): + import matplotlib.pyplot as plt + dataMat, labelMat = loadDataSet() + dataArr = array(dataMat) + n = shape(dataArr)[0] + xcord1 = []; + ycord1 = [] + xcord2 = []; + ycord2 = [] + for i in range(n): + if int(labelMat[i]) == 1: + xcord1.append(dataArr[i, 1]); + ycord1.append(dataArr[i, 2]) + else: + xcord2.append(dataArr[i, 1]); + ycord2.append(dataArr[i, 2]) + fig = plt.figure() + ax = fig.add_subplot(111) + ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') + ax.scatter(xcord2, ycord2, s=30, c='green') + x = arange(-3.0, 3.0, 0.1) + y = (-weights[0] - weights[1] * x) / weights[2] + ax.plot(x, y) + plt.xlabel('X1'); + plt.ylabel('X2'); + plt.show() + + +def stocGradAscent1(dataMatrix, classLabels, numIter=150): + m, n = shape(dataMatrix) + weights = ones(n) # initialize to all ones + for j in range(numIter): + dataIndex = range(m) + for i in range(m): + # 步长在不断减小 + alpha = 4 / (1.0 + j + i) + 0.0001 # apha decreases with iteration, does not + # 随机选取样本减少周期波动 + randIndex = int(random.uniform(0, len(dataIndex))) # go to 0 because of the constant + h = sigmoid(sum(dataMatrix[randIndex] * weights)) + error = classLabels[randIndex] - h + weights = weights + alpha * error * dataMatrix[randIndex] + del (dataIndex[randIndex]) + return weights + + +# a, b = loadDataSet() +# weights = gradAscent(a, b) +# plotBestFit(weights) +# + +###################################################################################################################### + +def classifyVector(inX, weights): + prob = sigmoid(sum(inX * weights)) + if prob > 0.5: + return 1.0 + else: + return 0.0 + + +def colicTest(): + frTrain = open('horseColicTraining.txt'); + frTest = open('horseColicTest.txt') + trainingSet = []; + trainingLabels = [] + for line in frTrain.readlines(): + currLine = line.strip().split('\t') + lineArr = [] + for i in range(21): + lineArr.append(float(currLine[i])) + trainingSet.append(lineArr) + trainingLabels.append(float(currLine[21])) + trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000) + errorCount = 0; + numTestVec = 0.0 + for line in frTest.readlines(): + numTestVec += 1.0 + currLine = line.strip().split('\t') + lineArr = [] + for i in range(21): + lineArr.append(float(currLine[i])) + if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]): + errorCount += 1 + errorRate = (float(errorCount) / numTestVec) + print "the error rate of this test is: %f" % errorRate + return errorRate + + +def multiTest(): + numTests = 10; + errorSum = 0.0 + for k in range(numTests): + errorSum += colicTest() + print "after %d iterations the average error rate is: %f" % (numTests, errorSum / float(numTests)) + + # multiTest() + +colicTest() \ No newline at end of file