mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-12 06:46:14 +08:00
DEV: ADD CAHPTER 4 CODE FILE
This commit is contained in:
@@ -10,7 +10,7 @@ Decision Tree Source Code for Machine Learning in Action Ch. 3
|
||||
print(__doc__)
|
||||
import operator
|
||||
from math import log
|
||||
import decisionTreePlot as dtPlot
|
||||
import DecisionTreePlot as dtPlot
|
||||
|
||||
|
||||
def createDataSet():
|
||||
@@ -116,14 +116,24 @@ def chooseBestFeatureToSplit(dataSet):
|
||||
# get a set of unique values
|
||||
# 获取剔重后的集合
|
||||
uniqueVals = set(featList)
|
||||
print('uniqueVals:'+str(uniqueVals))
|
||||
# 创建一个临时的信息熵
|
||||
newEntropy = 0.0
|
||||
# 遍历某一列的value集合,计算该列的信息熵
|
||||
for value in uniqueVals:
|
||||
subDataSet = splitDataSet(dataSet, i, value)
|
||||
print (subDataSet)
|
||||
prob = len(subDataSet)/float(len(dataSet))
|
||||
<<<<<<< Updated upstream
|
||||
newEntropy += prob * calcShannonEnt(subDataSet)
|
||||
# gain[信息增益]: 划分数据集前后的信息变化, 获取信息熵最大的值
|
||||
=======
|
||||
text = calcShannonEnt(subDataSet)
|
||||
print('----------'+str(text))
|
||||
newEntropy += prob * text
|
||||
# gain[信息增益] 值越大,意味着该分类提供的信息量越大,该特征对分类的不确定程度越小
|
||||
# 也就说: 列进行group分组后,对应的类别越多,信息量越大,那么香农熵越小,那么信息增益就越大,所以gain越大
|
||||
>>>>>>> Stashed changes
|
||||
infoGain = baseEntropy - newEntropy
|
||||
print 'infoGain=', infoGain, 'bestFeature=', i, baseEntropy, newEntropy
|
||||
if (infoGain > bestInfoGain):
|
||||
|
||||
164
src/python/05.Logistic/logRegres.py
Executable file
164
src/python/05.Logistic/logRegres.py
Executable file
@@ -0,0 +1,164 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
from numpy import *
|
||||
|
||||
|
||||
def loadDataSet():
|
||||
dataMat = []
|
||||
labelMat = []
|
||||
fr = open('testSet.txt')
|
||||
for line in fr.readlines():
|
||||
lineArr = line.strip().split()
|
||||
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||||
labelMat.append(int(lineArr[2]))
|
||||
return dataMat, labelMat
|
||||
|
||||
|
||||
def sigmoid(inX):
|
||||
return 1.0 / (1 + exp(-inX))
|
||||
|
||||
|
||||
def gradAscent(dataMatIn, classLabels):
|
||||
# 转化为矩阵[[1,1,2],[1,1,2]....]
|
||||
dataMatrix = mat(dataMatIn) # convert to NumPy matrix
|
||||
# 转化为矩阵[[0,1,0,1,0,1.....]],并转制[[0],[1],[0].....]
|
||||
x = mat(classLabels)
|
||||
labelMat = x.transpose() # convert to NumPy matrix
|
||||
# m->数据量 n->特征数
|
||||
m, n = shape(dataMatrix)
|
||||
# 步长
|
||||
alpha = 0.001
|
||||
# 迭代次数
|
||||
maxCycles = 500
|
||||
# 生成一个长度和特征数相同的矩阵,此处n为3 -> [[1],[1],[1]]
|
||||
weights = ones((n, 1))
|
||||
for k in range(maxCycles): # heavy on matrix operations
|
||||
# 1. dataMatrix * weights 矩阵乘法: [[1,1,2],[1,1,2]....] * [[1],[1],[1]] -> [[]]
|
||||
s = dataMatrix * weights
|
||||
# 把每个特征与系数的乘积只和带入Sigmoid函数
|
||||
h = sigmoid(dataMatrix * weights) # matrix mult
|
||||
# [[x,x,x,x,x,......一共一百个误差]]
|
||||
error = (labelMat - h) # vector subtraction
|
||||
# dataMatrix.transpose() * error 推理略去
|
||||
# [[x,x,x,x....一共一百个数],[],[]]
|
||||
data_tran = dataMatrix.transpose()
|
||||
# [[a,b,c]]
|
||||
data_tran_error = data_tran * error
|
||||
|
||||
# weights = weights + alpha * dataMatrix.transpose() * error # matrix mult
|
||||
weights = weights + alpha * data_tran_error
|
||||
return weights
|
||||
|
||||
|
||||
# 随机梯度上升
|
||||
# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集,计算复杂都较高
|
||||
# 随机梯度上升一次只用一个样本点来更新回归系数
|
||||
def stocGradAscent0(dataMatrix, classLabels):
|
||||
m, n = shape(dataMatrix)
|
||||
alpha = 0.01
|
||||
weights = ones(n) # initialize to all ones
|
||||
for i in range(m):
|
||||
h = sigmoid(sum(dataMatrix[i] * weights))
|
||||
error = classLabels[i] - h
|
||||
weights = weights + alpha * error * dataMatrix[i]
|
||||
return weights
|
||||
|
||||
|
||||
def plotBestFit(weights):
|
||||
import matplotlib.pyplot as plt
|
||||
dataMat, labelMat = loadDataSet()
|
||||
dataArr = array(dataMat)
|
||||
n = shape(dataArr)[0]
|
||||
xcord1 = [];
|
||||
ycord1 = []
|
||||
xcord2 = [];
|
||||
ycord2 = []
|
||||
for i in range(n):
|
||||
if int(labelMat[i]) == 1:
|
||||
xcord1.append(dataArr[i, 1]);
|
||||
ycord1.append(dataArr[i, 2])
|
||||
else:
|
||||
xcord2.append(dataArr[i, 1]);
|
||||
ycord2.append(dataArr[i, 2])
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
|
||||
ax.scatter(xcord2, ycord2, s=30, c='green')
|
||||
x = arange(-3.0, 3.0, 0.1)
|
||||
y = (-weights[0] - weights[1] * x) / weights[2]
|
||||
ax.plot(x, y)
|
||||
plt.xlabel('X1');
|
||||
plt.ylabel('X2');
|
||||
plt.show()
|
||||
|
||||
|
||||
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
|
||||
m, n = shape(dataMatrix)
|
||||
weights = ones(n) # initialize to all ones
|
||||
for j in range(numIter):
|
||||
dataIndex = range(m)
|
||||
for i in range(m):
|
||||
# 步长在不断减小
|
||||
alpha = 4 / (1.0 + j + i) + 0.0001 # apha decreases with iteration, does not
|
||||
# 随机选取样本减少周期波动
|
||||
randIndex = int(random.uniform(0, len(dataIndex))) # go to 0 because of the constant
|
||||
h = sigmoid(sum(dataMatrix[randIndex] * weights))
|
||||
error = classLabels[randIndex] - h
|
||||
weights = weights + alpha * error * dataMatrix[randIndex]
|
||||
del (dataIndex[randIndex])
|
||||
return weights
|
||||
|
||||
|
||||
# a, b = loadDataSet()
|
||||
# weights = gradAscent(a, b)
|
||||
# plotBestFit(weights)
|
||||
#
|
||||
|
||||
######################################################################################################################
|
||||
|
||||
def classifyVector(inX, weights):
|
||||
prob = sigmoid(sum(inX * weights))
|
||||
if prob > 0.5:
|
||||
return 1.0
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
|
||||
def colicTest():
|
||||
frTrain = open('horseColicTraining.txt');
|
||||
frTest = open('horseColicTest.txt')
|
||||
trainingSet = [];
|
||||
trainingLabels = []
|
||||
for line in frTrain.readlines():
|
||||
currLine = line.strip().split('\t')
|
||||
lineArr = []
|
||||
for i in range(21):
|
||||
lineArr.append(float(currLine[i]))
|
||||
trainingSet.append(lineArr)
|
||||
trainingLabels.append(float(currLine[21]))
|
||||
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
|
||||
errorCount = 0;
|
||||
numTestVec = 0.0
|
||||
for line in frTest.readlines():
|
||||
numTestVec += 1.0
|
||||
currLine = line.strip().split('\t')
|
||||
lineArr = []
|
||||
for i in range(21):
|
||||
lineArr.append(float(currLine[i]))
|
||||
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
|
||||
errorCount += 1
|
||||
errorRate = (float(errorCount) / numTestVec)
|
||||
print "the error rate of this test is: %f" % errorRate
|
||||
return errorRate
|
||||
|
||||
|
||||
def multiTest():
|
||||
numTests = 10;
|
||||
errorSum = 0.0
|
||||
for k in range(numTests):
|
||||
errorSum += colicTest()
|
||||
print "after %d iterations the average error rate is: %f" % (numTests, errorSum / float(numTests))
|
||||
|
||||
# multiTest()
|
||||
|
||||
colicTest()
|
||||
Reference in New Issue
Block a user