From 95a29ca47e8fc83ed0dd1607eb88b7c2804581ae Mon Sep 17 00:00:00 2001 From: jiangzhonglian Date: Sun, 26 Mar 2017 18:12:25 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../05.Logistic/core/logRegression01.py | 103 ----------- .../05.Logistic/core/test_logRegression.py | 54 ------ src/python/05.Logistic/logRegres.py | 164 ------------------ .../{Logistic.py => 05.Logistic/logistic.py} | 31 +++- src/python/07.AdaBoost/adaboost.py | 16 +- .../regression.py | 24 ++- 6 files changed, 46 insertions(+), 346 deletions(-) delete mode 100644 src/python/05.Logistic/core/logRegression01.py delete mode 100644 src/python/05.Logistic/core/test_logRegression.py delete mode 100755 src/python/05.Logistic/logRegres.py rename src/python/{Logistic.py => 05.Logistic/logistic.py} (87%) diff --git a/src/python/05.Logistic/core/logRegression01.py b/src/python/05.Logistic/core/logRegression01.py deleted file mode 100644 index caa026be..00000000 --- a/src/python/05.Logistic/core/logRegression01.py +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 -from numpy import * -import matplotlib.pyplot as plt -import time -''' -1、需要安装模块:pip install matplotlib-1.5.0-cp27-none-win_amd64.whl -由于直接安装会出现问题,所以建议下载whl包进行安装,下载网址: -https://pypi.python.org/pypi/matplotlib/1.5.0 - -2、可以看见画出的图像 -''' - -""" -@version: -@author: yangjf -@license: ApacheCN -@contact: highfei2011@126.com -@site: https://github.com/apachecn/MachineLearning -@software: PyCharm -@file: logRegression01.py -@time: 2017/3/3 22:03 -@test result: ok -""" - -# sigmoid函数 -def sigmoid(inX): - return 1.0 / (1 + exp(-inX)) - -def trainLogRegres(train_x, train_y, opts): - # 计算训练时间 - startTime = time.time() - - numSamples, numFeatures = shape(train_x) - alpha = opts['alpha']; maxIter = opts['maxIter'] - weights = ones((numFeatures, 1)) - - # 通过梯度下降算法优化 - for k in range(maxIter): - if opts['optimizeType'] == 'gradDescent': # 梯度下降算法 - output = sigmoid(train_x * weights) - error = train_y - output - weights = weights + alpha * train_x.transpose() * error - elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降 - for i in range(numSamples): - output = sigmoid(train_x[i, :] * weights) - error = train_y[i, 0] - output - weights = weights + alpha * train_x[i, :].transpose() * error - elif opts['optimizeType'] == 'smoothStocGradDescent': # 光滑随机梯度下降 - # 随机选择样本以优化以减少周期波动 - dataIndex = range(numSamples) - for i in range(numSamples): - alpha = 4.0 / (1.0 + k + i) + 0.01 - randIndex = int(random.uniform(0, len(dataIndex))) - output = sigmoid(train_x[randIndex, :] * weights) - error = train_y[randIndex, 0] - output - weights = weights + alpha * train_x[randIndex, :].transpose() * error - del(dataIndex[randIndex]) # 在一次交互期间,删除优化的样品 - else: - raise NameError('Not support optimize method type!') - - - print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime) - return weights - - -#测试给定测试集的训练Logistic回归模型 -def testLogRegres(weights, test_x, test_y): - numSamples, numFeatures = shape(test_x) - matchCount = 0 - for i in xrange(numSamples): - predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5 - if predict == bool(test_y[i, 0]): - matchCount += 1 - accuracy = float(matchCount) / numSamples - return accuracy - - -# 显示你的训练逻辑回归模型只有2-D数据可用 -def showLogRegres(weights, train_x, train_y): - # 注意:train_x和train_y是垫数据类型 - numSamples, numFeatures = shape(train_x) - if numFeatures != 3: - print "抱歉! 我不能绘制,因为你的数据的维度不是2!" - return 1 - - # 画出所有抽样数据 - for i in xrange(numSamples): - if int(train_y[i, 0]) == 0: - plt.plot(train_x[i, 1], train_x[i, 2], 'or') - elif int(train_y[i, 0]) == 1: - plt.plot(train_x[i, 1], train_x[i, 2], 'ob') - - # 画图操作 - min_x = min(train_x[:, 1])[0, 0] - max_x = max(train_x[:, 1])[0, 0] - weights = weights.getA() # 将mat转换为数组 - y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2] - y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2] - plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g') - plt.xlabel('X1'); plt.ylabel('X2') - #显示图像 - plt.show() \ No newline at end of file diff --git a/src/python/05.Logistic/core/test_logRegression.py b/src/python/05.Logistic/core/test_logRegression.py deleted file mode 100644 index c7d5d50d..00000000 --- a/src/python/05.Logistic/core/test_logRegression.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 -import os -import sys -sys.path.append("C:\Python27") -from numpy import * - -from logRegression01 import * -""" -@version: -@author: yangjf -@license: ApacheCN -@contact: highfei2011@126.com -@site: https://github.com/apachecn/MachineLearning -@software: PyCharm -@file: test_logRegression.py -@time: 2017/3/3 22:09 -@test result: ok -""" - -def loadData(): - train_x = [] - train_y = [] - # 获取当前文件所在路径 - project_dir = os.getcwdu() - # 截取字符串至项目名:Test\ - project_dir = project_dir[:project_dir.find("MachineLearning\\") + 15] - print project_dir - fileIn = open("%s/testData/Logistic_testdata.txt" % project_dir) - for line in fileIn.readlines(): - lineArr = line.strip().split() - train_x.append([1.0, float(lineArr[0]), float(lineArr[1])]) - train_y.append(float(lineArr[2])) - return mat(train_x), mat(train_y).transpose() - - -##第一步: 加载数据 -print "step 1: load data..." -train_x, train_y = loadData() -test_x = train_x; test_y = train_y - -##第二步: 训练数据... -print "step 2: training..." -opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'} -optimalWeights = trainLogRegres(train_x, train_y, opts) - -##第三步: 测试 -print "step 3: testing..." -accuracy = testLogRegres(optimalWeights, test_x, test_y) - -##第四步: 显示结果 -print "step 4: show the result..." -print 'The classify accuracy is: %.3f%%' % (accuracy * 100) -showLogRegres(optimalWeights, train_x, train_y) \ No newline at end of file diff --git a/src/python/05.Logistic/logRegres.py b/src/python/05.Logistic/logRegres.py deleted file mode 100755 index 89216058..00000000 --- a/src/python/05.Logistic/logRegres.py +++ /dev/null @@ -1,164 +0,0 @@ -#!/usr/bin/env python -# -*- coding:utf-8 -*- -from numpy import * - - -def loadDataSet(): - dataMat = [] - labelMat = [] - fr = open('testSet.txt') - for line in fr.readlines(): - lineArr = line.strip().split() - dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) - labelMat.append(int(lineArr[2])) - return dataMat, labelMat - - -def sigmoid(inX): - return 1.0 / (1 + exp(-inX)) - - -def gradAscent(dataMatIn, classLabels): - # 转化为矩阵[[1,1,2],[1,1,2]....] - dataMatrix = mat(dataMatIn) # convert to NumPy matrix - # 转化为矩阵[[0,1,0,1,0,1.....]],并转制[[0],[1],[0].....] - x = mat(classLabels) - labelMat = x.transpose() # convert to NumPy matrix - # m->数据量 n->特征数 - m, n = shape(dataMatrix) - # 步长 - alpha = 0.001 - # 迭代次数 - maxCycles = 500 - # 生成一个长度和特征数相同的矩阵,此处n为3 -> [[1],[1],[1]] - weights = ones((n, 1)) - for k in range(maxCycles): # heavy on matrix operations - # 1. dataMatrix * weights 矩阵乘法: [[1,1,2],[1,1,2]....] * [[1],[1],[1]] -> [[]] - s = dataMatrix * weights - # 把每个特征与系数的乘积只和带入Sigmoid函数 - h = sigmoid(dataMatrix * weights) # matrix mult - # [[x,x,x,x,x,......一共一百个误差]] - error = (labelMat - h) # vector subtraction - # dataMatrix.transpose() * error 推理略去 - # [[x,x,x,x....一共一百个数],[],[]] - data_tran = dataMatrix.transpose() - # [[a,b,c]] - data_tran_error = data_tran * error - - # weights = weights + alpha * dataMatrix.transpose() * error # matrix mult - weights = weights + alpha * data_tran_error - return weights - - -# 随机梯度上升 -# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集,计算复杂都较高 -# 随机梯度上升一次只用一个样本点来更新回归系数 -def stocGradAscent0(dataMatrix, classLabels): - m, n = shape(dataMatrix) - alpha = 0.01 - weights = ones(n) # initialize to all ones - for i in range(m): - h = sigmoid(sum(dataMatrix[i] * weights)) - error = classLabels[i] - h - weights = weights + alpha * error * dataMatrix[i] - return weights - - -def plotBestFit(weights): - import matplotlib.pyplot as plt - dataMat, labelMat = loadDataSet() - dataArr = array(dataMat) - n = shape(dataArr)[0] - xcord1 = []; - ycord1 = [] - xcord2 = []; - ycord2 = [] - for i in range(n): - if int(labelMat[i]) == 1: - xcord1.append(dataArr[i, 1]); - ycord1.append(dataArr[i, 2]) - else: - xcord2.append(dataArr[i, 1]); - ycord2.append(dataArr[i, 2]) - fig = plt.figure() - ax = fig.add_subplot(111) - ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') - ax.scatter(xcord2, ycord2, s=30, c='green') - x = arange(-3.0, 3.0, 0.1) - y = (-weights[0] - weights[1] * x) / weights[2] - ax.plot(x, y) - plt.xlabel('X1'); - plt.ylabel('X2'); - plt.show() - - -def stocGradAscent1(dataMatrix, classLabels, numIter=150): - m, n = shape(dataMatrix) - weights = ones(n) # initialize to all ones - for j in range(numIter): - dataIndex = range(m) - for i in range(m): - # 步长在不断减小 - alpha = 4 / (1.0 + j + i) + 0.0001 # apha decreases with iteration, does not - # 随机选取样本减少周期波动 - randIndex = int(random.uniform(0, len(dataIndex))) # go to 0 because of the constant - h = sigmoid(sum(dataMatrix[randIndex] * weights)) - error = classLabels[randIndex] - h - weights = weights + alpha * error * dataMatrix[randIndex] - del (dataIndex[randIndex]) - return weights - - -# a, b = loadDataSet() -# weights = gradAscent(a, b) -# plotBestFit(weights) -# - -###################################################################################################################### - -def classifyVector(inX, weights): - prob = sigmoid(sum(inX * weights)) - if prob > 0.5: - return 1.0 - else: - return 0.0 - - -def colicTest(): - frTrain = open('horseColicTraining.txt'); - frTest = open('horseColicTest.txt') - trainingSet = []; - trainingLabels = [] - for line in frTrain.readlines(): - currLine = line.strip().split('\t') - lineArr = [] - for i in range(21): - lineArr.append(float(currLine[i])) - trainingSet.append(lineArr) - trainingLabels.append(float(currLine[21])) - trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000) - errorCount = 0; - numTestVec = 0.0 - for line in frTest.readlines(): - numTestVec += 1.0 - currLine = line.strip().split('\t') - lineArr = [] - for i in range(21): - lineArr.append(float(currLine[i])) - if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]): - errorCount += 1 - errorRate = (float(errorCount) / numTestVec) - print "the error rate of this test is: %f" % errorRate - return errorRate - - -def multiTest(): - numTests = 10; - errorSum = 0.0 - for k in range(numTests): - errorSum += colicTest() - print "after %d iterations the average error rate is: %f" % (numTests, errorSum / float(numTests)) - - # multiTest() - -colicTest() \ No newline at end of file diff --git a/src/python/Logistic.py b/src/python/05.Logistic/logistic.py similarity index 87% rename from src/python/Logistic.py rename to src/python/05.Logistic/logistic.py index c59e022d..7a727dc2 100644 --- a/src/python/Logistic.py +++ b/src/python/05.Logistic/logistic.py @@ -6,10 +6,10 @@ Created on Oct 27, 2010 Logistic Regression Working Module @author: Peter ''' - -import os from numpy import * import matplotlib.pyplot as plt + + # 解析数据 def loadDataSet(file_name): # dataMat为原始数据, labelMat为原始数据的标签 @@ -25,17 +25,24 @@ def loadDataSet(file_name): def sigmoid(inX): return 1.0/(1+exp(-inX)) + # 正常的处理方案 def gradAscent(dataMatIn, classLabels): + # 转化为矩阵[[1,1,2],[1,1,2]....] dataMatrix = mat(dataMatIn) #convert to NumPy matrix + # 转化为矩阵[[0,1,0,1,0,1.....]],并转制[[0],[1],[0].....] # transpose() 行列转制函数 # 将行矩阵转化为列矩阵 => 矩阵的转置 labelMat = mat(classLabels).transpose() #convert to NumPy matrix + # m->数据量 n->特征数 m,n = shape(dataMatrix) # print m, n, '__'*10, shape(dataMatrix.transpose()), '__'*100 + # 步长 alpha = 0.001 + # 迭代次数 maxCycles = 500 - # 权重 + # 生成一个长度和特征数相同的矩阵,此处n为3 -> [[1],[1],[1]] + # 回归系数 weights = ones((n,1)) for k in range(maxCycles): #heavy on matrix operations # m*3的矩阵 * 3*1的单位矩阵 = m*1的矩阵 @@ -49,7 +56,10 @@ def gradAscent(dataMatIn, classLabels): weights = weights + alpha * dataMatrix.transpose() * error #matrix mult return array(weights) -# 梯度上升算法 + +# 随机梯度上升 +# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集,计算复杂都较高 +# 随机梯度上升一次只用一个样本点来更新回归系数 def stocGradAscent0(dataMatrix, classLabels): m,n = shape(dataMatrix) alpha = 0.01 @@ -65,6 +75,7 @@ def stocGradAscent0(dataMatrix, classLabels): weights = weights + alpha * error * dataMatrix[i] return weights + # 随机梯度上升算法(随机化) def stocGradAscent1(dataMatrix, classLabels, numIter=150): m,n = shape(dataMatrix) @@ -86,6 +97,7 @@ def stocGradAscent1(dataMatrix, classLabels, numIter=150): del(dataIndex[randIndex]) return weights + # 可视化展示 def plotBestFit(dataArr, labelMat, weights): n = shape(dataArr)[0] @@ -114,10 +126,12 @@ def plotBestFit(dataArr, labelMat, weights): plt.xlabel('X'); plt.ylabel('Y') plt.show() + def main(): - project_dir = os.path.dirname(os.path.dirname(os.getcwd())) + # project_dir = os.path.dirname(os.path.dirname(os.getcwd())) # 1.收集并准备数据 - dataMat, labelMat = loadDataSet("%s/testData/Logistic_testdata.txt" % project_dir) + # dataMat, labelMat = loadDataSet("%s/testData/Logistic_testdata.txt" % project_dir) + dataMat, labelMat = loadDataSet("testData/Logistic_testdata.txt") # print dataMat, '---\n', labelMat # 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值 @@ -132,5 +146,6 @@ def main(): # 数据可视化 plotBestFit(dataArr, labelMat, weights) -if __name__=="__main__": - main() \ No newline at end of file + +if __name__ == "__main__": + main() diff --git a/src/python/07.AdaBoost/adaboost.py b/src/python/07.AdaBoost/adaboost.py index 8f6a44f9..676cdff8 100644 --- a/src/python/07.AdaBoost/adaboost.py +++ b/src/python/07.AdaBoost/adaboost.py @@ -267,10 +267,10 @@ if __name__ == "__main__": D = mat(ones((5, 1))/5) print 'D=', D.T - bestStump, minError, bestClasEst = buildStump(dataArr, labelArr, D) - print 'bestStump=', bestStump - print 'minError=', minError - print 'bestClasEst=', bestClasEst.T + # bestStump, minError, bestClasEst = buildStump(dataArr, labelArr, D) + # print 'bestStump=', bestStump + # print 'minError=', minError + # print 'bestClasEst=', bestClasEst.T # 分类器:weakClassArr @@ -288,14 +288,14 @@ if __name__ == "__main__": print adaClassify([0, 0], weakClassArr).T print adaClassify([[5, 5], [0, 0]], weakClassArr).T - # 马疝病数据集 - # 训练集合 + # # 马疝病数据集 + # # 训练集合 # dataArr, labelArr = loadDataSet("testData/AB_horseColicTraining2.txt") # weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 40) # print weakClassArr, '\n-----\n', aggClassEst.T - # 计算ROC下面的AUC的面积大小 + # # 计算ROC下面的AUC的面积大小 # plotROC(aggClassEst.T, labelArr) - # 测试集合 + # # 测试集合 # dataArrTest, labelArrTest = loadDataSet("testData/AB_horseColicTest2.txt") # m = shape(dataArrTest)[0] # predicting10 = adaClassify(dataArrTest, weakClassArr) diff --git a/src/python/08.Predictive numerical data regression/regression.py b/src/python/08.Predictive numerical data regression/regression.py index c47a09fd..e2ef8aa1 100644 --- a/src/python/08.Predictive numerical data regression/regression.py +++ b/src/python/08.Predictive numerical data regression/regression.py @@ -1,3 +1,6 @@ +#!/usr/bin/python +# coding:utf8 + ''' Create by ApacheCN-小瑶 Date from 2017-02-27 @@ -26,6 +29,8 @@ def standRegres(xArr,yArr): #线性回归 if linalg.det(xTx) == 0.0: #因为要用到xTx的逆矩阵,所以事先需要确定计算得到的xTx是否可逆,条件是矩阵的行列式不为0 print ("This matrix is singular, cannot do inverse") return + # 最小二乘法 + # http://www.apache.wiki/pages/viewpage.action?pageId=5505133 ws = xTx.I * (xMat.T*yMat) #书中的公式,求得w的最优解 return ws @@ -69,7 +74,7 @@ def ridgeRegres(xMat,yMat,lam=0.2): #岭回归 return ws = denom.I * (xMat.T*yMat) return ws - + def ridgeTest(xArr,yArr): xMat = mat(xArr); yMat=mat(yArr).T yMean = mean(yMat,0) #计算Y均值 @@ -85,6 +90,7 @@ def ridgeTest(xArr,yArr): wMat[i,:]=ws.T return wMat + def regularize(xMat):#按列进行规范化 inMat = xMat.copy() inMeans = mean(inMat,0) #计算平均值然后减去它 @@ -227,7 +233,7 @@ def crossValidation(xArr,yArr,numVal=10): #test for standRegression def regression1(): - xArr, yArr = loadDataSet("../../../testData/Regression_data.txt") + xArr, yArr = loadDataSet("testData/Regression_data.txt") xMat = mat(xArr) yMat = mat(yArr) ws = standRegres(xArr, yArr) @@ -245,7 +251,7 @@ def regression1(): #test for LWLR def regression2(): - xArr, yArr = loadDataSet("../../../testData/Regression_data.txt") + xArr, yArr = loadDataSet("testData/Regression_data.txt") yHat = lwlrTest(xArr, xArr, yArr, 0.003) xMat = mat(xArr) srtInd = xMat[:,1].argsort(0) #argsort()函数是将x中的元素从小到大排列,提取其对应的index(索引),然后输出 @@ -259,7 +265,7 @@ def regression2(): #test for ridgeRegression def regression3(): - abX,abY = loadDataSet("../../../testData/Regression_abalone.txt") + abX,abY = loadDataSet("testData/Regression_abalone.txt") ridgeWeights = ridgeTest(abX, abY) fig = plt.figure() ax = fig.add_subplot(111) @@ -269,7 +275,7 @@ def regression3(): #test for stageWise def regression4(): - xArr,yArr=loadDataSet("../../../testData/Regression_abalone.txt") + xArr,yArr=loadDataSet("testData/Regression_abalone.txt") stageWise(xArr,yArr,0.01,200) xMat = mat(xArr) yMat = mat(yArr).T @@ -280,7 +286,7 @@ def regression4(): print (weights.T) if __name__ == "__main__": - #regression1() - #regression2() - #regression3() - regression4() \ No newline at end of file + # regression1() + regression2() + # regression3() + # regression4() \ No newline at end of file