#!/usr/bin/python # coding: utf8 ''' Created on Oct 27, 2010 Logistic Regression Working Module @author: Peter ''' import os from numpy import * import matplotlib.pyplot as plt # 解析数据 def loadDataSet(file_name): # dataMat为原始数据, labelMat为原始数据的标签 dataMat = []; labelMat = [] fr = open(file_name) for line in fr.readlines(): lineArr = line.strip().split() dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) labelMat.append(int(lineArr[2])) return dataMat,labelMat # sigmoid跳跃函数 def sigmoid(inX): return 1.0/(1+exp(-inX)) # 正常的处理方案 def gradAscent(dataMatIn, classLabels): dataMatrix = mat(dataMatIn) #convert to NumPy matrix # transpose() 行列转制函数 # 将行矩阵转化为列矩阵 => 矩阵的转置 labelMat = mat(classLabels).transpose() #convert to NumPy matrix m,n = shape(dataMatrix) # print m, n, '__'*10, shape(dataMatrix.transpose()), '__'*100 alpha = 0.001 maxCycles = 500 # 权重 weights = ones((n,1)) for k in range(maxCycles): #heavy on matrix operations # m*3的矩阵 * 3*1的单位矩阵 = m*1的矩阵 # 那么乘上单位矩阵的意义,就代表:通过公式得到的理论值 # 参考地址: 矩阵乘法的本质是什么? https://www.zhihu.com/question/21351965/answer/31050145 # n*3 * 3*1 = n*1 h = sigmoid(dataMatrix*weights) #matrix mult # labelMat是实际值 error = (labelMat - h) #vector subtraction # 0.001* (3*m)*(m*1) 表示在每一个列上的一个误差情况,最后得出 x1,x2,xn的系数的偏移量 weights = weights + alpha * dataMatrix.transpose() * error #matrix mult return array(weights) # 梯度上升算法 def stocGradAscent0(dataMatrix, classLabels): m,n = shape(dataMatrix) alpha = 0.01 # n*1的矩阵 # 函数ones创建一个全1的数组 weights = ones(n) #initialize to all ones for i in range(m): # sum(dataMatrix[i]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn h = sigmoid(sum(dataMatrix[i]*weights)) error = classLabels[i] - h # 0.01*(1*1)*(1*n) print weights, "*"*10 , dataMatrix[i], "*"*10 , error weights = weights + alpha * error * dataMatrix[i] return weights # 随机梯度上升算法(随机化) def stocGradAscent1(dataMatrix, classLabels, numIter=150): m,n = shape(dataMatrix) weights = ones(n) #initialize to all ones # 随机剃度, 循环150,观察是否收敛 for j in range(numIter): # [0, 1, 2 .. m-1] dataIndex = range(m) for i in range(m): # i和j的不断增大,导致alpha的值不断减少,但是不为0 alpha = 4/(1.0+j+i)+0.0001 #apha decreases with iteration, does not # 随机产生一个 0~len()之间的一个值 randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant # sum(dataMatrix[i]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn h = sigmoid(sum(dataMatrix[randIndex]*weights)) error = classLabels[randIndex] - h # print weights, '__h=%s' % h, '__'*20, alpha, '__'*20, error, '__'*20, dataMatrix[randIndex] weights = weights + alpha * error * dataMatrix[randIndex] del(dataIndex[randIndex]) return weights # 可视化展示 def plotBestFit(dataArr, labelMat, weights): n = shape(dataArr)[0] xcord1 = []; ycord1 = [] xcord2 = []; ycord2 = [] for i in range(n): if int(labelMat[i])== 1: xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) else: xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') ax.scatter(xcord2, ycord2, s=30, c='green') x = arange(-3.0, 3.0, 0.1) """ y的由来,卧槽,是不是没看懂? 首先理论上是这个样子的。 dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) w0*x0+w1*x1+w2*x2=f(x) x0最开始就设置为1叻, x2就是我们画图的y值,而f(x)被我们磨合误差给算到w0,w1,w2身上去了 所以: w0+w1*x+w2*y=0 => y = (-w0-w1*x)/w2 """ y = (-weights[0]-weights[1]*x)/weights[2] ax.plot(x, y) plt.xlabel('X'); plt.ylabel('Y') plt.show() def main(): project_dir = os.path.dirname(os.path.dirname(os.getcwd())) # 1.收集并准备数据 dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir) # print dataMat, '---\n', labelMat # 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值 # 因为数组没有是复制n份, array的乘法就是乘法 dataArr = array(dataMat) # print dataArr # weights = gradAscent(dataArr, labelMat) # weights = stocGradAscent0(dataArr, labelMat) weights = stocGradAscent1(dataArr, labelMat) # print '*'*30, weights # 数据可视化 plotBestFit(dataArr, labelMat, weights) if __name__=="__main__": main()