更新2.KNN文件位置

2026-07-01 10:46:11 +08:00 · 2017-04-07 16:43:54 +08:00
parent ebc6bbad78
commit a1b53e46fa
2894 changed files with 5 additions and 5 deletions
--- a/src/python/5.Logistic/logistic.py
+++ b/src/python/5.Logistic/logistic.py
@@ -0,0 +1,149 @@
+#!/usr/bin/python
+# coding: utf8
+
+'''
+Created on Oct 27, 2010
+Logistic Regression Working Module
+@author: Peter
+'''
+from numpy import *
+import matplotlib.pyplot as plt
+
+
+# 解析数据
+def loadDataSet(file_name):
+    # dataMat为原始数据， labelMat为原始数据的标签
+    dataMat = []; labelMat = []
+    fr = open(file_name)
+    for line in fr.readlines():
+        lineArr = line.strip().split()
+        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
+        labelMat.append(int(lineArr[2]))
+    return dataMat,labelMat
+
+# sigmoid跳跃函数
+def sigmoid(inX):
+    return 1.0/(1+exp(-inX))
+
+
+# 正常的处理方案
+def gradAscent(dataMatIn, classLabels):
+    # 转化为矩阵[[1,1,2],[1,1,2]....]
+    dataMatrix = mat(dataMatIn)             #convert to NumPy matrix
+    # 转化为矩阵[[0,1,0,1,0,1.....]]，并转制[[0],[1],[0].....]
+    # transpose() 行列转制函数
+    # 将行矩阵转化为列矩阵    =>  矩阵的转置
+    labelMat = mat(classLabels).transpose() #convert to NumPy matrix
+    # m->数据量 n->特征数
+    m,n = shape(dataMatrix)
+    # print m, n, '__'*10, shape(dataMatrix.transpose()), '__'*100
+    # 步长
+    alpha = 0.001
+    # 迭代次数
+    maxCycles = 500
+    # 生成一个长度和特征数相同的矩阵，此处n为3 -> [[1],[1],[1]]
+    # 回归系数
+    weights = ones((n,1))
+    for k in range(maxCycles):              #heavy on matrix operations
+        # m*3的矩阵 * 3*1的单位矩阵 ＝ m*1的矩阵
+        # 那么乘上单位矩阵的意义，就代表：通过公式得到的理论值
+        # 参考地址： 矩阵乘法的本质是什么？ https://www.zhihu.com/question/21351965/answer/31050145
+        # n*3   *  3*1  = n*1
+        h = sigmoid(dataMatrix*weights)     #matrix mult
+        # labelMat是实际值
+        error = (labelMat - h)              #vector subtraction
+        # 0.001* (3*m)*(m*1) 表示在每一个列上的一个误差情况，最后得出 x1,x2,xn的系数的偏移量
+        weights = weights + alpha * dataMatrix.transpose() * error #matrix mult
+    return array(weights)
+
+
+# 随机梯度上升
+# 梯度上升优化算法在每次更新数据集时都需要遍历整个数据集，计算复杂都较高
+# 随机梯度上升一次只用一个样本点来更新回归系数
+def stocGradAscent0(dataMatrix, classLabels):
+    m,n = shape(dataMatrix)
+    alpha = 0.01
+    # n*1的矩阵
+    # 函数ones创建一个全1的数组
+    weights = ones(n)   #initialize to all ones
+    for i in range(m):
+        # sum(dataMatrix[i]*weights)为了求 f(x)的值， f(x)=a1*x1+b2*x2+..+nn*xn
+        h = sigmoid(sum(dataMatrix[i]*weights))
+        error = classLabels[i] - h
+        # 0.01*(1*1)*(1*n)
+        print weights, "*"*10 , dataMatrix[i], "*"*10 , error
+        weights = weights + alpha * error * dataMatrix[i]
+    return weights
+
+
+# 随机梯度上升算法（随机化）
+def stocGradAscent1(dataMatrix, classLabels, numIter=150):
+    m,n = shape(dataMatrix)
+    weights = ones(n)   #initialize to all ones
+    # 随机剃度, 循环150,观察是否收敛
+    for j in range(numIter):
+        # [0, 1, 2 .. m-1]
+        dataIndex = range(m)
+        for i in range(m):
+            # i和j的不断增大，导致alpha的值不断减少，但是不为0
+            alpha = 4/(1.0+j+i)+0.0001    #apha decreases with iteration, does not
+            # 随机产生一个 0～len()之间的一个值
+            randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant
+            # sum(dataMatrix[i]*weights)为了求 f(x)的值， f(x)=a1*x1+b2*x2+..+nn*xn
+            h = sigmoid(sum(dataMatrix[randIndex]*weights))
+            error = classLabels[randIndex] - h
+            # print weights, '__h=%s' % h, '__'*20, alpha, '__'*20, error, '__'*20, dataMatrix[randIndex]
+            weights = weights + alpha * error * dataMatrix[randIndex]
+            del(dataIndex[randIndex])
+    return weights
+
+
+# 可视化展示
+def plotBestFit(dataArr, labelMat, weights):
+    n = shape(dataArr)[0]
+    xcord1 = []; ycord1 = []
+    xcord2 = []; ycord2 = []
+    for i in range(n):
+        if int(labelMat[i])== 1:
+            xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
+        else:
+            xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
+    ax.scatter(xcord2, ycord2, s=30, c='green')
+    x = arange(-3.0, 3.0, 0.1)
+    """
+    y的由来，卧槽，是不是没看懂？
+    首先理论上是这个样子的。
+    dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
+    w0*x0+w1*x1+w2*x2=f(x)
+    x0最开始就设置为1叻， x2就是我们画图的y值，而f(x)被我们磨合误差给算到w0,w1,w2身上去了
+    所以： w0+w1*x+w2*y=0 => y = (-w0-w1*x)/w2   
+    """
+    y = (-weights[0]-weights[1]*x)/weights[2]
+    ax.plot(x, y)
+    plt.xlabel('X'); plt.ylabel('Y')
+    plt.show()
+
+
+def main():
+    # 1.收集并准备数据
+    dataMat, labelMat = loadDataSet("input/05.Logistic/TestSet.txt")
+
+    # print dataMat, '---\n', labelMat
+    # 2.训练模型，  f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
+    # 因为数组没有是复制n份， array的乘法就是乘法
+    dataArr = array(dataMat)
+    # print dataArr
+    # weights = gradAscent(dataArr, labelMat)
+    # weights = stocGradAscent0(dataArr, labelMat)
+    weights = stocGradAscent1(dataArr, labelMat)
+    # print '*'*30, weights
+
+    # 数据可视化
+    plotBestFit(dataArr, labelMat, weights)
+
+
+if __name__ == "__main__":
+    main()