添加了SVM无核函数的，注释参考有有核函数的

2026-06-16 07:16:36 +08:00 · 2017-04-18 16:50:38 +08:00
parent fa6fb45082
commit c413c5f626
4 changed files with 592 additions and 12 deletions
--- a/src/python/6.SVM/svm-complete.py
+++ b/src/python/6.SVM/svm-complete.py
@@ -123,7 +123,7 @@ def selectJrand(i, m):


 def selectJ(i, oS, Ei):  # this is the second choice -heurstic, and calcs Ej
-    """selectJ（）
+    """selectJ（返回最优的j和Ej）

    内循环的启发式方法。
    选择第二个(内循环)alpha的alpha值
@@ -187,9 +187,6 @@ def updateEk(oS, k):
    Args:
        oS  optStruct对象
        k   某一列的行号
-
-    Returns:
-
    """

    # 求 误差：预测值-真实值的差
@@ -214,14 +211,15 @@ def clipAlpha(aj, H, L):


 def innerL(i, oS):
-    """
+    """innerL
    内循环代码
    Args:
        i   具体的某一行
        oS  optStruct对象

    Returns:
-
+        0   找不到最优的值
+        1   找到了最优的值，并且oS.Cache到缓存中
    """

    # 求 Ek误差：预测值-真实值的差
@@ -311,20 +309,29 @@ def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup=('lin', 0)):
    entireSet = True
    alphaPairsChanged = 0

-    # 循环遍历：
+    # 循环遍历：循环maxIter次 并且 （alphaPairsChanged存在可以改变 or 所有行遍历一遍）
    while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
        alphaPairsChanged = 0
-        if entireSet:  # 在数据集上遍历所有可能的alpha
+
+        #  当entireSet=true or 非边界alpha对没有了；就开始寻找 alpha对，然后决定是否要进行else。
+        if entireSet:
+            # 在数据集上遍历所有可能的alpha
            for i in range(oS.m):
+                # 是否存在alpha对，存在就+1
                alphaPairsChanged += innerL(i, oS)
                print("fullSet, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
            iter += 1
-        else:  # 遍历所有的非边界alpha值，也就是不在边界0或C上的值。
+
+        # 对已存在 alpha对，选出非边界的alpha值，进行优化。
+        else:
+            # 遍历所有的非边界alpha值，也就是不在边界0或C上的值。
            nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
            for i in nonBoundIs:
                alphaPairsChanged += innerL(i, oS)
                print("non-bound, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
            iter += 1
+
+        # 如果找到alpha对，就优化非边界alpha值，否则，就重新进行寻找，如果寻找一遍 遍历所有的行还是没找到，就退出循环。
        if entireSet:
            entireSet = False  # toggle entire set loop
        elif (alphaPairsChanged == 0):
@@ -438,6 +445,7 @@ if __name__ == "__main__":



+


 def testRbf(k1=1.3):
@@ -548,9 +556,6 @@ def calcEkK(oS, k):
    return Ek


-
-
-
 def selectJK(i, oS, Ei):  # this is the second choice -heurstic, and calcs Ej
    maxK = -1
    maxDeltaE = 0
--- a/src/python/6.SVM/svm-complete_Non-Kernel.py
+++ b/src/python/6.SVM/svm-complete_Non-Kernel.py
@@ -0,0 +1,375 @@
+#!/usr/bin/python
+# coding:utf8
+
+"""
+Created on Nov 4, 2010
+Update on 2017-03-21
+Chapter 5 source file for Machine Learing in Action
+@author: Peter/geekidentity/片刻
+"""
+from numpy import *
+import matplotlib.pyplot as plt
+
+
+def loadDataSet(fileName):
+    """loadDataSet（对文件进行逐行解析，从而得到第行的类标签和整个数据矩阵）
+
+    Args:
+        fileName 文件名
+    Returns:
+        dataMat  数据矩阵
+        labelMat 类标签
+    """
+    dataMat = []
+    labelMat = []
+    fr = open(fileName)
+    for line in fr.readlines():
+        lineArr = line.strip().split('\t')
+        dataMat.append([float(lineArr[0]), float(lineArr[1])])
+        labelMat.append(float(lineArr[2]))
+    return dataMat, labelMat
+
+
+def selectJrand(i, m):
+    """
+    随机选择一个整数
+    Args:
+        i  第一个alpha的下标
+        m  所有alpha的数目
+    Returns:
+        j  返回一个不为i的随机数，在0~m之间的整数值
+    """
+    j = i
+    while j == i:
+        j = int(random.uniform(0, m))
+    return j
+
+
+def clipAlpha(aj, H, L):
+    """clipAlpha(调整aj的值，使aj处于 L<=aj<=H)
+    Args:
+        aj  目标值
+        H   最大值
+        L   最小值
+    Returns:
+        aj  目标值
+    """
+    if aj > H:
+        aj = H
+    if L > aj:
+        aj = L
+    return aj
+
+
+def calcWs(alphas, dataArr, classLabels):
+    """
+    基于alpha计算w值
+    Args:
+        alphas        拉格朗日乘子
+        dataArr       feature数据集
+        classLabels   目标变量数据集
+
+    Returns:
+        wc  回归系数
+    """
+    X = mat(dataArr)
+    labelMat = mat(classLabels).transpose()
+    m, n = shape(X)
+    w = zeros((n, 1))
+    for i in range(m):
+        w += multiply(alphas[i] * labelMat[i], X[i, :].T)
+    return w
+
+
+'''
+#######********************************
+Non-Kernel VErsions below
+#######********************************
+'''
+
+class optStruct:
+    def __init__(self, dataMatIn, classLabels, C, toler):  # Initialize the structure with the parameters
+        self.X = dataMatIn
+        self.labelMat = classLabels
+        self.C = C
+        self.tol = toler
+        self.m = shape(dataMatIn)[0]
+        self.alphas = mat(zeros((self.m, 1)))
+        self.b = 0
+        self.eCache = mat(zeros((self.m, 2)))  # first column is valid flag
+
+
+def calcEk(oS, k):
+    fXk = float(multiply(oS.alphas, oS.labelMat).T * (oS.X * oS.X[k, :].T)) + oS.b
+    Ek = fXk - float(oS.labelMat[k])
+    return Ek
+
+
+def selectJ(i, oS, Ei):  # this is the second choice -heurstic, and calcs Ej
+    maxK = -1
+    maxDeltaE = 0
+    Ej = 0
+    oS.eCache[i] = [1, Ei]  # set valid #choose the alpha that gives the maximum delta E
+    validEcacheList = nonzero(oS.eCache[:, 0].A)[0]
+    if (len(validEcacheList)) > 1:
+        for k in validEcacheList:  # loop through valid Ecache values and find the one that maximizes delta E
+            if k == i: continue  # don't calc for i, waste of time
+            Ek = calcEk(oS, k)
+            deltaE = abs(Ei - Ek)
+            if (deltaE > maxDeltaE):
+                maxK = k
+                maxDeltaE = deltaE
+                Ej = Ek
+        return maxK, Ej
+    else:  # in this case (first time around) we don't have any valid eCache values
+        j = selectJrand(i, oS.m)
+        Ej = calcEk(oS, j)
+    return j, Ej
+
+
+def updateEk(oS, k):  # after any alpha has changed update the new value in the cache
+    Ek = calcEk(oS, k)
+    oS.eCache[k] = [1, Ek]
+
+
+def innerL(i, oS):
+    Ei = calcEk(oS, i)
+    if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or (
+        (oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)):
+        j, Ej = selectJ(i, oS, Ei)  # this has been changed from selectJrand
+        alphaIold = oS.alphas[i].copy()
+        alphaJold = oS.alphas[j].copy()
+        if (oS.labelMat[i] != oS.labelMat[j]):
+            L = max(0, oS.alphas[j] - oS.alphas[i])
+            H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
+        else:
+            L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
+            H = min(oS.C, oS.alphas[j] + oS.alphas[i])
+        if L == H:
+            print("L==H")
+            return 0
+        eta = 2.0 * oS.X[i, :] * oS.X[j, :].T - oS.X[i, :] * oS.X[i, :].T - oS.X[j, :] * oS.X[j, :].T
+        if eta >= 0:
+            print("eta>=0")
+            return 0
+        oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej) / eta
+        oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
+        updateEk(oS, j)  # added this for the Ecache
+        if (abs(oS.alphas[j] - alphaJold) < 0.00001):
+            print("j not moving enough")
+            return 0
+        oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * (alphaJold - oS.alphas[j])  # update i by the same amount as j
+        updateEk(oS, i)  # added this for the Ecache                    #the update is in the oppostie direction
+        b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[i, :].T - oS.labelMat[j] * (
+        oS.alphas[j] - alphaJold) * oS.X[i, :] * oS.X[j, :].T
+        b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[j, :].T - oS.labelMat[j] * (
+        oS.alphas[j] - alphaJold) * oS.X[j, :] * oS.X[j, :].T
+        if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]):
+            oS.b = b1
+        elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]):
+            oS.b = b2
+        else:
+            oS.b = (b1 + b2) / 2.0
+        return 1
+    else:
+        return 0
+
+
+def smoP(dataMatIn, classLabels, C, toler, maxIter):  # full Platt SMO
+    oS = optStruct(mat(dataMatIn), mat(classLabels).transpose(), C, toler)
+    iter = 0
+    entireSet = True
+    alphaPairsChanged = 0
+    while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
+        alphaPairsChanged = 0
+        if entireSet:  # go over all
+            for i in range(oS.m):
+                alphaPairsChanged += innerL(i, oS)
+                print("fullSet, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
+            iter += 1
+        else:  # go over non-bound (railed) alphas
+            nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
+            for i in nonBoundIs:
+                alphaPairsChanged += innerL(i, oS)
+                print("non-bound, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
+            iter += 1
+        if entireSet:
+            entireSet = False  # toggle entire set loop
+        elif (alphaPairsChanged == 0):
+            entireSet = True
+        print("iteration number: %d" % iter)
+    return oS.b, oS.alphas
+
+
+def plotfig_SVM(xArr, yArr, ws, b, alphas):
+    """
+    参考地址：
+       http://blog.csdn.net/maoersong/article/details/24315633
+       http://www.cnblogs.com/JustForCS/p/5283489.html
+       http://blog.csdn.net/kkxgx/article/details/6951959
+    """
+
+    xMat = mat(xArr)
+    yMat = mat(yArr)
+
+    # b原来是矩阵，先转为数组类型后其数组大小为（1,1），所以后面加[0]，变为(1,)
+    b = array(b)[0]
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+
+    # 注意flatten的用法
+    ax.scatter(xMat[:, 0].flatten().A[0], xMat[:, 1].flatten().A[0])
+
+    # x最大值，最小值根据原数据集dataArr[:, 0]的大小而定
+    x = arange(-1.0, 10.0, 0.1)
+
+    # 根据x.w + b = 0 得到，其式子展开为w0.x1 + w1.x2 + b = 0, x2就是y值
+    y = (-b-ws[0, 0]*x)/ws[1, 0]
+    ax.plot(x, y)
+
+    for i in range(shape(yMat[0, :])[1]):
+        if yMat[0, i] > 0:
+            ax.plot(xMat[i, 0], xMat[i, 1], 'cx')
+        else:
+            ax.plot(xMat[i, 0], xMat[i, 1], 'kp')
+
+    # 找到支持向量，并在图中标红
+    for i in range(100):
+        if alphas[i] > 0.0:
+            ax.plot(xMat[i, 0], xMat[i, 1], 'ro')
+    plt.show()
+
+
+if __name__ == "__main__":
+    # 获取特征和目标变量
+    dataArr, labelArr = loadDataSet('input/6.SVM/testSet.txt')
+    # print labelArr
+
+    # b是常量值， alphas是拉格朗日乘子
+    b, alphas = smoP(dataArr, labelArr, 0.6, 0.001, 40)
+    print '/n/n/n'
+    print 'b=', b
+    print 'alphas[alphas>0]=', alphas[alphas > 0]
+    print 'shape(alphas[alphas > 0])=', shape(alphas[alphas > 0])
+    for i in range(100):
+        if alphas[i] > 0:
+            print dataArr[i], labelArr[i]
+    # 画图
+    ws = calcWs(alphas, dataArr, labelArr)
+    plotfig_SVM(dataArr, labelArr, ws, b, alphas)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def testRbf(k1=1.3):
+    dataArr, labelArr = loadDataSet('testSetRBF.txt')
+    b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1))  # C=200 important
+    datMat = mat(dataArr)
+    labelMat = mat(labelArr).transpose()
+    svInd = nonzero(alphas.A > 0)[0]
+    sVs = datMat[svInd]  # get matrix of only support vectors
+    labelSV = labelMat[svInd]
+    print("there are %d Support Vectors" % shape(sVs)[0])
+    m, n = shape(datMat)
+    errorCount = 0
+    for i in range(m):
+        kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
+        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
+        if sign(predict) != sign(labelArr[i]): errorCount += 1
+    print("the training error rate is: %f" % (float(errorCount) / m))
+    dataArr, labelArr = loadDataSet('testSetRBF2.txt')
+    errorCount = 0
+    datMat = mat(dataArr)
+    labelMat = mat(labelArr).transpose()
+    m, n = shape(datMat)
+    for i in range(m):
+        kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
+        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
+        if sign(predict) != sign(labelArr[i]): errorCount += 1
+    print("the test error rate is: %f" % (float(errorCount) / m))
+
+
+def img2vector(filename):
+    returnVect = zeros((1, 1024))
+    fr = open(filename)
+    for i in range(32):
+        lineStr = fr.readline()
+        for j in range(32):
+            returnVect[0, 32 * i + j] = int(lineStr[j])
+    return returnVect
+
+
+def loadImages(dirName):
+    from os import listdir
+    hwLabels = []
+    print(dirName)
+    trainingFileList = listdir(dirName)  # load the training set
+    m = len(trainingFileList)
+    trainingMat = zeros((m, 1024))
+    for i in range(m):
+        fileNameStr = trainingFileList[i]
+        fileStr = fileNameStr.split('.')[0]  # take off .txt
+        classNumStr = int(fileStr.split('_')[0])
+        if classNumStr == 9:
+            hwLabels.append(-1)
+        else:
+            hwLabels.append(1)
+        trainingMat[i, :] = img2vector('%s/%s' % (dirName, fileNameStr))
+    return trainingMat, hwLabels
+
+
+def testDigits(kTup=('rbf', 10)):
+    dataArr, labelArr = loadImages('trainingDigits')
+    b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup)
+    datMat = mat(dataArr)
+    labelMat = mat(labelArr).transpose()
+    svInd = nonzero(alphas.A > 0)[0]
+    sVs = datMat[svInd]
+    labelSV = labelMat[svInd]
+    print("there are %d Support Vectors" % shape(sVs)[0])
+    m, n = shape(datMat)
+    errorCount = 0
+    for i in range(m):
+        kernelEval = kernelTrans(sVs, datMat[i, :], kTup)
+        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
+        if sign(predict) != sign(labelArr[i]): errorCount += 1
+    print("the training error rate is: %f" % (float(errorCount) / m))
+    dataArr, labelArr = loadImages('testDigits')
+    errorCount = 0
+    datMat = mat(dataArr)
+    labelMat = mat(labelArr).transpose()
+    m, n = shape(datMat)
+    for i in range(m):
+        kernelEval = kernelTrans(sVs, datMat[i, :], kTup)
+        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
+        if sign(predict) != sign(labelArr[i]): errorCount += 1
+    print("the test error rate is: %f" % (float(errorCount) / m))
+