更新完 6.SVM支持向量机

2026-02-13 15:26:28 +08:00 · 2017-04-19 23:28:38 +08:00
parent 338cb7ecb1
commit 90154ca374
4 changed files with 238 additions and 129 deletions
--- a/src/python/6.SVM/svm-complete.py
+++ b/src/python/6.SVM/svm-complete.py
@@ -101,7 +101,7 @@ def calcEk(oS, k):
        k   具体的某一行

    Returns:
-
+        Ek  预测结果与真实结果比对，计算误差Ek
    """
    fXk = float(multiply(oS.alphas, oS.labelMat).T * oS.K[:, k] + oS.b)
    Ek = fXk - float(oS.labelMat[k])
@@ -142,7 +142,7 @@ def selectJ(i, oS, Ei):  # this is the second choice -heurstic, and calcs Ej
    maxK = -1
    maxDeltaE = 0
    Ej = 0
-    # # 首先将输入值Ei在缓存中设置成为有效的。这里的有效意味着它已经计算好了。
+    # 首先将输入值Ei在缓存中设置成为有效的。这里的有效意味着它已经计算好了。
    oS.eCache[i] = [1, Ei]

    # print 'oS.eCache[%s]=%s' % (i, oS.eCache[i])
@@ -515,8 +515,8 @@ if __name__ == "__main__":
    # plotfig_SVM(dataArr, labelArr, ws, b, alphas)

    # # 有核函数的测试
-    # testRbf(1)
+    testRbf(0.8)

    # 项目实战
    # 示例：手写识别问题回顾
-    testDigits(('rbf', 20))
+    # testDigits(('rbf', 20))
--- a/src/python/6.SVM/svm-complete_Non-Kernel.py
+++ b/src/python/6.SVM/svm-complete_Non-Kernel.py
@@ -11,6 +11,18 @@ from numpy import *
 import matplotlib.pyplot as plt


+class optStruct:
+    def __init__(self, dataMatIn, classLabels, C, toler):  # Initialize the structure with the parameters
+        self.X = dataMatIn
+        self.labelMat = classLabels
+        self.C = C
+        self.tol = toler
+        self.m = shape(dataMatIn)[0]
+        self.alphas = mat(zeros((self.m, 1)))
+        self.b = 0
+        self.eCache = mat(zeros((self.m, 2)))  # first column is valid flag
+
+
 def loadDataSet(fileName):
    """loadDataSet（对文件进行逐行解析，从而得到第行的类标签和整个数据矩阵）

@@ -61,6 +73,222 @@ def clipAlpha(aj, H, L):
    return aj


+def calcEk(oS, k):
+    """calcEk（求 Ek误差：预测值-真实值的差）
+
+    该过程在完整版的SMO算法中陪出现次数较多，因此将其单独作为一个方法
+    Args:
+        oS  optStruct对象
+        k   具体的某一行
+
+    Returns:
+        Ek  预测结果与真实结果比对，计算误差Ek
+    """
+    fXk = float(multiply(oS.alphas, oS.labelMat).T * (oS.X * oS.X[k, :].T)) + oS.b
+    Ek = fXk - float(oS.labelMat[k])
+    return Ek
+
+
+def selectJ(i, oS, Ei):  # this is the second choice -heurstic, and calcs Ej
+    """selectJ（返回最优的j和Ej）
+
+    内循环的启发式方法。
+    选择第二个(内循环)alpha的alpha值
+    这里的目标是选择合适的第二个alpha值以保证每次优化中采用最大步长。
+    该函数的误差与第一个alpha值Ei和下标i有关。
+    Args:
+        i   具体的第i一行
+        oS  optStruct对象
+        Ei  预测结果与真实结果比对，计算误差Ei
+
+    Returns:
+        j  随机选出的第j一行
+        Ej 预测结果与真实结果比对，计算误差Ej
+    """
+    maxK = -1
+    maxDeltaE = 0
+    Ej = 0
+    # 首先将输入值Ei在缓存中设置成为有效的。这里的有效意味着它已经计算好了。
+    oS.eCache[i] = [1, Ei]
+
+    # print 'oS.eCache[%s]=%s' % (i, oS.eCache[i])
+    # print 'oS.eCache[:, 0].A=%s' % oS.eCache[:, 0].A.T
+    # """
+    # # 返回非0的：行列值
+    # nonzero(oS.eCache[:, 0].A)= (
+    #     行： array([ 0,  2,  4,  5,  8, 10, 17, 18, 20, 21, 23, 25, 26, 29, 30, 39, 46,52, 54, 55, 62, 69, 70, 76, 79, 82, 94, 97]), 
+    #     列： array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0])
+    # )
+    # """
+    # print 'nonzero(oS.eCache[:, 0].A)=', nonzero(oS.eCache[:, 0].A)
+    # # 取行的list
+    # print 'nonzero(oS.eCache[:, 0].A)[0]=', nonzero(oS.eCache[:, 0].A)[0]
+    # 非零E值的行的list列表，所对应的alpha值
+    validEcacheList = nonzero(oS.eCache[:, 0].A)[0]
+    if (len(validEcacheList)) > 1:
+        for k in validEcacheList:  # 在所有的值上进行循环，并选择其中使得改变最大的那个值
+            if k == i:
+                continue  # don't calc for i, waste of time
+
+            # 求 Ek误差：预测值-真实值的差
+            Ek = calcEk(oS, k)
+            deltaE = abs(Ei - Ek)
+            if (deltaE > maxDeltaE):
+                maxK = k
+                maxDeltaE = deltaE
+                Ej = Ek
+        return maxK, Ej
+    else:  # 如果是第一次循环，则随机选择一个alpha值
+        j = selectJrand(i, oS.m)
+
+        # 求 Ek误差：预测值-真实值的差
+        Ej = calcEk(oS, j)
+    return j, Ej
+
+
+def updateEk(oS, k):  # after any alpha has changed update the new value in the cache
+    """updateEk（计算误差值并存入缓存中。）
+
+    在对alpha值进行优化之后会用到这个值。
+    Args:
+        oS  optStruct对象
+        k   某一列的行号
+    """
+
+    # 求 误差：预测值-真实值的差
+    Ek = calcEk(oS, k)
+    oS.eCache[k] = [1, Ek]
+
+
+def innerL(i, oS):
+    """innerL
+    内循环代码
+    Args:
+        i   具体的某一行
+        oS  optStruct对象
+
+    Returns:
+        0   找不到最优的值
+        1   找到了最优的值，并且oS.Cache到缓存中
+    """
+
+    # 求 Ek误差：预测值-真实值的差
+    Ei = calcEk(oS, i)
+
+    # 约束条件 (KKT条件是解决最优化问题的时用到的一种方法。我们这里提到的最优化问题通常是指对于给定的某一函数，求其在指定作用域上的全局最小值。)
+    # 0<=alphas[i]<=C，但由于0和C是边界值，我们无法进行优化，因为需要增加一个alphas和降低一个alphas。
+    # 表示发生错误的概率：labelMat[i]*Ei 如果超出了 toler， 才需要优化。至于正负号，我们考虑绝对值就对了。
+    if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)):
+        # 选择最大的误差对应的j进行优化。效果更明显
+        j, Ej = selectJ(i, oS, Ei)
+        alphaIold = oS.alphas[i].copy()
+        alphaJold = oS.alphas[j].copy()
+
+        # L和H用于将alphas[j]调整到0-C之间。如果L==H，就不做任何改变，直接return 0
+        if (oS.labelMat[i] != oS.labelMat[j]):
+            L = max(0, oS.alphas[j] - oS.alphas[i])
+            H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
+        else:
+            L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
+            H = min(oS.C, oS.alphas[j] + oS.alphas[i])
+        if L == H:
+            print("L==H")
+            return 0
+
+        # eta是alphas[j]的最优修改量，如果eta==0，需要退出for循环的当前迭代过程
+        # 如果ETA为0，那么计算新的alphas[j]就比较麻烦了, 为什么呢？ 因为2个值一样。
+        # 2ab <= a^2 + b^2
+        eta = 2.0 * oS.X[i, :] * oS.X[j, :].T - oS.X[i, :] * oS.X[i, :].T - oS.X[j, :] * oS.X[j, :].T
+        if eta >= 0:
+            print("eta>=0")
+            return 0
+
+         # 计算出一个新的alphas[j]值
+        oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej) / eta
+        # 并使用辅助函数，以及L和H对其进行调整
+        oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
+        # 更新误差缓存
+        updateEk(oS, j)
+
+        # 检查alpha[j]是否只是轻微的改变，如果是的话，就退出for循环。
+        if (abs(oS.alphas[j] - alphaJold) < 0.00001):
+            print("j not moving enough")
+            return 0
+
+        # 然后alphas[i]和alphas[j]同样进行改变，虽然改变的大小一样，但是改变的方向正好相反
+        oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * (alphaJold - oS.alphas[j])
+        # 更新误差缓存
+        updateEk(oS, i)
+
+        # 在对alpha[i], alpha[j] 进行优化之后，给这两个alpha值设置一个常数b。
+        # w= Σ[1~n] ai*yi*xi => b = yj Σ[1~n] ai*yi(xi*xj)
+        # 所以：  b1 - b = (y1-y) - Σ[1~n] yi*(a1-a)*(xi*x1)
+        b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[i, :].T - oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.X[i, :] * oS.X[j, :].T
+        b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[j, :].T - oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.X[j, :] * oS.X[j, :].T
+        if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]):
+            oS.b = b1
+        elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]):
+            oS.b = b2
+        else:
+            oS.b = (b1 + b2) / 2.0
+        return 1
+    else:
+        return 0
+
+
+def smoP(dataMatIn, classLabels, C, toler, maxIter):
+    """
+    完整SMO算法外循环，与smoSimple有些类似，但这里的循环退出条件更多一些
+    Args:
+        dataMatIn    数据集
+        classLabels  类别标签
+        C   松弛变量(常量值)，允许有些数据点可以处于分隔面的错误一侧。
+            控制最大化间隔和保证大部分的函数间隔小于1.0这两个目标的权重。
+            可以通过调节该参数达到不同的结果。
+        toler   容错率
+        maxIter 退出前最大的循环次数
+    Returns:
+        b       模型的常量值
+        alphas  拉格朗日乘子
+    """
+
+    # 创建一个 optStruct 对象
+    oS = optStruct(mat(dataMatIn), mat(classLabels).transpose(), C, toler)
+    iter = 0
+    entireSet = True
+    alphaPairsChanged = 0
+
+    # 循环遍历：循环maxIter次 并且 （alphaPairsChanged存在可以改变 or 所有行遍历一遍）
+    while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
+        alphaPairsChanged = 0
+
+        #  当entireSet=true or 非边界alpha对没有了；就开始寻找 alpha对，然后决定是否要进行else。
+        if entireSet:
+            # 在数据集上遍历所有可能的alpha
+            for i in range(oS.m):
+                # 是否存在alpha对，存在就+1
+                alphaPairsChanged += innerL(i, oS)
+                print("fullSet, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
+            iter += 1
+
+        # 对已存在 alpha对，选出非边界的alpha值，进行优化。
+        else:
+            # 遍历所有的非边界alpha值，也就是不在边界0或C上的值。
+            nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
+            for i in nonBoundIs:
+                alphaPairsChanged += innerL(i, oS)
+                print("non-bound, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
+            iter += 1
+
+        # 如果找到alpha对，就优化非边界alpha值，否则，就重新进行寻找，如果寻找一遍 遍历所有的行还是没找到，就退出循环。
+        if entireSet:
+            entireSet = False  # toggle entire set loop
+        elif (alphaPairsChanged == 0):
+            entireSet = True
+        print("iteration number: %d" % iter)
+    return oS.b, oS.alphas
+
+
 def calcWs(alphas, dataArr, classLabels):
    """
    基于alpha计算w值
@@ -81,126 +309,6 @@ def calcWs(alphas, dataArr, classLabels):
    return w


-'''
-#######********************************
-Non-Kernel VErsions below
-#######********************************
-'''
-
-class optStruct:
-    def __init__(self, dataMatIn, classLabels, C, toler):  # Initialize the structure with the parameters
-        self.X = dataMatIn
-        self.labelMat = classLabels
-        self.C = C
-        self.tol = toler
-        self.m = shape(dataMatIn)[0]
-        self.alphas = mat(zeros((self.m, 1)))
-        self.b = 0
-        self.eCache = mat(zeros((self.m, 2)))  # first column is valid flag
-
-
-def calcEk(oS, k):
-    fXk = float(multiply(oS.alphas, oS.labelMat).T * (oS.X * oS.X[k, :].T)) + oS.b
-    Ek = fXk - float(oS.labelMat[k])
-    return Ek
-
-
-def selectJ(i, oS, Ei):  # this is the second choice -heurstic, and calcs Ej
-    maxK = -1
-    maxDeltaE = 0
-    Ej = 0
-    oS.eCache[i] = [1, Ei]  # set valid #choose the alpha that gives the maximum delta E
-    validEcacheList = nonzero(oS.eCache[:, 0].A)[0]
-    if (len(validEcacheList)) > 1:
-        for k in validEcacheList:  # loop through valid Ecache values and find the one that maximizes delta E
-            if k == i: continue  # don't calc for i, waste of time
-            Ek = calcEk(oS, k)
-            deltaE = abs(Ei - Ek)
-            if (deltaE > maxDeltaE):
-                maxK = k
-                maxDeltaE = deltaE
-                Ej = Ek
-        return maxK, Ej
-    else:  # in this case (first time around) we don't have any valid eCache values
-        j = selectJrand(i, oS.m)
-        Ej = calcEk(oS, j)
-    return j, Ej
-
-
-def updateEk(oS, k):  # after any alpha has changed update the new value in the cache
-    Ek = calcEk(oS, k)
-    oS.eCache[k] = [1, Ek]
-
-
-def innerL(i, oS):
-    Ei = calcEk(oS, i)
-    if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or (
-        (oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)):
-        j, Ej = selectJ(i, oS, Ei)  # this has been changed from selectJrand
-        alphaIold = oS.alphas[i].copy()
-        alphaJold = oS.alphas[j].copy()
-        if (oS.labelMat[i] != oS.labelMat[j]):
-            L = max(0, oS.alphas[j] - oS.alphas[i])
-            H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
-        else:
-            L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
-            H = min(oS.C, oS.alphas[j] + oS.alphas[i])
-        if L == H:
-            print("L==H")
-            return 0
-        eta = 2.0 * oS.X[i, :] * oS.X[j, :].T - oS.X[i, :] * oS.X[i, :].T - oS.X[j, :] * oS.X[j, :].T
-        if eta >= 0:
-            print("eta>=0")
-            return 0
-        oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej) / eta
-        oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
-        updateEk(oS, j)  # added this for the Ecache
-        if (abs(oS.alphas[j] - alphaJold) < 0.00001):
-            print("j not moving enough")
-            return 0
-        oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * (alphaJold - oS.alphas[j])  # update i by the same amount as j
-        updateEk(oS, i)  # added this for the Ecache                    #the update is in the oppostie direction
-        b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[i, :].T - oS.labelMat[j] * (
-        oS.alphas[j] - alphaJold) * oS.X[i, :] * oS.X[j, :].T
-        b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[j, :].T - oS.labelMat[j] * (
-        oS.alphas[j] - alphaJold) * oS.X[j, :] * oS.X[j, :].T
-        if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]):
-            oS.b = b1
-        elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]):
-            oS.b = b2
-        else:
-            oS.b = (b1 + b2) / 2.0
-        return 1
-    else:
-        return 0
-
-
-def smoP(dataMatIn, classLabels, C, toler, maxIter):  # full Platt SMO
-    oS = optStruct(mat(dataMatIn), mat(classLabels).transpose(), C, toler)
-    iter = 0
-    entireSet = True
-    alphaPairsChanged = 0
-    while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
-        alphaPairsChanged = 0
-        if entireSet:  # go over all
-            for i in range(oS.m):
-                alphaPairsChanged += innerL(i, oS)
-                print("fullSet, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
-            iter += 1
-        else:  # go over non-bound (railed) alphas
-            nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
-            for i in nonBoundIs:
-                alphaPairsChanged += innerL(i, oS)
-                print("non-bound, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
-            iter += 1
-        if entireSet:
-            entireSet = False  # toggle entire set loop
-        elif (alphaPairsChanged == 0):
-            entireSet = True
-        print("iteration number: %d" % iter)
-    return oS.b, oS.alphas
-
-
 def plotfig_SVM(xArr, yArr, ws, b, alphas):
    """
    参考地址：
--- a/src/python/6.SVM/svm-simple.py
+++ b/src/python/6.SVM/svm-simple.py
@@ -145,7 +145,7 @@ def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
                # 然后alphas[i]和alphas[j]同样进行改变，虽然改变的大小一样，但是改变的方向正好相反
                alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j])
                # 在对alpha[i], alpha[j] 进行优化之后，给这两个alpha值设置一个常数b。
-                # w= Σ[1~n] ai*yi*xi => b = yi- Σ[1~n] ai*yi(xi*xj)
+                # w= Σ[1~n] ai*yi*xi => b = yj- Σ[1~n] ai*yi(xi*xj)
                # 所以：  b1 - b = (y1-y) - Σ[1~n] yi*(a1-a)*(xi*x1)
                b1 = b - Ei- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
                b2 = b - Ej- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
@@ -158,6 +158,7 @@ def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
                alphaPairsChanged += 1
                print("iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged))
        # 在for循环外，检查alpha值是否做了更新，如果在更新则将iter设为0后继续运行程序
+        # 知道更新完毕后，iter次循环无变化，才推出循环。
        if (alphaPairsChanged == 0):
            iter += 1
        else: