From cdd436d5b8c6dc469c7c51368b53bd1462af8307 Mon Sep 17 00:00:00 2001 From: hello19883 Date: Tue, 21 Mar 2017 21:40:25 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E6=9B=B4=E6=96=B0svdRec.py=20=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E7=A4=BA=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/python/14.SVD/svdRec.py | 76 +++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 16 deletions(-) diff --git a/src/python/14.SVD/svdRec.py b/src/python/14.SVD/svdRec.py index f705e37e..5566816b 100644 --- a/src/python/14.SVD/svdRec.py +++ b/src/python/14.SVD/svdRec.py @@ -1,30 +1,74 @@ -# coding=utf-8 -def loadExData(): - return[[1,1,1,0,0], - [2,2,2,0,0], - [1,1,1,0,0], - [5,5,5,0,0], - [1,1,0,2,2], - [0,0,0,3,3], - [0,0,0,1,1]] +#!/usr/bin/python +# encoding: utf-8 -from numpy import * +from numpy import * from numpy import linalg as la + + +def loadExData(): + return[[1, 1, 1, 0, 0], + [2, 2, 2, 0, 0], + [1, 1, 1, 0, 0], + [5, 5, 5, 0, 0], + [1, 1, 0, 2, 2], + [0, 0, 0, 3, 3], + [0, 0, 0, 1, 1]] + + # 欧氏距离相似度,假定inA和inB 都是列向量 # 计算向量的第二范式,相当于计算了欧氏距离 -def ecludSim(inA,inB): +def ecludSim(inA, inB): return 1.0/(1.0 + la.norm(inA - inB)) + # pearsSim()函数会检查是否存在3个或更多的点。 # corrcoef直接计算皮尔逊相关系数 -def pearsSim(inA,inB): +def pearsSim(inA, inB): # 如果不存在,该函数返回1.0,此时两个向量完全相关。 - if len(inA)< 3 :return 1.0 - return 0.5 + 0.5*corrcoef(inA,inB,rowvar = 0)[0][1] + if len(inA) < 3: + return 1.0 + return 0.5 + 0.5*corrcoef(inA, inB, rowvar=0)[0][1] + # 计算余弦相似度 -def cosSim(inA,inB): +def cosSim(inA, inB): num = float(inA.T*inB) denom = la.norm(inA)*la.norm(inB) - return 0.5 +0.5*(num/denom) + return 0.5 + 0.5*(num/denom) + +# 基于物品相似度的推荐引擎 +# standEst()函数,用来计算在给定相似度计算方法的条件下,用户对物品的估计评分值。 + # standEst()函数的参数包括数据矩阵、用户编号、物品编号和相似度计算方法 +def standEst(dataMat, user, simMeas, item): + n = shape(dataMat)[1] + simTotal = 0.0 + ratSimTotal = 0.0 + for j in range(n): + userRating = dataMat[user, j] + if userRating == 0: + continue + # 寻找两个用户都评级的物品 + overLap = nonzero(logical_and(dataMat[:, item].A>0, dataMat[:, j].A>0))[0] + if len(overLap) == 0:similarity =0 + else: similarity = simMeas(dataMat[overLap,item], \ + dataMat[overLap,j]) + #print 'the %d and %d similarity is : %f'(iten,j,similarity) + simTotal += similarity + ratSimTotal += similarity * userRating + if simTotal == 0: return 0 + else: return ratSimTotal/simTotal + + +#recommend()函数,就是推荐引擎,它会调用standEst()函数。 +def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst): + # 寻找未评级的物品 + unratedItems = nonzero(dataMat[user, :].A == 0)[1] + if len(unratedItems) == 0: + return 'you rated everything' + itemScores = [] + for item in unratedItems: + estimatedScore = estMethod(dataMat, user, simMeas, item) + # 寻找前N个未评级物品 + itemScores.append((item, estimatedScore)) + return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[: N] From 92336d8212d8cc0a356254c57ca02a41e84bcd2d Mon Sep 17 00:00:00 2001 From: geekidentity Date: Tue, 21 Mar 2017 23:18:35 +0800 Subject: [PATCH 2/3] SVM smoSimple --- src/python/06.SVM/svmMLiA.py | 118 +++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 src/python/06.SVM/svmMLiA.py diff --git a/src/python/06.SVM/svmMLiA.py b/src/python/06.SVM/svmMLiA.py new file mode 100644 index 00000000..8cc51a4a --- /dev/null +++ b/src/python/06.SVM/svmMLiA.py @@ -0,0 +1,118 @@ +""" +Created on Nov 4, 2010 +Update on 2017-03-21 +Chapter 5 source file for Machine Learing in Action +@author: Peter/geekidentity +""" +from numpy import * +from time import sleep + +def loadDataSet(fileName): + """ + 对文件进行逐行解析,从而得到第行的类标签和整个数据矩阵 + Args: + fileName: testSet.txt + + Returns: + 数据矩阵, 类标签 + """ + dataMat = []; labelMat = [] + fr = open(fileName) + for line in fr.readlines(): + lineArr = line.strip().split('\t') + dataMat.append([float(lineArr[0]), float(lineArr[1])]) + labelMat.append(float(lineArr[2])) + return dataMat,labelMat + +def selectJrand(i,m): + """ + 随机选择一个整数 + Args: + i: 第一个alpha的下标 + m: 所有alpha的数目 + + Returns: + + """ + j=i #we want to select any J not equal to i + while (j==i): + j = int(random.uniform(0,m)) + return j + +def clipAlpha(aj,H,L): + """ + 用于调整大于H或小于L的alpha值 + Args: + aj: + H: + L: + + Returns: + + """ + if aj > H: + aj = H + if L > aj: + aj = L + return aj + +def smoSimple(dataMatIn, classLabels, C, toler, maxIter): + """ + SVM SMO算法的简单实现: + 创建一个alpha向量并将其初始化为0向量 + 当迭代次数据小于最大迭代次数时(外循环) + 对数据集中的每个数据向量(内循环): + 如果该数据向量可以被优化: + 随机选择另外一个数据向量 + 同时优化这两个向量 + 如果两个向量都不能被优化,退出内循环 + 如果所有向量都没有被优化,增加迭代数目,继续下一次循环 + Args: + dataMatIn: 数据集 + classLabels: 类别标签 + C: 常数C + toler: 容错率 + maxIter: 退出前最大的循环次数 + + Returns: + + """ + dataMatrix = mat(dataMatIn); labelMat = mat(classLabels).transpose() + b = 0; m,n = shape(dataMatrix) + alphas = mat(zeros((m,1))) + iter = 0 + while (iter < maxIter): + alphaPairsChanged = 0 + for i in range(m): + fXi = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b + Ei = fXi - float(labelMat[i])#if checks if an example violates KKT conditions + if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] > 0)): + j = selectJrand(i,m) + fXj = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b + Ej = fXj - float(labelMat[j]) + alphaIold = alphas[i].copy(); alphaJold = alphas[j].copy() + if (labelMat[i] != labelMat[j]): + L = max(0, alphas[j] - alphas[i]) + H = min(C, C + alphas[j] - alphas[i]) + else: + L = max(0, alphas[j] + alphas[i] - C) + H = min(C, alphas[j] + alphas[i]) + if L==H: print("L==H"); continue + eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T - dataMatrix[i,:]*dataMatrix[i,:].T - dataMatrix[j,:]*dataMatrix[j,:].T + if eta >= 0: print("eta>=0"); continue + alphas[j] -= labelMat[j]*(Ei - Ej)/eta + alphas[j] = clipAlpha(alphas[j],H,L) + if (abs(alphas[j] - alphaJold) < 0.00001): print("j not moving enough"); continue + alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j])#update i by the same amount as j + #the update is in the oppostie direction + b1 = b - Ei- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T + b2 = b - Ej- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T + if (0 < alphas[i]) and (C > alphas[i]): b = b1 + elif (0 < alphas[j]) and (C > alphas[j]): b = b2 + else: b = (b1 + b2)/2.0 + alphaPairsChanged += 1 + print("iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged)) + if (alphaPairsChanged == 0): iter += 1 + else: iter = 0 + print("iteration number: %d" % iter) + return b,alphas \ No newline at end of file From 2af181545b74e6e90d7661d4bcc05bbfcb282d66 Mon Sep 17 00:00:00 2001 From: hello19883 Date: Thu, 23 Mar 2017 19:27:53 +0800 Subject: [PATCH 3/3] =?UTF-8?q?14=E7=AB=A0=E5=9F=BA=E4=BA=8ESVD=E7=9A=84?= =?UTF-8?q?=E8=AF=84=E5=88=86=E4=BC=B0=E8=AE=A1=E7=9A=84=E4=B8=A4=E4=B8=AA?= =?UTF-8?q?=E7=A4=BA=E4=BE=8B=E5=B7=B2=E7=BB=8F=E6=B5=8B=E8=AF=95=E5=AE=8C?= =?UTF-8?q?=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/python/14.SVD/svdRec.py | 95 ++++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/src/python/14.SVD/svdRec.py b/src/python/14.SVD/svdRec.py index 5566816b..1cb6b482 100644 --- a/src/python/14.SVD/svdRec.py +++ b/src/python/14.SVD/svdRec.py @@ -6,6 +6,29 @@ from numpy import linalg as la def loadExData(): + # 利用SVD提高推荐效果,菜肴矩阵 + return[[2, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5], + [0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0], + [3, 3, 4, 0, 3, 0, 0, 2, 2, 0, 0], + [5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0], + [4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5], + [0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4], + [0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0], + [0, 0, 0, 3, 0, 0, 0, 0, 4, 5, 0], + [1, 1, 2, 1, 1, 2, 1, 0, 4, 5, 0]] +""" + # 推荐引擎示例矩阵 + return[[4, 4, 0, 2, 2], + [4, 0, 0, 3, 3], + [4, 0, 0, 1, 1], + [1, 1, 1, 2, 0], + [2, 2, 2, 0, 0], + [1, 1, 1, 0, 0], + [5, 5, 5, 0, 0]] + + 原矩阵 return[[1, 1, 1, 0, 0], [2, 2, 2, 0, 0], [1, 1, 1, 0, 0], @@ -13,6 +36,7 @@ def loadExData(): [1, 1, 0, 2, 2], [0, 0, 0, 3, 3], [0, 0, 0, 1, 1]] +""" # 欧氏距离相似度,假定inA和inB 都是列向量 @@ -39,36 +63,93 @@ def cosSim(inA, inB): # 基于物品相似度的推荐引擎 # standEst()函数,用来计算在给定相似度计算方法的条件下,用户对物品的估计评分值。 - # standEst()函数的参数包括数据矩阵、用户编号、物品编号和相似度计算方法 +# standEst()函数的参数包括数据矩阵、用户编号、物品编号和相似度计算方法 def standEst(dataMat, user, simMeas, item): + # 得到数据集中的物品数目 n = shape(dataMat)[1] + # 初始化两个评分值 simTotal = 0.0 ratSimTotal = 0.0 + # 遍历行中的每个物品(对用户评过分的物品进行遍历,并将它与其他物品进行比较) for j in range(n): userRating = dataMat[user, j] + # 如果某个物品的评分值为0,则跳过这个物品 if userRating == 0: continue # 寻找两个用户都评级的物品 + # 变量overLap 给出的是两个物品当中已经被评分的那个元素 overLap = nonzero(logical_and(dataMat[:, item].A>0, dataMat[:, j].A>0))[0] + # 如果相似度为0,则两着没有任何重合元素,终止本次循环 if len(overLap) == 0:similarity =0 + # 如果存在重合的物品,则基于这些重合物重新计算相似度。 else: similarity = simMeas(dataMat[overLap,item], \ dataMat[overLap,j]) - #print 'the %d and %d similarity is : %f'(iten,j,similarity) + # print 'the %d and %d similarity is : %f'(iten,j,similarity) + # 相似度会不断累加,每次计算时还考虑相似度和当前用户评分的乘积 + # similarity 用户相似度, userRating 用户评分 simTotal += similarity ratSimTotal += similarity * userRating - if simTotal == 0: return 0 - else: return ratSimTotal/simTotal + if simTotal == 0: + return 0 + # 通过除以所有的评分总和,对上述相似度评分的乘积进行归一化,使得最后评分在0~5之间,这些评分用来对预测值进行排序 + else: + return ratSimTotal/simTotal -#recommend()函数,就是推荐引擎,它会调用standEst()函数。 +# recommend()函数,就是推荐引擎,它会调用standEst()函数,产生了最高的N个推荐结果。 +# 如果不指定N的大小,则默认值为3。该函数另外的参数还包括相似度计算方法和估计方法 def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst): # 寻找未评级的物品 + # 对给定的用户建立一个未评分的物品列表 unratedItems = nonzero(dataMat[user, :].A == 0)[1] + # 如果不存在未评分物品,那么就退出函数 if len(unratedItems) == 0: return 'you rated everything' + # 在所有的未评分物品上进行循环 itemScores = [] for item in unratedItems: estimatedScore = estMethod(dataMat, user, simMeas, item) - # 寻找前N个未评级物品 + # 寻找前N个未评级物品,调用standEst()来产生该物品的预测得分,该物品的编号和估计值会放在一个元素列表itemScores中 itemScores.append((item, estimatedScore)) - return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[: N] + # 按照估计得分,对该列表进行排序并返回。列表逆排序,第一个值就是最大值 + return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[: N] + + +# 基于SVD的评分估计 +# 在recommend() 中,这个函数用于替换对standEst()的调用,该函数对给定用户给定物品构建了一个评分估计值 +def svdEst(dataMat, user, simMeas, item): + n = shape(dataMat)[1] + # 对数据集进行SVD分解 + simTotal = 0.0 + ratSimTotal = 0.0 + # 在SVD分解之后,我们只利用包含了90%能量值的奇异值,这些奇异值会以NumPy数组的形式得以保存 + U, Sigma, VT = la.svd(dataMat) + # 如果要进行矩阵运算,就必须要用这些奇异值构建出一个对角矩阵 + Sig4 = mat(eye(4) * Sigma[: 4]) + # 利用U矩阵将物品转换到低维空间中,构建转换后的物品 + xformedItems = dataMat.T * U[:, :4] * Sig4.I + # 对于给定的用户,for循环在用户对应行的元素上进行遍历, + # 这和standEst()函数中的for循环的目的一样,只不过这里的相似度计算时在低维空间下进行的。 + for j in range(n): + userRating = dataMat[user, j] + if userRating == 0 or j == item: + continue + # 相似度的计算方法也会作为一个参数传递给该函数 + similarity = simMeas(xformedItems[item, :].T,xformedItems[j, :].T) + # for 循环中加入了一条print语句,以便了解相似度计算的进展情况。如果觉得累赘,可以去掉 + print 'the %d and %d similarity is: %f' % (item, j, similarity) + # 对相似度求和 + simTotal += similarity + # 对相似度及对应评分值的乘积求和 + ratSimTotal += similarity * userRating + if simTotal == 0: + return 0 + else: + # 计算估计评分 + return ratSimTotal/simTotal + + +if __name__ == "__main__": + myMat = mat(loadExData()) + print myMat + print recommend(myMat, 1, estMethod=svdEst)