diff --git a/src/python/16.RecommenderSystems/itemcf.py b/src/python/16.RecommenderSystems/RS-itemcf.py similarity index 95% rename from src/python/16.RecommenderSystems/itemcf.py rename to src/python/16.RecommenderSystems/RS-itemcf.py index eed6ed30..46d55d38 100644 --- a/src/python/16.RecommenderSystems/itemcf.py +++ b/src/python/16.RecommenderSystems/RS-itemcf.py @@ -65,7 +65,8 @@ class ItemBasedCF(): for line in self.loadfile(filename): # 用户ID,电影名称,评分,时间戳 - user, movie, rating, _ = line.split('::') + # user, movie, rating, _ = line.split('::') + user, movie, rating, _ = line.split('\t') # 通过pivot和随机函数比较,然后初始化用户和对应的值 if (random.random() < pivot): @@ -203,7 +204,8 @@ class ItemBasedCF(): if __name__ == '__main__': - ratingfile = 'input/16.RecommenderSystems/ml-1m/ratings.dat' + # ratingfile = 'input/16.RecommenderSystems/ml-1m/ratings.dat' + ratingfile = 'input/16.RecommenderSystems/ml-100k/u.data' # 创建ItemCF对象 itemcf = ItemBasedCF() @@ -212,4 +214,8 @@ if __name__ == '__main__': # 计算用户之间的相似度 itemcf.calc_movie_sim() # 评估推荐效果 - itemcf.evaluate() + # itemcf.evaluate() + # 查看推荐结果用户 + user = "2" + print "推荐结果", itemcf.recommend(user) + print "---", itemcf.testset.get(user, {}) diff --git a/src/python/16.RecommenderSystems/sklearn-RS-demo-cf.py b/src/python/16.RecommenderSystems/RS-sklearn-rating.py similarity index 54% rename from src/python/16.RecommenderSystems/sklearn-RS-demo-cf.py rename to src/python/16.RecommenderSystems/RS-sklearn-rating.py index 5a5bf4c7..f20a6784 100644 --- a/src/python/16.RecommenderSystems/sklearn-RS-demo-cf.py +++ b/src/python/16.RecommenderSystems/RS-sklearn-rating.py @@ -1,7 +1,9 @@ #!/usr/bin/python # coding:utf8 -from math import sqrt +import sys +import math +from operator import itemgetter import numpy as np import pandas as pd @@ -36,9 +38,23 @@ def calc_similarity(n_users, n_items, train_data, test_data): # 使用sklearn的pairwise_distances函数来计算余弦相似性。 print "1:", np.shape(train_data_matrix) # 行:人,列:电影 print "2:", np.shape(train_data_matrix.T) # 行:电影,列:人 + user_similarity = pairwise_distances(train_data_matrix, metric="cosine") item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine") - return train_data_matrix, test_data_matrix, user_similarity, item_similarity + + print >> sys.stderr, '开始统计流行item的数量...' + item_popular = {} + # 统计在所有的用户中,不同电影的总出现次数 + for i_index in range(n_items): + if np.sum(train_data_matrix[:, i_index]) != 0: + item_popular[i_index] = np.sum(train_data_matrix[:, i_index]!=0) + # print "pop=", i_index, self.item_popular[i_index] + + # save the total number of items + item_count = len(item_popular) + print >> sys.stderr, '总共流行item数量 = %d' % item_count + + return train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular def predict(rating, similarity, type='user'): @@ -60,7 +76,7 @@ def predict(rating, similarity, type='user'): # 均分 + 人-人-距离(943, 943)*人-电影-评分diff(943, 1682)=结果-人-电影(每个人对同一电影的综合得分)(943, 1682) 再除以 个人与其他人总的距离 = 人-电影综合得分 pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T elif type == 'item': - # 综合打分: 人-电影-评分(943, 1682)*电影-电影-距离1682, 1682)=结果-人-电影(各个电影对同一电影的综合得分)(943, 1682) / 再除以 电影与其他电影总的距离 = 人-电影综合得分 + # 综合打分: 人-电影-评分(943, 1682)*电影-电影-距离(1682, 1682)=结果-人-电影(各个电影对同一电影的综合得分)(943, 1682) / 再除以 电影与其他电影总的距离 = 人-电影综合得分 pred = rating.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)]) return pred @@ -68,11 +84,51 @@ def predict(rating, similarity, type='user'): def rmse(prediction, ground_truth): prediction = prediction[ground_truth.nonzero()].flatten() ground_truth = ground_truth[ground_truth.nonzero()].flatten() - return sqrt(mean_squared_error(prediction, ground_truth)) + return math.sqrt(mean_squared_error(prediction, ground_truth)) + + +def evaluate(prediction, item_popular, name): + hit = 0 + rec_count = 0 + test_count = 0 + popular_sum = 0 + all_rec_items = set() + for u_index in range(n_users): + items = np.where(train_data_matrix[u_index, :] == 0)[0] + pre_items = sorted(dict(zip(items, prediction[u_index, items])).items(), key=itemgetter(1), reverse=True)[: 20] + test_items = np.where(test_data_matrix[u_index, :] != 0)[0] + + # 对比测试集和推荐集的差异 + for item, w in pre_items: + if item in test_items: + hit += 1 + all_rec_items.add(item) + + # 计算用户对应的电影出现次数log值的sum加和 + if item in item_popular: + popular_sum += math.log(1 + item_popular[item]) + + rec_count += len(pre_items) + test_count += len(test_items) + + precision = hit / (1.0 * rec_count) + recall = hit / (1.0 * test_count) + coverage = len(all_rec_items) / (1.0 * len(item_popular)) + popularity = popular_sum / (1.0 * rec_count) + print >> sys.stderr, '%s: precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (name, precision, recall, coverage, popularity) + + +def recommend(u_index, prediction): + items = np.where(train_data_matrix[u_index, :] == 0)[0] + pre_items = sorted(dict(zip(items, prediction[u_index, items])).items(), key=itemgetter(1), reverse=True)[: 10] + test_items = np.where(test_data_matrix[u_index, :] != 0)[0] + + print '原始结果:', test_items + print '推荐结果:', [key for key, value in pre_items] if __name__ == "__main__": - # 基于模型的协同过滤 + # 基于内存的协同过滤 # ... # 拆分数据集 # http://files.grouplens.org/datasets/movielens/ml-100k.zip @@ -80,22 +136,37 @@ if __name__ == "__main__": df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25) # 计算相似度 - train_data_matrix, test_data_matrix, user_similarity, item_similarity = calc_similarity(n_users, n_items, train_data, test_data) + train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular = calc_similarity(n_users, n_items, train_data, test_data) - user_prediction = predict(train_data_matrix, user_similarity, type='user') item_prediction = predict(train_data_matrix, item_similarity, type='item') + user_prediction = predict(train_data_matrix, user_similarity, type='user') # 评估:均方根误差 - print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)) print 'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)) + print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)) # 基于模型的协同过滤 # ... - # 计算MovieLens数据集的稀疏度 + # 计算MovieLens数据集的稀疏度 (n_users,n_items 是常量,所以,用户行为数据越少,意味着信息量少;越稀疏,优化的空间也越大) sparsity = round(1.0 - len(df)/float(n_users*n_items), 3) print 'The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%' - u, s, vt = svds(train_data_matrix, k=20) + # 计算稀疏矩阵的最大k个奇异值/向量 + u, s, vt = svds(train_data_matrix, k=15) s_diag_matrix = np.diag(s) - x_pred = np.dot(np.dot(u, s_diag_matrix), vt) - print 'Model based CF RMSE: ' + str(rmse(x_pred, test_data_matrix)) + svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt) + print 'Model based CF RMSE: ' + str(rmse(svd_prediction, test_data_matrix)) + + """ + 在信息量相同的情况下,矩阵越小,那么携带的信息越可靠。 + 所以:user-cf 推荐效果高于 item-cf; 而svd分解后,发现15个维度效果就能达到90%以上,所以信息更可靠,效果也更好。 + item-cf: 1682 + user-cf: 943 + svd: 15 + """ + evaluate(item_prediction, item_popular, 'item') + evaluate(user_prediction, item_popular, 'user') + evaluate(svd_prediction, item_popular, 'svd') + + # 推荐结果 + recommend(1, svd_prediction) diff --git a/src/python/16.RecommenderSystems/usercf.py b/src/python/16.RecommenderSystems/RS-usercf.py similarity index 97% rename from src/python/16.RecommenderSystems/usercf.py rename to src/python/16.RecommenderSystems/RS-usercf.py index fbe19640..ce84bc3d 100644 --- a/src/python/16.RecommenderSystems/usercf.py +++ b/src/python/16.RecommenderSystems/RS-usercf.py @@ -65,7 +65,8 @@ class UserBasedCF(): for line in self.loadfile(filename): # 用户ID,电影名称,评分,时间戳 - user, movie, rating, timestamp = line.split('::') + # user, movie, rating, timestamp = line.split('::') + user, movie, rating, timestamp = line.split('\t') # 通过pivot和随机函数比较,然后初始化用户和对应的值 if (random.random() < pivot): @@ -220,7 +221,8 @@ class UserBasedCF(): if __name__ == '__main__': - ratingfile = 'input/16.RecommenderSystems/ml-1m/ratings.dat' + # ratingfile = 'input/16.RecommenderSystems/ml-1m/ratings.dat' + ratingfile = 'input/16.RecommenderSystems/ml-100k/u.data' # 创建UserCF对象 usercf = UserBasedCF() diff --git a/src/python/16.RecommenderSystems/python/Recommender.py b/src/python/16.RecommenderSystems/python/Recommender.py new file mode 100644 index 00000000..40acbb04 --- /dev/null +++ b/src/python/16.RecommenderSystems/python/Recommender.py @@ -0,0 +1,28 @@ +import numpy as np + + +# 自定义杰卡德相似系数函数,仅对0-1矩阵有效 +def Jaccard(a, b): + return 1.0*(a*b).sum()/(a+b-a*b).sum() + + +class Recommender(): + + # 相似度矩阵 + sim = None + + # 计算相似度矩阵的函数 + def similarity(self, x, distance): + y = np.ones((len(x), len(x))) + for i in range(len(x)): + for j in range(len(x)): + y[i, j] = distance(x[i], x[j]) + return y + + # 训练函数 + def fit(self, x, distance=Jaccard): + self.sim = self.similarity(x, distance) + + # 推荐函数 + def recommend(self, a): + return np.dot(self.sim, a)*(1-a) diff --git a/src/python/16.RecommenderSystems/sklearn-RS-demo-cf-item-test.py b/src/python/16.RecommenderSystems/sklearn-RS-demo-cf-item-test.py new file mode 100644 index 00000000..52a7699a --- /dev/null +++ b/src/python/16.RecommenderSystems/sklearn-RS-demo-cf-item-test.py @@ -0,0 +1,185 @@ +#!/usr/bin/python +# coding:utf8 + +''' +Created on 2015-06-22 +Update on 2017-05-16 +@author: Lockvictor/片刻 +《推荐系统实践》协同过滤算法源代码 +参考地址:https://github.com/Lockvictor/MovieLens-RecSys +更新地址:https://github.com/apachecn/MachineLearning +''' +import math +import random +import sys +from operator import itemgetter + +import numpy as np +import pandas as pd +from sklearn import cross_validation as cv +from sklearn.metrics.pairwise import pairwise_distances + +print(__doc__) +# 作用:使得随机数据可预测 +random.seed(0) + + +class ItemBasedCF(): + ''' TopN recommendation - ItemBasedCF ''' + def __init__(self): + # 拆分数据集 + self.train_mat = {} + self.test_mat = {} + + # 总用户数 + self.n_users = 0 + self.n_items = 0 + + # n_sim_user: top 20个用户, n_rec_item: top 10个推荐结果 + self.n_sim_item = 20 + self.n_rec_item = 10 + + # item_mat_similarity: 电影之间的相似度, item_popular: 电影的出现次数, item_count: 总电影数量 + self.item_mat_similarity = {} + self.item_popular = {} + self.item_count = 0 + + print >> sys.stderr, 'Similar item number = %d' % self.n_sim_item + print >> sys.stderr, 'Recommended item number = %d' % self.n_rec_item + + def splitData(self, dataFile, test_size): + # 加载数据集 + header = ['user_id', 'item_id', 'rating', 'timestamp'] + df = pd.read_csv(dataFile, sep='\t', names=header) + + self.n_users = df.user_id.unique().shape[0] + self.n_items = df.item_id.unique().shape[0] + + print 'Number of users = ' + str(self.n_users) + ' | Number of items = ' + str(self.n_items) + + # 拆分数据集: 用户+电影 + self.train_data, self.test_data = cv.train_test_split(df, test_size=test_size) + print >> sys.stderr, '分离训练集和测试集成功' + print >> sys.stderr, 'len(train) = %s' % np.shape(self.train_data)[0] + print >> sys.stderr, 'len(test) = %s' % np.shape(self.test_data)[0] + + def calc_similarity(self): + # 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵: + self.train_mat = np.zeros((self.n_users, self.n_items)) + for line in self.train_data.itertuples(): + self.train_mat[int(line.user_id)-1, int(line.item_id)-1] = float(line.rating) + self.test_mat = np.zeros((self.n_users, self.n_items)) + for line in self.test_data.itertuples(): + # print "line", line.user_id-1, line.item_id-1, line.rating + self.test_mat[int(line.user_id)-1, int(line.item_id)-1] = float(line.rating) + + # 使用sklearn的pairwise_distances函数来计算余弦相似性。 + print "1:", np.shape(np.mat(self.train_mat).T) # 行:电影,列:人 + # 电影-电影-距离(1682, 1682) + self.item_mat_similarity = pairwise_distances(np.mat(self.train_mat).T, metric='cosine') + print >> sys.stderr, 'item_mat_similarity=', np.shape(self.item_mat_similarity) + + print >> sys.stderr, '开始统计流行item的数量...' + + # 统计在所有的用户中,不同电影的总出现次数 + for i_index in range(self.n_items): + if np.sum(self.train_mat[:, i_index]) != 0: + self.item_popular[i_index] = np.sum(self.train_mat[:, i_index]!=0) + # print "pop=", i_index, self.item_popular[i_index] + + # save the total number of items + self.item_count = len(self.item_popular) + print >> sys.stderr, '总共流行item数量 = %d' % self.item_count + + # @profile + def recommend(self, u_index): + """recommend(找出top K的电影,对电影进行相似度sum的排序,取出top N的电影数) + + Args: + u_index 用户_ID-1=用户index + Returns: + rec_item 电影推荐列表,按照相似度从大到小的排序 + """ + ''' Find K similar items and recommend N items. ''' + K = self.n_sim_item + N = self.n_rec_item + rank = {} + i_items = np.where(self.train_mat[u_index, :] != 0)[0] + # print "i_items=", i_items + watched_items = dict(zip(i_items, self.train_mat[u_index, i_items])) + + # 计算top K 电影的相似度 + # rating=电影评分, w=不同电影出现的次数 + # 耗时分析:98.2%的时间在 line-154行 + for i_item, rating in watched_items.iteritems(): + i_other_items = np.where(self.item_mat_similarity[i_item, :] != 0)[0] + for related_item, w in sorted(dict(zip(i_other_items, self.item_mat_similarity[i_item, i_other_items])).items(), key=itemgetter(1), reverse=True)[0:K]: + if related_item in watched_items: + continue + rank.setdefault(related_item, 0) + rank[related_item] += w * rating + + # return the N best items + return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N] + + def evaluate(self): + ''' return precision, recall, coverage and popularity ''' + print >> sys.stderr, 'Evaluation start...' + + # varables for precision and recall + # hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数 + hit = 0 + rec_count = 0 + test_count = 0 + # varables for coverage + all_rec_items = set() + # varables for popularity + popular_sum = 0 + + # enumerate 将其组成一个索引序列,利用它可以同时获得索引和值 + # 参考地址:http://blog.csdn.net/churximi/article/details/51648388 + for u_index in range(50): + if u_index > 0 and u_index % 10 == 0: + print >> sys.stderr, 'recommended for %d users' % u_index + print "u_index", u_index + + # 对比测试集和推荐集的差异 + rec_items = self.recommend(u_index) + print "rec_items=", rec_items + for item, w in rec_items: + # print 'test_mat[u_index, item]=', item, self.test_mat[u_index, item] + + if self.test_mat[u_index, item] != 0: + hit += 1 + print "self.test_mat[%d, %d]=%s" % (u_index, item, self.test_mat[u_index, item]) + # 计算用户对应的电影出现次数log值的sum加和 + if item in self.item_popular: + popular_sum += math.log(1 + self.item_popular[item]) + + rec_count += len(rec_items) + test_count += np.sum(self.test_mat[u_index, :] != 0) + # print "test_count=", np.sum(self.test_mat[u_index, :] != 0), np.sum(self.train_mat[u_index, :] != 0) + + print("-------", hit, rec_count) + precision = hit / (1.0 * rec_count) + recall = hit / (1.0 * test_count) + coverage = len(all_rec_items) / (1.0 * self.item_count) + popularity = popular_sum / (1.0 * rec_count) + + print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (precision, recall, coverage, popularity) + + +if __name__ == '__main__': + dataFile = 'input/16.RecommenderSystems/ml-100k/u.data' + + # 创建ItemCF对象 + itemcf = ItemBasedCF() + # 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset和testset中 + itemcf.splitData(dataFile, test_size=0.3) + # 计算用户之间的相似度 + itemcf.calc_similarity() + # 评估推荐效果 + # itemcf.evaluate() + # 查看推荐结果用户 + print "推荐结果", itemcf.recommend(u_index=1) + print "---", np.where(itemcf.test_mat[1, :] != 0)[0] diff --git a/src/python/16.RecommenderSystems/sklearn-RS-demo-cf-item.py b/src/python/16.RecommenderSystems/sklearn-RS-demo-cf-item.py deleted file mode 100644 index b9285f13..00000000 --- a/src/python/16.RecommenderSystems/sklearn-RS-demo-cf-item.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/python -# coding:utf8 - -from math import sqrt - -import numpy as np -import pandas as pd -from scipy.sparse.linalg import svds -from sklearn import cross_validation as cv -from sklearn.metrics import mean_squared_error -from sklearn.metrics.pairwise import pairwise_distances - - -def splitData(dataFile, test_size): - # 加载数据集 - header = ['user_id', 'item_id', 'rating', 'timestamp'] - df = pd.read_csv(dataFile, sep='\t', names=header) - - n_users = df.user_id.unique().shape[0] - n_items = df.item_id.unique().shape[0] - - print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items) - train_data, test_data = cv.train_test_split(df, test_size=test_size) - return df, n_users, n_items, train_data, test_data - - -def calc_similarity(n_users, n_items, train_data, test_data): - # 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵: - train_data_matrix = np.zeros((n_users, n_items)) - for line in train_data.itertuples(): - train_data_matrix[line[1]-1, line[2]-1] = line[3] - test_data_matrix = np.zeros((n_users, n_items)) - for line in test_data.itertuples(): - test_data_matrix[line[1]-1, line[2]-1] = line[3] - - # 使用sklearn的pairwise_distances函数来计算余弦相似性。 - user_similarity = pairwise_distances(train_data_matrix, metric="cosine") - item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine") - return train_data_matrix, test_data_matrix, user_similarity, item_similarity - - -def predict(rating, similarity, type='user'): - if type == 'user': - mean_user_rating = rating.mean(axis=1) - rating_diff = (rating - mean_user_rating[:, np.newaxis]) - pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T - elif type == 'item': - pred = rating.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)]) - return pred - - -def rmse(prediction, ground_truth): - prediction = prediction[ground_truth.nonzero()].flatten() - ground_truth = ground_truth[ground_truth.nonzero()].flatten() - return sqrt(mean_squared_error(prediction, ground_truth)) - - -if __name__ == "__main__": - # 基于模型的协同过滤 - # ... - # 拆分数据集 - # http://files.grouplens.org/datasets/movielens/ml-100k.zip - dataFile = 'input/16.RecommenderSystems/ml-100k/u.data' - df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25) - - # 计算相似度 - train_data_matrix, test_data_matrix, user_similarity, item_similarity = calc_similarity(n_users, n_items, train_data, test_data) - - user_prediction = predict(train_data_matrix, user_similarity, type='user') - item_prediction = predict(train_data_matrix, item_similarity, type='item') - - # 评估:均方根误差 - print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)) - print 'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)) - - # 基于模型的协同过滤 - # ... - # 计算MovieLens数据集的稀疏度 - sparsity = round(1.0 - len(df)/float(n_users*n_items), 3) - print 'The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%' - - u, s, vt = svds(train_data_matrix, k=20) - s_diag_matrix = np.diag(s) - x_pred = np.dot(np.dot(u, s_diag_matrix), vt) - print 'Model based CF RMSE: ' + str(rmse(x_pred, test_data_matrix)) diff --git a/src/python/16.RecommenderSystems/test_graph-based.py b/src/python/16.RecommenderSystems/test_graph-based.py index 2dca9e48..12d22403 100644 --- a/src/python/16.RecommenderSystems/test_graph-based.py +++ b/src/python/16.RecommenderSystems/test_graph-based.py @@ -14,4 +14,3 @@ def PersonalRank(G, alpha, root): tmp[j] += 1 - alpha rank = tmp return rank -