diff --git a/src/python/16.RecommenderSystems/itemcf.py b/src/python/16.RecommenderSystems/itemcf.py index 1ba67b95..eed6ed30 100644 --- a/src/python/16.RecommenderSystems/itemcf.py +++ b/src/python/16.RecommenderSystems/itemcf.py @@ -89,6 +89,7 @@ class ItemBasedCF(): print >> sys.stderr, 'counting movies number and popularity...' + # 统计在所有的用户中,不同电影的总出现次数 for user, movies in self.trainset.iteritems(): for movie in movies: # count item popularity @@ -175,6 +176,8 @@ class ItemBasedCF(): # varables for popularity popular_sum = 0 + # enumerate将其组成一个索引序列,利用它可以同时获得索引和值 + # 参考地址:http://blog.csdn.net/churximi/article/details/51648388 for i, user in enumerate(self.trainset): if i > 0 and i % 500 == 0: print >> sys.stderr, 'recommended for %d users' % i diff --git a/src/python/16.RecommenderSystems/sklearn-RS-demo-cf-item.py b/src/python/16.RecommenderSystems/sklearn-RS-demo-cf-item.py new file mode 100644 index 00000000..b9285f13 --- /dev/null +++ b/src/python/16.RecommenderSystems/sklearn-RS-demo-cf-item.py @@ -0,0 +1,85 @@ +#!/usr/bin/python +# coding:utf8 + +from math import sqrt + +import numpy as np +import pandas as pd +from scipy.sparse.linalg import svds +from sklearn import cross_validation as cv +from sklearn.metrics import mean_squared_error +from sklearn.metrics.pairwise import pairwise_distances + + +def splitData(dataFile, test_size): + # 加载数据集 + header = ['user_id', 'item_id', 'rating', 'timestamp'] + df = pd.read_csv(dataFile, sep='\t', names=header) + + n_users = df.user_id.unique().shape[0] + n_items = df.item_id.unique().shape[0] + + print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items) + train_data, test_data = cv.train_test_split(df, test_size=test_size) + return df, n_users, n_items, train_data, test_data + + +def calc_similarity(n_users, n_items, train_data, test_data): + # 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵: + train_data_matrix = np.zeros((n_users, n_items)) + for line in train_data.itertuples(): + train_data_matrix[line[1]-1, line[2]-1] = line[3] + test_data_matrix = np.zeros((n_users, n_items)) + for line in test_data.itertuples(): + test_data_matrix[line[1]-1, line[2]-1] = line[3] + + # 使用sklearn的pairwise_distances函数来计算余弦相似性。 + user_similarity = pairwise_distances(train_data_matrix, metric="cosine") + item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine") + return train_data_matrix, test_data_matrix, user_similarity, item_similarity + + +def predict(rating, similarity, type='user'): + if type == 'user': + mean_user_rating = rating.mean(axis=1) + rating_diff = (rating - mean_user_rating[:, np.newaxis]) + pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T + elif type == 'item': + pred = rating.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)]) + return pred + + +def rmse(prediction, ground_truth): + prediction = prediction[ground_truth.nonzero()].flatten() + ground_truth = ground_truth[ground_truth.nonzero()].flatten() + return sqrt(mean_squared_error(prediction, ground_truth)) + + +if __name__ == "__main__": + # 基于模型的协同过滤 + # ... + # 拆分数据集 + # http://files.grouplens.org/datasets/movielens/ml-100k.zip + dataFile = 'input/16.RecommenderSystems/ml-100k/u.data' + df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25) + + # 计算相似度 + train_data_matrix, test_data_matrix, user_similarity, item_similarity = calc_similarity(n_users, n_items, train_data, test_data) + + user_prediction = predict(train_data_matrix, user_similarity, type='user') + item_prediction = predict(train_data_matrix, item_similarity, type='item') + + # 评估:均方根误差 + print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)) + print 'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)) + + # 基于模型的协同过滤 + # ... + # 计算MovieLens数据集的稀疏度 + sparsity = round(1.0 - len(df)/float(n_users*n_items), 3) + print 'The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%' + + u, s, vt = svds(train_data_matrix, k=20) + s_diag_matrix = np.diag(s) + x_pred = np.dot(np.dot(u, s_diag_matrix), vt) + print 'Model based CF RMSE: ' + str(rmse(x_pred, test_data_matrix)) diff --git a/src/python/16.RecommenderSystems/sklearn-RS-demo-cf.py b/src/python/16.RecommenderSystems/sklearn-RS-demo-cf.py index f060d076..5a5bf4c7 100644 --- a/src/python/16.RecommenderSystems/sklearn-RS-demo-cf.py +++ b/src/python/16.RecommenderSystems/sklearn-RS-demo-cf.py @@ -10,59 +10,92 @@ from sklearn import cross_validation as cv from sklearn.metrics import mean_squared_error from sklearn.metrics.pairwise import pairwise_distances -# 加载数据集 -header = ['user_id', 'item_id', 'rating', 'timestamp'] -# http://files.grouplens.org/datasets/movielens/ml-100k.zip -dataFile = 'input/16.RecommenderSystems/ml-100k/u.data' -df = pd.read_csv(dataFile, sep='\t', names=header) -n_users = df.user_id.unique().shape[0] -n_items = df.item_id.unique().shape[0] -print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items) +def splitData(dataFile, test_size): + # 加载数据集 + header = ['user_id', 'item_id', 'rating', 'timestamp'] + df = pd.read_csv(dataFile, sep='\t', names=header) -# 拆分数据集 -train_data, test_data = cv.train_test_split(df, test_size=0.25) + n_users = df.user_id.unique().shape[0] + n_items = df.item_id.unique().shape[0] -# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵: -train_data_matrix = np.zeros((n_users, n_items)) -for line in train_data.itertuples(): - train_data_matrix[line[1]-1, line[2]-1] = line[3] -test_data_matrix = np.zeros((n_users, n_items)) -for line in test_data.itertuples(): - test_data_matrix[line[1]-1, line[2]-1] = line[3] -# 使用sklearn的pairwise_distances函数来计算余弦相似性。 -user_similarity = pairwise_distances(train_data_matrix, metric="cosine") -item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine") + print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items) + train_data, test_data = cv.train_test_split(df, test_size=test_size) + return df, n_users, n_items, train_data, test_data + + +def calc_similarity(n_users, n_items, train_data, test_data): + # 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵: + train_data_matrix = np.zeros((n_users, n_items)) + for line in train_data.itertuples(): + train_data_matrix[line[1]-1, line[2]-1] = line[3] + test_data_matrix = np.zeros((n_users, n_items)) + for line in test_data.itertuples(): + test_data_matrix[line[1]-1, line[2]-1] = line[3] + + # 使用sklearn的pairwise_distances函数来计算余弦相似性。 + print "1:", np.shape(train_data_matrix) # 行:人,列:电影 + print "2:", np.shape(train_data_matrix.T) # 行:电影,列:人 + user_similarity = pairwise_distances(train_data_matrix, metric="cosine") + item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine") + return train_data_matrix, test_data_matrix, user_similarity, item_similarity def predict(rating, similarity, type='user'): + print type + print "rating=", np.shape(rating) + print "similarity=", np.shape(similarity) if type == 'user': + # 求出每一个用户,所有电影的综合评分(axis=0 表示对列操作, 1表示对行操作) + # print "rating=", np.shape(rating) mean_user_rating = rating.mean(axis=1) + # np.newaxis参考地址: http://blog.csdn.net/xtingjie/article/details/72510834 + # print "mean_user_rating=", np.shape(mean_user_rating) + # print "mean_user_rating.newaxis=", np.shape(mean_user_rating[:, np.newaxis]) rating_diff = (rating - mean_user_rating[:, np.newaxis]) + # print "rating=", rating[:3, :3] + # print "mean_user_rating[:, np.newaxis]=", mean_user_rating[:, np.newaxis][:3, :3] + # print "rating_diff=", rating_diff[:3, :3] + + # 均分 + 人-人-距离(943, 943)*人-电影-评分diff(943, 1682)=结果-人-电影(每个人对同一电影的综合得分)(943, 1682) 再除以 个人与其他人总的距离 = 人-电影综合得分 pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T elif type == 'item': - pred = rating.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)]) + # 综合打分: 人-电影-评分(943, 1682)*电影-电影-距离1682, 1682)=结果-人-电影(各个电影对同一电影的综合得分)(943, 1682) / 再除以 电影与其他电影总的距离 = 人-电影综合得分 + pred = rating.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)]) return pred -user_prediction = predict(train_data_matrix, user_similarity, type='user') -item_prediction = predict(train_data_matrix, item_similarity, type='item') - - def rmse(prediction, ground_truth): prediction = prediction[ground_truth.nonzero()].flatten() ground_truth = ground_truth[ground_truth.nonzero()].flatten() return sqrt(mean_squared_error(prediction, ground_truth)) -print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)) -print 'Item based CF RMSe: ' + str(rmse(item_prediction, test_data_matrix)) +if __name__ == "__main__": + # 基于模型的协同过滤 + # ... + # 拆分数据集 + # http://files.grouplens.org/datasets/movielens/ml-100k.zip + dataFile = 'input/16.RecommenderSystems/ml-100k/u.data' + df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25) -sparsity = round(1.0 - len(df)/float(n_users*n_items), 3) -print 'The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%' + # 计算相似度 + train_data_matrix, test_data_matrix, user_similarity, item_similarity = calc_similarity(n_users, n_items, train_data, test_data) + user_prediction = predict(train_data_matrix, user_similarity, type='user') + item_prediction = predict(train_data_matrix, item_similarity, type='item') -u, s, vt = svds(train_data_matrix, k=20) -s_diag_matrix = np.diag(s) -x_pred = np.dot(np.dot(u, s_diag_matrix), vt) -print 'User-based CF MSE: ' + str(rmse(x_pred, test_data_matrix)) + # 评估:均方根误差 + print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)) + print 'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)) + + # 基于模型的协同过滤 + # ... + # 计算MovieLens数据集的稀疏度 + sparsity = round(1.0 - len(df)/float(n_users*n_items), 3) + print 'The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%' + + u, s, vt = svds(train_data_matrix, k=20) + s_diag_matrix = np.diag(s) + x_pred = np.dot(np.dot(u, s_diag_matrix), vt) + print 'Model based CF RMSE: ' + str(rmse(x_pred, test_data_matrix)) diff --git a/src/python/16.RecommenderSystems/usercf.py b/src/python/16.RecommenderSystems/usercf.py index fc528292..fbe19640 100644 --- a/src/python/16.RecommenderSystems/usercf.py +++ b/src/python/16.RecommenderSystems/usercf.py @@ -92,6 +92,8 @@ class UserBasedCF(): print >> sys.stderr, 'building movie-users inverse table...' movie2users = dict() + # 同一个电影中,收集用户的集合 + # 统计在所有的用户中,不同电影的总出现次数 for user, movies in self.trainset.iteritems(): for movie in movies: # inverse table for item-users @@ -155,16 +157,24 @@ class UserBasedCF(): watched_movies = self.trainset[user] # 计算top K 用户的相似度 - # v=similar user, wuv=不同用户同时出现的次数 + # v=similar user, wuv=不同用户同时出现的次数,根据wuv倒序从大到小选出K个用户进行排列 # 耗时分析:50.4%的时间在 line-160行 for v, wuv in sorted(self.user_sim_mat[user].items(), key=itemgetter(1), reverse=True)[0:K]: - for movie in self.trainset[v]: + for movie, rating in self.trainset[v].iteritems(): if movie in watched_movies: continue # predict the user's "interest" for each movie rank.setdefault(movie, 0) - rank[movie] += wuv + rank[movie] += wuv * rating # return the N best movies + + """ + wuv + precision=0.3766 recall=0.0759 coverage=0.3183 popularity=6.9194 + + wuv * rating + precision=0.3865 recall=0.0779 coverage=0.2681 popularity=7.0116 + """ return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N] def evaluate(self): @@ -183,6 +193,8 @@ class UserBasedCF(): # varables for popularity popular_sum = 0 + # enumerate将其组成一个索引序列,利用它可以同时获得索引和值 + # 参考地址:http://blog.csdn.net/churximi/article/details/51648388 for i, user in enumerate(self.trainset): if i > 0 and i % 500 == 0: print >> sys.stderr, 'recommended for %d users' % i