From fa72c038ca5e0186a6316c6d01795d36ef67436b Mon Sep 17 00:00:00 2001 From: jiangzhonglian Date: Tue, 16 May 2017 23:36:51 +0800 Subject: [PATCH] =?UTF-8?q?=E5=9F=BA=E4=BA=8E=E7=94=A8=E6=88=B7=E7=9A=84?= =?UTF-8?q?=E5=8D=8F=E5=90=8C=E8=BF=87=E6=BB=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/python/16.RecommendedSystem/usercf.py | 131 ++++++++++++++-------- 1 file changed, 85 insertions(+), 46 deletions(-) diff --git a/src/python/16.RecommendedSystem/usercf.py b/src/python/16.RecommendedSystem/usercf.py index 7ab51261..82f5b865 100644 --- a/src/python/16.RecommendedSystem/usercf.py +++ b/src/python/16.RecommendedSystem/usercf.py @@ -1,13 +1,20 @@ -#-*- coding: utf-8 -*- +#!/usr/bin/python +# coding:utf8 + ''' Created on 2015-06-22 - -@author: Lockvictor +Update on 2017-05-16 +@author: Lockvictor/片刻 +《推荐系统实践》协同过滤算法源代码 +参考地址:https://github.com/Lockvictor/MovieLens-RecSys +更新地址:https://github.com/apachecn/MachineLearning ''' -import sys, random, math +import sys +import math +import random from operator import itemgetter - - +print(__doc__) +# 作用:使得随机数据可预测 random.seed(0) @@ -17,59 +24,75 @@ class UserBasedCF(): self.trainset = {} self.testset = {} + # n_sim_user: top 20个用户, n_rec_movie: top 10个推荐结果 self.n_sim_user = 20 self.n_rec_movie = 10 + # user_sim_mat: 用户之间的相似度, movie_popular: 电影的出现次数, movie_count: 总电影数量 self.user_sim_mat = {} self.movie_popular = {} self.movie_count = 0 - print >> sys.stderr, 'Similar user number = %d' % self.n_sim_user + print >> sys.stderr, 'similar user number = %d' % self.n_sim_user print >> sys.stderr, 'recommended movie number = %d' % self.n_rec_movie - @staticmethod def loadfile(filename): - ''' load a file, return a generator. ''' + """loadfile(加载文件,返回一个生成器) + + Args: + filename 文件名 + Returns: + line 行数据,去空格 + """ fp = open(filename, 'r') - for i,line in enumerate(fp): + for i, line in enumerate(fp): yield line.strip('\r\n') - if i%100000 == 0: + if i > 0 and i % 100000 == 0: print >> sys.stderr, 'loading %s(%s)' % (filename, i) fp.close() - print >> sys.stderr, 'load %s succ' % filename - + print >> sys.stderr, 'load %s success' % filename def generate_dataset(self, filename, pivot=0.7): - ''' load rating data and split it to training set and test set ''' + """loadfile(加载文件,将数据集按照7:3 进行随机拆分) + + Args: + filename 文件名 + pivot 拆分比例 + """ trainset_len = 0 testset_len = 0 for line in self.loadfile(filename): + # 用户ID,电影名称,评分,时间戳 user, movie, rating, timestamp = line.split('::') - # split the data by pivot + # 通过pivot和随机函数比较,然后初始化用户和对应的值 if (random.random() < pivot): - self.trainset.setdefault(user,{}) + + # dict.setdefault(key, default=None) + # key -- 查找的键值 + # default -- 键不存在时,设置的默认键值 + self.trainset.setdefault(user, {}) self.trainset[user][movie] = int(rating) trainset_len += 1 else: - self.testset.setdefault(user,{}) + self.testset.setdefault(user, {}) self.testset[user][movie] = int(rating) testset_len += 1 - print >> sys.stderr, 'split training set and test set succ' + print >> sys.stderr, '分离训练集和测试集成功' print >> sys.stderr, 'train set = %s' % trainset_len - print >> sys.stderr, 'test set = %s' % testset_len - + print >> sys.stderr, 'test set = %s' % testset_len def calc_user_sim(self): - ''' calculate user similarity matrix ''' + """calc_user_sim(计算用户之间的相似度)""" + # build inverse table for item-users # key=movieID, value=list of userIDs who have seen this movie print >> sys.stderr, 'building movie-users inverse table...' movie2users = dict() - for user,movies in self.trainset.iteritems(): + for user, movies in self.trainset.iteritems(): for movie in movies: # inverse table for item-users if movie not in movie2users: @@ -79,67 +102,76 @@ class UserBasedCF(): if movie not in self.movie_popular: self.movie_popular[movie] = 0 self.movie_popular[movie] += 1 - print >> sys.stderr, 'build movie-users inverse table succ' + print >> sys.stderr, 'build movie-users inverse table success' # save the total movie number, which will be used in evaluation self.movie_count = len(movie2users) print >> sys.stderr, 'total movie number = %d' % self.movie_count - # count co-rated items between users usersim_mat = self.user_sim_mat + # 统计在相同电影时,用户同时出现的次数 print >> sys.stderr, 'building user co-rated movies matrix...' - for movie,users in movie2users.iteritems(): + for movie, users in movie2users.iteritems(): for u in users: for v in users: - if u == v: continue - usersim_mat.setdefault(u,{}) - usersim_mat[u].setdefault(v,0) + if u == v: + continue + usersim_mat.setdefault(u, {}) + usersim_mat[u].setdefault(v, 0) usersim_mat[u][v] += 1 - print >> sys.stderr, 'build user co-rated movies matrix succ' + print >> sys.stderr, 'build user co-rated movies matrix success' - # calculate similarity matrix + # calculate similarity matrix print >> sys.stderr, 'calculating user similarity matrix...' simfactor_count = 0 PRINT_STEP = 2000000 - for u,related_users in usersim_mat.iteritems(): - for v,count in related_users.iteritems(): - usersim_mat[u][v] = count / math.sqrt( - len(self.trainset[u]) * len(self.trainset[v])) + for u, related_users in usersim_mat.iteritems(): + for v, count in related_users.iteritems(): + # 余弦相似度 + usersim_mat[u][v] = count / math.sqrt(len(self.trainset[u]) * len(self.trainset[v])) simfactor_count += 1 + # 打印进度条 if simfactor_count % PRINT_STEP == 0: print >> sys.stderr, 'calculating user similarity factor(%d)' % simfactor_count - print >> sys.stderr, 'calculate user similarity matrix(similarity factor) succ' - print >> sys.stderr, 'Total similarity factor number = %d' %simfactor_count - + print >> sys.stderr, 'calculate user similarity matrix(similarity factor) success' + print >> sys.stderr, 'Total similarity factor number = %d' % simfactor_count def recommend(self, user): + """recommend(推荐top K的用户,所看过的电影,对电影进行相似度sum的排序,取出top N的电影数) + + Args: + user 用户 + Returns: + rec_movie 电影推荐列表,按照相似度从大到小的排序 + """ ''' Find K similar users and recommend N movies. ''' K = self.n_sim_user N = self.n_rec_movie rank = dict() watched_movies = self.trainset[user] + # 找出top 10的用户和相似度 # v=similar user, wuv=similarity factor - for v, wuv in sorted(self.user_sim_mat[user].items(), - key=itemgetter(1), reverse=True)[0:K]: + for v, wuv in sorted(self.user_sim_mat[user].items(), key=itemgetter(1), reverse=True)[0:K]: for movie in self.trainset[v]: if movie in watched_movies: continue # predict the user's "interest" for each movie - rank.setdefault(movie,0) + rank.setdefault(movie, 0) rank[movie] += wuv # return the N best movies return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N] - def evaluate(self): ''' return precision, recall, coverage and popularity ''' print >> sys.stderr, 'Evaluation start...' + # 返回top 10的推荐结果 N = self.n_rec_movie - # varables for precision and recall + # varables for precision and recall + # hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数 hit = 0 rec_count = 0 test_count = 0 @@ -149,14 +181,17 @@ class UserBasedCF(): popular_sum = 0 for i, user in enumerate(self.trainset): - if i % 500 == 0: + if i > 0 and i % 500 == 0: print >> sys.stderr, 'recommended for %d users' % i test_movies = self.testset.get(user, {}) rec_movies = self.recommend(user) + + # 对比测试集和推荐集的差异 for movie, w in rec_movies: if movie in test_movies: hit += 1 all_rec_movies.add(movie) + # 计算用户对应的电影出现次数log值的sum加和 popular_sum += math.log(1 + self.movie_popular[movie]) rec_count += N test_count += len(test_movies) @@ -166,13 +201,17 @@ class UserBasedCF(): coverage = len(all_rec_movies) / (1.0*self.movie_count) popularity = popular_sum / (1.0*rec_count) - print >> sys.stderr, 'precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' % \ - (precision, recall, coverage, popularity) + print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (precision, recall, coverage, popularity) if __name__ == '__main__': ratingfile = 'input/16.RecommendedSystem/ml-1m/ratings.dat' + + # 创建UserCF对象 usercf = UserBasedCF() - usercf.generate_dataset(ratingfile) + # 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset河testset中 + usercf.generate_dataset(ratingfile, pivot=0.7) + # 计算用户之间的相似度 usercf.calc_user_sim() + # 评估推荐效果 usercf.evaluate()