From 5a7c0892eb61efdb7c221b7b2204bfda73d30630 Mon Sep 17 00:00:00 2001 From: jiangzhonglian Date: Wed, 17 May 2017 21:19:44 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E5=85=A8=E6=8E=A8=E8=8D=90=E7=B3=BB?= =?UTF-8?q?=E7=BB=9F=E7=9A=84python=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/python/16.RecommendedSystem/itemcf.py | 120 ++++++++++++------ ...tion_model.py => test_evaluation_model.py} | 0 .../{graph-based.py => test_graph-based.py} | 0 .../{lfm.py => test_lfm.py} | 0 .../{基于物品.py => test_基于物品.py} | 0 .../{基于用户.py => test_基于用户.py} | 0 src/python/16.RecommendedSystem/usercf.py | 15 ++- 7 files changed, 89 insertions(+), 46 deletions(-) rename src/python/16.RecommendedSystem/{evaluation_model.py => test_evaluation_model.py} (100%) rename src/python/16.RecommendedSystem/{graph-based.py => test_graph-based.py} (100%) rename src/python/16.RecommendedSystem/{lfm.py => test_lfm.py} (100%) rename src/python/16.RecommendedSystem/{基于物品.py => test_基于物品.py} (100%) rename src/python/16.RecommendedSystem/{基于用户.py => test_基于用户.py} (100%) diff --git a/src/python/16.RecommendedSystem/itemcf.py b/src/python/16.RecommendedSystem/itemcf.py index cf732753..849b7a69 100644 --- a/src/python/16.RecommendedSystem/itemcf.py +++ b/src/python/16.RecommendedSystem/itemcf.py @@ -1,13 +1,20 @@ -#-*- coding: utf-8 -*- +#!/usr/bin/python +# coding:utf8 + ''' Created on 2015-06-22 - -@author: Lockvictor +Update on 2017-05-16 +@author: Lockvictor/片刻 +《推荐系统实践》协同过滤算法源代码 +参考地址:https://github.com/Lockvictor/MovieLens-RecSys +更新地址:https://github.com/apachecn/MachineLearning ''' -import sys, random, math +import sys +import math +import random from operator import itemgetter - - +print(__doc__) +# 作用:使得随机数据可预测 random.seed(0) @@ -17,9 +24,11 @@ class ItemBasedCF(): self.trainset = {} self.testset = {} + # n_sim_user: top 20个用户, n_rec_movie: top 10个推荐结果 self.n_sim_movie = 20 self.n_rec_movie = 10 + # user_sim_mat: 电影之间的相似度, movie_popular: 电影的出现次数, movie_count: 总电影数量 self.movie_sim_mat = {} self.movie_popular = {} self.movie_count = 0 @@ -27,28 +36,42 @@ class ItemBasedCF(): print >> sys.stderr, 'Similar movie number = %d' % self.n_sim_movie print >> sys.stderr, 'Recommended movie number = %d' % self.n_rec_movie - @staticmethod def loadfile(filename): - ''' load a file, return a generator. ''' + """loadfile(加载文件,返回一个生成器) + + Args: + filename 文件名 + Returns: + line 行数据,去空格 + """ fp = open(filename, 'r') for i, line in enumerate(fp): yield line.strip('\r\n') - if i % 100000 == 0: + if i > 0 and i % 100000 == 0: print >> sys.stderr, 'loading %s(%s)' % (filename, i) fp.close() - print >> sys.stderr, 'load %s succ' % filename - + print >> sys.stderr, 'load %s success' % filename def generate_dataset(self, filename, pivot=0.7): - ''' load rating data and split it to training set and test set ''' + """loadfile(加载文件,将数据集按照7:3 进行随机拆分) + + Args: + filename 文件名 + pivot 拆分比例 + """ trainset_len = 0 testset_len = 0 for line in self.loadfile(filename): + # 用户ID,电影名称,评分,时间戳 user, movie, rating, _ = line.split('::') - # split the data by pivot + # 通过pivot和随机函数比较,然后初始化用户和对应的值 if (random.random() < pivot): + + # dict.setdefault(key, default=None) + # key -- 查找的键值 + # default -- 键不存在时,设置的默认键值 self.trainset.setdefault(user, {}) self.trainset[user][movie] = int(rating) trainset_len += 1 @@ -57,83 +80,93 @@ class ItemBasedCF(): self.testset[user][movie] = int(rating) testset_len += 1 - print >> sys.stderr, 'split training set and test set succ' + print >> sys.stderr, '分离训练集和测试集成功' print >> sys.stderr, 'train set = %s' % trainset_len print >> sys.stderr, 'test set = %s' % testset_len - def calc_movie_sim(self): - ''' calculate movie similarity matrix ''' + """calc_movie_sim(计算用户之间的相似度)""" + print >> sys.stderr, 'counting movies number and popularity...' for user, movies in self.trainset.iteritems(): for movie in movies: - # count item popularity + # count item popularity if movie not in self.movie_popular: self.movie_popular[movie] = 0 self.movie_popular[movie] += 1 - print >> sys.stderr, 'count movies number and popularity succ' + print >> sys.stderr, 'count movies number and popularity success' # save the total number of movies self.movie_count = len(self.movie_popular) print >> sys.stderr, 'total movie number = %d' % self.movie_count - # count co-rated users between items + # 统计在相同用户时,不同电影同时出现的次数 itemsim_mat = self.movie_sim_mat print >> sys.stderr, 'building co-rated users matrix...' for user, movies in self.trainset.iteritems(): for m1 in movies: for m2 in movies: - if m1 == m2: continue - itemsim_mat.setdefault(m1,{}) - itemsim_mat[m1].setdefault(m2,0) + if m1 == m2: + continue + itemsim_mat.setdefault(m1, {}) + itemsim_mat[m1].setdefault(m2, 0) itemsim_mat[m1][m2] += 1 + print >> sys.stderr, 'build co-rated users matrix success' - print >> sys.stderr, 'build co-rated users matrix succ' - - # calculate similarity matrix + # calculate similarity matrix print >> sys.stderr, 'calculating movie similarity matrix...' simfactor_count = 0 PRINT_STEP = 2000000 - for m1, related_movies in itemsim_mat.iteritems(): for m2, count in related_movies.iteritems(): - itemsim_mat[m1][m2] = count / math.sqrt( - self.movie_popular[m1] * self.movie_popular[m2]) + # 余弦相似度 + itemsim_mat[m1][m2] = count / math.sqrt(self.movie_popular[m1] * self.movie_popular[m2]) simfactor_count += 1 + # 打印进度条 if simfactor_count % PRINT_STEP == 0: print >> sys.stderr, 'calculating movie similarity factor(%d)' % simfactor_count - print >> sys.stderr, 'calculate movie similarity matrix(similarity factor) succ' - print >> sys.stderr, 'Total similarity factor number = %d' %simfactor_count - + print >> sys.stderr, 'calculate movie similarity matrix(similarity factor) success' + print >> sys.stderr, 'Total similarity factor number = %d' % simfactor_count + # @profile def recommend(self, user): + """recommend(找出top K的电影,对电影进行相似度sum的排序,取出top N的电影数) + + Args: + user 用户 + Returns: + rec_movie 电影推荐列表,按照相似度从大到小的排序 + """ ''' Find K similar movies and recommend N movies. ''' K = self.n_sim_movie N = self.n_rec_movie rank = {} watched_movies = self.trainset[user] + # 计算top K 电影的相似度 + # rating=电影评分, w=不同电影出现的次数 + # 耗时分析:98.2%的时间在 line-154行 for movie, rating in watched_movies.iteritems(): - for related_movie, w in sorted(self.movie_sim_mat[movie].items(), - key=itemgetter(1), reverse=True)[:K]: + for related_movie, w in sorted(self.movie_sim_mat[movie].items(), key=itemgetter(1), reverse=True)[0:K]: if related_movie in watched_movies: continue rank.setdefault(related_movie, 0) rank[related_movie] += w * rating # return the N best movies - return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N] - + return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N] def evaluate(self): ''' return precision, recall, coverage and popularity ''' print >> sys.stderr, 'Evaluation start...' + # 返回top N的推荐结果 N = self.n_rec_movie - # varables for precision and recall + # varables for precision and recall + # hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数 hit = 0 rec_count = 0 test_count = 0 @@ -143,14 +176,17 @@ class ItemBasedCF(): popular_sum = 0 for i, user in enumerate(self.trainset): - if i % 500 == 0: + if i > 0 and i % 500 == 0: print >> sys.stderr, 'recommended for %d users' % i test_movies = self.testset.get(user, {}) rec_movies = self.recommend(user) + + # 对比测试集和推荐集的差异 for movie, w in rec_movies: if movie in test_movies: hit += 1 all_rec_movies.add(movie) + # 计算用户对应的电影出现次数log值的sum加和 popular_sum += math.log(1 + self.movie_popular[movie]) rec_count += N test_count += len(test_movies) @@ -160,13 +196,17 @@ class ItemBasedCF(): coverage = len(all_rec_movies) / (1.0 * self.movie_count) popularity = popular_sum / (1.0 * rec_count) - print >> sys.stderr, 'precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' \ - % (precision, recall, coverage, popularity) + print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (precision, recall, coverage, popularity) if __name__ == '__main__': ratingfile = 'input/16.RecommendedSystem/ml-1m/ratings.dat' + + # 创建ItemCF对象 itemcf = ItemBasedCF() - itemcf.generate_dataset(ratingfile) + # 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset和testset中 + itemcf.generate_dataset(ratingfile, pivot=0.7) + # 计算用户之间的相似度 itemcf.calc_movie_sim() + # 评估推荐效果 itemcf.evaluate() diff --git a/src/python/16.RecommendedSystem/evaluation_model.py b/src/python/16.RecommendedSystem/test_evaluation_model.py similarity index 100% rename from src/python/16.RecommendedSystem/evaluation_model.py rename to src/python/16.RecommendedSystem/test_evaluation_model.py diff --git a/src/python/16.RecommendedSystem/graph-based.py b/src/python/16.RecommendedSystem/test_graph-based.py similarity index 100% rename from src/python/16.RecommendedSystem/graph-based.py rename to src/python/16.RecommendedSystem/test_graph-based.py diff --git a/src/python/16.RecommendedSystem/lfm.py b/src/python/16.RecommendedSystem/test_lfm.py similarity index 100% rename from src/python/16.RecommendedSystem/lfm.py rename to src/python/16.RecommendedSystem/test_lfm.py diff --git a/src/python/16.RecommendedSystem/基于物品.py b/src/python/16.RecommendedSystem/test_基于物品.py similarity index 100% rename from src/python/16.RecommendedSystem/基于物品.py rename to src/python/16.RecommendedSystem/test_基于物品.py diff --git a/src/python/16.RecommendedSystem/基于用户.py b/src/python/16.RecommendedSystem/test_基于用户.py similarity index 100% rename from src/python/16.RecommendedSystem/基于用户.py rename to src/python/16.RecommendedSystem/test_基于用户.py diff --git a/src/python/16.RecommendedSystem/usercf.py b/src/python/16.RecommendedSystem/usercf.py index 82f5b865..0f780431 100644 --- a/src/python/16.RecommendedSystem/usercf.py +++ b/src/python/16.RecommendedSystem/usercf.py @@ -102,6 +102,7 @@ class UserBasedCF(): if movie not in self.movie_popular: self.movie_popular[movie] = 0 self.movie_popular[movie] += 1 + print >> sys.stderr, 'build movie-users inverse table success' # save the total movie number, which will be used in evaluation @@ -109,7 +110,7 @@ class UserBasedCF(): print >> sys.stderr, 'total movie number = %d' % self.movie_count usersim_mat = self.user_sim_mat - # 统计在相同电影时,用户同时出现的次数 + # 统计在相同电影时,不同用户同时出现的次数 print >> sys.stderr, 'building user co-rated movies matrix...' for movie, users in movie2users.iteritems(): @@ -138,8 +139,9 @@ class UserBasedCF(): print >> sys.stderr, 'calculate user similarity matrix(similarity factor) success' print >> sys.stderr, 'Total similarity factor number = %d' % simfactor_count + # @profile def recommend(self, user): - """recommend(推荐top K的用户,所看过的电影,对电影进行相似度sum的排序,取出top N的电影数) + """recommend(找出top K的用户,所看过的电影,对电影进行相似度sum的排序,取出top N的电影数) Args: user 用户 @@ -152,8 +154,9 @@ class UserBasedCF(): rank = dict() watched_movies = self.trainset[user] - # 找出top 10的用户和相似度 - # v=similar user, wuv=similarity factor + # 计算top K 用户的相似度 + # v=similar user, wuv=不同用户同时出现的次数 + # 耗时分析:50.4%的时间在 line-160行 for v, wuv in sorted(self.user_sim_mat[user].items(), key=itemgetter(1), reverse=True)[0:K]: for movie in self.trainset[v]: if movie in watched_movies: @@ -168,7 +171,7 @@ class UserBasedCF(): ''' return precision, recall, coverage and popularity ''' print >> sys.stderr, 'Evaluation start...' - # 返回top 10的推荐结果 + # 返回top N的推荐结果 N = self.n_rec_movie # varables for precision and recall # hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数 @@ -209,7 +212,7 @@ if __name__ == '__main__': # 创建UserCF对象 usercf = UserBasedCF() - # 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset河testset中 + # 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset和testset中 usercf.generate_dataset(ratingfile, pivot=0.7) # 计算用户之间的相似度 usercf.calc_user_sim()