From 5a7c0892eb61efdb7c221b7b2204bfda73d30630 Mon Sep 17 00:00:00 2001
From: jiangzhonglian <jiang-s@163.com>
Date: Wed, 17 May 2017 21:19:44 +0800
Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E5=85=A8=E6=8E=A8=E8=8D=90=E7=B3=BB?=
 =?UTF-8?q?=E7=BB=9F=E7=9A=84python=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/python/16.RecommendedSystem/itemcf.py     | 120 ++++++++++++------
 ...tion_model.py => test_evaluation_model.py} |   0
 .../{graph-based.py => test_graph-based.py}   |   0
 .../{lfm.py => test_lfm.py}                   |   0
 .../{基于物品.py => test_基于物品.py}         |   0
 .../{基于用户.py => test_基于用户.py}         |   0
 src/python/16.RecommendedSystem/usercf.py     |  15 ++-
 7 files changed, 89 insertions(+), 46 deletions(-)
 rename src/python/16.RecommendedSystem/{evaluation_model.py => test_evaluation_model.py} (100%)
 rename src/python/16.RecommendedSystem/{graph-based.py => test_graph-based.py} (100%)
 rename src/python/16.RecommendedSystem/{lfm.py => test_lfm.py} (100%)
 rename src/python/16.RecommendedSystem/{基于物品.py => test_基于物品.py} (100%)
 rename src/python/16.RecommendedSystem/{基于用户.py => test_基于用户.py} (100%)

diff --git a/src/python/16.RecommendedSystem/itemcf.py b/src/python/16.RecommendedSystem/itemcf.py
index cf732753..849b7a69 100644
--- a/src/python/16.RecommendedSystem/itemcf.py
+++ b/src/python/16.RecommendedSystem/itemcf.py
@@ -1,13 +1,20 @@
-#-*- coding: utf-8 -*-
+#!/usr/bin/python
+# coding:utf8
+
 '''
 Created on 2015-06-22
-
-@author: Lockvictor
+Update  on 2017-05-16
+@author: Lockvictor/片刻
+《推荐系统实践》协同过滤算法源代码
+参考地址：https://github.com/Lockvictor/MovieLens-RecSys
+更新地址：https://github.com/apachecn/MachineLearning
 '''
-import sys, random, math
+import sys
+import math
+import random
 from operator import itemgetter
-
-
+print(__doc__)
+# 作用：使得随机数据可预测
 random.seed(0)
 
 
@@ -17,9 +24,11 @@ class ItemBasedCF():
         self.trainset = {}
         self.testset = {}
 
+        # n_sim_user: top 20个用户， n_rec_movie: top 10个推荐结果
         self.n_sim_movie = 20
         self.n_rec_movie = 10
 
+        # user_sim_mat: 电影之间的相似度， movie_popular: 电影的出现次数， movie_count: 总电影数量
         self.movie_sim_mat = {}
         self.movie_popular = {}
         self.movie_count = 0
@@ -27,28 +36,42 @@ class ItemBasedCF():
         print >> sys.stderr, 'Similar movie number = %d' % self.n_sim_movie
         print >> sys.stderr, 'Recommended movie number = %d' % self.n_rec_movie
 
-
     @staticmethod
     def loadfile(filename):
-        ''' load a file, return a generator. '''
+        """loadfile(加载文件，返回一个生成器)
+
+        Args:
+            filename   文件名
+        Returns:
+            line       行数据，去空格
+        """
         fp = open(filename, 'r')
         for i, line in enumerate(fp):
             yield line.strip('\r\n')
-            if i % 100000 == 0:
+            if i > 0 and i % 100000 == 0:
                 print >> sys.stderr, 'loading %s(%s)' % (filename, i)
         fp.close()
-        print >> sys.stderr, 'load %s succ' % filename
-
+        print >> sys.stderr, 'load %s success' % filename
 
     def generate_dataset(self, filename, pivot=0.7):
-        ''' load rating data and split it to training set and test set '''
+        """loadfile(加载文件，将数据集按照7:3 进行随机拆分)
+
+        Args:
+            filename   文件名
+            pivot      拆分比例
+        """
         trainset_len = 0
         testset_len = 0
 
         for line in self.loadfile(filename):
+            # 用户ID，电影名称，评分，时间戳
             user, movie, rating, _ = line.split('::')
-            # split the data by pivot
+            # 通过pivot和随机函数比较，然后初始化用户和对应的值
             if (random.random() < pivot):
+
+                # dict.setdefault(key, default=None)
+                # key -- 查找的键值
+                # default -- 键不存在时，设置的默认键值
                 self.trainset.setdefault(user, {})
                 self.trainset[user][movie] = int(rating)
                 trainset_len += 1
@@ -57,83 +80,93 @@ class ItemBasedCF():
                 self.testset[user][movie] = int(rating)
                 testset_len += 1
 
-        print >> sys.stderr, 'split training set and test set succ'
+        print >> sys.stderr, '分离训练集和测试集成功'
         print >> sys.stderr, 'train set = %s' % trainset_len
         print >> sys.stderr, 'test set = %s' % testset_len
 
-
     def calc_movie_sim(self):
-        ''' calculate movie similarity matrix '''
+        """calc_movie_sim(计算用户之间的相似度)"""
+
         print >> sys.stderr, 'counting movies number and popularity...'
 
         for user, movies in self.trainset.iteritems():
             for movie in movies:
-                # count item popularity 
+                # count item popularity
                 if movie not in self.movie_popular:
                     self.movie_popular[movie] = 0
                 self.movie_popular[movie] += 1
 
-        print >> sys.stderr, 'count movies number and popularity succ'
+        print >> sys.stderr, 'count movies number and popularity success'
 
         # save the total number of movies
         self.movie_count = len(self.movie_popular)
         print >> sys.stderr, 'total movie number = %d' % self.movie_count
 
-        # count co-rated users between items
+        # 统计在相同用户时，不同电影同时出现的次数
         itemsim_mat = self.movie_sim_mat
         print >> sys.stderr, 'building co-rated users matrix...'
 
         for user, movies in self.trainset.iteritems():
             for m1 in movies:
                 for m2 in movies:
-                    if m1 == m2: continue
-                    itemsim_mat.setdefault(m1,{})
-                    itemsim_mat[m1].setdefault(m2,0)
+                    if m1 == m2:
+                        continue
+                    itemsim_mat.setdefault(m1, {})
+                    itemsim_mat[m1].setdefault(m2, 0)
                     itemsim_mat[m1][m2] += 1
+        print >> sys.stderr, 'build co-rated users matrix success'
 
-        print >> sys.stderr, 'build co-rated users matrix succ'
-
-        # calculate similarity matrix 
+        # calculate similarity matrix
         print >> sys.stderr, 'calculating movie similarity matrix...'
         simfactor_count = 0
         PRINT_STEP = 2000000
-
         for m1, related_movies in itemsim_mat.iteritems():
             for m2, count in related_movies.iteritems():
-                itemsim_mat[m1][m2] = count / math.sqrt(
-                        self.movie_popular[m1] * self.movie_popular[m2])
+                # 余弦相似度
+                itemsim_mat[m1][m2] = count / math.sqrt(self.movie_popular[m1] * self.movie_popular[m2])
                 simfactor_count += 1
+                # 打印进度条
                 if simfactor_count % PRINT_STEP == 0:
                     print >> sys.stderr, 'calculating movie similarity factor(%d)' % simfactor_count
 
-        print >> sys.stderr, 'calculate movie similarity matrix(similarity factor) succ'
-        print >> sys.stderr, 'Total similarity factor number = %d' %simfactor_count
-
+        print >> sys.stderr, 'calculate movie similarity matrix(similarity factor) success'
+        print >> sys.stderr, 'Total similarity factor number = %d' % simfactor_count
 
+    # @profile
     def recommend(self, user):
+        """recommend(找出top K的电影，对电影进行相似度sum的排序，取出top N的电影数)
+
+        Args:
+            user       用户
+        Returns:
+            rec_movie  电影推荐列表，按照相似度从大到小的排序
+        """
         ''' Find K similar movies and recommend N movies. '''
         K = self.n_sim_movie
         N = self.n_rec_movie
         rank = {}
         watched_movies = self.trainset[user]
 
+        # 计算top K 电影的相似度
+        # rating=电影评分, w=不同电影出现的次数
+        # 耗时分析：98.2%的时间在 line-154行
         for movie, rating in watched_movies.iteritems():
-            for related_movie, w in sorted(self.movie_sim_mat[movie].items(),
-                    key=itemgetter(1), reverse=True)[:K]:
+            for related_movie, w in sorted(self.movie_sim_mat[movie].items(), key=itemgetter(1), reverse=True)[0:K]:
                 if related_movie in watched_movies:
                     continue
                 rank.setdefault(related_movie, 0)
                 rank[related_movie] += w * rating
         # return the N best movies
-        return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N]
-
+        return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
 
     def evaluate(self):
         ''' return precision, recall, coverage and popularity '''
         print >> sys.stderr, 'Evaluation start...'
 
+        # 返回top N的推荐结果
         N = self.n_rec_movie
-        #  varables for precision and recall 
+        # varables for precision and recall
+        # hit表示命中(测试集和推荐集相同+1)，rec_count 每个用户的推荐数， test_count 每个用户对应的测试数据集的电影数
         hit = 0
         rec_count = 0
         test_count = 0
@@ -143,14 +176,17 @@ class ItemBasedCF():
         popular_sum = 0
 
         for i, user in enumerate(self.trainset):
-            if i % 500 == 0:
+            if i > 0 and i % 500 == 0:
                 print >> sys.stderr, 'recommended for %d users' % i
             test_movies = self.testset.get(user, {})
             rec_movies = self.recommend(user)
+
+            # 对比测试集和推荐集的差异
             for movie, w in rec_movies:
                 if movie in test_movies:
                     hit += 1
                 all_rec_movies.add(movie)
+                # 计算用户对应的电影出现次数log值的sum加和
                 popular_sum += math.log(1 + self.movie_popular[movie])
             rec_count += N
             test_count += len(test_movies)
@@ -160,13 +196,17 @@ class ItemBasedCF():
         coverage = len(all_rec_movies) / (1.0 * self.movie_count)
         popularity = popular_sum / (1.0 * rec_count)
 
-        print >> sys.stderr, 'precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' \
-                % (precision, recall, coverage, popularity)
+        print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (precision, recall, coverage, popularity)
 
 
 if __name__ == '__main__':
     ratingfile = 'input/16.RecommendedSystem/ml-1m/ratings.dat'
+
+    # 创建ItemCF对象
     itemcf = ItemBasedCF()
-    itemcf.generate_dataset(ratingfile)
+    # 将数据按照 7:3的比例，拆分成：训练集和测试集，存储在usercf的trainset和testset中
+    itemcf.generate_dataset(ratingfile, pivot=0.7)
+    # 计算用户之间的相似度
     itemcf.calc_movie_sim()
+    # 评估推荐效果
     itemcf.evaluate()
diff --git a/src/python/16.RecommendedSystem/evaluation_model.py b/src/python/16.RecommendedSystem/test_evaluation_model.py
similarity index 100%
rename from src/python/16.RecommendedSystem/evaluation_model.py
rename to src/python/16.RecommendedSystem/test_evaluation_model.py
diff --git a/src/python/16.RecommendedSystem/graph-based.py b/src/python/16.RecommendedSystem/test_graph-based.py
similarity index 100%
rename from src/python/16.RecommendedSystem/graph-based.py
rename to src/python/16.RecommendedSystem/test_graph-based.py
diff --git a/src/python/16.RecommendedSystem/lfm.py b/src/python/16.RecommendedSystem/test_lfm.py
similarity index 100%
rename from src/python/16.RecommendedSystem/lfm.py
rename to src/python/16.RecommendedSystem/test_lfm.py
diff --git a/src/python/16.RecommendedSystem/基于物品.py b/src/python/16.RecommendedSystem/test_基于物品.py
similarity index 100%
rename from src/python/16.RecommendedSystem/基于物品.py
rename to src/python/16.RecommendedSystem/test_基于物品.py
diff --git a/src/python/16.RecommendedSystem/基于用户.py b/src/python/16.RecommendedSystem/test_基于用户.py
similarity index 100%
rename from src/python/16.RecommendedSystem/基于用户.py
rename to src/python/16.RecommendedSystem/test_基于用户.py
diff --git a/src/python/16.RecommendedSystem/usercf.py b/src/python/16.RecommendedSystem/usercf.py
index 82f5b865..0f780431 100644
--- a/src/python/16.RecommendedSystem/usercf.py
+++ b/src/python/16.RecommendedSystem/usercf.py
@@ -102,6 +102,7 @@ class UserBasedCF():
                 if movie not in self.movie_popular:
                     self.movie_popular[movie] = 0
                 self.movie_popular[movie] += 1
+
         print >> sys.stderr, 'build movie-users inverse table success'
 
         # save the total movie number, which will be used in evaluation
@@ -109,7 +110,7 @@ class UserBasedCF():
         print >> sys.stderr, 'total movie number = %d' % self.movie_count
 
         usersim_mat = self.user_sim_mat
-        # 统计在相同电影时，用户同时出现的次数
+        # 统计在相同电影时，不同用户同时出现的次数
         print >> sys.stderr, 'building user co-rated movies matrix...'
 
         for movie, users in movie2users.iteritems():
@@ -138,8 +139,9 @@ class UserBasedCF():
         print >> sys.stderr, 'calculate user similarity matrix(similarity factor) success'
         print >> sys.stderr, 'Total similarity factor number = %d' % simfactor_count
 
+    # @profile
     def recommend(self, user):
-        """recommend(推荐top K的用户，所看过的电影，对电影进行相似度sum的排序，取出top N的电影数)
+        """recommend(找出top K的用户，所看过的电影，对电影进行相似度sum的排序，取出top N的电影数)
 
         Args:
             user       用户
@@ -152,8 +154,9 @@ class UserBasedCF():
         rank = dict()
         watched_movies = self.trainset[user]
 
-        # 找出top 10的用户和相似度
-        # v=similar user, wuv=similarity factor
+        # 计算top K 用户的相似度
+        # v=similar user, wuv=不同用户同时出现的次数
+        # 耗时分析：50.4%的时间在 line-160行
         for v, wuv in sorted(self.user_sim_mat[user].items(), key=itemgetter(1), reverse=True)[0:K]:
             for movie in self.trainset[v]:
                 if movie in watched_movies:
@@ -168,7 +171,7 @@ class UserBasedCF():
         ''' return precision, recall, coverage and popularity '''
         print >> sys.stderr, 'Evaluation start...'
 
-        # 返回top 10的推荐结果
+        # 返回top N的推荐结果
         N = self.n_rec_movie
         # varables for precision and recall
         # hit表示命中(测试集和推荐集相同+1)，rec_count 每个用户的推荐数， test_count 每个用户对应的测试数据集的电影数
@@ -209,7 +212,7 @@ if __name__ == '__main__':
 
     # 创建UserCF对象
     usercf = UserBasedCF()
-    # 将数据按照 7:3的比例，拆分成：训练集和测试集，存储在usercf的trainset河testset中
+    # 将数据按照 7:3的比例，拆分成：训练集和测试集，存储在usercf的trainset和testset中
     usercf.generate_dataset(ratingfile, pivot=0.7)
     # 计算用户之间的相似度
     usercf.calc_user_sim()