Files
ailearning/src/python/16.RecommenderSystems/itemcf.py
jiangzhonglian 9448063ab2 更新文件名
2017-06-05 11:50:16 +08:00

213 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/python
# coding:utf8
'''
Created on 2015-06-22
Update on 2017-05-16
@author: Lockvictor/片刻
《推荐系统实践》协同过滤算法源代码
参考地址https://github.com/Lockvictor/MovieLens-RecSys
更新地址https://github.com/apachecn/MachineLearning
'''
import sys
import math
import random
from operator import itemgetter
print(__doc__)
# 作用:使得随机数据可预测
random.seed(0)
class ItemBasedCF():
''' TopN recommendation - ItemBasedCF '''
def __init__(self):
self.trainset = {}
self.testset = {}
# n_sim_user: top 20个用户 n_rec_movie: top 10个推荐结果
self.n_sim_movie = 20
self.n_rec_movie = 10
# user_sim_mat: 电影之间的相似度, movie_popular: 电影的出现次数, movie_count: 总电影数量
self.movie_sim_mat = {}
self.movie_popular = {}
self.movie_count = 0
print >> sys.stderr, 'Similar movie number = %d' % self.n_sim_movie
print >> sys.stderr, 'Recommended movie number = %d' % self.n_rec_movie
@staticmethod
def loadfile(filename):
"""loadfile(加载文件,返回一个生成器)
Args:
filename 文件名
Returns:
line 行数据,去空格
"""
fp = open(filename, 'r')
for i, line in enumerate(fp):
yield line.strip('\r\n')
if i > 0 and i % 100000 == 0:
print >> sys.stderr, 'loading %s(%s)' % (filename, i)
fp.close()
print >> sys.stderr, 'load %s success' % filename
def generate_dataset(self, filename, pivot=0.7):
"""loadfile(加载文件将数据集按照7:3 进行随机拆分)
Args:
filename 文件名
pivot 拆分比例
"""
trainset_len = 0
testset_len = 0
for line in self.loadfile(filename):
# 用户ID电影名称评分时间戳
user, movie, rating, _ = line.split('::')
# 通过pivot和随机函数比较然后初始化用户和对应的值
if (random.random() < pivot):
# dict.setdefault(key, default=None)
# key -- 查找的键值
# default -- 键不存在时,设置的默认键值
self.trainset.setdefault(user, {})
self.trainset[user][movie] = int(rating)
trainset_len += 1
else:
self.testset.setdefault(user, {})
self.testset[user][movie] = int(rating)
testset_len += 1
print >> sys.stderr, '分离训练集和测试集成功'
print >> sys.stderr, 'train set = %s' % trainset_len
print >> sys.stderr, 'test set = %s' % testset_len
def calc_movie_sim(self):
"""calc_movie_sim(计算用户之间的相似度)"""
print >> sys.stderr, 'counting movies number and popularity...'
for user, movies in self.trainset.iteritems():
for movie in movies:
# count item popularity
if movie not in self.movie_popular:
self.movie_popular[movie] = 0
self.movie_popular[movie] += 1
print >> sys.stderr, 'count movies number and popularity success'
# save the total number of movies
self.movie_count = len(self.movie_popular)
print >> sys.stderr, 'total movie number = %d' % self.movie_count
# 统计在相同用户时,不同电影同时出现的次数
itemsim_mat = self.movie_sim_mat
print >> sys.stderr, 'building co-rated users matrix...'
for user, movies in self.trainset.iteritems():
for m1 in movies:
for m2 in movies:
if m1 == m2:
continue
itemsim_mat.setdefault(m1, {})
itemsim_mat[m1].setdefault(m2, 0)
itemsim_mat[m1][m2] += 1
print >> sys.stderr, 'build co-rated users matrix success'
# calculate similarity matrix
print >> sys.stderr, 'calculating movie similarity matrix...'
simfactor_count = 0
PRINT_STEP = 2000000
for m1, related_movies in itemsim_mat.iteritems():
for m2, count in related_movies.iteritems():
# 余弦相似度
itemsim_mat[m1][m2] = count / math.sqrt(self.movie_popular[m1] * self.movie_popular[m2])
simfactor_count += 1
# 打印进度条
if simfactor_count % PRINT_STEP == 0:
print >> sys.stderr, 'calculating movie similarity factor(%d)' % simfactor_count
print >> sys.stderr, 'calculate movie similarity matrix(similarity factor) success'
print >> sys.stderr, 'Total similarity factor number = %d' % simfactor_count
# @profile
def recommend(self, user):
"""recommend(找出top K的电影对电影进行相似度sum的排序取出top N的电影数)
Args:
user 用户
Returns:
rec_movie 电影推荐列表,按照相似度从大到小的排序
"""
''' Find K similar movies and recommend N movies. '''
K = self.n_sim_movie
N = self.n_rec_movie
rank = {}
watched_movies = self.trainset[user]
# 计算top K 电影的相似度
# rating=电影评分, w=不同电影出现的次数
# 耗时分析98.2%的时间在 line-154行
for movie, rating in watched_movies.iteritems():
for related_movie, w in sorted(self.movie_sim_mat[movie].items(), key=itemgetter(1), reverse=True)[0:K]:
if related_movie in watched_movies:
continue
rank.setdefault(related_movie, 0)
rank[related_movie] += w * rating
# return the N best movies
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
def evaluate(self):
''' return precision, recall, coverage and popularity '''
print >> sys.stderr, 'Evaluation start...'
# 返回top N的推荐结果
N = self.n_rec_movie
# varables for precision and recall
# hit表示命中(测试集和推荐集相同+1)rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数
hit = 0
rec_count = 0
test_count = 0
# varables for coverage
all_rec_movies = set()
# varables for popularity
popular_sum = 0
for i, user in enumerate(self.trainset):
if i > 0 and i % 500 == 0:
print >> sys.stderr, 'recommended for %d users' % i
test_movies = self.testset.get(user, {})
rec_movies = self.recommend(user)
# 对比测试集和推荐集的差异
for movie, w in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
# 计算用户对应的电影出现次数log值的sum加和
popular_sum += math.log(1 + self.movie_popular[movie])
rec_count += N
test_count += len(test_movies)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
popularity = popular_sum / (1.0 * rec_count)
print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (precision, recall, coverage, popularity)
if __name__ == '__main__':
ratingfile = 'input/16.RecommenderSystems/ml-1m/ratings.dat'
# 创建ItemCF对象
itemcf = ItemBasedCF()
# 将数据按照 7:3的比例拆分成训练集和测试集存储在usercf的trainset和testset中
itemcf.generate_dataset(ratingfile, pivot=0.7)
# 计算用户之间的相似度
itemcf.calc_movie_sim()
# 评估推荐效果
itemcf.evaluate()