Files
ailearning/src/py3.x/ml/16.RecommenderSystems/RS-usercf.py
2020-05-18 22:12:42 +08:00

239 lines
9.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/python
# coding:utf8
'''
Created on 2015-06-22
Update on 2017-05-16
Author: Lockvictor/片刻
《推荐系统实践》协同过滤算法源代码
参考地址: https://github.com/Lockvictor/MovieLens-RecSys
更新地址: https://github.com/apachecn/AiLearning
'''
from __future__ import print_function
import sys
import math
import random
from operator import itemgetter
print(__doc__)
# 作用: 使得随机数据可预测
random.seed(0)
class UserBasedCF():
''' TopN recommendation - UserBasedCF '''
def __init__(self):
self.trainset = {}
self.testset = {}
# n_sim_user: top 20个用户 n_rec_movie: top 10个推荐结果
self.n_sim_user = 20
self.n_rec_movie = 10
# user_sim_mat: 用户之间的相似度, movie_popular: 电影的出现次数, movie_count: 总电影数量
self.user_sim_mat = {}
self.movie_popular = {}
self.movie_count = 0
print('similar user number = %d' % self.n_sim_user, file=sys.stderr)
print('recommended movie number = %d' % self.n_rec_movie, file=sys.stderr)
@staticmethod
def loadfile(filename):
"""loadfile(加载文件,返回一个生成器)
Args:
filename 文件名
Returns:
line 行数据,去空格
"""
fp = open(filename, 'r')
for i, line in enumerate(fp):
yield line.strip('\r\n')
if i > 0 and i % 100000 == 0:
print('loading %s(%s)' % (filename, i), file=sys.stderr)
fp.close()
print('load %s success' % filename, file=sys.stderr)
def generate_dataset(self, filename, pivot=0.7):
"""loadfile(加载文件将数据集按照7:3 进行随机拆分)
Args:
filename 文件名
pivot 拆分比例
"""
trainset_len = 0
testset_len = 0
for line in self.loadfile(filename):
# 用户ID电影名称评分时间戳timestamp
# user, movie, rating, timestamp = line.split('::')
user, movie, rating, _ = line.split('\t')
# 通过pivot和随机函数比较然后初始化用户和对应的值
if (random.random() < pivot):
# dict.setdefault(key, default=None)
# key -- 查找的键值
# default -- 键不存在时,设置的默认键值
self.trainset.setdefault(user, {})
self.trainset[user][movie] = int(rating)
trainset_len += 1
else:
self.testset.setdefault(user, {})
self.testset[user][movie] = int(rating)
testset_len += 1
print('分离训练集和测试集成功', file=sys.stderr)
print('train set = %s' % trainset_len, file=sys.stderr)
print('test set = %s' % testset_len, file=sys.stderr)
def calc_user_sim(self):
"""calc_user_sim(计算用户之间的相似度)"""
# build inverse table for item-users
# key=movieID, value=list of userIDs who have seen this movie
print('building movie-users inverse table...', file=sys.stderr)
movie2users = dict()
# 同一个电影中,收集用户的集合
# 统计在所有的用户中,不同电影的总出现次数
for user, movies in self.trainset.items():
for movie in movies:
# inverse table for item-users
if movie not in movie2users:
movie2users[movie] = set()
movie2users[movie].add(user)
# count item popularity at the same time
if movie not in self.movie_popular:
self.movie_popular[movie] = 0
self.movie_popular[movie] += 1
print('build movie-users inverse table success', file=sys.stderr)
# save the total movie number, which will be used in evaluation
self.movie_count = len(movie2users)
print('total movie number = %d' % self.movie_count, file=sys.stderr)
usersim_mat = self.user_sim_mat
# 统计在相同电影时,不同用户同时出现的次数
print('building user co-rated movies matrix...', file=sys.stderr)
for movie, users in movie2users.items():
for u in users:
for v in users:
if u == v:
continue
usersim_mat.setdefault(u, {})
usersim_mat[u].setdefault(v, 0)
usersim_mat[u][v] += 1
print('build user co-rated movies matrix success', file=sys.stderr)
# calculate similarity matrix
print('calculating user similarity matrix...', file=sys.stderr)
simfactor_count = 0
PRINT_STEP = 2000000
for u, related_users in usersim_mat.items():
for v, count in related_users.iteritems():
# 余弦相似度
usersim_mat[u][v] = count / math.sqrt(
len(self.trainset[u]) * len(self.trainset[v]))
simfactor_count += 1
# 打印进度条
if simfactor_count % PRINT_STEP == 0:
print('calculating user similarity factor(%d)' % simfactor_count, file=sys.stderr)
print('calculate user similarity matrix(similarity factor) success', file=sys.stderr)
print('Total similarity factor number = %d' % simfactor_count, file=sys.stderr)
# @profile
def recommend(self, user):
"""recommend(找出top K的用户所看过的电影对电影进行相似度sum的排序取出top N的电影数)
Args:
user 用户
Returns:
rec_movie 电影推荐列表,按照相似度从大到小的排序
"""
''' Find K similar users and recommend N movies. '''
K = self.n_sim_user
N = self.n_rec_movie
rank = dict()
watched_movies = self.trainset[user]
# 计算top K 用户的相似度
# v=similar user, wuv=不同用户同时出现的次数根据wuv倒序从大到小选出K个用户进行排列
# 耗时分析: 50.4%的时间在 line-160行
for v, wuv in sorted(
self.user_sim_mat[user].items(), key=itemgetter(1),
reverse=True)[0:K]:
for movie, rating in self.trainset[v].iteritems():
if movie in watched_movies:
continue
# predict the user's "interest" for each movie
rank.setdefault(movie, 0)
rank[movie] += wuv * rating
# return the N best movies
"""
wuv
precision=0.3766 recall=0.0759 coverage=0.3183 popularity=6.9194
wuv * rating
precision=0.3865 recall=0.0779 coverage=0.2681 popularity=7.0116
"""
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
def evaluate(self):
''' return precision, recall, coverage and popularity '''
print('Evaluation start...', file=sys.stderr)
# 返回top N的推荐结果
N = self.n_rec_movie
# varables for precision and recall
# hit表示命中(测试集和推荐集相同+1)rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数
hit = 0
rec_count = 0
test_count = 0
# varables for coverage
all_rec_movies = set()
# varables for popularity
popular_sum = 0
# enumerate将其组成一个索引序列利用它可以同时获得索引和值
# 参考地址: http://blog.csdn.net/churximi/article/details/51648388
for i, user in enumerate(self.trainset):
if i > 0 and i % 500 == 0:
print('recommended for %d users' % i, file=sys.stderr)
test_movies = self.testset.get(user, {})
rec_movies = self.recommend(user)
# 对比测试集和推荐集的差异 movie, w
for movie, _ in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
# 计算用户对应的电影出现次数log值的sum加和
popular_sum += math.log(1 + self.movie_popular[movie])
rec_count += N
test_count += len(test_movies)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
popularity = popular_sum / (1.0 * rec_count)
print('precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
precision, recall, coverage, popularity), file=sys.stderr)
if __name__ == '__main__':
# ratingfile = 'data/16.RecommenderSystems/ml-1m/ratings.dat'
ratingfile = 'data/16.RecommenderSystems/ml-100k/u.data'
# 创建UserCF对象
usercf = UserBasedCF()
# 将数据按照 7:3的比例拆分成: 训练集和测试集存储在usercf的trainset和testset中
usercf.generate_dataset(ratingfile, pivot=0.7)
# 计算用户之间的相似度
usercf.calc_user_sim()
# 评估推荐效果
usercf.evaluate()