mirror of
https://github.com/apachecn/ailearning.git
synced 2026-04-24 18:42:25 +08:00
239 lines
9.1 KiB
Python
239 lines
9.1 KiB
Python
#!/usr/bin/python
|
||
# coding:utf8
|
||
'''
|
||
Created on 2015-06-22
|
||
Update on 2017-05-16
|
||
Author: Lockvictor/片刻
|
||
《推荐系统实践》协同过滤算法源代码
|
||
参考地址: https://github.com/Lockvictor/MovieLens-RecSys
|
||
更新地址: https://github.com/apachecn/AiLearning
|
||
'''
|
||
from __future__ import print_function
|
||
import sys
|
||
import math
|
||
import random
|
||
from operator import itemgetter
|
||
print(__doc__)
|
||
# 作用: 使得随机数据可预测
|
||
random.seed(0)
|
||
|
||
|
||
class UserBasedCF():
|
||
''' TopN recommendation - UserBasedCF '''
|
||
|
||
def __init__(self):
|
||
self.trainset = {}
|
||
self.testset = {}
|
||
|
||
# n_sim_user: top 20个用户, n_rec_movie: top 10个推荐结果
|
||
self.n_sim_user = 20
|
||
self.n_rec_movie = 10
|
||
|
||
# user_sim_mat: 用户之间的相似度, movie_popular: 电影的出现次数, movie_count: 总电影数量
|
||
self.user_sim_mat = {}
|
||
self.movie_popular = {}
|
||
self.movie_count = 0
|
||
|
||
print('similar user number = %d' % self.n_sim_user, file=sys.stderr)
|
||
print('recommended movie number = %d' % self.n_rec_movie, file=sys.stderr)
|
||
|
||
@staticmethod
|
||
def loadfile(filename):
|
||
"""loadfile(加载文件,返回一个生成器)
|
||
|
||
Args:
|
||
filename 文件名
|
||
Returns:
|
||
line 行数据,去空格
|
||
"""
|
||
fp = open(filename, 'r')
|
||
for i, line in enumerate(fp):
|
||
yield line.strip('\r\n')
|
||
if i > 0 and i % 100000 == 0:
|
||
print('loading %s(%s)' % (filename, i), file=sys.stderr)
|
||
fp.close()
|
||
print('load %s success' % filename, file=sys.stderr)
|
||
|
||
def generate_dataset(self, filename, pivot=0.7):
|
||
"""loadfile(加载文件,将数据集按照7:3 进行随机拆分)
|
||
|
||
Args:
|
||
filename 文件名
|
||
pivot 拆分比例
|
||
"""
|
||
trainset_len = 0
|
||
testset_len = 0
|
||
|
||
for line in self.loadfile(filename):
|
||
# 用户ID,电影名称,评分,时间戳timestamp
|
||
# user, movie, rating, timestamp = line.split('::')
|
||
user, movie, rating, _ = line.split('\t')
|
||
# 通过pivot和随机函数比较,然后初始化用户和对应的值
|
||
if (random.random() < pivot):
|
||
|
||
# dict.setdefault(key, default=None)
|
||
# key -- 查找的键值
|
||
# default -- 键不存在时,设置的默认键值
|
||
self.trainset.setdefault(user, {})
|
||
self.trainset[user][movie] = int(rating)
|
||
trainset_len += 1
|
||
else:
|
||
self.testset.setdefault(user, {})
|
||
self.testset[user][movie] = int(rating)
|
||
testset_len += 1
|
||
|
||
print('分离训练集和测试集成功', file=sys.stderr)
|
||
print('train set = %s' % trainset_len, file=sys.stderr)
|
||
print('test set = %s' % testset_len, file=sys.stderr)
|
||
|
||
def calc_user_sim(self):
|
||
"""calc_user_sim(计算用户之间的相似度)"""
|
||
|
||
# build inverse table for item-users
|
||
# key=movieID, value=list of userIDs who have seen this movie
|
||
print('building movie-users inverse table...', file=sys.stderr)
|
||
movie2users = dict()
|
||
|
||
# 同一个电影中,收集用户的集合
|
||
# 统计在所有的用户中,不同电影的总出现次数
|
||
for user, movies in self.trainset.items():
|
||
for movie in movies:
|
||
# inverse table for item-users
|
||
if movie not in movie2users:
|
||
movie2users[movie] = set()
|
||
movie2users[movie].add(user)
|
||
# count item popularity at the same time
|
||
if movie not in self.movie_popular:
|
||
self.movie_popular[movie] = 0
|
||
self.movie_popular[movie] += 1
|
||
|
||
print('build movie-users inverse table success', file=sys.stderr)
|
||
|
||
# save the total movie number, which will be used in evaluation
|
||
self.movie_count = len(movie2users)
|
||
print('total movie number = %d' % self.movie_count, file=sys.stderr)
|
||
|
||
usersim_mat = self.user_sim_mat
|
||
# 统计在相同电影时,不同用户同时出现的次数
|
||
print('building user co-rated movies matrix...', file=sys.stderr)
|
||
|
||
for movie, users in movie2users.items():
|
||
for u in users:
|
||
for v in users:
|
||
if u == v:
|
||
continue
|
||
usersim_mat.setdefault(u, {})
|
||
usersim_mat[u].setdefault(v, 0)
|
||
usersim_mat[u][v] += 1
|
||
print('build user co-rated movies matrix success', file=sys.stderr)
|
||
|
||
# calculate similarity matrix
|
||
print('calculating user similarity matrix...', file=sys.stderr)
|
||
simfactor_count = 0
|
||
PRINT_STEP = 2000000
|
||
for u, related_users in usersim_mat.items():
|
||
for v, count in related_users.iteritems():
|
||
# 余弦相似度
|
||
usersim_mat[u][v] = count / math.sqrt(
|
||
len(self.trainset[u]) * len(self.trainset[v]))
|
||
simfactor_count += 1
|
||
# 打印进度条
|
||
if simfactor_count % PRINT_STEP == 0:
|
||
print('calculating user similarity factor(%d)' % simfactor_count, file=sys.stderr)
|
||
|
||
print('calculate user similarity matrix(similarity factor) success', file=sys.stderr)
|
||
print('Total similarity factor number = %d' % simfactor_count, file=sys.stderr)
|
||
|
||
# @profile
|
||
def recommend(self, user):
|
||
"""recommend(找出top K的用户,所看过的电影,对电影进行相似度sum的排序,取出top N的电影数)
|
||
|
||
Args:
|
||
user 用户
|
||
Returns:
|
||
rec_movie 电影推荐列表,按照相似度从大到小的排序
|
||
"""
|
||
''' Find K similar users and recommend N movies. '''
|
||
K = self.n_sim_user
|
||
N = self.n_rec_movie
|
||
rank = dict()
|
||
watched_movies = self.trainset[user]
|
||
|
||
# 计算top K 用户的相似度
|
||
# v=similar user, wuv=不同用户同时出现的次数,根据wuv倒序从大到小选出K个用户进行排列
|
||
# 耗时分析: 50.4%的时间在 line-160行
|
||
for v, wuv in sorted(
|
||
self.user_sim_mat[user].items(), key=itemgetter(1),
|
||
reverse=True)[0:K]:
|
||
for movie, rating in self.trainset[v].iteritems():
|
||
if movie in watched_movies:
|
||
continue
|
||
# predict the user's "interest" for each movie
|
||
rank.setdefault(movie, 0)
|
||
rank[movie] += wuv * rating
|
||
# return the N best movies
|
||
"""
|
||
wuv
|
||
precision=0.3766 recall=0.0759 coverage=0.3183 popularity=6.9194
|
||
|
||
wuv * rating
|
||
precision=0.3865 recall=0.0779 coverage=0.2681 popularity=7.0116
|
||
"""
|
||
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
|
||
|
||
def evaluate(self):
|
||
''' return precision, recall, coverage and popularity '''
|
||
print('Evaluation start...', file=sys.stderr)
|
||
|
||
# 返回top N的推荐结果
|
||
N = self.n_rec_movie
|
||
# varables for precision and recall
|
||
# hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数
|
||
hit = 0
|
||
rec_count = 0
|
||
test_count = 0
|
||
# varables for coverage
|
||
all_rec_movies = set()
|
||
# varables for popularity
|
||
popular_sum = 0
|
||
|
||
# enumerate将其组成一个索引序列,利用它可以同时获得索引和值
|
||
# 参考地址: http://blog.csdn.net/churximi/article/details/51648388
|
||
for i, user in enumerate(self.trainset):
|
||
if i > 0 and i % 500 == 0:
|
||
print('recommended for %d users' % i, file=sys.stderr)
|
||
test_movies = self.testset.get(user, {})
|
||
rec_movies = self.recommend(user)
|
||
|
||
# 对比测试集和推荐集的差异 movie, w
|
||
for movie, _ in rec_movies:
|
||
if movie in test_movies:
|
||
hit += 1
|
||
all_rec_movies.add(movie)
|
||
# 计算用户对应的电影出现次数log值的sum加和
|
||
popular_sum += math.log(1 + self.movie_popular[movie])
|
||
rec_count += N
|
||
test_count += len(test_movies)
|
||
|
||
precision = hit / (1.0 * rec_count)
|
||
recall = hit / (1.0 * test_count)
|
||
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
|
||
popularity = popular_sum / (1.0 * rec_count)
|
||
|
||
print('precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
|
||
precision, recall, coverage, popularity), file=sys.stderr)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# ratingfile = 'data/16.RecommenderSystems/ml-1m/ratings.dat'
|
||
ratingfile = 'data/16.RecommenderSystems/ml-100k/u.data'
|
||
|
||
# 创建UserCF对象
|
||
usercf = UserBasedCF()
|
||
# 将数据按照 7:3的比例,拆分成: 训练集和测试集,存储在usercf的trainset和testset中
|
||
usercf.generate_dataset(ratingfile, pivot=0.7)
|
||
# 计算用户之间的相似度
|
||
usercf.calc_user_sim()
|
||
# 评估推荐效果
|
||
usercf.evaluate()
|