mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-13 07:15:26 +08:00
完全推荐系统的python代码
This commit is contained in:
@@ -1,13 +1,20 @@
|
||||
#-*- coding: utf-8 -*-
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on 2015-06-22
|
||||
|
||||
@author: Lockvictor
|
||||
Update on 2017-05-16
|
||||
@author: Lockvictor/片刻
|
||||
《推荐系统实践》协同过滤算法源代码
|
||||
参考地址:https://github.com/Lockvictor/MovieLens-RecSys
|
||||
更新地址:https://github.com/apachecn/MachineLearning
|
||||
'''
|
||||
import sys, random, math
|
||||
import sys
|
||||
import math
|
||||
import random
|
||||
from operator import itemgetter
|
||||
|
||||
|
||||
print(__doc__)
|
||||
# 作用:使得随机数据可预测
|
||||
random.seed(0)
|
||||
|
||||
|
||||
@@ -17,9 +24,11 @@ class ItemBasedCF():
|
||||
self.trainset = {}
|
||||
self.testset = {}
|
||||
|
||||
# n_sim_user: top 20个用户, n_rec_movie: top 10个推荐结果
|
||||
self.n_sim_movie = 20
|
||||
self.n_rec_movie = 10
|
||||
|
||||
# user_sim_mat: 电影之间的相似度, movie_popular: 电影的出现次数, movie_count: 总电影数量
|
||||
self.movie_sim_mat = {}
|
||||
self.movie_popular = {}
|
||||
self.movie_count = 0
|
||||
@@ -27,28 +36,42 @@ class ItemBasedCF():
|
||||
print >> sys.stderr, 'Similar movie number = %d' % self.n_sim_movie
|
||||
print >> sys.stderr, 'Recommended movie number = %d' % self.n_rec_movie
|
||||
|
||||
|
||||
@staticmethod
|
||||
def loadfile(filename):
|
||||
''' load a file, return a generator. '''
|
||||
"""loadfile(加载文件,返回一个生成器)
|
||||
|
||||
Args:
|
||||
filename 文件名
|
||||
Returns:
|
||||
line 行数据,去空格
|
||||
"""
|
||||
fp = open(filename, 'r')
|
||||
for i, line in enumerate(fp):
|
||||
yield line.strip('\r\n')
|
||||
if i % 100000 == 0:
|
||||
if i > 0 and i % 100000 == 0:
|
||||
print >> sys.stderr, 'loading %s(%s)' % (filename, i)
|
||||
fp.close()
|
||||
print >> sys.stderr, 'load %s succ' % filename
|
||||
|
||||
print >> sys.stderr, 'load %s success' % filename
|
||||
|
||||
def generate_dataset(self, filename, pivot=0.7):
|
||||
''' load rating data and split it to training set and test set '''
|
||||
"""loadfile(加载文件,将数据集按照7:3 进行随机拆分)
|
||||
|
||||
Args:
|
||||
filename 文件名
|
||||
pivot 拆分比例
|
||||
"""
|
||||
trainset_len = 0
|
||||
testset_len = 0
|
||||
|
||||
for line in self.loadfile(filename):
|
||||
# 用户ID,电影名称,评分,时间戳
|
||||
user, movie, rating, _ = line.split('::')
|
||||
# split the data by pivot
|
||||
# 通过pivot和随机函数比较,然后初始化用户和对应的值
|
||||
if (random.random() < pivot):
|
||||
|
||||
# dict.setdefault(key, default=None)
|
||||
# key -- 查找的键值
|
||||
# default -- 键不存在时,设置的默认键值
|
||||
self.trainset.setdefault(user, {})
|
||||
self.trainset[user][movie] = int(rating)
|
||||
trainset_len += 1
|
||||
@@ -57,83 +80,93 @@ class ItemBasedCF():
|
||||
self.testset[user][movie] = int(rating)
|
||||
testset_len += 1
|
||||
|
||||
print >> sys.stderr, 'split training set and test set succ'
|
||||
print >> sys.stderr, '分离训练集和测试集成功'
|
||||
print >> sys.stderr, 'train set = %s' % trainset_len
|
||||
print >> sys.stderr, 'test set = %s' % testset_len
|
||||
|
||||
|
||||
def calc_movie_sim(self):
|
||||
''' calculate movie similarity matrix '''
|
||||
"""calc_movie_sim(计算用户之间的相似度)"""
|
||||
|
||||
print >> sys.stderr, 'counting movies number and popularity...'
|
||||
|
||||
for user, movies in self.trainset.iteritems():
|
||||
for movie in movies:
|
||||
# count item popularity
|
||||
# count item popularity
|
||||
if movie not in self.movie_popular:
|
||||
self.movie_popular[movie] = 0
|
||||
self.movie_popular[movie] += 1
|
||||
|
||||
print >> sys.stderr, 'count movies number and popularity succ'
|
||||
print >> sys.stderr, 'count movies number and popularity success'
|
||||
|
||||
# save the total number of movies
|
||||
self.movie_count = len(self.movie_popular)
|
||||
print >> sys.stderr, 'total movie number = %d' % self.movie_count
|
||||
|
||||
# count co-rated users between items
|
||||
# 统计在相同用户时,不同电影同时出现的次数
|
||||
itemsim_mat = self.movie_sim_mat
|
||||
print >> sys.stderr, 'building co-rated users matrix...'
|
||||
|
||||
for user, movies in self.trainset.iteritems():
|
||||
for m1 in movies:
|
||||
for m2 in movies:
|
||||
if m1 == m2: continue
|
||||
itemsim_mat.setdefault(m1,{})
|
||||
itemsim_mat[m1].setdefault(m2,0)
|
||||
if m1 == m2:
|
||||
continue
|
||||
itemsim_mat.setdefault(m1, {})
|
||||
itemsim_mat[m1].setdefault(m2, 0)
|
||||
itemsim_mat[m1][m2] += 1
|
||||
print >> sys.stderr, 'build co-rated users matrix success'
|
||||
|
||||
print >> sys.stderr, 'build co-rated users matrix succ'
|
||||
|
||||
# calculate similarity matrix
|
||||
# calculate similarity matrix
|
||||
print >> sys.stderr, 'calculating movie similarity matrix...'
|
||||
simfactor_count = 0
|
||||
PRINT_STEP = 2000000
|
||||
|
||||
for m1, related_movies in itemsim_mat.iteritems():
|
||||
for m2, count in related_movies.iteritems():
|
||||
itemsim_mat[m1][m2] = count / math.sqrt(
|
||||
self.movie_popular[m1] * self.movie_popular[m2])
|
||||
# 余弦相似度
|
||||
itemsim_mat[m1][m2] = count / math.sqrt(self.movie_popular[m1] * self.movie_popular[m2])
|
||||
simfactor_count += 1
|
||||
# 打印进度条
|
||||
if simfactor_count % PRINT_STEP == 0:
|
||||
print >> sys.stderr, 'calculating movie similarity factor(%d)' % simfactor_count
|
||||
|
||||
print >> sys.stderr, 'calculate movie similarity matrix(similarity factor) succ'
|
||||
print >> sys.stderr, 'Total similarity factor number = %d' %simfactor_count
|
||||
|
||||
print >> sys.stderr, 'calculate movie similarity matrix(similarity factor) success'
|
||||
print >> sys.stderr, 'Total similarity factor number = %d' % simfactor_count
|
||||
|
||||
# @profile
|
||||
def recommend(self, user):
|
||||
"""recommend(找出top K的电影,对电影进行相似度sum的排序,取出top N的电影数)
|
||||
|
||||
Args:
|
||||
user 用户
|
||||
Returns:
|
||||
rec_movie 电影推荐列表,按照相似度从大到小的排序
|
||||
"""
|
||||
''' Find K similar movies and recommend N movies. '''
|
||||
K = self.n_sim_movie
|
||||
N = self.n_rec_movie
|
||||
rank = {}
|
||||
watched_movies = self.trainset[user]
|
||||
|
||||
# 计算top K 电影的相似度
|
||||
# rating=电影评分, w=不同电影出现的次数
|
||||
# 耗时分析:98.2%的时间在 line-154行
|
||||
for movie, rating in watched_movies.iteritems():
|
||||
for related_movie, w in sorted(self.movie_sim_mat[movie].items(),
|
||||
key=itemgetter(1), reverse=True)[:K]:
|
||||
for related_movie, w in sorted(self.movie_sim_mat[movie].items(), key=itemgetter(1), reverse=True)[0:K]:
|
||||
if related_movie in watched_movies:
|
||||
continue
|
||||
rank.setdefault(related_movie, 0)
|
||||
rank[related_movie] += w * rating
|
||||
# return the N best movies
|
||||
return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N]
|
||||
|
||||
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
|
||||
|
||||
def evaluate(self):
|
||||
''' return precision, recall, coverage and popularity '''
|
||||
print >> sys.stderr, 'Evaluation start...'
|
||||
|
||||
# 返回top N的推荐结果
|
||||
N = self.n_rec_movie
|
||||
# varables for precision and recall
|
||||
# varables for precision and recall
|
||||
# hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数
|
||||
hit = 0
|
||||
rec_count = 0
|
||||
test_count = 0
|
||||
@@ -143,14 +176,17 @@ class ItemBasedCF():
|
||||
popular_sum = 0
|
||||
|
||||
for i, user in enumerate(self.trainset):
|
||||
if i % 500 == 0:
|
||||
if i > 0 and i % 500 == 0:
|
||||
print >> sys.stderr, 'recommended for %d users' % i
|
||||
test_movies = self.testset.get(user, {})
|
||||
rec_movies = self.recommend(user)
|
||||
|
||||
# 对比测试集和推荐集的差异
|
||||
for movie, w in rec_movies:
|
||||
if movie in test_movies:
|
||||
hit += 1
|
||||
all_rec_movies.add(movie)
|
||||
# 计算用户对应的电影出现次数log值的sum加和
|
||||
popular_sum += math.log(1 + self.movie_popular[movie])
|
||||
rec_count += N
|
||||
test_count += len(test_movies)
|
||||
@@ -160,13 +196,17 @@ class ItemBasedCF():
|
||||
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
|
||||
popularity = popular_sum / (1.0 * rec_count)
|
||||
|
||||
print >> sys.stderr, 'precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' \
|
||||
% (precision, recall, coverage, popularity)
|
||||
print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (precision, recall, coverage, popularity)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
ratingfile = 'input/16.RecommendedSystem/ml-1m/ratings.dat'
|
||||
|
||||
# 创建ItemCF对象
|
||||
itemcf = ItemBasedCF()
|
||||
itemcf.generate_dataset(ratingfile)
|
||||
# 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset和testset中
|
||||
itemcf.generate_dataset(ratingfile, pivot=0.7)
|
||||
# 计算用户之间的相似度
|
||||
itemcf.calc_movie_sim()
|
||||
# 评估推荐效果
|
||||
itemcf.evaluate()
|
||||
|
||||
@@ -102,6 +102,7 @@ class UserBasedCF():
|
||||
if movie not in self.movie_popular:
|
||||
self.movie_popular[movie] = 0
|
||||
self.movie_popular[movie] += 1
|
||||
|
||||
print >> sys.stderr, 'build movie-users inverse table success'
|
||||
|
||||
# save the total movie number, which will be used in evaluation
|
||||
@@ -109,7 +110,7 @@ class UserBasedCF():
|
||||
print >> sys.stderr, 'total movie number = %d' % self.movie_count
|
||||
|
||||
usersim_mat = self.user_sim_mat
|
||||
# 统计在相同电影时,用户同时出现的次数
|
||||
# 统计在相同电影时,不同用户同时出现的次数
|
||||
print >> sys.stderr, 'building user co-rated movies matrix...'
|
||||
|
||||
for movie, users in movie2users.iteritems():
|
||||
@@ -138,8 +139,9 @@ class UserBasedCF():
|
||||
print >> sys.stderr, 'calculate user similarity matrix(similarity factor) success'
|
||||
print >> sys.stderr, 'Total similarity factor number = %d' % simfactor_count
|
||||
|
||||
# @profile
|
||||
def recommend(self, user):
|
||||
"""recommend(推荐top K的用户,所看过的电影,对电影进行相似度sum的排序,取出top N的电影数)
|
||||
"""recommend(找出top K的用户,所看过的电影,对电影进行相似度sum的排序,取出top N的电影数)
|
||||
|
||||
Args:
|
||||
user 用户
|
||||
@@ -152,8 +154,9 @@ class UserBasedCF():
|
||||
rank = dict()
|
||||
watched_movies = self.trainset[user]
|
||||
|
||||
# 找出top 10的用户和相似度
|
||||
# v=similar user, wuv=similarity factor
|
||||
# 计算top K 用户的相似度
|
||||
# v=similar user, wuv=不同用户同时出现的次数
|
||||
# 耗时分析:50.4%的时间在 line-160行
|
||||
for v, wuv in sorted(self.user_sim_mat[user].items(), key=itemgetter(1), reverse=True)[0:K]:
|
||||
for movie in self.trainset[v]:
|
||||
if movie in watched_movies:
|
||||
@@ -168,7 +171,7 @@ class UserBasedCF():
|
||||
''' return precision, recall, coverage and popularity '''
|
||||
print >> sys.stderr, 'Evaluation start...'
|
||||
|
||||
# 返回top 10的推荐结果
|
||||
# 返回top N的推荐结果
|
||||
N = self.n_rec_movie
|
||||
# varables for precision and recall
|
||||
# hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数
|
||||
@@ -209,7 +212,7 @@ if __name__ == '__main__':
|
||||
|
||||
# 创建UserCF对象
|
||||
usercf = UserBasedCF()
|
||||
# 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset河testset中
|
||||
# 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset和testset中
|
||||
usercf.generate_dataset(ratingfile, pivot=0.7)
|
||||
# 计算用户之间的相似度
|
||||
usercf.calc_user_sim()
|
||||
|
||||
Reference in New Issue
Block a user