优化成功-推荐系统

This commit is contained in:
jiangzhonglian
2017-07-08 22:49:58 +08:00
parent 75fbb3866c
commit 28b26de4bb
7 changed files with 309 additions and 103 deletions

View File

@@ -65,7 +65,8 @@ class ItemBasedCF():
for line in self.loadfile(filename):
# 用户ID电影名称评分时间戳
user, movie, rating, _ = line.split('::')
# user, movie, rating, _ = line.split('::')
user, movie, rating, _ = line.split('\t')
# 通过pivot和随机函数比较然后初始化用户和对应的值
if (random.random() < pivot):
@@ -203,7 +204,8 @@ class ItemBasedCF():
if __name__ == '__main__':
ratingfile = 'input/16.RecommenderSystems/ml-1m/ratings.dat'
# ratingfile = 'input/16.RecommenderSystems/ml-1m/ratings.dat'
ratingfile = 'input/16.RecommenderSystems/ml-100k/u.data'
# 创建ItemCF对象
itemcf = ItemBasedCF()
@@ -212,4 +214,8 @@ if __name__ == '__main__':
# 计算用户之间的相似度
itemcf.calc_movie_sim()
# 评估推荐效果
itemcf.evaluate()
# itemcf.evaluate()
# 查看推荐结果用户
user = "2"
print "推荐结果", itemcf.recommend(user)
print "---", itemcf.testset.get(user, {})

View File

@@ -1,7 +1,9 @@
#!/usr/bin/python
# coding:utf8
from math import sqrt
import sys
import math
from operator import itemgetter
import numpy as np
import pandas as pd
@@ -36,9 +38,23 @@ def calc_similarity(n_users, n_items, train_data, test_data):
# 使用sklearn的pairwise_distances函数来计算余弦相似性。
print "1:", np.shape(train_data_matrix) # 行:人,列:电影
print "2:", np.shape(train_data_matrix.T) # 行:电影,列:人
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
return train_data_matrix, test_data_matrix, user_similarity, item_similarity
print >> sys.stderr, '开始统计流行item的数量...'
item_popular = {}
# 统计在所有的用户中,不同电影的总出现次数
for i_index in range(n_items):
if np.sum(train_data_matrix[:, i_index]) != 0:
item_popular[i_index] = np.sum(train_data_matrix[:, i_index]!=0)
# print "pop=", i_index, self.item_popular[i_index]
# save the total number of items
item_count = len(item_popular)
print >> sys.stderr, '总共流行item数量 = %d' % item_count
return train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular
def predict(rating, similarity, type='user'):
@@ -60,7 +76,7 @@ def predict(rating, similarity, type='user'):
# 均分 + 人-人-距离(943, 943)*人-电影-评分diff(943, 1682)=结果-人-电影(每个人对同一电影的综合得分)(943, 1682) 再除以 个人与其他人总的距离 = 人-电影综合得分
pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
elif type == 'item':
# 综合打分: 人-电影-评分(943, 1682)*电影-电影-距离1682, 1682)=结果-人-电影(各个电影对同一电影的综合得分)(943, 1682) 再除以 电影与其他电影总的距离 = 人-电影综合得分
# 综合打分: 人-电影-评分(943, 1682)*电影-电影-距离(1682, 1682)=结果-人-电影(各个电影对同一电影的综合得分)(943, 1682) 再除以 电影与其他电影总的距离 = 人-电影综合得分
pred = rating.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])
return pred
@@ -68,11 +84,51 @@ def predict(rating, similarity, type='user'):
def rmse(prediction, ground_truth):
prediction = prediction[ground_truth.nonzero()].flatten()
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
return sqrt(mean_squared_error(prediction, ground_truth))
return math.sqrt(mean_squared_error(prediction, ground_truth))
def evaluate(prediction, item_popular, name):
hit = 0
rec_count = 0
test_count = 0
popular_sum = 0
all_rec_items = set()
for u_index in range(n_users):
items = np.where(train_data_matrix[u_index, :] == 0)[0]
pre_items = sorted(dict(zip(items, prediction[u_index, items])).items(), key=itemgetter(1), reverse=True)[: 20]
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
# 对比测试集和推荐集的差异
for item, w in pre_items:
if item in test_items:
hit += 1
all_rec_items.add(item)
# 计算用户对应的电影出现次数log值的sum加和
if item in item_popular:
popular_sum += math.log(1 + item_popular[item])
rec_count += len(pre_items)
test_count += len(test_items)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_items) / (1.0 * len(item_popular))
popularity = popular_sum / (1.0 * rec_count)
print >> sys.stderr, '%s: precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (name, precision, recall, coverage, popularity)
def recommend(u_index, prediction):
items = np.where(train_data_matrix[u_index, :] == 0)[0]
pre_items = sorted(dict(zip(items, prediction[u_index, items])).items(), key=itemgetter(1), reverse=True)[: 10]
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
print '原始结果:', test_items
print '推荐结果:', [key for key, value in pre_items]
if __name__ == "__main__":
# 基于模型的协同过滤
# 基于内存的协同过滤
# ...
# 拆分数据集
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
@@ -80,22 +136,37 @@ if __name__ == "__main__":
df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25)
# 计算相似度
train_data_matrix, test_data_matrix, user_similarity, item_similarity = calc_similarity(n_users, n_items, train_data, test_data)
train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular = calc_similarity(n_users, n_items, train_data, test_data)
user_prediction = predict(train_data_matrix, user_similarity, type='user')
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')
# 评估:均方根误差
print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
print 'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))
print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
# 基于模型的协同过滤
# ...
# 计算MovieLens数据集的稀疏度
# 计算MovieLens数据集的稀疏度 n_usersn_items 是常量,所以,用户行为数据越少,意味着信息量少;越稀疏,优化的空间也越大)
sparsity = round(1.0 - len(df)/float(n_users*n_items), 3)
print 'The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%'
u, s, vt = svds(train_data_matrix, k=20)
# 计算稀疏矩阵的最大k个奇异值/向量
u, s, vt = svds(train_data_matrix, k=15)
s_diag_matrix = np.diag(s)
x_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print 'Model based CF RMSE: ' + str(rmse(x_pred, test_data_matrix))
svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
print 'Model based CF RMSE: ' + str(rmse(svd_prediction, test_data_matrix))
"""
在信息量相同的情况下矩阵越小那么携带的信息越可靠
所以user-cf 推荐效果高于 item-cf 而svd分解后发现15个维度效果就能达到90%以上所以信息更可靠效果也更好
item-cf: 1682
user-cf: 943
svd: 15
"""
evaluate(item_prediction, item_popular, 'item')
evaluate(user_prediction, item_popular, 'user')
evaluate(svd_prediction, item_popular, 'svd')
# 推荐结果
recommend(1, svd_prediction)

View File

@@ -65,7 +65,8 @@ class UserBasedCF():
for line in self.loadfile(filename):
# 用户ID电影名称评分时间戳
user, movie, rating, timestamp = line.split('::')
# user, movie, rating, timestamp = line.split('::')
user, movie, rating, timestamp = line.split('\t')
# 通过pivot和随机函数比较然后初始化用户和对应的值
if (random.random() < pivot):
@@ -220,7 +221,8 @@ class UserBasedCF():
if __name__ == '__main__':
ratingfile = 'input/16.RecommenderSystems/ml-1m/ratings.dat'
# ratingfile = 'input/16.RecommenderSystems/ml-1m/ratings.dat'
ratingfile = 'input/16.RecommenderSystems/ml-100k/u.data'
# 创建UserCF对象
usercf = UserBasedCF()

View File

@@ -0,0 +1,28 @@
import numpy as np
# 自定义杰卡德相似系数函数仅对0-1矩阵有效
def Jaccard(a, b):
return 1.0*(a*b).sum()/(a+b-a*b).sum()
class Recommender():
# 相似度矩阵
sim = None
# 计算相似度矩阵的函数
def similarity(self, x, distance):
y = np.ones((len(x), len(x)))
for i in range(len(x)):
for j in range(len(x)):
y[i, j] = distance(x[i], x[j])
return y
# 训练函数
def fit(self, x, distance=Jaccard):
self.sim = self.similarity(x, distance)
# 推荐函数
def recommend(self, a):
return np.dot(self.sim, a)*(1-a)

View File

@@ -0,0 +1,185 @@
#!/usr/bin/python
# coding:utf8
'''
Created on 2015-06-22
Update on 2017-05-16
@author: Lockvictor/片刻
《推荐系统实践》协同过滤算法源代码
参考地址https://github.com/Lockvictor/MovieLens-RecSys
更新地址https://github.com/apachecn/MachineLearning
'''
import math
import random
import sys
from operator import itemgetter
import numpy as np
import pandas as pd
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
print(__doc__)
# 作用:使得随机数据可预测
random.seed(0)
class ItemBasedCF():
''' TopN recommendation - ItemBasedCF '''
def __init__(self):
# 拆分数据集
self.train_mat = {}
self.test_mat = {}
# 总用户数
self.n_users = 0
self.n_items = 0
# n_sim_user: top 20个用户 n_rec_item: top 10个推荐结果
self.n_sim_item = 20
self.n_rec_item = 10
# item_mat_similarity: 电影之间的相似度, item_popular: 电影的出现次数, item_count: 总电影数量
self.item_mat_similarity = {}
self.item_popular = {}
self.item_count = 0
print >> sys.stderr, 'Similar item number = %d' % self.n_sim_item
print >> sys.stderr, 'Recommended item number = %d' % self.n_rec_item
def splitData(self, dataFile, test_size):
# 加载数据集
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(dataFile, sep='\t', names=header)
self.n_users = df.user_id.unique().shape[0]
self.n_items = df.item_id.unique().shape[0]
print 'Number of users = ' + str(self.n_users) + ' | Number of items = ' + str(self.n_items)
# 拆分数据集: 用户+电影
self.train_data, self.test_data = cv.train_test_split(df, test_size=test_size)
print >> sys.stderr, '分离训练集和测试集成功'
print >> sys.stderr, 'len(train) = %s' % np.shape(self.train_data)[0]
print >> sys.stderr, 'len(test) = %s' % np.shape(self.test_data)[0]
def calc_similarity(self):
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
self.train_mat = np.zeros((self.n_users, self.n_items))
for line in self.train_data.itertuples():
self.train_mat[int(line.user_id)-1, int(line.item_id)-1] = float(line.rating)
self.test_mat = np.zeros((self.n_users, self.n_items))
for line in self.test_data.itertuples():
# print "line", line.user_id-1, line.item_id-1, line.rating
self.test_mat[int(line.user_id)-1, int(line.item_id)-1] = float(line.rating)
# 使用sklearn的pairwise_distances函数来计算余弦相似性。
print "1:", np.shape(np.mat(self.train_mat).T) # 行:电影,列:人
# 电影-电影-距离(1682, 1682)
self.item_mat_similarity = pairwise_distances(np.mat(self.train_mat).T, metric='cosine')
print >> sys.stderr, 'item_mat_similarity=', np.shape(self.item_mat_similarity)
print >> sys.stderr, '开始统计流行item的数量...'
# 统计在所有的用户中,不同电影的总出现次数
for i_index in range(self.n_items):
if np.sum(self.train_mat[:, i_index]) != 0:
self.item_popular[i_index] = np.sum(self.train_mat[:, i_index]!=0)
# print "pop=", i_index, self.item_popular[i_index]
# save the total number of items
self.item_count = len(self.item_popular)
print >> sys.stderr, '总共流行item数量 = %d' % self.item_count
# @profile
def recommend(self, u_index):
"""recommend(找出top K的电影对电影进行相似度sum的排序取出top N的电影数)
Args:
u_index 用户_ID-1=用户index
Returns:
rec_item 电影推荐列表,按照相似度从大到小的排序
"""
''' Find K similar items and recommend N items. '''
K = self.n_sim_item
N = self.n_rec_item
rank = {}
i_items = np.where(self.train_mat[u_index, :] != 0)[0]
# print "i_items=", i_items
watched_items = dict(zip(i_items, self.train_mat[u_index, i_items]))
# 计算top K 电影的相似度
# rating=电影评分, w=不同电影出现的次数
# 耗时分析98.2%的时间在 line-154行
for i_item, rating in watched_items.iteritems():
i_other_items = np.where(self.item_mat_similarity[i_item, :] != 0)[0]
for related_item, w in sorted(dict(zip(i_other_items, self.item_mat_similarity[i_item, i_other_items])).items(), key=itemgetter(1), reverse=True)[0:K]:
if related_item in watched_items:
continue
rank.setdefault(related_item, 0)
rank[related_item] += w * rating
# return the N best items
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
def evaluate(self):
''' return precision, recall, coverage and popularity '''
print >> sys.stderr, 'Evaluation start...'
# varables for precision and recall
# hit表示命中(测试集和推荐集相同+1)rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数
hit = 0
rec_count = 0
test_count = 0
# varables for coverage
all_rec_items = set()
# varables for popularity
popular_sum = 0
# enumerate 将其组成一个索引序列,利用它可以同时获得索引和值
# 参考地址http://blog.csdn.net/churximi/article/details/51648388
for u_index in range(50):
if u_index > 0 and u_index % 10 == 0:
print >> sys.stderr, 'recommended for %d users' % u_index
print "u_index", u_index
# 对比测试集和推荐集的差异
rec_items = self.recommend(u_index)
print "rec_items=", rec_items
for item, w in rec_items:
# print 'test_mat[u_index, item]=', item, self.test_mat[u_index, item]
if self.test_mat[u_index, item] != 0:
hit += 1
print "self.test_mat[%d, %d]=%s" % (u_index, item, self.test_mat[u_index, item])
# 计算用户对应的电影出现次数log值的sum加和
if item in self.item_popular:
popular_sum += math.log(1 + self.item_popular[item])
rec_count += len(rec_items)
test_count += np.sum(self.test_mat[u_index, :] != 0)
# print "test_count=", np.sum(self.test_mat[u_index, :] != 0), np.sum(self.train_mat[u_index, :] != 0)
print("-------", hit, rec_count)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_items) / (1.0 * self.item_count)
popularity = popular_sum / (1.0 * rec_count)
print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (precision, recall, coverage, popularity)
if __name__ == '__main__':
dataFile = 'input/16.RecommenderSystems/ml-100k/u.data'
# 创建ItemCF对象
itemcf = ItemBasedCF()
# 将数据按照 7:3的比例拆分成训练集和测试集存储在usercf的trainset和testset中
itemcf.splitData(dataFile, test_size=0.3)
# 计算用户之间的相似度
itemcf.calc_similarity()
# 评估推荐效果
# itemcf.evaluate()
# 查看推荐结果用户
print "推荐结果", itemcf.recommend(u_index=1)
print "---", np.where(itemcf.test_mat[1, :] != 0)[0]

View File

@@ -1,85 +0,0 @@
#!/usr/bin/python
# coding:utf8
from math import sqrt
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from sklearn import cross_validation as cv
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances
def splitData(dataFile, test_size):
# 加载数据集
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(dataFile, sep='\t', names=header)
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)
train_data, test_data = cv.train_test_split(df, test_size=test_size)
return df, n_users, n_items, train_data, test_data
def calc_similarity(n_users, n_items, train_data, test_data):
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
train_data_matrix[line[1]-1, line[2]-1] = line[3]
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
test_data_matrix[line[1]-1, line[2]-1] = line[3]
# 使用sklearn的pairwise_distances函数来计算余弦相似性。
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
return train_data_matrix, test_data_matrix, user_similarity, item_similarity
def predict(rating, similarity, type='user'):
if type == 'user':
mean_user_rating = rating.mean(axis=1)
rating_diff = (rating - mean_user_rating[:, np.newaxis])
pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
elif type == 'item':
pred = rating.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])
return pred
def rmse(prediction, ground_truth):
prediction = prediction[ground_truth.nonzero()].flatten()
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
return sqrt(mean_squared_error(prediction, ground_truth))
if __name__ == "__main__":
# 基于模型的协同过滤
# ...
# 拆分数据集
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
dataFile = 'input/16.RecommenderSystems/ml-100k/u.data'
df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25)
# 计算相似度
train_data_matrix, test_data_matrix, user_similarity, item_similarity = calc_similarity(n_users, n_items, train_data, test_data)
user_prediction = predict(train_data_matrix, user_similarity, type='user')
item_prediction = predict(train_data_matrix, item_similarity, type='item')
# 评估:均方根误差
print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
print 'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))
# 基于模型的协同过滤
# ...
# 计算MovieLens数据集的稀疏度
sparsity = round(1.0 - len(df)/float(n_users*n_items), 3)
print 'The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%'
u, s, vt = svds(train_data_matrix, k=20)
s_diag_matrix = np.diag(s)
x_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print 'Model based CF RMSE: ' + str(rmse(x_pred, test_data_matrix))

View File

@@ -14,4 +14,3 @@ def PersonalRank(G, alpha, root):
tmp[j] += 1 - alpha
rank = tmp
return rank