mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-13 23:35:28 +08:00
优化成功-推荐系统
This commit is contained in:
@@ -65,7 +65,8 @@ class ItemBasedCF():
|
||||
|
||||
for line in self.loadfile(filename):
|
||||
# 用户ID,电影名称,评分,时间戳
|
||||
user, movie, rating, _ = line.split('::')
|
||||
# user, movie, rating, _ = line.split('::')
|
||||
user, movie, rating, _ = line.split('\t')
|
||||
# 通过pivot和随机函数比较,然后初始化用户和对应的值
|
||||
if (random.random() < pivot):
|
||||
|
||||
@@ -203,7 +204,8 @@ class ItemBasedCF():
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
ratingfile = 'input/16.RecommenderSystems/ml-1m/ratings.dat'
|
||||
# ratingfile = 'input/16.RecommenderSystems/ml-1m/ratings.dat'
|
||||
ratingfile = 'input/16.RecommenderSystems/ml-100k/u.data'
|
||||
|
||||
# 创建ItemCF对象
|
||||
itemcf = ItemBasedCF()
|
||||
@@ -212,4 +214,8 @@ if __name__ == '__main__':
|
||||
# 计算用户之间的相似度
|
||||
itemcf.calc_movie_sim()
|
||||
# 评估推荐效果
|
||||
itemcf.evaluate()
|
||||
# itemcf.evaluate()
|
||||
# 查看推荐结果用户
|
||||
user = "2"
|
||||
print "推荐结果", itemcf.recommend(user)
|
||||
print "---", itemcf.testset.get(user, {})
|
||||
@@ -1,7 +1,9 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
from math import sqrt
|
||||
import sys
|
||||
import math
|
||||
from operator import itemgetter
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
@@ -36,9 +38,23 @@ def calc_similarity(n_users, n_items, train_data, test_data):
|
||||
# 使用sklearn的pairwise_distances函数来计算余弦相似性。
|
||||
print "1:", np.shape(train_data_matrix) # 行:人,列:电影
|
||||
print "2:", np.shape(train_data_matrix.T) # 行:电影,列:人
|
||||
|
||||
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
|
||||
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
|
||||
return train_data_matrix, test_data_matrix, user_similarity, item_similarity
|
||||
|
||||
print >> sys.stderr, '开始统计流行item的数量...'
|
||||
item_popular = {}
|
||||
# 统计在所有的用户中,不同电影的总出现次数
|
||||
for i_index in range(n_items):
|
||||
if np.sum(train_data_matrix[:, i_index]) != 0:
|
||||
item_popular[i_index] = np.sum(train_data_matrix[:, i_index]!=0)
|
||||
# print "pop=", i_index, self.item_popular[i_index]
|
||||
|
||||
# save the total number of items
|
||||
item_count = len(item_popular)
|
||||
print >> sys.stderr, '总共流行item数量 = %d' % item_count
|
||||
|
||||
return train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular
|
||||
|
||||
|
||||
def predict(rating, similarity, type='user'):
|
||||
@@ -60,7 +76,7 @@ def predict(rating, similarity, type='user'):
|
||||
# 均分 + 人-人-距离(943, 943)*人-电影-评分diff(943, 1682)=结果-人-电影(每个人对同一电影的综合得分)(943, 1682) 再除以 个人与其他人总的距离 = 人-电影综合得分
|
||||
pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
|
||||
elif type == 'item':
|
||||
# 综合打分: 人-电影-评分(943, 1682)*电影-电影-距离1682, 1682)=结果-人-电影(各个电影对同一电影的综合得分)(943, 1682) / 再除以 电影与其他电影总的距离 = 人-电影综合得分
|
||||
# 综合打分: 人-电影-评分(943, 1682)*电影-电影-距离(1682, 1682)=结果-人-电影(各个电影对同一电影的综合得分)(943, 1682) / 再除以 电影与其他电影总的距离 = 人-电影综合得分
|
||||
pred = rating.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])
|
||||
return pred
|
||||
|
||||
@@ -68,11 +84,51 @@ def predict(rating, similarity, type='user'):
|
||||
def rmse(prediction, ground_truth):
|
||||
prediction = prediction[ground_truth.nonzero()].flatten()
|
||||
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
|
||||
return sqrt(mean_squared_error(prediction, ground_truth))
|
||||
return math.sqrt(mean_squared_error(prediction, ground_truth))
|
||||
|
||||
|
||||
def evaluate(prediction, item_popular, name):
|
||||
hit = 0
|
||||
rec_count = 0
|
||||
test_count = 0
|
||||
popular_sum = 0
|
||||
all_rec_items = set()
|
||||
for u_index in range(n_users):
|
||||
items = np.where(train_data_matrix[u_index, :] == 0)[0]
|
||||
pre_items = sorted(dict(zip(items, prediction[u_index, items])).items(), key=itemgetter(1), reverse=True)[: 20]
|
||||
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
|
||||
|
||||
# 对比测试集和推荐集的差异
|
||||
for item, w in pre_items:
|
||||
if item in test_items:
|
||||
hit += 1
|
||||
all_rec_items.add(item)
|
||||
|
||||
# 计算用户对应的电影出现次数log值的sum加和
|
||||
if item in item_popular:
|
||||
popular_sum += math.log(1 + item_popular[item])
|
||||
|
||||
rec_count += len(pre_items)
|
||||
test_count += len(test_items)
|
||||
|
||||
precision = hit / (1.0 * rec_count)
|
||||
recall = hit / (1.0 * test_count)
|
||||
coverage = len(all_rec_items) / (1.0 * len(item_popular))
|
||||
popularity = popular_sum / (1.0 * rec_count)
|
||||
print >> sys.stderr, '%s: precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (name, precision, recall, coverage, popularity)
|
||||
|
||||
|
||||
def recommend(u_index, prediction):
|
||||
items = np.where(train_data_matrix[u_index, :] == 0)[0]
|
||||
pre_items = sorted(dict(zip(items, prediction[u_index, items])).items(), key=itemgetter(1), reverse=True)[: 10]
|
||||
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
|
||||
|
||||
print '原始结果:', test_items
|
||||
print '推荐结果:', [key for key, value in pre_items]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 基于模型的协同过滤
|
||||
# 基于内存的协同过滤
|
||||
# ...
|
||||
# 拆分数据集
|
||||
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
|
||||
@@ -80,22 +136,37 @@ if __name__ == "__main__":
|
||||
df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25)
|
||||
|
||||
# 计算相似度
|
||||
train_data_matrix, test_data_matrix, user_similarity, item_similarity = calc_similarity(n_users, n_items, train_data, test_data)
|
||||
train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular = calc_similarity(n_users, n_items, train_data, test_data)
|
||||
|
||||
user_prediction = predict(train_data_matrix, user_similarity, type='user')
|
||||
item_prediction = predict(train_data_matrix, item_similarity, type='item')
|
||||
user_prediction = predict(train_data_matrix, user_similarity, type='user')
|
||||
|
||||
# 评估:均方根误差
|
||||
print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
|
||||
print 'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))
|
||||
print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
|
||||
|
||||
# 基于模型的协同过滤
|
||||
# ...
|
||||
# 计算MovieLens数据集的稀疏度
|
||||
# 计算MovieLens数据集的稀疏度 (n_users,n_items 是常量,所以,用户行为数据越少,意味着信息量少;越稀疏,优化的空间也越大)
|
||||
sparsity = round(1.0 - len(df)/float(n_users*n_items), 3)
|
||||
print 'The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%'
|
||||
|
||||
u, s, vt = svds(train_data_matrix, k=20)
|
||||
# 计算稀疏矩阵的最大k个奇异值/向量
|
||||
u, s, vt = svds(train_data_matrix, k=15)
|
||||
s_diag_matrix = np.diag(s)
|
||||
x_pred = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
print 'Model based CF RMSE: ' + str(rmse(x_pred, test_data_matrix))
|
||||
svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
print 'Model based CF RMSE: ' + str(rmse(svd_prediction, test_data_matrix))
|
||||
|
||||
"""
|
||||
在信息量相同的情况下,矩阵越小,那么携带的信息越可靠。
|
||||
所以:user-cf 推荐效果高于 item-cf; 而svd分解后,发现15个维度效果就能达到90%以上,所以信息更可靠,效果也更好。
|
||||
item-cf: 1682
|
||||
user-cf: 943
|
||||
svd: 15
|
||||
"""
|
||||
evaluate(item_prediction, item_popular, 'item')
|
||||
evaluate(user_prediction, item_popular, 'user')
|
||||
evaluate(svd_prediction, item_popular, 'svd')
|
||||
|
||||
# 推荐结果
|
||||
recommend(1, svd_prediction)
|
||||
@@ -65,7 +65,8 @@ class UserBasedCF():
|
||||
|
||||
for line in self.loadfile(filename):
|
||||
# 用户ID,电影名称,评分,时间戳
|
||||
user, movie, rating, timestamp = line.split('::')
|
||||
# user, movie, rating, timestamp = line.split('::')
|
||||
user, movie, rating, timestamp = line.split('\t')
|
||||
# 通过pivot和随机函数比较,然后初始化用户和对应的值
|
||||
if (random.random() < pivot):
|
||||
|
||||
@@ -220,7 +221,8 @@ class UserBasedCF():
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
ratingfile = 'input/16.RecommenderSystems/ml-1m/ratings.dat'
|
||||
# ratingfile = 'input/16.RecommenderSystems/ml-1m/ratings.dat'
|
||||
ratingfile = 'input/16.RecommenderSystems/ml-100k/u.data'
|
||||
|
||||
# 创建UserCF对象
|
||||
usercf = UserBasedCF()
|
||||
28
src/python/16.RecommenderSystems/python/Recommender.py
Normal file
28
src/python/16.RecommenderSystems/python/Recommender.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import numpy as np
|
||||
|
||||
|
||||
# 自定义杰卡德相似系数函数,仅对0-1矩阵有效
|
||||
def Jaccard(a, b):
|
||||
return 1.0*(a*b).sum()/(a+b-a*b).sum()
|
||||
|
||||
|
||||
class Recommender():
|
||||
|
||||
# 相似度矩阵
|
||||
sim = None
|
||||
|
||||
# 计算相似度矩阵的函数
|
||||
def similarity(self, x, distance):
|
||||
y = np.ones((len(x), len(x)))
|
||||
for i in range(len(x)):
|
||||
for j in range(len(x)):
|
||||
y[i, j] = distance(x[i], x[j])
|
||||
return y
|
||||
|
||||
# 训练函数
|
||||
def fit(self, x, distance=Jaccard):
|
||||
self.sim = self.similarity(x, distance)
|
||||
|
||||
# 推荐函数
|
||||
def recommend(self, a):
|
||||
return np.dot(self.sim, a)*(1-a)
|
||||
185
src/python/16.RecommenderSystems/sklearn-RS-demo-cf-item-test.py
Normal file
185
src/python/16.RecommenderSystems/sklearn-RS-demo-cf-item-test.py
Normal file
@@ -0,0 +1,185 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on 2015-06-22
|
||||
Update on 2017-05-16
|
||||
@author: Lockvictor/片刻
|
||||
《推荐系统实践》协同过滤算法源代码
|
||||
参考地址:https://github.com/Lockvictor/MovieLens-RecSys
|
||||
更新地址:https://github.com/apachecn/MachineLearning
|
||||
'''
|
||||
import math
|
||||
import random
|
||||
import sys
|
||||
from operator import itemgetter
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn import cross_validation as cv
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
|
||||
print(__doc__)
|
||||
# 作用:使得随机数据可预测
|
||||
random.seed(0)
|
||||
|
||||
|
||||
class ItemBasedCF():
|
||||
''' TopN recommendation - ItemBasedCF '''
|
||||
def __init__(self):
|
||||
# 拆分数据集
|
||||
self.train_mat = {}
|
||||
self.test_mat = {}
|
||||
|
||||
# 总用户数
|
||||
self.n_users = 0
|
||||
self.n_items = 0
|
||||
|
||||
# n_sim_user: top 20个用户, n_rec_item: top 10个推荐结果
|
||||
self.n_sim_item = 20
|
||||
self.n_rec_item = 10
|
||||
|
||||
# item_mat_similarity: 电影之间的相似度, item_popular: 电影的出现次数, item_count: 总电影数量
|
||||
self.item_mat_similarity = {}
|
||||
self.item_popular = {}
|
||||
self.item_count = 0
|
||||
|
||||
print >> sys.stderr, 'Similar item number = %d' % self.n_sim_item
|
||||
print >> sys.stderr, 'Recommended item number = %d' % self.n_rec_item
|
||||
|
||||
def splitData(self, dataFile, test_size):
|
||||
# 加载数据集
|
||||
header = ['user_id', 'item_id', 'rating', 'timestamp']
|
||||
df = pd.read_csv(dataFile, sep='\t', names=header)
|
||||
|
||||
self.n_users = df.user_id.unique().shape[0]
|
||||
self.n_items = df.item_id.unique().shape[0]
|
||||
|
||||
print 'Number of users = ' + str(self.n_users) + ' | Number of items = ' + str(self.n_items)
|
||||
|
||||
# 拆分数据集: 用户+电影
|
||||
self.train_data, self.test_data = cv.train_test_split(df, test_size=test_size)
|
||||
print >> sys.stderr, '分离训练集和测试集成功'
|
||||
print >> sys.stderr, 'len(train) = %s' % np.shape(self.train_data)[0]
|
||||
print >> sys.stderr, 'len(test) = %s' % np.shape(self.test_data)[0]
|
||||
|
||||
def calc_similarity(self):
|
||||
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
|
||||
self.train_mat = np.zeros((self.n_users, self.n_items))
|
||||
for line in self.train_data.itertuples():
|
||||
self.train_mat[int(line.user_id)-1, int(line.item_id)-1] = float(line.rating)
|
||||
self.test_mat = np.zeros((self.n_users, self.n_items))
|
||||
for line in self.test_data.itertuples():
|
||||
# print "line", line.user_id-1, line.item_id-1, line.rating
|
||||
self.test_mat[int(line.user_id)-1, int(line.item_id)-1] = float(line.rating)
|
||||
|
||||
# 使用sklearn的pairwise_distances函数来计算余弦相似性。
|
||||
print "1:", np.shape(np.mat(self.train_mat).T) # 行:电影,列:人
|
||||
# 电影-电影-距离(1682, 1682)
|
||||
self.item_mat_similarity = pairwise_distances(np.mat(self.train_mat).T, metric='cosine')
|
||||
print >> sys.stderr, 'item_mat_similarity=', np.shape(self.item_mat_similarity)
|
||||
|
||||
print >> sys.stderr, '开始统计流行item的数量...'
|
||||
|
||||
# 统计在所有的用户中,不同电影的总出现次数
|
||||
for i_index in range(self.n_items):
|
||||
if np.sum(self.train_mat[:, i_index]) != 0:
|
||||
self.item_popular[i_index] = np.sum(self.train_mat[:, i_index]!=0)
|
||||
# print "pop=", i_index, self.item_popular[i_index]
|
||||
|
||||
# save the total number of items
|
||||
self.item_count = len(self.item_popular)
|
||||
print >> sys.stderr, '总共流行item数量 = %d' % self.item_count
|
||||
|
||||
# @profile
|
||||
def recommend(self, u_index):
|
||||
"""recommend(找出top K的电影,对电影进行相似度sum的排序,取出top N的电影数)
|
||||
|
||||
Args:
|
||||
u_index 用户_ID-1=用户index
|
||||
Returns:
|
||||
rec_item 电影推荐列表,按照相似度从大到小的排序
|
||||
"""
|
||||
''' Find K similar items and recommend N items. '''
|
||||
K = self.n_sim_item
|
||||
N = self.n_rec_item
|
||||
rank = {}
|
||||
i_items = np.where(self.train_mat[u_index, :] != 0)[0]
|
||||
# print "i_items=", i_items
|
||||
watched_items = dict(zip(i_items, self.train_mat[u_index, i_items]))
|
||||
|
||||
# 计算top K 电影的相似度
|
||||
# rating=电影评分, w=不同电影出现的次数
|
||||
# 耗时分析:98.2%的时间在 line-154行
|
||||
for i_item, rating in watched_items.iteritems():
|
||||
i_other_items = np.where(self.item_mat_similarity[i_item, :] != 0)[0]
|
||||
for related_item, w in sorted(dict(zip(i_other_items, self.item_mat_similarity[i_item, i_other_items])).items(), key=itemgetter(1), reverse=True)[0:K]:
|
||||
if related_item in watched_items:
|
||||
continue
|
||||
rank.setdefault(related_item, 0)
|
||||
rank[related_item] += w * rating
|
||||
|
||||
# return the N best items
|
||||
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
|
||||
|
||||
def evaluate(self):
|
||||
''' return precision, recall, coverage and popularity '''
|
||||
print >> sys.stderr, 'Evaluation start...'
|
||||
|
||||
# varables for precision and recall
|
||||
# hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的电影数
|
||||
hit = 0
|
||||
rec_count = 0
|
||||
test_count = 0
|
||||
# varables for coverage
|
||||
all_rec_items = set()
|
||||
# varables for popularity
|
||||
popular_sum = 0
|
||||
|
||||
# enumerate 将其组成一个索引序列,利用它可以同时获得索引和值
|
||||
# 参考地址:http://blog.csdn.net/churximi/article/details/51648388
|
||||
for u_index in range(50):
|
||||
if u_index > 0 and u_index % 10 == 0:
|
||||
print >> sys.stderr, 'recommended for %d users' % u_index
|
||||
print "u_index", u_index
|
||||
|
||||
# 对比测试集和推荐集的差异
|
||||
rec_items = self.recommend(u_index)
|
||||
print "rec_items=", rec_items
|
||||
for item, w in rec_items:
|
||||
# print 'test_mat[u_index, item]=', item, self.test_mat[u_index, item]
|
||||
|
||||
if self.test_mat[u_index, item] != 0:
|
||||
hit += 1
|
||||
print "self.test_mat[%d, %d]=%s" % (u_index, item, self.test_mat[u_index, item])
|
||||
# 计算用户对应的电影出现次数log值的sum加和
|
||||
if item in self.item_popular:
|
||||
popular_sum += math.log(1 + self.item_popular[item])
|
||||
|
||||
rec_count += len(rec_items)
|
||||
test_count += np.sum(self.test_mat[u_index, :] != 0)
|
||||
# print "test_count=", np.sum(self.test_mat[u_index, :] != 0), np.sum(self.train_mat[u_index, :] != 0)
|
||||
|
||||
print("-------", hit, rec_count)
|
||||
precision = hit / (1.0 * rec_count)
|
||||
recall = hit / (1.0 * test_count)
|
||||
coverage = len(all_rec_items) / (1.0 * self.item_count)
|
||||
popularity = popular_sum / (1.0 * rec_count)
|
||||
|
||||
print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (precision, recall, coverage, popularity)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
dataFile = 'input/16.RecommenderSystems/ml-100k/u.data'
|
||||
|
||||
# 创建ItemCF对象
|
||||
itemcf = ItemBasedCF()
|
||||
# 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset和testset中
|
||||
itemcf.splitData(dataFile, test_size=0.3)
|
||||
# 计算用户之间的相似度
|
||||
itemcf.calc_similarity()
|
||||
# 评估推荐效果
|
||||
# itemcf.evaluate()
|
||||
# 查看推荐结果用户
|
||||
print "推荐结果", itemcf.recommend(u_index=1)
|
||||
print "---", np.where(itemcf.test_mat[1, :] != 0)[0]
|
||||
@@ -1,85 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
from math import sqrt
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.sparse.linalg import svds
|
||||
from sklearn import cross_validation as cv
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
|
||||
|
||||
def splitData(dataFile, test_size):
|
||||
# 加载数据集
|
||||
header = ['user_id', 'item_id', 'rating', 'timestamp']
|
||||
df = pd.read_csv(dataFile, sep='\t', names=header)
|
||||
|
||||
n_users = df.user_id.unique().shape[0]
|
||||
n_items = df.item_id.unique().shape[0]
|
||||
|
||||
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)
|
||||
train_data, test_data = cv.train_test_split(df, test_size=test_size)
|
||||
return df, n_users, n_items, train_data, test_data
|
||||
|
||||
|
||||
def calc_similarity(n_users, n_items, train_data, test_data):
|
||||
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
|
||||
train_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in train_data.itertuples():
|
||||
train_data_matrix[line[1]-1, line[2]-1] = line[3]
|
||||
test_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in test_data.itertuples():
|
||||
test_data_matrix[line[1]-1, line[2]-1] = line[3]
|
||||
|
||||
# 使用sklearn的pairwise_distances函数来计算余弦相似性。
|
||||
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
|
||||
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
|
||||
return train_data_matrix, test_data_matrix, user_similarity, item_similarity
|
||||
|
||||
|
||||
def predict(rating, similarity, type='user'):
|
||||
if type == 'user':
|
||||
mean_user_rating = rating.mean(axis=1)
|
||||
rating_diff = (rating - mean_user_rating[:, np.newaxis])
|
||||
pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
|
||||
elif type == 'item':
|
||||
pred = rating.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])
|
||||
return pred
|
||||
|
||||
|
||||
def rmse(prediction, ground_truth):
|
||||
prediction = prediction[ground_truth.nonzero()].flatten()
|
||||
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
|
||||
return sqrt(mean_squared_error(prediction, ground_truth))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 基于模型的协同过滤
|
||||
# ...
|
||||
# 拆分数据集
|
||||
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
|
||||
dataFile = 'input/16.RecommenderSystems/ml-100k/u.data'
|
||||
df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25)
|
||||
|
||||
# 计算相似度
|
||||
train_data_matrix, test_data_matrix, user_similarity, item_similarity = calc_similarity(n_users, n_items, train_data, test_data)
|
||||
|
||||
user_prediction = predict(train_data_matrix, user_similarity, type='user')
|
||||
item_prediction = predict(train_data_matrix, item_similarity, type='item')
|
||||
|
||||
# 评估:均方根误差
|
||||
print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
|
||||
print 'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))
|
||||
|
||||
# 基于模型的协同过滤
|
||||
# ...
|
||||
# 计算MovieLens数据集的稀疏度
|
||||
sparsity = round(1.0 - len(df)/float(n_users*n_items), 3)
|
||||
print 'The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%'
|
||||
|
||||
u, s, vt = svds(train_data_matrix, k=20)
|
||||
s_diag_matrix = np.diag(s)
|
||||
x_pred = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
print 'Model based CF RMSE: ' + str(rmse(x_pred, test_data_matrix))
|
||||
@@ -14,4 +14,3 @@ def PersonalRank(G, alpha, root):
|
||||
tmp[j] += 1 - alpha
|
||||
rank = tmp
|
||||
return rank
|
||||
|
||||
|
||||
Reference in New Issue
Block a user