mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-12 23:05:14 +08:00
更新推荐系统代码
This commit is contained in:
@@ -89,6 +89,7 @@ class ItemBasedCF():
|
||||
|
||||
print >> sys.stderr, 'counting movies number and popularity...'
|
||||
|
||||
# 统计在所有的用户中,不同电影的总出现次数
|
||||
for user, movies in self.trainset.iteritems():
|
||||
for movie in movies:
|
||||
# count item popularity
|
||||
@@ -175,6 +176,8 @@ class ItemBasedCF():
|
||||
# varables for popularity
|
||||
popular_sum = 0
|
||||
|
||||
# enumerate将其组成一个索引序列,利用它可以同时获得索引和值
|
||||
# 参考地址:http://blog.csdn.net/churximi/article/details/51648388
|
||||
for i, user in enumerate(self.trainset):
|
||||
if i > 0 and i % 500 == 0:
|
||||
print >> sys.stderr, 'recommended for %d users' % i
|
||||
|
||||
85
src/python/16.RecommenderSystems/sklearn-RS-demo-cf-item.py
Normal file
85
src/python/16.RecommenderSystems/sklearn-RS-demo-cf-item.py
Normal file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
from math import sqrt
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.sparse.linalg import svds
|
||||
from sklearn import cross_validation as cv
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
|
||||
|
||||
def splitData(dataFile, test_size):
|
||||
# 加载数据集
|
||||
header = ['user_id', 'item_id', 'rating', 'timestamp']
|
||||
df = pd.read_csv(dataFile, sep='\t', names=header)
|
||||
|
||||
n_users = df.user_id.unique().shape[0]
|
||||
n_items = df.item_id.unique().shape[0]
|
||||
|
||||
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)
|
||||
train_data, test_data = cv.train_test_split(df, test_size=test_size)
|
||||
return df, n_users, n_items, train_data, test_data
|
||||
|
||||
|
||||
def calc_similarity(n_users, n_items, train_data, test_data):
|
||||
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
|
||||
train_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in train_data.itertuples():
|
||||
train_data_matrix[line[1]-1, line[2]-1] = line[3]
|
||||
test_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in test_data.itertuples():
|
||||
test_data_matrix[line[1]-1, line[2]-1] = line[3]
|
||||
|
||||
# 使用sklearn的pairwise_distances函数来计算余弦相似性。
|
||||
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
|
||||
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
|
||||
return train_data_matrix, test_data_matrix, user_similarity, item_similarity
|
||||
|
||||
|
||||
def predict(rating, similarity, type='user'):
|
||||
if type == 'user':
|
||||
mean_user_rating = rating.mean(axis=1)
|
||||
rating_diff = (rating - mean_user_rating[:, np.newaxis])
|
||||
pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
|
||||
elif type == 'item':
|
||||
pred = rating.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])
|
||||
return pred
|
||||
|
||||
|
||||
def rmse(prediction, ground_truth):
|
||||
prediction = prediction[ground_truth.nonzero()].flatten()
|
||||
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
|
||||
return sqrt(mean_squared_error(prediction, ground_truth))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 基于模型的协同过滤
|
||||
# ...
|
||||
# 拆分数据集
|
||||
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
|
||||
dataFile = 'input/16.RecommenderSystems/ml-100k/u.data'
|
||||
df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25)
|
||||
|
||||
# 计算相似度
|
||||
train_data_matrix, test_data_matrix, user_similarity, item_similarity = calc_similarity(n_users, n_items, train_data, test_data)
|
||||
|
||||
user_prediction = predict(train_data_matrix, user_similarity, type='user')
|
||||
item_prediction = predict(train_data_matrix, item_similarity, type='item')
|
||||
|
||||
# 评估:均方根误差
|
||||
print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
|
||||
print 'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))
|
||||
|
||||
# 基于模型的协同过滤
|
||||
# ...
|
||||
# 计算MovieLens数据集的稀疏度
|
||||
sparsity = round(1.0 - len(df)/float(n_users*n_items), 3)
|
||||
print 'The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%'
|
||||
|
||||
u, s, vt = svds(train_data_matrix, k=20)
|
||||
s_diag_matrix = np.diag(s)
|
||||
x_pred = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
print 'Model based CF RMSE: ' + str(rmse(x_pred, test_data_matrix))
|
||||
@@ -10,59 +10,92 @@ from sklearn import cross_validation as cv
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
|
||||
# 加载数据集
|
||||
header = ['user_id', 'item_id', 'rating', 'timestamp']
|
||||
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
|
||||
dataFile = 'input/16.RecommenderSystems/ml-100k/u.data'
|
||||
df = pd.read_csv(dataFile, sep='\t', names=header)
|
||||
|
||||
n_users = df.user_id.unique().shape[0]
|
||||
n_items = df.item_id.unique().shape[0]
|
||||
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)
|
||||
def splitData(dataFile, test_size):
|
||||
# 加载数据集
|
||||
header = ['user_id', 'item_id', 'rating', 'timestamp']
|
||||
df = pd.read_csv(dataFile, sep='\t', names=header)
|
||||
|
||||
# 拆分数据集
|
||||
train_data, test_data = cv.train_test_split(df, test_size=0.25)
|
||||
n_users = df.user_id.unique().shape[0]
|
||||
n_items = df.item_id.unique().shape[0]
|
||||
|
||||
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
|
||||
train_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in train_data.itertuples():
|
||||
train_data_matrix[line[1]-1, line[2]-1] = line[3]
|
||||
test_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in test_data.itertuples():
|
||||
test_data_matrix[line[1]-1, line[2]-1] = line[3]
|
||||
# 使用sklearn的pairwise_distances函数来计算余弦相似性。
|
||||
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
|
||||
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
|
||||
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)
|
||||
train_data, test_data = cv.train_test_split(df, test_size=test_size)
|
||||
return df, n_users, n_items, train_data, test_data
|
||||
|
||||
|
||||
def calc_similarity(n_users, n_items, train_data, test_data):
|
||||
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
|
||||
train_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in train_data.itertuples():
|
||||
train_data_matrix[line[1]-1, line[2]-1] = line[3]
|
||||
test_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in test_data.itertuples():
|
||||
test_data_matrix[line[1]-1, line[2]-1] = line[3]
|
||||
|
||||
# 使用sklearn的pairwise_distances函数来计算余弦相似性。
|
||||
print "1:", np.shape(train_data_matrix) # 行:人,列:电影
|
||||
print "2:", np.shape(train_data_matrix.T) # 行:电影,列:人
|
||||
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
|
||||
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
|
||||
return train_data_matrix, test_data_matrix, user_similarity, item_similarity
|
||||
|
||||
|
||||
def predict(rating, similarity, type='user'):
|
||||
print type
|
||||
print "rating=", np.shape(rating)
|
||||
print "similarity=", np.shape(similarity)
|
||||
if type == 'user':
|
||||
# 求出每一个用户,所有电影的综合评分(axis=0 表示对列操作, 1表示对行操作)
|
||||
# print "rating=", np.shape(rating)
|
||||
mean_user_rating = rating.mean(axis=1)
|
||||
# np.newaxis参考地址: http://blog.csdn.net/xtingjie/article/details/72510834
|
||||
# print "mean_user_rating=", np.shape(mean_user_rating)
|
||||
# print "mean_user_rating.newaxis=", np.shape(mean_user_rating[:, np.newaxis])
|
||||
rating_diff = (rating - mean_user_rating[:, np.newaxis])
|
||||
# print "rating=", rating[:3, :3]
|
||||
# print "mean_user_rating[:, np.newaxis]=", mean_user_rating[:, np.newaxis][:3, :3]
|
||||
# print "rating_diff=", rating_diff[:3, :3]
|
||||
|
||||
# 均分 + 人-人-距离(943, 943)*人-电影-评分diff(943, 1682)=结果-人-电影(每个人对同一电影的综合得分)(943, 1682) 再除以 个人与其他人总的距离 = 人-电影综合得分
|
||||
pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
|
||||
elif type == 'item':
|
||||
pred = rating.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
|
||||
# 综合打分: 人-电影-评分(943, 1682)*电影-电影-距离1682, 1682)=结果-人-电影(各个电影对同一电影的综合得分)(943, 1682) / 再除以 电影与其他电影总的距离 = 人-电影综合得分
|
||||
pred = rating.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])
|
||||
return pred
|
||||
|
||||
|
||||
user_prediction = predict(train_data_matrix, user_similarity, type='user')
|
||||
item_prediction = predict(train_data_matrix, item_similarity, type='item')
|
||||
|
||||
|
||||
def rmse(prediction, ground_truth):
|
||||
prediction = prediction[ground_truth.nonzero()].flatten()
|
||||
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
|
||||
return sqrt(mean_squared_error(prediction, ground_truth))
|
||||
|
||||
|
||||
print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
|
||||
print 'Item based CF RMSe: ' + str(rmse(item_prediction, test_data_matrix))
|
||||
if __name__ == "__main__":
|
||||
# 基于模型的协同过滤
|
||||
# ...
|
||||
# 拆分数据集
|
||||
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
|
||||
dataFile = 'input/16.RecommenderSystems/ml-100k/u.data'
|
||||
df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25)
|
||||
|
||||
sparsity = round(1.0 - len(df)/float(n_users*n_items), 3)
|
||||
print 'The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%'
|
||||
# 计算相似度
|
||||
train_data_matrix, test_data_matrix, user_similarity, item_similarity = calc_similarity(n_users, n_items, train_data, test_data)
|
||||
|
||||
user_prediction = predict(train_data_matrix, user_similarity, type='user')
|
||||
item_prediction = predict(train_data_matrix, item_similarity, type='item')
|
||||
|
||||
u, s, vt = svds(train_data_matrix, k=20)
|
||||
s_diag_matrix = np.diag(s)
|
||||
x_pred = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
print 'User-based CF MSE: ' + str(rmse(x_pred, test_data_matrix))
|
||||
# 评估:均方根误差
|
||||
print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
|
||||
print 'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))
|
||||
|
||||
# 基于模型的协同过滤
|
||||
# ...
|
||||
# 计算MovieLens数据集的稀疏度
|
||||
sparsity = round(1.0 - len(df)/float(n_users*n_items), 3)
|
||||
print 'The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%'
|
||||
|
||||
u, s, vt = svds(train_data_matrix, k=20)
|
||||
s_diag_matrix = np.diag(s)
|
||||
x_pred = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
print 'Model based CF RMSE: ' + str(rmse(x_pred, test_data_matrix))
|
||||
|
||||
@@ -92,6 +92,8 @@ class UserBasedCF():
|
||||
print >> sys.stderr, 'building movie-users inverse table...'
|
||||
movie2users = dict()
|
||||
|
||||
# 同一个电影中,收集用户的集合
|
||||
# 统计在所有的用户中,不同电影的总出现次数
|
||||
for user, movies in self.trainset.iteritems():
|
||||
for movie in movies:
|
||||
# inverse table for item-users
|
||||
@@ -155,16 +157,24 @@ class UserBasedCF():
|
||||
watched_movies = self.trainset[user]
|
||||
|
||||
# 计算top K 用户的相似度
|
||||
# v=similar user, wuv=不同用户同时出现的次数
|
||||
# v=similar user, wuv=不同用户同时出现的次数,根据wuv倒序从大到小选出K个用户进行排列
|
||||
# 耗时分析:50.4%的时间在 line-160行
|
||||
for v, wuv in sorted(self.user_sim_mat[user].items(), key=itemgetter(1), reverse=True)[0:K]:
|
||||
for movie in self.trainset[v]:
|
||||
for movie, rating in self.trainset[v].iteritems():
|
||||
if movie in watched_movies:
|
||||
continue
|
||||
# predict the user's "interest" for each movie
|
||||
rank.setdefault(movie, 0)
|
||||
rank[movie] += wuv
|
||||
rank[movie] += wuv * rating
|
||||
# return the N best movies
|
||||
|
||||
"""
|
||||
wuv
|
||||
precision=0.3766 recall=0.0759 coverage=0.3183 popularity=6.9194
|
||||
|
||||
wuv * rating
|
||||
precision=0.3865 recall=0.0779 coverage=0.2681 popularity=7.0116
|
||||
"""
|
||||
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
|
||||
|
||||
def evaluate(self):
|
||||
@@ -183,6 +193,8 @@ class UserBasedCF():
|
||||
# varables for popularity
|
||||
popular_sum = 0
|
||||
|
||||
# enumerate将其组成一个索引序列,利用它可以同时获得索引和值
|
||||
# 参考地址:http://blog.csdn.net/churximi/article/details/51648388
|
||||
for i, user in enumerate(self.trainset):
|
||||
if i > 0 and i % 500 == 0:
|
||||
print >> sys.stderr, 'recommended for %d users' % i
|
||||
|
||||
Reference in New Issue
Block a user