mirror of
https://github.com/apachecn/ailearning.git
synced 2026-04-02 18:19:44 +08:00
更新推荐系统最新代码和注释
This commit is contained in:
@@ -1,190 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import math
|
||||
from operator import itemgetter
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.sparse.linalg import svds
|
||||
from sklearn import cross_validation as cv
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
|
||||
|
||||
def splitData(dataFile, test_size):
|
||||
# 加载数据集
|
||||
header = ['user_id', 'item_id', 'rating', 'timestamp']
|
||||
df = pd.read_csv(dataFile, sep='\t', names=header)
|
||||
|
||||
n_users = df.user_id.unique().shape[0]
|
||||
n_items = df.item_id.unique().shape[0]
|
||||
|
||||
print('Number of users = ' + str(n_users) + ' | Number of movies = ' +
|
||||
str(n_items))
|
||||
train_data, test_data = cv.train_test_split(df, test_size=test_size)
|
||||
print("数据量: ", len(train_data), len(test_data))
|
||||
return df, n_users, n_items, train_data, test_data
|
||||
|
||||
|
||||
def calc_similarity(n_users, n_items, train_data, test_data):
|
||||
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
|
||||
train_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in train_data.itertuples():
|
||||
train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
|
||||
test_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in test_data.itertuples():
|
||||
test_data_matrix[line[1] - 1, line[2] - 1] = line[3]
|
||||
|
||||
# 使用sklearn的pairwise_distances函数来计算余弦相似性。
|
||||
print("1:", np.shape(train_data_matrix)) # 行: 人,列: 电影
|
||||
print("2:", np.shape(train_data_matrix.T)) # 行: 电影,列: 人
|
||||
|
||||
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
|
||||
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
|
||||
|
||||
print('开始统计流行item的数量...', file=sys.stderr)
|
||||
item_popular = {}
|
||||
# 统计在所有的用户中,不同电影的总出现次数
|
||||
for i_index in range(n_items):
|
||||
if np.sum(train_data_matrix[:, i_index]) != 0:
|
||||
item_popular[i_index] = np.sum(train_data_matrix[:, i_index] != 0)
|
||||
# print "pop=", i_index, self.item_popular[i_index]
|
||||
|
||||
# save the total number of items
|
||||
item_count = len(item_popular)
|
||||
print('总共流行item数量 = %d' % item_count, file=sys.stderr)
|
||||
|
||||
return train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular
|
||||
|
||||
|
||||
def predict(rating, similarity, type='user'):
|
||||
print(type)
|
||||
print("rating=", np.shape(rating))
|
||||
print("similarity=", np.shape(similarity))
|
||||
if type == 'user':
|
||||
# 求出每一个用户,所有电影的综合评分(axis=0 表示对列操作, 1表示对行操作)
|
||||
# print "rating=", np.shape(rating)
|
||||
mean_user_rating = rating.mean(axis=1)
|
||||
# np.newaxis参考地址: http://blog.csdn.net/xtingjie/article/details/72510834
|
||||
# print "mean_user_rating=", np.shape(mean_user_rating)
|
||||
# print "mean_user_rating.newaxis=", np.shape(mean_user_rating[:, np.newaxis])
|
||||
rating_diff = (rating - mean_user_rating[:, np.newaxis])
|
||||
# print "rating=", rating[:3, :3]
|
||||
# print "mean_user_rating[:, np.newaxis]=", mean_user_rating[:, np.newaxis][:3, :3]
|
||||
# print "rating_diff=", rating_diff[:3, :3]
|
||||
|
||||
# 均分 + 人-人-距离(943, 943)*人-电影-评分diff(943, 1682)=结果-人-电影(每个人对同一电影的综合得分)(943, 1682) 再除以 个人与其他人总的距离 = 人-电影综合得分
|
||||
pred = mean_user_rating[:, np.newaxis] + similarity.dot(
|
||||
rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
|
||||
elif type == 'item':
|
||||
# 综合打分: 人-电影-评分(943, 1682)*电影-电影-距离(1682, 1682)=结果-人-电影(各个电影对同一电影的综合得分)(943, 1682) / 再除以 电影与其他电影总的距离 = 人-电影综合得分
|
||||
pred = rating.dot(similarity) / np.array(
|
||||
[np.abs(similarity).sum(axis=1)])
|
||||
return pred
|
||||
|
||||
|
||||
def rmse(prediction, ground_truth):
|
||||
prediction = prediction[ground_truth.nonzero()].flatten()
|
||||
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
|
||||
return math.sqrt(mean_squared_error(prediction, ground_truth))
|
||||
|
||||
|
||||
def evaluate(prediction, item_popular, name):
|
||||
hit = 0
|
||||
rec_count = 0
|
||||
test_count = 0
|
||||
popular_sum = 0
|
||||
all_rec_items = set()
|
||||
for u_index in range(n_users):
|
||||
items = np.where(train_data_matrix[u_index, :] == 0)[0]
|
||||
pre_items = sorted(
|
||||
dict(zip(items, prediction[u_index, items])).items(),
|
||||
key=itemgetter(1),
|
||||
reverse=True)[:20]
|
||||
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
|
||||
|
||||
# 对比测试集和推荐集的差异 item, w
|
||||
for item, _ in pre_items:
|
||||
if item in test_items:
|
||||
hit += 1
|
||||
all_rec_items.add(item)
|
||||
|
||||
# 计算用户对应的电影出现次数log值的sum加和
|
||||
if item in item_popular:
|
||||
popular_sum += math.log(1 + item_popular[item])
|
||||
|
||||
rec_count += len(pre_items)
|
||||
test_count += len(test_items)
|
||||
|
||||
precision = hit / (1.0 * rec_count)
|
||||
recall = hit / (1.0 * test_count)
|
||||
coverage = len(all_rec_items) / (1.0 * len(item_popular))
|
||||
popularity = popular_sum / (1.0 * rec_count)
|
||||
print('%s: precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
|
||||
name, precision, recall, coverage, popularity), file=sys.stderr)
|
||||
|
||||
|
||||
def recommend(u_index, prediction):
|
||||
items = np.where(train_data_matrix[u_index, :] == 0)[0]
|
||||
pre_items = sorted(
|
||||
dict(zip(items, prediction[u_index, items])).items(),
|
||||
key=itemgetter(1),
|
||||
reverse=True)[:10]
|
||||
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
|
||||
|
||||
print('原始结果: ', test_items)
|
||||
print('推荐结果: ', [key for key, value in pre_items])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# 基于内存的协同过滤
|
||||
# ...
|
||||
# 拆分数据集
|
||||
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
|
||||
dataFile = 'data/16.RecommenderSystems/ml-100k/u.data'
|
||||
df, n_users, n_items, train_data, test_data = splitData(
|
||||
dataFile, test_size=0.25)
|
||||
|
||||
# 计算相似度
|
||||
train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular = calc_similarity(
|
||||
n_users, n_items, train_data, test_data)
|
||||
|
||||
item_prediction = predict(train_data_matrix, item_similarity, type='item')
|
||||
user_prediction = predict(train_data_matrix, user_similarity, type='user')
|
||||
|
||||
# 评估: 均方根误差
|
||||
print(
|
||||
'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))
|
||||
print(
|
||||
'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
|
||||
|
||||
# 基于模型的协同过滤
|
||||
# ...
|
||||
# 计算MovieLens数据集的稀疏度 (n_users,n_items 是常量,所以,用户行为数据越少,意味着信息量少;越稀疏,优化的空间也越大)
|
||||
sparsity = round(1.0 - len(df) / float(n_users * n_items), 3)
|
||||
print('The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%')
|
||||
|
||||
# 计算稀疏矩阵的最大k个奇异值/向量
|
||||
u, s, vt = svds(train_data_matrix, k=15)
|
||||
s_diag_matrix = np.diag(s)
|
||||
svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
print("svd-shape:", np.shape(svd_prediction))
|
||||
print(
|
||||
'Model based CF RMSE: ' + str(rmse(svd_prediction, test_data_matrix)))
|
||||
"""
|
||||
在信息量相同的情况下,矩阵越小,那么携带的信息越可靠。
|
||||
所以: user-cf 推荐效果高于 item-cf; 而svd分解后,发现15个维度效果就能达到90%以上,所以信息更可靠,效果也更好。
|
||||
item-cf: 1682
|
||||
user-cf: 943
|
||||
svd: 15
|
||||
"""
|
||||
evaluate(item_prediction, item_popular, 'item')
|
||||
evaluate(user_prediction, item_popular, 'user')
|
||||
evaluate(svd_prediction, item_popular, 'svd')
|
||||
|
||||
# 推荐结果
|
||||
recommend(1, svd_prediction)
|
||||
259
tutorials/RecommenderSystems/rs_rating_demo.py
Normal file
259
tutorials/RecommenderSystems/rs_rating_demo.py
Normal file
@@ -0,0 +1,259 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import math
|
||||
from operator import itemgetter
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.sparse.linalg import svds
|
||||
from sklearn import model_selection as cv
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from middleware.utils import TimeStat, Chart
|
||||
"""
|
||||
推荐系统: Item CF/User CF/SVD 对比
|
||||
"""
|
||||
|
||||
|
||||
def splitData(dataFile, test_size):
|
||||
# 加载数据集 (用户ID, 电影ID, 评分, 时间戳)
|
||||
header = ['user_id', 'item_id', 'rating', 'timestamp']
|
||||
df = pd.read_csv(dataFile, sep='\t', names=header)
|
||||
|
||||
n_users = df.user_id.unique().shape[0]
|
||||
n_items = df.item_id.unique().shape[0]
|
||||
|
||||
print('>>> 本数据集包含: 总用户数 = %s | 总电影数 = %s' % (n_users, n_items) )
|
||||
train_data, test_data = cv.train_test_split(df, test_size=test_size)
|
||||
print(">>> 训练:测试 = %s:%s = %s:%s" % (len(train_data), len(test_data), 1-test_size, test_size))
|
||||
return df, n_users, n_items, train_data, test_data
|
||||
|
||||
|
||||
def calc_similarity(n_users, n_items, train_data, test_data):
|
||||
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
|
||||
"""
|
||||
line: Pandas(Index=93661, user_id=624, item_id=750, rating=4, timestamp=891961163)
|
||||
"""
|
||||
train_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in train_data.itertuples():
|
||||
train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
|
||||
|
||||
test_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in test_data.itertuples():
|
||||
test_data_matrix[line[1] - 1, line[2] - 1] = line[3]
|
||||
|
||||
print("1:", np.shape(train_data_matrix)) # 行: 人 | 列: 电影
|
||||
print("2:", np.shape(train_data_matrix.T)) # 行: 电影 | 列: 人
|
||||
|
||||
# 使用sklearn的 pairwise_distances 计算向量距离,cosine来计算余弦距离,越小越相似
|
||||
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
|
||||
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
|
||||
# print("<<< %s \n %s" % (np.shape(user_similarity), user_similarity) )
|
||||
# print("<<< %s \n %s" % (np.shape(item_similarity), item_similarity) )
|
||||
|
||||
print('开始统计流行item的数量...', file=sys.stderr)
|
||||
item_popular = {}
|
||||
# 统计同一个电影,观看的总人数(也就是所谓的流行度!)
|
||||
for i_index in range(n_items):
|
||||
if np.sum(train_data_matrix[:, i_index]) != 0:
|
||||
item_popular[i_index] = np.sum(train_data_matrix[:, i_index] != 0)
|
||||
|
||||
# save the total number of items
|
||||
item_count = len(item_popular)
|
||||
print('总共流行 item 数量 = %d' % item_count, file=sys.stderr)
|
||||
return train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular
|
||||
|
||||
|
||||
def predict(rating, similarity, type='user'):
|
||||
"""
|
||||
:param rating: 训练数据
|
||||
:param similarity: 向量距离
|
||||
:return:
|
||||
"""
|
||||
print("+++ %s" % type)
|
||||
print(" rating=", np.shape(rating))
|
||||
print(" similarity=", np.shape(similarity))
|
||||
if type == 'item':
|
||||
"""
|
||||
综合打分:
|
||||
rating.dot(similarity) 表示:
|
||||
某1个人所有的电影组合 X ·电影*电影·距离(第1列都是关于第1部电影和其他的电影的距离)中,计算出 第一个人对第1/2/3部电影的 总评分 1*n
|
||||
某2个人所有的电影组合 X ·电影*电影·距离(第1列都是关于第1部电影和其他的电影的距离)中,计算出 第一个人对第1/2/3部电影的 总评分 1*n
|
||||
...
|
||||
某n个人所有的电影组合 X ·电影*电影·距离(第1列都是关于第1部电影和其他的电影的距离)中,计算出 第一个人对第1/2/3部电影的 总评分 1*n
|
||||
= 人-电影-评分(943, 1682) * 电影-电影-距离(1682, 1682)
|
||||
= 人-电影-总评分距离(943, 1682)
|
||||
|
||||
np.array([np.abs(similarity).sum(axis=1)]) 表示: 横向求和: 1 表示某一行所有的列求和
|
||||
第1列表示:某个A电影,对于所有电影计算出A的总距离
|
||||
第2列表示:某个B电影,对于所有电影的综出B的总距离
|
||||
...
|
||||
第n列表示:某个N电影,对于所有电影的综出N的总距离
|
||||
= 每一个电影的总距离 (1, 1682)
|
||||
|
||||
pred = 人-电影-平均评分 (943, 1682)
|
||||
"""
|
||||
pred = rating.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
|
||||
elif type == 'user':
|
||||
# 每个样本上减去数据的统计平均值可以移除共同的部分,凸显个体差异。
|
||||
|
||||
# 求出每一个用户,所有电影的综合评分
|
||||
# 横向求平均: 1 表示某一行所有的列求平均
|
||||
mean_user_rating = rating.mean(axis=1)
|
||||
# numpy中包含的 newaxis 可以给原数组增加一个维度
|
||||
rating_diff = (rating - mean_user_rating[:, np.newaxis])
|
||||
|
||||
# 均分 +
|
||||
# 人-人-距离(943, 943)*人-电影-评分diff(943, 1682)=结果-人-电影(每个人对同一电影的综合得分)(943, 1682) 再除以 个人与其他人总的距离 = 人-电影综合得分
|
||||
"""
|
||||
综合打分:
|
||||
similarity.dot(rating_diff) 表示:
|
||||
第1列:第1个人与其他人的相似度 * 人与电影的相似度,得到 第1个人对第1/2/3列电影的 总得分 1*n
|
||||
第2列:第2个人与其他人的相似度 * 人与电影的相似度,得到 第2个人对第1/2/3列电影的 总得分 1*n
|
||||
...
|
||||
第n列:第n个人与其他人的相似度 * 人与电影的相似度,得到 第n个人对第1/2/3列电影的 总得分 1*n
|
||||
= 人-人-距离(943, 943) * 人-电影-评分(943, 1682)
|
||||
= 人-电影-总评分距离(943, 1682)
|
||||
|
||||
np.array([np.abs(similarity).sum(axis=1)]) 表示: 横向求和: 1 表示某一行所有的列求和
|
||||
第1列表示:第A个人,对于所有人计算出A的总距离
|
||||
第2列表示:第B个人,对于所有人计算出B的总距离
|
||||
...
|
||||
第n列表示:第N个人,对于所有人计算出N的总距离
|
||||
= 每一个电影的总距离 (1, 943)
|
||||
|
||||
pred = 均值 + 人-电影-平均评分 (943, 1682)
|
||||
"""
|
||||
pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
|
||||
|
||||
return pred
|
||||
|
||||
|
||||
def rmse(prediction, ground_truth):
|
||||
prediction = prediction[ground_truth.nonzero()].flatten()
|
||||
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
|
||||
return math.sqrt(mean_squared_error(prediction, ground_truth))
|
||||
|
||||
|
||||
def evaluate(prediction, item_popular, name):
|
||||
hit = 0
|
||||
rec_count = 0
|
||||
test_count = 0
|
||||
popular_sum = 0
|
||||
all_rec_items = set()
|
||||
for u_index in range(n_users):
|
||||
items = np.where(train_data_matrix[u_index, :] == 0)[0]
|
||||
pre_items = sorted(
|
||||
dict(zip(items, prediction[u_index, items])).items(),
|
||||
key=itemgetter(1),
|
||||
reverse=True)[:20]
|
||||
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
|
||||
|
||||
# 对比测试集和推荐集的差异 item, w
|
||||
for item, _ in pre_items:
|
||||
if item in test_items:
|
||||
hit += 1
|
||||
all_rec_items.add(item)
|
||||
|
||||
# popular_sum是对所有的item的流行度进行加和
|
||||
if item in item_popular:
|
||||
popular_sum += math.log(1 + item_popular[item])
|
||||
|
||||
rec_count += len(pre_items)
|
||||
test_count += len(test_items)
|
||||
|
||||
precision = hit / (1.0 * rec_count)
|
||||
# 召回率,相对于测试推荐集合的数据
|
||||
recall = hit / (1.0 * test_count)
|
||||
# 覆盖率,相对于训练集合的数据
|
||||
coverage = len(all_rec_items) / (1.0 * len(item_popular))
|
||||
popularity = popular_sum / (1.0 * rec_count)
|
||||
print('--- %s: precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
|
||||
name, precision, recall, coverage, popularity), file=sys.stderr)
|
||||
|
||||
|
||||
def recommend(u_index, prediction):
|
||||
items = np.where(train_data_matrix[u_index, :] == 0)[0]
|
||||
pre_items = sorted(
|
||||
dict(zip(items, prediction[u_index, items])).items(),
|
||||
key=itemgetter(1),
|
||||
reverse=True)[:10]
|
||||
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
|
||||
|
||||
result = [key for key, value in pre_items]
|
||||
result.sort(reverse=False)
|
||||
print('原始结果(%s): %s' % (len(test_items), test_items) )
|
||||
print('推荐结果(%s): %s' % (len(result), result) )
|
||||
|
||||
|
||||
def main():
|
||||
global n_users, train_data_matrix, test_data_matrix
|
||||
# 基于内存的协同过滤
|
||||
# ...
|
||||
# 拆分数据集
|
||||
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
|
||||
path_root = "/Users/jiangzl/work/data/机器学习"
|
||||
dataFile = '%s/16.RecommenderSystems/ml-100k/u.data' % path_root
|
||||
|
||||
df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25)
|
||||
|
||||
# 计算相似度
|
||||
train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular = calc_similarity(
|
||||
n_users, n_items, train_data, test_data)
|
||||
|
||||
item_prediction = predict(train_data_matrix, item_similarity, type='item')
|
||||
user_prediction = predict(train_data_matrix, user_similarity, type='user')
|
||||
|
||||
# # 评估: 均方根误差
|
||||
print('>>> Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))
|
||||
print('>>> User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
|
||||
|
||||
# 基于模型的协同过滤
|
||||
# ...
|
||||
# 计算MovieLens数据集的稀疏度 (n_users,n_items 是常量,所以,用户行为数据越少,意味着信息量少;越稀疏,优化的空间也越大)
|
||||
sparsity = round(1.0 - len(df) / float(n_users * n_items), 3)
|
||||
print('\nMovieLen100K的稀疏度: %s%%\n' % (sparsity * 100))
|
||||
|
||||
# # 计算稀疏矩阵的最大k个奇异值/向量
|
||||
# minrmse = math.inf
|
||||
# index = 1
|
||||
# for k in range(1, 30, 1):
|
||||
# u, s, vt = svds(train_data_matrix, k=k)
|
||||
# # print(">>> ", s)
|
||||
# s_diag_matrix = np.diag(s)
|
||||
# svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
# r_rmse = rmse(svd_prediction, test_data_matrix)
|
||||
# if r_rmse < minrmse:
|
||||
# index = k
|
||||
# minrmse = r_rmse
|
||||
|
||||
index = 11
|
||||
minrmse = 2.6717213264389765
|
||||
u, s, vt = svds(train_data_matrix, k=index)
|
||||
# print(">>> ", s)
|
||||
s_diag_matrix = np.diag(s)
|
||||
svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
r_rmse = rmse(svd_prediction, test_data_matrix)
|
||||
print("+++ k=%s, svd-shape: %s" % (index, np.shape(svd_prediction)) )
|
||||
print('>>> Model based CF RMSE: %s\n' % minrmse)
|
||||
# """
|
||||
# 在信息量相同的情况下,矩阵越小,那么携带的信息越可靠。
|
||||
# 所以: user-cf 推荐效果高于 item-cf; 而svd分解后,发现15个维度效果就能达到90%以上,所以信息更可靠,效果也更好。
|
||||
# item-cf: 1682
|
||||
# user-cf: 943
|
||||
# svd: 15
|
||||
# """
|
||||
evaluate(item_prediction, item_popular, 'item')
|
||||
evaluate(user_prediction, item_popular, 'user')
|
||||
evaluate(svd_prediction, item_popular, 'svd')
|
||||
|
||||
# 推荐结果
|
||||
# recommend(1, item_prediction)
|
||||
# recommend(1, user_prediction)
|
||||
recommend(1, svd_prediction)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user