mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-13 07:15:26 +08:00
更新文档数据
This commit is contained in:
68
src/python/16.RecommenderSystems/sklearn-RS-demo-cf.py
Normal file
68
src/python/16.RecommenderSystems/sklearn-RS-demo-cf.py
Normal file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
from math import sqrt
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.sparse.linalg import svds
|
||||
from sklearn import cross_validation as cv
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
|
||||
# 加载数据集
|
||||
header = ['user_id', 'item_id', 'rating', 'timestamp']
|
||||
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
|
||||
dataFile = 'input/16.RecommenderSystems/ml-100k/u.data'
|
||||
df = pd.read_csv(dataFile, sep='\t', names=header)
|
||||
|
||||
n_users = df.user_id.unique().shape[0]
|
||||
n_items = df.item_id.unique().shape[0]
|
||||
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)
|
||||
|
||||
# 拆分数据集
|
||||
train_data, test_data = cv.train_test_split(df, test_size=0.25)
|
||||
|
||||
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
|
||||
train_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in train_data.itertuples():
|
||||
train_data_matrix[line[1]-1, line[2]-1] = line[3]
|
||||
test_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in test_data.itertuples():
|
||||
test_data_matrix[line[1]-1, line[2]-1] = line[3]
|
||||
# 使用sklearn的pairwise_distances函数来计算余弦相似性。
|
||||
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
|
||||
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
|
||||
|
||||
|
||||
def predict(rating, similarity, type='user'):
|
||||
if type == 'user':
|
||||
mean_user_rating = rating.mean(axis=1)
|
||||
rating_diff = (rating - mean_user_rating[:, np.newaxis])
|
||||
pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
|
||||
elif type == 'item':
|
||||
pred = rating.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
|
||||
return pred
|
||||
|
||||
|
||||
user_prediction = predict(train_data_matrix, user_similarity, type='user')
|
||||
item_prediction = predict(train_data_matrix, item_similarity, type='item')
|
||||
|
||||
|
||||
def rmse(prediction, ground_truth):
|
||||
prediction = prediction[ground_truth.nonzero()].flatten()
|
||||
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
|
||||
return sqrt(mean_squared_error(prediction, ground_truth))
|
||||
|
||||
|
||||
print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
|
||||
print 'Item based CF RMSe: ' + str(rmse(item_prediction, test_data_matrix))
|
||||
|
||||
sparsity = round(1.0 - len(df)/float(n_users*n_items), 3)
|
||||
print 'The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%'
|
||||
|
||||
|
||||
u, s, vt = svds(train_data_matrix, k=20)
|
||||
s_diag_matrix = np.diag(s)
|
||||
x_pred = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
print 'User-based CF MSE: ' + str(rmse(x_pred, test_data_matrix))
|
||||
30
src/python/16.RecommenderSystems/sklearn-RS-demo-item.py
Normal file
30
src/python/16.RecommenderSystems/sklearn-RS-demo-item.py
Normal file
@@ -0,0 +1,30 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
import numpy as np
|
||||
from sklearn.decomposition import NMF
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
RATE_MATRIX = np.array(
|
||||
[[5, 5, 3, 0, 5, 5],
|
||||
[5, 0, 4, 0, 4, 4],
|
||||
[0, 3, 0, 5, 4, 5],
|
||||
[5, 4, 3, 3, 5, 5]]
|
||||
)
|
||||
|
||||
nmf = NMF(n_components=2)
|
||||
user_distribution = nmf.fit_transform(RATE_MATRIX)
|
||||
item_distribution = nmf.components_
|
||||
|
||||
item_distribution = item_distribution.T
|
||||
plt.plot(item_distribution[:, 0], item_distribution[:, 1], "b*")
|
||||
plt.xlim((-1, 3))
|
||||
plt.ylim((-1, 3))
|
||||
|
||||
plt.title(u'the distribution of items (NMF)')
|
||||
count = 1
|
||||
for item in item_distribution:
|
||||
plt.text(item[0], item[1], 'item '+str(count), bbox=dict(facecolor='red', alpha=0.2),)
|
||||
count += 1
|
||||
|
||||
plt.show()
|
||||
31
src/python/16.RecommenderSystems/sklearn-RS-demo-user.py
Normal file
31
src/python/16.RecommenderSystems/sklearn-RS-demo-user.py
Normal file
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
import numpy as np
|
||||
from sklearn.decomposition import NMF
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
RATE_MATRIX = np.array(
|
||||
[[5, 5, 3, 0, 5, 5],
|
||||
[5, 0, 4, 0, 4, 4],
|
||||
[0, 3, 0, 5, 4, 5],
|
||||
[5, 4, 3, 3, 5, 5]]
|
||||
)
|
||||
|
||||
nmf = NMF(n_components=2)
|
||||
user_distribution = nmf.fit_transform(RATE_MATRIX)
|
||||
item_distribution = nmf.components_
|
||||
|
||||
users = ['Ben', 'Tom', 'John', 'Fred']
|
||||
zip_data = zip(users, user_distribution)
|
||||
|
||||
plt.title(u'the distribution of users (NMF)')
|
||||
plt.xlim((-1, 3))
|
||||
plt.ylim((-1, 4))
|
||||
for item in zip_data:
|
||||
user_name = item[0]
|
||||
data = item[1]
|
||||
plt.plot(data[0], data[1], "b*")
|
||||
plt.text(data[0], data[1], user_name, bbox=dict(facecolor='red', alpha=0.2),)
|
||||
|
||||
plt.show()
|
||||
22
src/python/16.RecommenderSystems/sklearn-RS-demo.py
Normal file
22
src/python/16.RecommenderSystems/sklearn-RS-demo.py
Normal file
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
import numpy as np
|
||||
from sklearn.decomposition import NMF
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
RATE_MATRIX = np.array(
|
||||
[[5, 5, 3, 0, 5, 5],
|
||||
[5, 0, 4, 0, 4, 4],
|
||||
[0, 3, 0, 5, 4, 5],
|
||||
[5, 4, 3, 3, 5, 5]]
|
||||
)
|
||||
|
||||
nmf = NMF(n_components=2) # 设有2个隐主题
|
||||
user_distribution = nmf.fit_transform(RATE_MATRIX)
|
||||
item_distribution = nmf.components_
|
||||
|
||||
print '用户的主题分布:'
|
||||
print user_distribution
|
||||
print '物品的主题分布:'
|
||||
print item_distribution
|
||||
Reference in New Issue
Block a user