mirror of
https://github.com/apachecn/ailearning.git
synced 2026-04-15 02:41:18 +08:00
修复原来删除 nlp的内容,迁移到 docs/nlp_old 下面
This commit is contained in:
318
tutorials/RecommenderSystems/rs_content_demo.py
Normal file
318
tutorials/RecommenderSystems/rs_content_demo.py
Normal file
@@ -0,0 +1,318 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf-8
|
||||
# -------------------------------------------------------------------------------
|
||||
# Name: 推荐系统
|
||||
# Purpose: 基于内容推荐
|
||||
# Author: jiangzhonglian
|
||||
# Create_time: 2020年10月15日
|
||||
# Update_time: 2020年10月21日
|
||||
# -------------------------------------------------------------------------------
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
# 自定义库
|
||||
import config.set_content as setting
|
||||
from middleware.utils import pd_load, pd_like, pd_save, pd_rename, get_days
|
||||
|
||||
|
||||
def data_converting(infile, outfile):
|
||||
"""
|
||||
# 将用户交易数据转化为:
|
||||
# 将
|
||||
# 用户ID、各种基金、变动金额、时间
|
||||
# 转化为:
|
||||
# 用户ID、基金ID、购买金额、时间的数据
|
||||
"""
|
||||
print("Loading user daliy data...")
|
||||
df = pd_load(infile)
|
||||
df["money"] = df["变动金额"].apply(lambda line: abs(line))
|
||||
df_user_item = df.groupby(["用户账号", "证券代码"], as_index=False).agg({
|
||||
"money": np.sum
|
||||
}).sort_values("money", ascending=True)
|
||||
pd_rename(df_user_item, ["user_id", "item_id", "rating"])
|
||||
pd_save(df_user_item, outfile)
|
||||
|
||||
|
||||
def create_user2item(infile, outfile):
|
||||
"""创建user-item评分矩阵"""
|
||||
|
||||
print("Loading user daliy data...")
|
||||
df_user_item = pd_load(infile)
|
||||
|
||||
user_id = sorted(df_user_item['user_id'].unique(), reverse=False)
|
||||
item_id = sorted(df_user_item['item_id'].unique(), reverse=False)
|
||||
# print("+++ user_id:", user_id)
|
||||
# print("+++ item_id:", item_id)
|
||||
rating_matrix = np.zeros([len(user_id),len(item_id)])
|
||||
rating_matrix = pd.DataFrame(rating_matrix, index=user_id, columns=item_id)
|
||||
|
||||
print("Converting data...")
|
||||
count = 0
|
||||
user_num= len(user_id)
|
||||
for uid in user_id:
|
||||
user_rating = df_user_item[df_user_item['user_id'] == uid].drop(['user_id'], axis=1)
|
||||
user_rated_num = len(user_rating)
|
||||
for row in range(0, user_rated_num):
|
||||
item_id = user_rating['item_id'].iloc[row]
|
||||
# 行(用户),列(电影),得分
|
||||
rating_matrix.loc[uid, item_id] = user_rating['rating'].iloc[row]
|
||||
|
||||
count += 1
|
||||
if count % 10 == 0:
|
||||
completed_percentage = round(float(count) / user_num * 100)
|
||||
print("Completed %s" % completed_percentage + "%")
|
||||
|
||||
rating_matrix.index.name = 'user_id'
|
||||
pd_save(rating_matrix, outfile, index=True)
|
||||
|
||||
|
||||
def create_item2feature(infile, outfile):
|
||||
"""创建 item-特征-是否存在 矩阵"""
|
||||
|
||||
print("Loading item feature data...")
|
||||
df_item_info = pd_load(infile, header=1)
|
||||
items_num = df_item_info.shape[0]
|
||||
columns = df_item_info.columns.tolist()
|
||||
new_cols = [col for col in columns if col not in ["info_type", "info_investype"]]
|
||||
info_types = sorted(df_item_info["info_type"].unique(), reverse=False)
|
||||
info_investypes = sorted(df_item_info["info_investype"].unique(), reverse=False)
|
||||
dict_n_cols = {"info_type": info_types, "info_investype": info_investypes}
|
||||
new_cols.append(dict_n_cols)
|
||||
# 获取新的列名
|
||||
def get_new_columns(new_cols):
|
||||
new_columns = []
|
||||
for col in new_cols:
|
||||
if isinstance(col, dict):
|
||||
for k, vs in col.items():
|
||||
new_columns += vs
|
||||
else:
|
||||
new_columns.append(col)
|
||||
return new_columns
|
||||
new_columns = get_new_columns(new_cols)
|
||||
# print(new_cols)
|
||||
# print(new_columns)
|
||||
|
||||
# ['item_id', 'info_name', 'info_trackerror', 'info_manafeeratioo', 'info_custfeeratioo', 'info_salefeeratioo', 'info_foundsize', 'info_foundlevel', 'info_creattime', 'info_unitworth'
|
||||
# {'info_type': ['QDII-ETF', '混合型', '股票指数', 'ETF-场内'], 'info_investype': ['契约型开放式', '契约型封闭式']}]
|
||||
def deal_line(line, new_cols):
|
||||
result = []
|
||||
for col in new_cols:
|
||||
if isinstance(col, str):
|
||||
result.append(line[col])
|
||||
elif isinstance(col, dict):
|
||||
for k, vs in col.items():
|
||||
for v in vs:
|
||||
if v == line[k]:
|
||||
result.append(1)
|
||||
else:
|
||||
result.append(0)
|
||||
else:
|
||||
print("类型错误")
|
||||
sys.exit(1)
|
||||
return result
|
||||
|
||||
df = df_item_info.apply(lambda line: deal_line(line, new_cols), axis=1, result_type="expand")
|
||||
pd_rename(df, new_columns)
|
||||
# 处理时间
|
||||
end_time = "2020-10-19"
|
||||
df["days"] = df["info_creattime"].apply(lambda str_time: get_days(str_time, end_time))
|
||||
# print(df.head(5))
|
||||
df.drop(['info_name', 'info_foundlevel', 'info_creattime'], axis=1, inplace=True)
|
||||
pd_save(df, outfile)
|
||||
|
||||
|
||||
def rs_1_data_preprocess():
|
||||
# 原属数据
|
||||
data_infile = setting.PATH_CONFIG["user_daily"]
|
||||
# 用户-物品-评分
|
||||
user_infile = setting.PATH_CONFIG["user_item"]
|
||||
user_outfile = setting.PATH_CONFIG["matrix_user_item2rating"]
|
||||
# 物品-特征-评分
|
||||
item_infile = setting.PATH_CONFIG["item_info"]
|
||||
item_outfile = setting.PATH_CONFIG["matrix_item2feature"]
|
||||
|
||||
# 判断用户交易数据,如果不存在就要重新生成
|
||||
if not os.path.exists(user_infile):
|
||||
"""数据处理部分"""
|
||||
# user 数据预处理
|
||||
data_converting(data_infile, user_infile)
|
||||
# 创建 user-item-评分 矩阵
|
||||
create_user2item(user_infile, user_outfile)
|
||||
else:
|
||||
if not os.path.exists(user_outfile):
|
||||
# 创建 user-item-评分 矩阵
|
||||
create_user2item(user_infile, user_outfile)
|
||||
|
||||
if not os.path.exists(item_outfile):
|
||||
# 创建 item-feature-是否存在 矩阵
|
||||
create_item2feature(item_infile, item_outfile)
|
||||
|
||||
user_feature = pd_load(user_outfile)
|
||||
item_feature = pd_load(item_outfile)
|
||||
user_feature.set_index('user_id', inplace=True)
|
||||
item_feature.set_index('item_id', inplace=True)
|
||||
return user_feature, item_feature
|
||||
|
||||
|
||||
def cos_measure(item_feature_vector, user_rated_items_matrix):
|
||||
"""
|
||||
计算item之间的余弦夹角相似度
|
||||
:param item_feature_vector: 待测量的item特征向量
|
||||
:param user_rated_items_matrix: 用户已评分的items的特征矩阵
|
||||
:return: 待计算item与用户已评分的items的余弦夹角相识度的向量
|
||||
"""
|
||||
x_c = (item_feature_vector * user_rated_items_matrix.T) + 0.0000001
|
||||
mod_x = np.sqrt(item_feature_vector * item_feature_vector.T)
|
||||
mod_c = np.sqrt((user_rated_items_matrix * user_rated_items_matrix.T).diagonal())
|
||||
cos_xc = x_c / (mod_x * mod_c)
|
||||
|
||||
return cos_xc
|
||||
|
||||
|
||||
def comp_user_feature(user_rated_vector, item_feature_matrix):
|
||||
"""
|
||||
根据user的评分来计算得到user的喜好特征
|
||||
:param user_rated_vector : user的评分向量
|
||||
:param item_feature_matrix: item的特征矩阵
|
||||
:return: user的喜好特征
|
||||
"""
|
||||
# user评分的均值
|
||||
user_rating_mean = user_rated_vector.mean()
|
||||
# # 分别得到user喜欢和不喜欢item的向量以及item对应的引索(以该user的评分均值来划分)
|
||||
user_like_item = user_rated_vector.loc[user_rated_vector >= user_rating_mean]
|
||||
user_unlike_item = user_rated_vector.loc[user_rated_vector < user_rating_mean]
|
||||
|
||||
print("user_like_item: \n", user_like_item)
|
||||
print("user_unlike_item: \n", user_unlike_item)
|
||||
|
||||
# 获取买入和卖出的 index
|
||||
user_like_item_index = map(int, user_like_item.index.values)
|
||||
user_unlike_item_index = map(int, user_unlike_item.index.values)
|
||||
# 获取买入和卖出的 value
|
||||
user_like_item_rating = np.matrix(user_like_item.values)
|
||||
user_unlike_item_rating = np.matrix(user_unlike_item.values)
|
||||
|
||||
#得到user喜欢和不喜欢item的特征矩阵
|
||||
user_like_item_feature_matrix = np.matrix(item_feature_matrix.loc[user_like_item_index, :].values)
|
||||
user_unlike_item_feature_matrix = np.matrix(item_feature_matrix.loc[user_unlike_item_index, :].values)
|
||||
|
||||
#计算user的喜好特征向量,以其对item的评分作为权重
|
||||
weight_of_like = user_like_item_rating / user_like_item_rating.sum()
|
||||
weight_of_unlike = user_unlike_item_rating / user_unlike_item_rating.sum()
|
||||
|
||||
print("weight_of_like: ", weight_of_like)
|
||||
print("weight_of_unlike: ", weight_of_unlike)
|
||||
|
||||
#计算user的喜欢特征和不喜欢特征以及总特征
|
||||
user_like_feature = weight_of_like * user_like_item_feature_matrix
|
||||
user_unlike_feature = weight_of_unlike * user_unlike_item_feature_matrix
|
||||
user_feature_tol = user_like_feature - user_unlike_feature
|
||||
return user_feature_tol
|
||||
|
||||
|
||||
def rs_2_cb_recommend(user_feature, item_feature_matrix, K=20):
|
||||
"""
|
||||
计算得到与user最相似的Top K个item推荐给user
|
||||
:param user_feature: 待推荐用户的对item的评分向量
|
||||
:param item_feature_matrix: 包含所有item的特征矩阵
|
||||
:param K: 推荐给user的item数量
|
||||
:return: 与user最相似的Top K个item的编号
|
||||
"""
|
||||
# 得到user已评分和未评分的item向量
|
||||
user_rated_vector = user_feature.loc[user_feature > 0]
|
||||
# print("操作 >>> \n", user_rated_vector)
|
||||
# user_unrated_vector = user_feature.loc[user_feature == 0]
|
||||
# print("未操作 >>> \n", user_unrated_vector)
|
||||
# 买过的其实也可以推荐
|
||||
user_unrated_vector = user_feature
|
||||
# print(">>> \n", user_unrated_vector)
|
||||
|
||||
# user喜好总特征(就是用户的调性)
|
||||
user_item_feature_tol = comp_user_feature(user_rated_vector, item_feature_matrix)
|
||||
print(">>> 用户调性", user_item_feature_tol)
|
||||
#未评分item的特征矩阵
|
||||
user_unrated_item_index = map(int, user_unrated_vector.index.values)
|
||||
user_unrated_item_feature_matrix = np.matrix(item_feature_matrix.loc[user_unrated_item_index, :].values)
|
||||
|
||||
#得到相似度并进行排序
|
||||
similarity = list(np.array(cos_measure(user_item_feature_tol, user_unrated_item_feature_matrix))[0])
|
||||
|
||||
key = {'item_index': list(user_unrated_vector.index.values), 'similarity': similarity}
|
||||
item_sim_df = pd.DataFrame(key)
|
||||
item_sim_df.sort_values('similarity', ascending=False, inplace=True)
|
||||
# print(item_sim_df.head(100))
|
||||
return item_sim_df.iloc[:K, 0].values
|
||||
|
||||
|
||||
def estimate_rate(user_rated_vector, similarity):
|
||||
"""
|
||||
估计用户对item的评分
|
||||
:param user_rated_vector: 用户已有item评分向量
|
||||
:param similarity: 待估计item和已评分item的相识度向量
|
||||
:return:用户对item的评分的估计
|
||||
"""
|
||||
rate_hat = (user_rated_vector * similarity.T) / similarity.sum()
|
||||
# print(">>> ", rate_hat)
|
||||
return rate_hat[0, 0]
|
||||
|
||||
|
||||
def rs_2_cb_recommend_estimate(user_feature, item_feature_matrix, item):
|
||||
"""
|
||||
基于内容的推荐算法对item的评分进行估计
|
||||
:param item_feature_matrix: 包含所有item的特征矩阵
|
||||
:param user_feature: 待估计用户的对item的评分向量
|
||||
:param item: 待估计item的编号
|
||||
:return: 基于内容的推荐算法对item的评分进行估计
|
||||
"""
|
||||
# #得到item的引索以及特征矩阵
|
||||
# item_index = item_feature_matrix.index
|
||||
# item_feature = item_feature_matrix.values
|
||||
|
||||
#得到所有user评分的item的引索
|
||||
user_item_index = user_feature.index
|
||||
|
||||
#某一用户已有评分item的评分向量和引索以及item的评分矩阵
|
||||
user_rated_vector = np.matrix(user_feature.loc[user_feature > 0].values)
|
||||
user_rated_items = map(int, user_item_index[user_feature > 0].values)
|
||||
|
||||
user_rated_items_matrix = np.matrix(item_feature_matrix.loc[user_rated_items, :].values)
|
||||
|
||||
#待评分item的特征向量,函数中给出的是该item的Id
|
||||
item_feature_vector = np.matrix(item_feature_matrix.loc[item].values)
|
||||
|
||||
#得到待计算item与用户已评分的items的余弦夹角相识度的向量
|
||||
cos_xc = cos_measure(item_feature_vector, user_rated_items_matrix)
|
||||
# print(">>> 相似度: %s" % cos_xc)
|
||||
#计算uesr对该item的评分估计
|
||||
rate_hat = estimate_rate(user_rated_vector, cos_xc)
|
||||
return rate_hat
|
||||
|
||||
|
||||
def main():
|
||||
# 数据初始化
|
||||
user_id = 20200930
|
||||
K = 10
|
||||
user_feature, item_feature = rs_1_data_preprocess()
|
||||
# 基于内容推荐的模块(给某一个用户推荐 10个 他感兴趣的内容)
|
||||
user_feature = user_feature.loc[user_id, :] # 一行 用户(具体某一个用户)-电影-评分 数据
|
||||
print(">>> 1 \n", user_feature)
|
||||
# 效果不好,有几个原因
|
||||
# 1. 交易数据比较少
|
||||
# 2. 基金的特征不够全面
|
||||
# 3. 优化喜欢和不喜欢的阈值
|
||||
result = rs_2_cb_recommend(user_feature, item_feature, K)
|
||||
print(result)
|
||||
# for code in result:
|
||||
# # 给某一个用户推荐一个item, 预估推荐评分
|
||||
# price = rs_2_cb_recommend_estimate(user_feature, item_feature, code)
|
||||
# if price > 1000:
|
||||
# print("--- %s 基金买入 %s" % (code, abs(price)) )
|
||||
# elif price < -1000:
|
||||
# print("--- %s 基金卖出 %s" % (code, abs(price)) )
|
||||
# else:
|
||||
# print("--- 不做任何操作")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
262
tutorials/RecommenderSystems/rs_rating_demo.py
Normal file
262
tutorials/RecommenderSystems/rs_rating_demo.py
Normal file
@@ -0,0 +1,262 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf-8
|
||||
# -------------------------------------------------------------------------------
|
||||
# Name: 推荐系统
|
||||
# Purpose: 推荐系统: Item CF/User CF/SVD 对比
|
||||
# Author: jiangzhonglian
|
||||
# Create_time: 2020年9月21日
|
||||
# Update_time: 2020年9月21日
|
||||
# -------------------------------------------------------------------------------
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import math
|
||||
from operator import itemgetter
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.sparse.linalg import svds
|
||||
from sklearn import model_selection as cv
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from middleware.utils import TimeStat, Chart
|
||||
|
||||
|
||||
def splitData(dataFile, test_size):
|
||||
# 加载数据集 (用户ID, 电影ID, 评分, 时间戳)
|
||||
header = ['user_id', 'item_id', 'rating', 'timestamp']
|
||||
df = pd.read_csv(dataFile, sep='\t', names=header)
|
||||
|
||||
n_users = df.user_id.unique().shape[0]
|
||||
n_items = df.item_id.unique().shape[0]
|
||||
|
||||
print('>>> 本数据集包含: 总用户数 = %s | 总电影数 = %s' % (n_users, n_items) )
|
||||
train_data, test_data = cv.train_test_split(df, test_size=test_size)
|
||||
print(">>> 训练:测试 = %s:%s = %s:%s" % (len(train_data), len(test_data), 1-test_size, test_size))
|
||||
return df, n_users, n_items, train_data, test_data
|
||||
|
||||
|
||||
def calc_similarity(n_users, n_items, train_data, test_data):
|
||||
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
|
||||
"""
|
||||
line: Pandas(Index=93661, user_id=624, item_id=750, rating=4, timestamp=891961163)
|
||||
"""
|
||||
train_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in train_data.itertuples():
|
||||
train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
|
||||
|
||||
test_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in test_data.itertuples():
|
||||
test_data_matrix[line[1] - 1, line[2] - 1] = line[3]
|
||||
|
||||
print("1:", np.shape(train_data_matrix)) # 行: 人 | 列: 电影
|
||||
print("2:", np.shape(train_data_matrix.T)) # 行: 电影 | 列: 人
|
||||
|
||||
# 使用sklearn的 pairwise_distances 计算向量距离,cosine来计算余弦距离,越小越相似
|
||||
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
|
||||
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
|
||||
# print("<<< %s \n %s" % (np.shape(user_similarity), user_similarity) )
|
||||
# print("<<< %s \n %s" % (np.shape(item_similarity), item_similarity) )
|
||||
|
||||
print('开始统计流行item的数量...', file=sys.stderr)
|
||||
item_popular = {}
|
||||
# 统计同一个电影,观看的总人数(也就是所谓的流行度!)
|
||||
for i_index in range(n_items):
|
||||
if np.sum(train_data_matrix[:, i_index]) != 0:
|
||||
item_popular[i_index] = np.sum(train_data_matrix[:, i_index] != 0)
|
||||
|
||||
# save the total number of items
|
||||
item_count = len(item_popular)
|
||||
print('总共流行 item 数量 = %d' % item_count, file=sys.stderr)
|
||||
return train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular
|
||||
|
||||
|
||||
def predict(rating, similarity, type='user'):
|
||||
"""
|
||||
:param rating: 训练数据
|
||||
:param similarity: 向量距离
|
||||
:return:
|
||||
"""
|
||||
print("+++ %s" % type)
|
||||
print(" rating=", np.shape(rating))
|
||||
print(" similarity=", np.shape(similarity))
|
||||
if type == 'item':
|
||||
"""
|
||||
综合打分:
|
||||
rating.dot(similarity) 表示:
|
||||
某1个人所有的电影组合 X ·电影*电影·距离(第1列都是关于第1部电影和其他的电影的距离)中,计算出 第一个人对第1/2/3部电影的 总评分 1*n
|
||||
某2个人所有的电影组合 X ·电影*电影·距离(第1列都是关于第1部电影和其他的电影的距离)中,计算出 第一个人对第1/2/3部电影的 总评分 1*n
|
||||
...
|
||||
某n个人所有的电影组合 X ·电影*电影·距离(第1列都是关于第1部电影和其他的电影的距离)中,计算出 第一个人对第1/2/3部电影的 总评分 1*n
|
||||
= 人-电影-评分(943, 1682) * 电影-电影-距离(1682, 1682)
|
||||
= 人-电影-总评分距离(943, 1682)
|
||||
|
||||
np.array([np.abs(similarity).sum(axis=1)]) 表示: 横向求和: 1 表示某一行所有的列求和
|
||||
第1列表示:某个A电影,对于所有电影计算出A的总距离
|
||||
第2列表示:某个B电影,对于所有电影的综出B的总距离
|
||||
...
|
||||
第n列表示:某个N电影,对于所有电影的综出N的总距离
|
||||
= 每一个电影的总距离 (1, 1682)
|
||||
|
||||
pred = 人-电影-平均评分 (943, 1682)
|
||||
"""
|
||||
pred = rating.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
|
||||
elif type == 'user':
|
||||
# 每个样本上减去数据的统计平均值可以移除共同的部分,凸显个体差异。
|
||||
|
||||
# 求出每一个用户,所有电影的综合评分
|
||||
# 横向求平均: 1 表示某一行所有的列求平均
|
||||
mean_user_rating = rating.mean(axis=1)
|
||||
# numpy中包含的 newaxis 可以给原数组增加一个维度
|
||||
rating_diff = (rating - mean_user_rating[:, np.newaxis])
|
||||
|
||||
# 均分 +
|
||||
# 人-人-距离(943, 943)*人-电影-评分diff(943, 1682)=结果-人-电影(每个人对同一电影的综合得分)(943, 1682) 再除以 个人与其他人总的距离 = 人-电影综合得分
|
||||
"""
|
||||
综合打分:
|
||||
similarity.dot(rating_diff) 表示:
|
||||
第1列:第1个人与其他人的相似度 * 人与电影的相似度,得到 第1个人对第1/2/3列电影的 总得分 1*n
|
||||
第2列:第2个人与其他人的相似度 * 人与电影的相似度,得到 第2个人对第1/2/3列电影的 总得分 1*n
|
||||
...
|
||||
第n列:第n个人与其他人的相似度 * 人与电影的相似度,得到 第n个人对第1/2/3列电影的 总得分 1*n
|
||||
= 人-人-距离(943, 943) * 人-电影-评分(943, 1682)
|
||||
= 人-电影-总评分距离(943, 1682)
|
||||
|
||||
np.array([np.abs(similarity).sum(axis=1)]) 表示: 横向求和: 1 表示某一行所有的列求和
|
||||
第1列表示:第A个人,对于所有人计算出A的总距离
|
||||
第2列表示:第B个人,对于所有人计算出B的总距离
|
||||
...
|
||||
第n列表示:第N个人,对于所有人计算出N的总距离
|
||||
= 每一个电影的总距离 (1, 943)
|
||||
|
||||
pred = 均值 + 人-电影-平均评分 (943, 1682)
|
||||
"""
|
||||
pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
|
||||
|
||||
return pred
|
||||
|
||||
|
||||
def rmse(prediction, ground_truth):
|
||||
prediction = prediction[ground_truth.nonzero()].flatten()
|
||||
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
|
||||
return math.sqrt(mean_squared_error(prediction, ground_truth))
|
||||
|
||||
|
||||
def evaluate(prediction, item_popular, name):
|
||||
hit = 0
|
||||
rec_count = 0
|
||||
test_count = 0
|
||||
popular_sum = 0
|
||||
all_rec_items = set()
|
||||
for u_index in range(n_users):
|
||||
items = np.where(train_data_matrix[u_index, :] == 0)[0]
|
||||
pre_items = sorted(
|
||||
dict(zip(items, prediction[u_index, items])).items(),
|
||||
key=itemgetter(1),
|
||||
reverse=True)[:20]
|
||||
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
|
||||
|
||||
# 对比测试集和推荐集的差异 item, w
|
||||
for item, _ in pre_items:
|
||||
if item in test_items:
|
||||
hit += 1
|
||||
all_rec_items.add(item)
|
||||
|
||||
# popular_sum是对所有的item的流行度进行加和
|
||||
if item in item_popular:
|
||||
popular_sum += math.log(1 + item_popular[item])
|
||||
|
||||
rec_count += len(pre_items)
|
||||
test_count += len(test_items)
|
||||
|
||||
precision = hit / (1.0 * rec_count)
|
||||
# 召回率,相对于测试推荐集合的数据
|
||||
recall = hit / (1.0 * test_count)
|
||||
# 覆盖率,相对于训练集合的数据
|
||||
coverage = len(all_rec_items) / (1.0 * len(item_popular))
|
||||
popularity = popular_sum / (1.0 * rec_count)
|
||||
print('--- %s: precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
|
||||
name, precision, recall, coverage, popularity), file=sys.stderr)
|
||||
|
||||
|
||||
def recommend(u_index, prediction):
|
||||
items = np.where(train_data_matrix[u_index, :] == 0)[0]
|
||||
pre_items = sorted(
|
||||
dict(zip(items, prediction[u_index, items])).items(),
|
||||
key=itemgetter(1),
|
||||
reverse=True)[:10]
|
||||
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
|
||||
|
||||
result = [key for key, value in pre_items]
|
||||
result.sort(reverse=False)
|
||||
print('原始结果(%s): %s' % (len(test_items), test_items) )
|
||||
print('推荐结果(%s): %s' % (len(result), result) )
|
||||
|
||||
|
||||
def main():
|
||||
global n_users, train_data_matrix, test_data_matrix
|
||||
# 基于内存的协同过滤
|
||||
# ...
|
||||
# 拆分数据集
|
||||
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
|
||||
path_root = "/Users/jiangzl/work/data/机器学习"
|
||||
dataFile = '%s/16.RecommenderSystems/ml-100k/u.data' % path_root
|
||||
|
||||
df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25)
|
||||
|
||||
# 计算相似度
|
||||
train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular = calc_similarity(
|
||||
n_users, n_items, train_data, test_data)
|
||||
|
||||
item_prediction = predict(train_data_matrix, item_similarity, type='item')
|
||||
user_prediction = predict(train_data_matrix, user_similarity, type='user')
|
||||
|
||||
# # 评估: 均方根误差
|
||||
print('>>> Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))
|
||||
print('>>> User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
|
||||
|
||||
# 基于模型的协同过滤
|
||||
# ...
|
||||
# 计算MovieLens数据集的稀疏度 (n_users,n_items 是常量,所以,用户行为数据越少,意味着信息量少;越稀疏,优化的空间也越大)
|
||||
sparsity = round(1.0 - len(df) / float(n_users * n_items), 3)
|
||||
print('\nMovieLen100K的稀疏度: %s%%\n' % (sparsity * 100))
|
||||
|
||||
# # 计算稀疏矩阵的最大k个奇异值/向量
|
||||
# minrmse = math.inf
|
||||
# index = 1
|
||||
# for k in range(1, 30, 1):
|
||||
# u, s, vt = svds(train_data_matrix, k=k)
|
||||
# # print(">>> ", s)
|
||||
# s_diag_matrix = np.diag(s)
|
||||
# svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
# r_rmse = rmse(svd_prediction, test_data_matrix)
|
||||
# if r_rmse < minrmse:
|
||||
# index = k
|
||||
# minrmse = r_rmse
|
||||
|
||||
index = 11
|
||||
minrmse = 2.6717213264389765
|
||||
u, s, vt = svds(train_data_matrix, k=index)
|
||||
# print(">>> ", s)
|
||||
s_diag_matrix = np.diag(s)
|
||||
svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
r_rmse = rmse(svd_prediction, test_data_matrix)
|
||||
print("+++ k=%s, svd-shape: %s" % (index, np.shape(svd_prediction)) )
|
||||
print('>>> Model based CF RMSE: %s\n' % minrmse)
|
||||
# """
|
||||
# 在信息量相同的情况下,矩阵越小,那么携带的信息越可靠。
|
||||
# 所以: user-cf 推荐效果高于 item-cf; 而svd分解后,发现15个维度效果就能达到90%以上,所以信息更可靠,效果也更好。
|
||||
# item-cf: 1682
|
||||
# user-cf: 943
|
||||
# svd: 15
|
||||
# """
|
||||
evaluate(item_prediction, item_popular, 'item')
|
||||
evaluate(user_prediction, item_popular, 'user')
|
||||
evaluate(svd_prediction, item_popular, 'svd')
|
||||
|
||||
# 推荐结果
|
||||
# recommend(1, item_prediction)
|
||||
# recommend(1, user_prediction)
|
||||
recommend(1, svd_prediction)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
tutorials/keras/__init__.py
Normal file
0
tutorials/keras/__init__.py
Normal file
121
tutorials/keras/brat_tag.py
Normal file
121
tutorials/keras/brat_tag.py
Normal file
@@ -0,0 +1,121 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
数据格式转化
|
||||
"""
|
||||
import os
|
||||
import emoji
|
||||
from middleware.utils import get_catalog_files
|
||||
from config.setting import Config
|
||||
|
||||
tag_dic = {"实体对象": "ORG",
|
||||
"正向观点": "Po_VIEW",
|
||||
"中性观点": "Mi_VIEW",
|
||||
"负向观点": "Ne_VIEW"}
|
||||
|
||||
|
||||
# 转换成可训练的格式,最后以"END O"结尾
|
||||
def from_ann2dic(r_ann_path, r_txt_path, w_path):
|
||||
q_dic = {}
|
||||
print("开始读取文件:%s" % r_ann_path)
|
||||
with open(r_ann_path, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
line_arr = line.split()
|
||||
# print(">>> ", line_arr)
|
||||
cls = tag_dic[line_arr[1]]
|
||||
start_index = int(line_arr[2])
|
||||
end_index = int(line_arr[3])
|
||||
length = end_index - start_index
|
||||
for r in range(length):
|
||||
q_dic[start_index+r] = ("B-%s" % cls) if r == 0 else ("I-%s" % cls)
|
||||
|
||||
# 存储坐标和对应的列名: {23: 'B-Ne_VIEW', 24: 'I-Ne_VIEW', 46: 'B-ORG', 47: 'I-ORG'}
|
||||
print("q_dic: ", q_dic)
|
||||
|
||||
print("开始读取文件内容: %s" % r_txt_path)
|
||||
with open(r_txt_path, "r", encoding="utf-8") as f:
|
||||
content_str = f.read()
|
||||
|
||||
print("开始写入文本%s" % w_path)
|
||||
with open(w_path, "w", encoding="utf-8") as w:
|
||||
for i, strA in enumerate(content_str):
|
||||
# print(">>> %s-%s" % (i, strA))
|
||||
if strA == "\n":
|
||||
w.write("\n")
|
||||
else:
|
||||
if i in q_dic:
|
||||
tag = q_dic[i]
|
||||
else:
|
||||
tag = "O" # 大写字母O
|
||||
w.write('%s %s\n' % (strA, tag))
|
||||
w.write('%s\n' % "END O")
|
||||
|
||||
|
||||
# 生成train.txt、dev.txt、test.txt
|
||||
# 除8,9-new.txt分别用于dev和test外,剩下的合并成train.txt
|
||||
def create_train_data(data_root_dir, w_path):
|
||||
if os.path.exists(w_path):
|
||||
os.remove(w_path)
|
||||
for file in os.listdir(data_root_dir):
|
||||
path = os.path.join(data_root_dir, file)
|
||||
if file.endswith("8-new.txt"):
|
||||
# 重命名为dev.txt
|
||||
os.rename(path, os.path.join(data_root_dir, "dev.txt"))
|
||||
continue
|
||||
if file.endswith("9-new.txt"):
|
||||
# 重命名为test.txt
|
||||
os.rename(path, os.path.join(data_root_dir, "test.txt"))
|
||||
continue
|
||||
q_list = []
|
||||
print("开始读取文件:%s" % file)
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
line = line.rstrip()
|
||||
if line == "END O":
|
||||
break
|
||||
q_list.append(line)
|
||||
|
||||
# 获取list 列表: ['美 O', '! O', '气 O', '质 O', '特 O', '别 O', '好 O', '', '造 O', '型 O', '独 O', '特 O', ', O', '尺 B-ORG', '码 I-ORG', '偏 B-Ne_VIEW', '大 I-Ne_VIEW', ', O']
|
||||
# print("q_list: ", q_list)
|
||||
print("开始写入文本: %s" % w_path)
|
||||
with open(w_path, "a", encoding="utf-8") as f:
|
||||
for item in q_list:
|
||||
f.write('%s\n' % item)
|
||||
|
||||
|
||||
def brat_1_format_origin(catalog):
|
||||
"""
|
||||
格式化原始文件(去除表情符号的影响,brat占2个字符,但是python占1个字符)
|
||||
"""
|
||||
with open('%s/origin/origin.txt' % path_root, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
with open('%s/tag_befer/befer.txt' % path_root, "w", encoding="utf-8") as f:
|
||||
# 转换原始文件
|
||||
for line in lines:
|
||||
text = emoji.demojize(line)
|
||||
f.write('%s' % text)
|
||||
# 创建标注的新文件
|
||||
with open('%s/tag_befer/befer.ann' % path_root, "w", encoding="utf-8") as f:
|
||||
pass
|
||||
|
||||
def brat_2_create_train_data(catalog):
|
||||
file_list = get_catalog_files("%s/tag_after" % catalog, status=-1, str1=".DS_Store")
|
||||
file_list = list(set([i.split("/")[-1].split(".")[0] for i in file_list]))
|
||||
print(file_list)
|
||||
for filename in file_list:
|
||||
r_ann_path = os.path.join(catalog, "tag_after/%s.ann" % filename)
|
||||
r_txt_path = os.path.join(catalog, "tag_after/%s.txt" % filename)
|
||||
w_path = os.path.join(catalog, "new/%s-new.txt" % filename)
|
||||
print("filename", r_ann_path, r_txt_path, w_path)
|
||||
from_ann2dic(r_ann_path, r_txt_path, w_path)
|
||||
# 生成train.txt、dev.txt、test.txt
|
||||
create_train_data("%s/new" % catalog, "%s/new/train.txt" % catalog)
|
||||
|
||||
|
||||
def main():
|
||||
catalog = Config.nlp_ner.path_root
|
||||
|
||||
# brat_1_format_origin(catalog)
|
||||
brat_2_create_train_data(catalog)
|
||||
165
tutorials/keras/text_NER.py
Normal file
165
tutorials/keras/text_NER.py
Normal file
@@ -0,0 +1,165 @@
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import platform
|
||||
from collections import Counter
|
||||
import keras
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Embedding, Bidirectional, LSTM, Dropout
|
||||
from keras_contrib.layers import CRF
|
||||
from keras_contrib.losses import crf_loss
|
||||
from keras_contrib.metrics import crf_viterbi_accuracy
|
||||
"""
|
||||
# padding: pre(默认) 向前补充0 post 向后补充0
|
||||
# truncating: 文本超过 pad_num, pre(默认) 删除前面 post 删除后面
|
||||
# x_train = pad_sequences(x, maxlen=pad_num, value=0, padding='post', truncating="post")
|
||||
# print("--- ", x_train[0][:20])
|
||||
|
||||
使用keras_bert、keras_contrib的crf时bug记录
|
||||
TypeError: Tensors in list passed to 'values' of 'ConcatV2' Op have types [bool, float32] that don't all match
|
||||
解决方案, 修改crf.py 516行:
|
||||
mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1),
|
||||
为:
|
||||
mask2 = K.cast(K.concatenate([mask, K.cast(K.zeros_like(mask[:, :1]), mask.dtype)], axis=1),
|
||||
"""
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
from config.setting import Config
|
||||
|
||||
|
||||
def load_data():
|
||||
train = _parse_data(Config.nlp_ner.path_train)
|
||||
test = _parse_data(Config.nlp_ner.path_test)
|
||||
print("--- init 数据加载解析完成 ---")
|
||||
|
||||
# Counter({'的': 8, '中': 7, '致': 7, '党': 7})
|
||||
word_counts = Counter(row[0].lower() for sample in train for row in sample)
|
||||
vocab = [w for w, f in iter(word_counts.items()) if f >= 2]
|
||||
chunk_tags = Config.nlp_ner.chunk_tags
|
||||
|
||||
# 存储保留的有效个数的 vovab 和 对应 chunk_tags
|
||||
with open(Config.nlp_ner.path_config, 'wb') as outp:
|
||||
pickle.dump((vocab, chunk_tags), outp)
|
||||
print("--- init 配置文件保存成功 ---")
|
||||
|
||||
train = _process_data(train, vocab, chunk_tags)
|
||||
test = _process_data(test , vocab, chunk_tags)
|
||||
print("--- init 对数据进行编码,生成训练需要的数据格式 ---")
|
||||
return train, test, (vocab, chunk_tags)
|
||||
|
||||
|
||||
def _parse_data(filename):
|
||||
"""
|
||||
以单下划线开头(_foo)的代表不能直接访问的类属性
|
||||
用于解析数据,用于模型训练
|
||||
:param filename: 文件地址
|
||||
:return: data: 解析数据后的结果
|
||||
[[['中', 'B-ORG'], ['共', 'I-ORG']], [['中', 'B-ORG'], ['国', 'I-ORG']]]
|
||||
"""
|
||||
with open(filename, 'rb') as fn:
|
||||
split_text = '\n'
|
||||
# 主要是分句: split_text 默认每个句子都是一行,所以原来换行就需要 两个split_text
|
||||
texts = fn.read().decode('utf-8').strip().split(split_text + split_text)
|
||||
# 对于每个字需要 split_text, 而字的内部需要用空格分隔
|
||||
# len(row) > 0 避免连续2个换行,导致 row 数据为空
|
||||
# row.split() 会删除空格或特殊符号,导致空格数据缺失!
|
||||
data = [[[" ", "O"] if len(row.split()) != 2 else row.split() for row in text.split(split_text) if len(row) > 0] for text in texts]
|
||||
# data = [[row.split() for row in text.split(split_text) if len(row.split()) == 2] for text in texts]
|
||||
return data
|
||||
|
||||
|
||||
def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):
|
||||
if maxlen is None:
|
||||
maxlen = max(len(s) for s in data)
|
||||
|
||||
# 对每个字进行编码
|
||||
word2idx = dict((w, i) for i, w in enumerate(vocab))
|
||||
# 如果不在 vocab里面,就给 unk 值为 1
|
||||
x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]
|
||||
y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]
|
||||
|
||||
x = pad_sequences(x, maxlen) # left padding
|
||||
y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
|
||||
|
||||
if onehot:
|
||||
# 返回一个onehot 编码的多维数组
|
||||
y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk]
|
||||
else:
|
||||
# np.expand_dims:用于扩展数组的形状
|
||||
# https://blog.csdn.net/hong615771420/article/details/83448878
|
||||
y_chunk = np.expand_dims(y_chunk, 2)
|
||||
return x, y_chunk
|
||||
|
||||
|
||||
def process_data(data, vocab, maxlen=100):
|
||||
word2idx = dict((w, i) for i, w in enumerate(vocab))
|
||||
x = [word2idx.get(w[0].lower(), 1) for w in data]
|
||||
length = len(x)
|
||||
x = pad_sequences([x], maxlen) # left padding
|
||||
return x, length
|
||||
|
||||
|
||||
def create_model(len_vocab, len_chunk_tags):
|
||||
model = Sequential()
|
||||
model.add(Embedding(len_vocab, Config.nlp_ner.EMBED_DIM, mask_zero=True)) # Random embedding
|
||||
model.add(Bidirectional(LSTM(Config.nlp_ner.BiLSTM_UNITS // 2, return_sequences=True)))
|
||||
model.add(Dropout(0.25))
|
||||
crf = CRF(len_chunk_tags, sparse_target=True)
|
||||
model.add(crf)
|
||||
model.summary()
|
||||
model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy])
|
||||
# model.compile('rmsprop', loss=crf_loss, metrics=[crf_viterbi_accuracy])
|
||||
|
||||
# from keras.optimizers import Adam
|
||||
# adam_lr = 0.0001
|
||||
# adam_beta_1 = 0.5
|
||||
# model.compile(optimizer=Adam(lr=adam_lr, beta_1=adam_beta_1), loss=crf_loss, metrics=[crf_viterbi_accuracy])
|
||||
return model
|
||||
|
||||
|
||||
def train():
|
||||
(train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data()
|
||||
model = create_model(len(vocab), len(chunk_tags))
|
||||
# train model
|
||||
model.fit(train_x, train_y, batch_size=16, epochs=Config.nlp_ner.EPOCHS, validation_data=[test_x, test_y])
|
||||
model.save(Config.nlp_ner.path_model)
|
||||
|
||||
|
||||
def test():
|
||||
with open(Config.nlp_ner.path_config, 'rb') as inp:
|
||||
(vocab, chunk_tags) = pickle.load(inp)
|
||||
model = create_model(len(vocab), len(chunk_tags))
|
||||
# predict_text = '造型独特,尺码偏大,估计是钉子头圆的半径的缘故'
|
||||
with open(Config.nlp_ner.path_origin, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
for predict_text in lines:
|
||||
content = predict_text.strip()
|
||||
text_EMBED, length = process_data(content, vocab)
|
||||
model.load_weights(Config.nlp_ner.path_model)
|
||||
raw = model.predict(text_EMBED)[0][-length:]
|
||||
pre_result = [np.argmax(row) for row in raw]
|
||||
result_tags = [chunk_tags[i] for i in pre_result]
|
||||
|
||||
# 保存每句话的 实体和观点
|
||||
result = {}
|
||||
tag_list = [i for i in chunk_tags if i not in ["O"]]
|
||||
for word, t in zip(content, result_tags):
|
||||
# print(word, t)
|
||||
if t not in tag_list:
|
||||
continue
|
||||
for i in range(0, len(tag_list), 2):
|
||||
if t in tag_list[i:i+2]:
|
||||
# print("\n>>> %s---%s==%s" % (word, t, tag_list[i:i+2]))
|
||||
tag = tag_list[i].split("-")[-1]
|
||||
if tag not in result:
|
||||
result[tag] = ""
|
||||
result[tag] += ' '+word if t==tag_list[i] else word
|
||||
print(result)
|
||||
|
||||
|
||||
def main():
|
||||
# print("--")
|
||||
train()
|
||||
test()
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# train()
|
||||
113
tutorials/test.ipynb
Normal file
113
tutorials/test.ipynb
Normal file
@@ -0,0 +1,113 @@
|
||||
{
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.3-final"
|
||||
},
|
||||
"orig_nbformat": 2,
|
||||
"kernelspec": {
|
||||
"name": "python_defaultSpec_1599819467604",
|
||||
"display_name": "Python 3.6.3 64-bit ('python3.6': virtualenv)"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2,
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from numpy import linalg as la\n",
|
||||
"from numpy import *"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def loadExData3():\n",
|
||||
" # 利用SVD提高推荐效果,菜肴矩阵\n",
|
||||
" return[[2, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0],\n",
|
||||
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5],\n",
|
||||
" [0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0],\n",
|
||||
" [3, 3, 4, 0, 3, 0, 0, 2, 2, 0, 0],\n",
|
||||
" [5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0],\n",
|
||||
" [0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],\n",
|
||||
" [4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5],\n",
|
||||
" [0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4],\n",
|
||||
" [0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],\n",
|
||||
" [0, 0, 0, 3, 0, 0, 0, 0, 4, 5, 0],\n",
|
||||
" [1, 1, 2, 1, 1, 2, 1, 0, 4, 5, 0]]\n",
|
||||
"\n",
|
||||
"myMat = mat(loadExData3())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "matrix([[2, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5],\n [0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0],\n [3, 3, 4, 0, 3, 0, 0, 2, 2, 0, 0],\n [5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],\n [4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5],\n [0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4],\n [0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],\n [0, 0, 0, 3, 0, 0, 0, 0, 4, 5, 0],\n [1, 1, 2, 1, 1, 2, 1, 0, 4, 5, 0]])"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 3
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"myMat"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):\n",
|
||||
" \"\"\"svdEst( )\n",
|
||||
" Args:\n",
|
||||
" dataMat 训练数据集\n",
|
||||
" user 用户编号\n",
|
||||
" simMeas 相似度计算方法\n",
|
||||
" estMethod 使用的推荐算法\n",
|
||||
" Returns:\n",
|
||||
" 返回最终 N 个推荐结果\n",
|
||||
" \"\"\"\n",
|
||||
" # 寻找未评级的物品\n",
|
||||
" # 对给定的用户建立一个未评分的物品列表\n",
|
||||
" \n",
|
||||
" unratedItems = nonzero(dataMat[user, :].A == 0)[1]\n",
|
||||
" # 如果不存在未评分物品,那么就退出函数\n",
|
||||
" if len(unratedItems) == 0:\n",
|
||||
" return 'you rated everything'\n",
|
||||
" # 物品的编号和评分值\n",
|
||||
" itemScores = []\n",
|
||||
" # 在未评分物品上进行循环\n",
|
||||
" for item in unratedItems:\n",
|
||||
" # 获取 item 该物品的评分\n",
|
||||
" estimatedScore = estMethod(dataMat, user, simMeas, item)\n",
|
||||
" itemScores.append((item, estimatedScore))\n",
|
||||
" # 按照评分得分 进行逆排序,获取前N个未评级物品进行推荐\n",
|
||||
" return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[: N]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(recommend(myMat, 1, estMethod=svdEst))"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user