修复原来删除 nlp的内容,迁移到 docs/nlp_old 下面

This commit is contained in:
片刻小哥哥
2022-12-30 16:37:47 +08:00
parent 5d89832ce2
commit 71899ca1e7
53 changed files with 1653 additions and 73 deletions

View File

@@ -0,0 +1,318 @@
#!/usr/bin/python
# coding:utf-8
# -------------------------------------------------------------------------------
# Name: 推荐系统
# Purpose: 基于内容推荐
# Author: jiangzhonglian
# Create_time: 2020年10月15日
# Update_time: 2020年10月21日
# -------------------------------------------------------------------------------
import os
import sys
import numpy as np
import pandas as pd
# 自定义库
import config.set_content as setting
from middleware.utils import pd_load, pd_like, pd_save, pd_rename, get_days
def data_converting(infile, outfile):
"""
# 将用户交易数据转化为:
# 将
# 用户ID、各种基金、变动金额、时间
# 转化为:
# 用户ID、基金ID、购买金额、时间的数据
"""
print("Loading user daliy data...")
df = pd_load(infile)
df["money"] = df["变动金额"].apply(lambda line: abs(line))
df_user_item = df.groupby(["用户账号", "证券代码"], as_index=False).agg({
"money": np.sum
}).sort_values("money", ascending=True)
pd_rename(df_user_item, ["user_id", "item_id", "rating"])
pd_save(df_user_item, outfile)
def create_user2item(infile, outfile):
"""创建user-item评分矩阵"""
print("Loading user daliy data...")
df_user_item = pd_load(infile)
user_id = sorted(df_user_item['user_id'].unique(), reverse=False)
item_id = sorted(df_user_item['item_id'].unique(), reverse=False)
# print("+++ user_id:", user_id)
# print("+++ item_id:", item_id)
rating_matrix = np.zeros([len(user_id),len(item_id)])
rating_matrix = pd.DataFrame(rating_matrix, index=user_id, columns=item_id)
print("Converting data...")
count = 0
user_num= len(user_id)
for uid in user_id:
user_rating = df_user_item[df_user_item['user_id'] == uid].drop(['user_id'], axis=1)
user_rated_num = len(user_rating)
for row in range(0, user_rated_num):
item_id = user_rating['item_id'].iloc[row]
# 行(用户),列(电影),得分
rating_matrix.loc[uid, item_id] = user_rating['rating'].iloc[row]
count += 1
if count % 10 == 0:
completed_percentage = round(float(count) / user_num * 100)
print("Completed %s" % completed_percentage + "%")
rating_matrix.index.name = 'user_id'
pd_save(rating_matrix, outfile, index=True)
def create_item2feature(infile, outfile):
"""创建 item-特征-是否存在 矩阵"""
print("Loading item feature data...")
df_item_info = pd_load(infile, header=1)
items_num = df_item_info.shape[0]
columns = df_item_info.columns.tolist()
new_cols = [col for col in columns if col not in ["info_type", "info_investype"]]
info_types = sorted(df_item_info["info_type"].unique(), reverse=False)
info_investypes = sorted(df_item_info["info_investype"].unique(), reverse=False)
dict_n_cols = {"info_type": info_types, "info_investype": info_investypes}
new_cols.append(dict_n_cols)
# 获取新的列名
def get_new_columns(new_cols):
new_columns = []
for col in new_cols:
if isinstance(col, dict):
for k, vs in col.items():
new_columns += vs
else:
new_columns.append(col)
return new_columns
new_columns = get_new_columns(new_cols)
# print(new_cols)
# print(new_columns)
# ['item_id', 'info_name', 'info_trackerror', 'info_manafeeratioo', 'info_custfeeratioo', 'info_salefeeratioo', 'info_foundsize', 'info_foundlevel', 'info_creattime', 'info_unitworth'
# {'info_type': ['QDII-ETF', '混合型', '股票指数', 'ETF-场内'], 'info_investype': ['契约型开放式', '契约型封闭式']}]
def deal_line(line, new_cols):
result = []
for col in new_cols:
if isinstance(col, str):
result.append(line[col])
elif isinstance(col, dict):
for k, vs in col.items():
for v in vs:
if v == line[k]:
result.append(1)
else:
result.append(0)
else:
print("类型错误")
sys.exit(1)
return result
df = df_item_info.apply(lambda line: deal_line(line, new_cols), axis=1, result_type="expand")
pd_rename(df, new_columns)
# 处理时间
end_time = "2020-10-19"
df["days"] = df["info_creattime"].apply(lambda str_time: get_days(str_time, end_time))
# print(df.head(5))
df.drop(['info_name', 'info_foundlevel', 'info_creattime'], axis=1, inplace=True)
pd_save(df, outfile)
def rs_1_data_preprocess():
# 原属数据
data_infile = setting.PATH_CONFIG["user_daily"]
# 用户-物品-评分
user_infile = setting.PATH_CONFIG["user_item"]
user_outfile = setting.PATH_CONFIG["matrix_user_item2rating"]
# 物品-特征-评分
item_infile = setting.PATH_CONFIG["item_info"]
item_outfile = setting.PATH_CONFIG["matrix_item2feature"]
# 判断用户交易数据,如果不存在就要重新生成
if not os.path.exists(user_infile):
"""数据处理部分"""
# user 数据预处理
data_converting(data_infile, user_infile)
# 创建 user-item-评分 矩阵
create_user2item(user_infile, user_outfile)
else:
if not os.path.exists(user_outfile):
# 创建 user-item-评分 矩阵
create_user2item(user_infile, user_outfile)
if not os.path.exists(item_outfile):
# 创建 item-feature-是否存在 矩阵
create_item2feature(item_infile, item_outfile)
user_feature = pd_load(user_outfile)
item_feature = pd_load(item_outfile)
user_feature.set_index('user_id', inplace=True)
item_feature.set_index('item_id', inplace=True)
return user_feature, item_feature
def cos_measure(item_feature_vector, user_rated_items_matrix):
"""
计算item之间的余弦夹角相似度
:param item_feature_vector: 待测量的item特征向量
:param user_rated_items_matrix: 用户已评分的items的特征矩阵
:return: 待计算item与用户已评分的items的余弦夹角相识度的向量
"""
x_c = (item_feature_vector * user_rated_items_matrix.T) + 0.0000001
mod_x = np.sqrt(item_feature_vector * item_feature_vector.T)
mod_c = np.sqrt((user_rated_items_matrix * user_rated_items_matrix.T).diagonal())
cos_xc = x_c / (mod_x * mod_c)
return cos_xc
def comp_user_feature(user_rated_vector, item_feature_matrix):
"""
根据user的评分来计算得到user的喜好特征
:param user_rated_vector : user的评分向量
:param item_feature_matrix: item的特征矩阵
:return: user的喜好特征
"""
# user评分的均值
user_rating_mean = user_rated_vector.mean()
# # 分别得到user喜欢和不喜欢item的向量以及item对应的引索(以该user的评分均值来划分)
user_like_item = user_rated_vector.loc[user_rated_vector >= user_rating_mean]
user_unlike_item = user_rated_vector.loc[user_rated_vector < user_rating_mean]
print("user_like_item: \n", user_like_item)
print("user_unlike_item: \n", user_unlike_item)
# 获取买入和卖出的 index
user_like_item_index = map(int, user_like_item.index.values)
user_unlike_item_index = map(int, user_unlike_item.index.values)
# 获取买入和卖出的 value
user_like_item_rating = np.matrix(user_like_item.values)
user_unlike_item_rating = np.matrix(user_unlike_item.values)
#得到user喜欢和不喜欢item的特征矩阵
user_like_item_feature_matrix = np.matrix(item_feature_matrix.loc[user_like_item_index, :].values)
user_unlike_item_feature_matrix = np.matrix(item_feature_matrix.loc[user_unlike_item_index, :].values)
#计算user的喜好特征向量以其对item的评分作为权重
weight_of_like = user_like_item_rating / user_like_item_rating.sum()
weight_of_unlike = user_unlike_item_rating / user_unlike_item_rating.sum()
print("weight_of_like: ", weight_of_like)
print("weight_of_unlike: ", weight_of_unlike)
#计算user的喜欢特征和不喜欢特征以及总特征
user_like_feature = weight_of_like * user_like_item_feature_matrix
user_unlike_feature = weight_of_unlike * user_unlike_item_feature_matrix
user_feature_tol = user_like_feature - user_unlike_feature
return user_feature_tol
def rs_2_cb_recommend(user_feature, item_feature_matrix, K=20):
"""
计算得到与user最相似的Top K个item推荐给user
:param user_feature: 待推荐用户的对item的评分向量
:param item_feature_matrix: 包含所有item的特征矩阵
:param K: 推荐给user的item数量
:return: 与user最相似的Top K个item的编号
"""
# 得到user已评分和未评分的item向量
user_rated_vector = user_feature.loc[user_feature > 0]
# print("操作 >>> \n", user_rated_vector)
# user_unrated_vector = user_feature.loc[user_feature == 0]
# print("未操作 >>> \n", user_unrated_vector)
# 买过的其实也可以推荐
user_unrated_vector = user_feature
# print(">>> \n", user_unrated_vector)
# user喜好总特征(就是用户的调性)
user_item_feature_tol = comp_user_feature(user_rated_vector, item_feature_matrix)
print(">>> 用户调性", user_item_feature_tol)
#未评分item的特征矩阵
user_unrated_item_index = map(int, user_unrated_vector.index.values)
user_unrated_item_feature_matrix = np.matrix(item_feature_matrix.loc[user_unrated_item_index, :].values)
#得到相似度并进行排序
similarity = list(np.array(cos_measure(user_item_feature_tol, user_unrated_item_feature_matrix))[0])
key = {'item_index': list(user_unrated_vector.index.values), 'similarity': similarity}
item_sim_df = pd.DataFrame(key)
item_sim_df.sort_values('similarity', ascending=False, inplace=True)
# print(item_sim_df.head(100))
return item_sim_df.iloc[:K, 0].values
def estimate_rate(user_rated_vector, similarity):
"""
估计用户对item的评分
:param user_rated_vector: 用户已有item评分向量
:param similarity: 待估计item和已评分item的相识度向量
:return:用户对item的评分的估计
"""
rate_hat = (user_rated_vector * similarity.T) / similarity.sum()
# print(">>> ", rate_hat)
return rate_hat[0, 0]
def rs_2_cb_recommend_estimate(user_feature, item_feature_matrix, item):
"""
基于内容的推荐算法对item的评分进行估计
:param item_feature_matrix: 包含所有item的特征矩阵
:param user_feature: 待估计用户的对item的评分向量
:param item: 待估计item的编号
:return: 基于内容的推荐算法对item的评分进行估计
"""
# #得到item的引索以及特征矩阵
# item_index = item_feature_matrix.index
# item_feature = item_feature_matrix.values
#得到所有user评分的item的引索
user_item_index = user_feature.index
#某一用户已有评分item的评分向量和引索以及item的评分矩阵
user_rated_vector = np.matrix(user_feature.loc[user_feature > 0].values)
user_rated_items = map(int, user_item_index[user_feature > 0].values)
user_rated_items_matrix = np.matrix(item_feature_matrix.loc[user_rated_items, :].values)
#待评分item的特征向量函数中给出的是该item的Id
item_feature_vector = np.matrix(item_feature_matrix.loc[item].values)
#得到待计算item与用户已评分的items的余弦夹角相识度的向量
cos_xc = cos_measure(item_feature_vector, user_rated_items_matrix)
# print(">>> 相似度: %s" % cos_xc)
#计算uesr对该item的评分估计
rate_hat = estimate_rate(user_rated_vector, cos_xc)
return rate_hat
def main():
# 数据初始化
user_id = 20200930
K = 10
user_feature, item_feature = rs_1_data_preprocess()
# 基于内容推荐的模块(给某一个用户推荐 10个 他感兴趣的内容)
user_feature = user_feature.loc[user_id, :] # 一行 用户(具体某一个用户)-电影-评分 数据
print(">>> 1 \n", user_feature)
# 效果不好,有几个原因
# 1. 交易数据比较少
# 2. 基金的特征不够全面
# 3. 优化喜欢和不喜欢的阈值
result = rs_2_cb_recommend(user_feature, item_feature, K)
print(result)
# for code in result:
# # 给某一个用户推荐一个item, 预估推荐评分
# price = rs_2_cb_recommend_estimate(user_feature, item_feature, code)
# if price > 1000:
# print("--- %s 基金买入 %s" % (code, abs(price)) )
# elif price < -1000:
# print("--- %s 基金卖出 %s" % (code, abs(price)) )
# else:
# print("--- 不做任何操作")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,262 @@
#!/usr/bin/python
# coding:utf-8
# -------------------------------------------------------------------------------
# Name: 推荐系统
# Purpose: 推荐系统: Item CF/User CF/SVD 对比
# Author: jiangzhonglian
# Create_time: 2020年9月21日
# Update_time: 2020年9月21日
# -------------------------------------------------------------------------------
from __future__ import print_function
import sys
import math
from operator import itemgetter
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from sklearn import model_selection as cv
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances
from middleware.utils import TimeStat, Chart
def splitData(dataFile, test_size):
# 加载数据集 (用户ID 电影ID 评分, 时间戳)
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(dataFile, sep='\t', names=header)
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print('>>> 本数据集包含: 总用户数 = %s | 总电影数 = %s' % (n_users, n_items) )
train_data, test_data = cv.train_test_split(df, test_size=test_size)
print(">>> 训练:测试 = %s:%s = %s:%s" % (len(train_data), len(test_data), 1-test_size, test_size))
return df, n_users, n_items, train_data, test_data
def calc_similarity(n_users, n_items, train_data, test_data):
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
"""
line: Pandas(Index=93661, user_id=624, item_id=750, rating=4, timestamp=891961163)
"""
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
test_data_matrix[line[1] - 1, line[2] - 1] = line[3]
print("1:", np.shape(train_data_matrix)) # 行: 人 | 列: 电影
print("2:", np.shape(train_data_matrix.T)) # 行: 电影 | 列: 人
# 使用sklearn的 pairwise_distances 计算向量距离cosine来计算余弦距离越小越相似
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
# print("<<< %s \n %s" % (np.shape(user_similarity), user_similarity) )
# print("<<< %s \n %s" % (np.shape(item_similarity), item_similarity) )
print('开始统计流行item的数量...', file=sys.stderr)
item_popular = {}
# 统计同一个电影,观看的总人数(也就是所谓的流行度!)
for i_index in range(n_items):
if np.sum(train_data_matrix[:, i_index]) != 0:
item_popular[i_index] = np.sum(train_data_matrix[:, i_index] != 0)
# save the total number of items
item_count = len(item_popular)
print('总共流行 item 数量 = %d' % item_count, file=sys.stderr)
return train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular
def predict(rating, similarity, type='user'):
"""
:param rating: 训练数据
:param similarity: 向量距离
:return:
"""
print("+++ %s" % type)
print(" rating=", np.shape(rating))
print(" similarity=", np.shape(similarity))
if type == 'item':
"""
综合打分:
rating.dot(similarity) 表示:
某1个人所有的电影组合 X ·电影*电影·距离第1列都是关于第1部电影和其他的电影的距离计算出 第一个人对第1/2/3部电影的 总评分 1*n
某2个人所有的电影组合 X ·电影*电影·距离第1列都是关于第1部电影和其他的电影的距离计算出 第一个人对第1/2/3部电影的 总评分 1*n
...
某n个人所有的电影组合 X ·电影*电影·距离第1列都是关于第1部电影和其他的电影的距离计算出 第一个人对第1/2/3部电影的 总评分 1*n
= 人-电影-评分(943, 1682) * 电影-电影-距离(1682, 1682)
= 人-电影-总评分距离(943, 1682)
np.array([np.abs(similarity).sum(axis=1)]) 表示: 横向求和: 1 表示某一行所有的列求和
第1列表示某个A电影对于所有电影计算出A的总距离
第2列表示某个B电影对于所有电影的综出B的总距离
...
第n列表示某个N电影对于所有电影的综出N的总距离
= 每一个电影的总距离 (1, 1682)
pred = 人-电影-平均评分 (943, 1682)
"""
pred = rating.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
elif type == 'user':
# 每个样本上减去数据的统计平均值可以移除共同的部分,凸显个体差异。
# 求出每一个用户,所有电影的综合评分
# 横向求平均: 1 表示某一行所有的列求平均
mean_user_rating = rating.mean(axis=1)
# numpy中包含的 newaxis 可以给原数组增加一个维度
rating_diff = (rating - mean_user_rating[:, np.newaxis])
# 均分 +
# 人-人-距离(943, 943)*人-电影-评分diff(943, 1682)=结果-人-电影(每个人对同一电影的综合得分)(943, 1682) 再除以 个人与其他人总的距离 = 人-电影综合得分
"""
综合打分:
similarity.dot(rating_diff) 表示:
第1列第1个人与其他人的相似度 * 人与电影的相似度,得到 第1个人对第1/2/3列电影的 总得分 1*n
第2列第2个人与其他人的相似度 * 人与电影的相似度,得到 第2个人对第1/2/3列电影的 总得分 1*n
...
第n列第n个人与其他人的相似度 * 人与电影的相似度,得到 第n个人对第1/2/3列电影的 总得分 1*n
= 人-人-距离(943, 943) * 人-电影-评分(943, 1682)
= 人-电影-总评分距离(943, 1682)
np.array([np.abs(similarity).sum(axis=1)]) 表示: 横向求和: 1 表示某一行所有的列求和
第1列表示第A个人对于所有人计算出A的总距离
第2列表示第B个人对于所有人计算出B的总距离
...
第n列表示第N个人对于所有人计算出N的总距离
= 每一个电影的总距离 (1, 943)
pred = 均值 + 人-电影-平均评分 (943, 1682)
"""
pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
return pred
def rmse(prediction, ground_truth):
prediction = prediction[ground_truth.nonzero()].flatten()
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
return math.sqrt(mean_squared_error(prediction, ground_truth))
def evaluate(prediction, item_popular, name):
hit = 0
rec_count = 0
test_count = 0
popular_sum = 0
all_rec_items = set()
for u_index in range(n_users):
items = np.where(train_data_matrix[u_index, :] == 0)[0]
pre_items = sorted(
dict(zip(items, prediction[u_index, items])).items(),
key=itemgetter(1),
reverse=True)[:20]
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
# 对比测试集和推荐集的差异 item, w
for item, _ in pre_items:
if item in test_items:
hit += 1
all_rec_items.add(item)
# popular_sum是对所有的item的流行度进行加和
if item in item_popular:
popular_sum += math.log(1 + item_popular[item])
rec_count += len(pre_items)
test_count += len(test_items)
precision = hit / (1.0 * rec_count)
# 召回率,相对于测试推荐集合的数据
recall = hit / (1.0 * test_count)
# 覆盖率,相对于训练集合的数据
coverage = len(all_rec_items) / (1.0 * len(item_popular))
popularity = popular_sum / (1.0 * rec_count)
print('--- %s: precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
name, precision, recall, coverage, popularity), file=sys.stderr)
def recommend(u_index, prediction):
items = np.where(train_data_matrix[u_index, :] == 0)[0]
pre_items = sorted(
dict(zip(items, prediction[u_index, items])).items(),
key=itemgetter(1),
reverse=True)[:10]
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
result = [key for key, value in pre_items]
result.sort(reverse=False)
print('原始结果(%s): %s' % (len(test_items), test_items) )
print('推荐结果(%s): %s' % (len(result), result) )
def main():
global n_users, train_data_matrix, test_data_matrix
# 基于内存的协同过滤
# ...
# 拆分数据集
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
path_root = "/Users/jiangzl/work/data/机器学习"
dataFile = '%s/16.RecommenderSystems/ml-100k/u.data' % path_root
df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25)
# 计算相似度
train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular = calc_similarity(
n_users, n_items, train_data, test_data)
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')
# # 评估: 均方根误差
print('>>> Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))
print('>>> User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
# 基于模型的协同过滤
# ...
# 计算MovieLens数据集的稀疏度 n_usersn_items 是常量,所以,用户行为数据越少,意味着信息量少;越稀疏,优化的空间也越大)
sparsity = round(1.0 - len(df) / float(n_users * n_items), 3)
print('\nMovieLen100K的稀疏度: %s%%\n' % (sparsity * 100))
# # 计算稀疏矩阵的最大k个奇异值/向量
# minrmse = math.inf
# index = 1
# for k in range(1, 30, 1):
# u, s, vt = svds(train_data_matrix, k=k)
# # print(">>> ", s)
# s_diag_matrix = np.diag(s)
# svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
# r_rmse = rmse(svd_prediction, test_data_matrix)
# if r_rmse < minrmse:
# index = k
# minrmse = r_rmse
index = 11
minrmse = 2.6717213264389765
u, s, vt = svds(train_data_matrix, k=index)
# print(">>> ", s)
s_diag_matrix = np.diag(s)
svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
r_rmse = rmse(svd_prediction, test_data_matrix)
print("+++ k=%s, svd-shape: %s" % (index, np.shape(svd_prediction)) )
print('>>> Model based CF RMSE: %s\n' % minrmse)
# """
# 在信息量相同的情况下,矩阵越小,那么携带的信息越可靠。
# 所以: user-cf 推荐效果高于 item-cf 而svd分解后发现15个维度效果就能达到90%以上,所以信息更可靠,效果也更好。
# item-cf: 1682
# user-cf: 943
# svd: 15
# """
evaluate(item_prediction, item_popular, 'item')
evaluate(user_prediction, item_popular, 'user')
evaluate(svd_prediction, item_popular, 'svd')
# 推荐结果
# recommend(1, item_prediction)
# recommend(1, user_prediction)
recommend(1, svd_prediction)
if __name__ == "__main__":
main()

View File

121
tutorials/keras/brat_tag.py Normal file
View File

@@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
"""
数据格式转化
"""
import os
import emoji
from middleware.utils import get_catalog_files
from config.setting import Config
tag_dic = {"实体对象": "ORG",
"正向观点": "Po_VIEW",
"中性观点": "Mi_VIEW",
"负向观点": "Ne_VIEW"}
# 转换成可训练的格式,最后以"END O"结尾
def from_ann2dic(r_ann_path, r_txt_path, w_path):
q_dic = {}
print("开始读取文件:%s" % r_ann_path)
with open(r_ann_path, "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
line_arr = line.split()
# print(">>> ", line_arr)
cls = tag_dic[line_arr[1]]
start_index = int(line_arr[2])
end_index = int(line_arr[3])
length = end_index - start_index
for r in range(length):
q_dic[start_index+r] = ("B-%s" % cls) if r == 0 else ("I-%s" % cls)
# 存储坐标和对应的列名: {23: 'B-Ne_VIEW', 24: 'I-Ne_VIEW', 46: 'B-ORG', 47: 'I-ORG'}
print("q_dic: ", q_dic)
print("开始读取文件内容: %s" % r_txt_path)
with open(r_txt_path, "r", encoding="utf-8") as f:
content_str = f.read()
print("开始写入文本%s" % w_path)
with open(w_path, "w", encoding="utf-8") as w:
for i, strA in enumerate(content_str):
# print(">>> %s-%s" % (i, strA))
if strA == "\n":
w.write("\n")
else:
if i in q_dic:
tag = q_dic[i]
else:
tag = "O" # 大写字母O
w.write('%s %s\n' % (strA, tag))
w.write('%s\n' % "END O")
# 生成train.txt、dev.txt、test.txt
# 除89-new.txt分别用于dev和test外,剩下的合并成train.txt
def create_train_data(data_root_dir, w_path):
if os.path.exists(w_path):
os.remove(w_path)
for file in os.listdir(data_root_dir):
path = os.path.join(data_root_dir, file)
if file.endswith("8-new.txt"):
# 重命名为dev.txt
os.rename(path, os.path.join(data_root_dir, "dev.txt"))
continue
if file.endswith("9-new.txt"):
# 重命名为test.txt
os.rename(path, os.path.join(data_root_dir, "test.txt"))
continue
q_list = []
print("开始读取文件:%s" % file)
with open(path, "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
line = line.rstrip()
if line == "END O":
break
q_list.append(line)
# 获取list 列表: ['美 O', ' O', '气 O', '质 O', '特 O', '别 O', '好 O', '', '造 O', '型 O', '独 O', '特 O', ' O', '尺 B-ORG', '码 I-ORG', '偏 B-Ne_VIEW', '大 I-Ne_VIEW', ' O']
# print("q_list: ", q_list)
print("开始写入文本: %s" % w_path)
with open(w_path, "a", encoding="utf-8") as f:
for item in q_list:
f.write('%s\n' % item)
def brat_1_format_origin(catalog):
"""
格式化原始文件去除表情符号的影响brat占2个字符但是python占1个字符
"""
with open('%s/origin/origin.txt' % path_root, "r", encoding="utf-8") as f:
lines = f.readlines()
with open('%s/tag_befer/befer.txt' % path_root, "w", encoding="utf-8") as f:
# 转换原始文件
for line in lines:
text = emoji.demojize(line)
f.write('%s' % text)
# 创建标注的新文件
with open('%s/tag_befer/befer.ann' % path_root, "w", encoding="utf-8") as f:
pass
def brat_2_create_train_data(catalog):
file_list = get_catalog_files("%s/tag_after" % catalog, status=-1, str1=".DS_Store")
file_list = list(set([i.split("/")[-1].split(".")[0] for i in file_list]))
print(file_list)
for filename in file_list:
r_ann_path = os.path.join(catalog, "tag_after/%s.ann" % filename)
r_txt_path = os.path.join(catalog, "tag_after/%s.txt" % filename)
w_path = os.path.join(catalog, "new/%s-new.txt" % filename)
print("filename", r_ann_path, r_txt_path, w_path)
from_ann2dic(r_ann_path, r_txt_path, w_path)
# 生成train.txt、dev.txt、test.txt
create_train_data("%s/new" % catalog, "%s/new/train.txt" % catalog)
def main():
catalog = Config.nlp_ner.path_root
# brat_1_format_origin(catalog)
brat_2_create_train_data(catalog)

165
tutorials/keras/text_NER.py Normal file
View File

@@ -0,0 +1,165 @@
import pickle
import numpy as np
import pandas as pd
import platform
from collections import Counter
import keras
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dropout
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy
"""
# padding: pre(默认) 向前补充0 post 向后补充0
# truncating: 文本超过 pad_num, pre(默认) 删除前面 post 删除后面
# x_train = pad_sequences(x, maxlen=pad_num, value=0, padding='post', truncating="post")
# print("--- ", x_train[0][:20])
使用keras_bert、keras_contrib的crf时bug记录
TypeError: Tensors in list passed to 'values' of 'ConcatV2' Op have types [bool, float32] that don't all match
解决方案, 修改crf.py 516行
mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1),
为:
mask2 = K.cast(K.concatenate([mask, K.cast(K.zeros_like(mask[:, :1]), mask.dtype)], axis=1),
"""
from keras.preprocessing.sequence import pad_sequences
from config.setting import Config
def load_data():
train = _parse_data(Config.nlp_ner.path_train)
test = _parse_data(Config.nlp_ner.path_test)
print("--- init 数据加载解析完成 ---")
# Counter({'的': 8, '中': 7, '致': 7, '党': 7})
word_counts = Counter(row[0].lower() for sample in train for row in sample)
vocab = [w for w, f in iter(word_counts.items()) if f >= 2]
chunk_tags = Config.nlp_ner.chunk_tags
# 存储保留的有效个数的 vovab 和 对应 chunk_tags
with open(Config.nlp_ner.path_config, 'wb') as outp:
pickle.dump((vocab, chunk_tags), outp)
print("--- init 配置文件保存成功 ---")
train = _process_data(train, vocab, chunk_tags)
test = _process_data(test , vocab, chunk_tags)
print("--- init 对数据进行编码,生成训练需要的数据格式 ---")
return train, test, (vocab, chunk_tags)
def _parse_data(filename):
"""
以单下划线开头_foo的代表不能直接访问的类属性
用于解析数据,用于模型训练
:param filename: 文件地址
:return: data: 解析数据后的结果
[[['', 'B-ORG'], ['', 'I-ORG']], [['', 'B-ORG'], ['', 'I-ORG']]]
"""
with open(filename, 'rb') as fn:
split_text = '\n'
# 主要是分句: split_text 默认每个句子都是一行,所以原来换行就需要 两个split_text
texts = fn.read().decode('utf-8').strip().split(split_text + split_text)
# 对于每个字需要 split_text, 而字的内部需要用空格分隔
# len(row) > 0 避免连续2个换行导致 row 数据为空
# row.split() 会删除空格或特殊符号,导致空格数据缺失!
data = [[[" ", "O"] if len(row.split()) != 2 else row.split() for row in text.split(split_text) if len(row) > 0] for text in texts]
# data = [[row.split() for row in text.split(split_text) if len(row.split()) == 2] for text in texts]
return data
def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):
if maxlen is None:
maxlen = max(len(s) for s in data)
# 对每个字进行编码
word2idx = dict((w, i) for i, w in enumerate(vocab))
# 如果不在 vocab里面就给 unk 值为 1
x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]
y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]
x = pad_sequences(x, maxlen) # left padding
y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
if onehot:
# 返回一个onehot 编码的多维数组
y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk]
else:
# np.expand_dims:用于扩展数组的形状
# https://blog.csdn.net/hong615771420/article/details/83448878
y_chunk = np.expand_dims(y_chunk, 2)
return x, y_chunk
def process_data(data, vocab, maxlen=100):
word2idx = dict((w, i) for i, w in enumerate(vocab))
x = [word2idx.get(w[0].lower(), 1) for w in data]
length = len(x)
x = pad_sequences([x], maxlen) # left padding
return x, length
def create_model(len_vocab, len_chunk_tags):
model = Sequential()
model.add(Embedding(len_vocab, Config.nlp_ner.EMBED_DIM, mask_zero=True)) # Random embedding
model.add(Bidirectional(LSTM(Config.nlp_ner.BiLSTM_UNITS // 2, return_sequences=True)))
model.add(Dropout(0.25))
crf = CRF(len_chunk_tags, sparse_target=True)
model.add(crf)
model.summary()
model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy])
# model.compile('rmsprop', loss=crf_loss, metrics=[crf_viterbi_accuracy])
# from keras.optimizers import Adam
# adam_lr = 0.0001
# adam_beta_1 = 0.5
# model.compile(optimizer=Adam(lr=adam_lr, beta_1=adam_beta_1), loss=crf_loss, metrics=[crf_viterbi_accuracy])
return model
def train():
(train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data()
model = create_model(len(vocab), len(chunk_tags))
# train model
model.fit(train_x, train_y, batch_size=16, epochs=Config.nlp_ner.EPOCHS, validation_data=[test_x, test_y])
model.save(Config.nlp_ner.path_model)
def test():
with open(Config.nlp_ner.path_config, 'rb') as inp:
(vocab, chunk_tags) = pickle.load(inp)
model = create_model(len(vocab), len(chunk_tags))
# predict_text = '造型独特,尺码偏大,估计是钉子头圆的半径的缘故'
with open(Config.nlp_ner.path_origin, "r", encoding="utf-8") as f:
lines = f.readlines()
for predict_text in lines:
content = predict_text.strip()
text_EMBED, length = process_data(content, vocab)
model.load_weights(Config.nlp_ner.path_model)
raw = model.predict(text_EMBED)[0][-length:]
pre_result = [np.argmax(row) for row in raw]
result_tags = [chunk_tags[i] for i in pre_result]
# 保存每句话的 实体和观点
result = {}
tag_list = [i for i in chunk_tags if i not in ["O"]]
for word, t in zip(content, result_tags):
# print(word, t)
if t not in tag_list:
continue
for i in range(0, len(tag_list), 2):
if t in tag_list[i:i+2]:
# print("\n>>> %s---%s==%s" % (word, t, tag_list[i:i+2]))
tag = tag_list[i].split("-")[-1]
if tag not in result:
result[tag] = ""
result[tag] += ' '+word if t==tag_list[i] else word
print(result)
def main():
# print("--")
train()
test()
# if __name__ == "__main__":
# train()

113
tutorials/test.ipynb Normal file
View File

@@ -0,0 +1,113 @@
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3-final"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python_defaultSpec_1599819467604",
"display_name": "Python 3.6.3 64-bit ('python3.6': virtualenv)"
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from numpy import linalg as la\n",
"from numpy import *"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def loadExData3():\n",
" # 利用SVD提高推荐效果菜肴矩阵\n",
" return[[2, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5],\n",
" [0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0],\n",
" [3, 3, 4, 0, 3, 0, 0, 2, 2, 0, 0],\n",
" [5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],\n",
" [4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5],\n",
" [0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4],\n",
" [0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],\n",
" [0, 0, 0, 3, 0, 0, 0, 0, 4, 5, 0],\n",
" [1, 1, 2, 1, 1, 2, 1, 0, 4, 5, 0]]\n",
"\n",
"myMat = mat(loadExData3())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "matrix([[2, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5],\n [0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0],\n [3, 3, 4, 0, 3, 0, 0, 2, 2, 0, 0],\n [5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0],\n [0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],\n [4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5],\n [0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4],\n [0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],\n [0, 0, 0, 3, 0, 0, 0, 0, 4, 5, 0],\n [1, 1, 2, 1, 1, 2, 1, 0, 4, 5, 0]])"
},
"metadata": {},
"execution_count": 3
}
],
"source": [
"myMat"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):\n",
" \"\"\"svdEst( )\n",
" Args:\n",
" dataMat 训练数据集\n",
" user 用户编号\n",
" simMeas 相似度计算方法\n",
" estMethod 使用的推荐算法\n",
" Returns:\n",
" 返回最终 N 个推荐结果\n",
" \"\"\"\n",
" # 寻找未评级的物品\n",
" # 对给定的用户建立一个未评分的物品列表\n",
" \n",
" unratedItems = nonzero(dataMat[user, :].A == 0)[1]\n",
" # 如果不存在未评分物品,那么就退出函数\n",
" if len(unratedItems) == 0:\n",
" return 'you rated everything'\n",
" # 物品的编号和评分值\n",
" itemScores = []\n",
" # 在未评分物品上进行循环\n",
" for item in unratedItems:\n",
" # 获取 item 该物品的评分\n",
" estimatedScore = estMethod(dataMat, user, simMeas, item)\n",
" itemScores.append((item, estimatedScore))\n",
" # 按照评分得分 进行逆排序获取前N个未评级物品进行推荐\n",
" return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[: N]\n",
"\n",
"\n",
"print(recommend(myMat, 1, estMethod=svdEst))"
]
}
]
}