mirror of
https://github.com/apachecn/ailearning.git
synced 2026-04-10 05:58:21 +08:00
Merge branch 'master' of github.com:apachecn/AiLearning
This commit is contained in:
262
src/tutorials/RecommenderSystems/rs_rating_demo.py
Normal file
262
src/tutorials/RecommenderSystems/rs_rating_demo.py
Normal file
@@ -0,0 +1,262 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf-8
|
||||
# -------------------------------------------------------------------------------
|
||||
# Name: 推荐系统
|
||||
# Purpose: 推荐系统: Item CF/User CF/SVD 对比
|
||||
# Author: jiangzhonglian
|
||||
# Create_time: 2020年9月21日
|
||||
# Update_time: 2020年9月21日
|
||||
# -------------------------------------------------------------------------------
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import math
|
||||
from operator import itemgetter
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.sparse.linalg import svds
|
||||
from sklearn import model_selection as cv
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from middleware.utils import TimeStat, Chart
|
||||
|
||||
|
||||
def splitData(dataFile, test_size):
|
||||
# 加载数据集 (用户ID, 电影ID, 评分, 时间戳)
|
||||
header = ['user_id', 'item_id', 'rating', 'timestamp']
|
||||
df = pd.read_csv(dataFile, sep='\t', names=header)
|
||||
|
||||
n_users = df.user_id.unique().shape[0]
|
||||
n_items = df.item_id.unique().shape[0]
|
||||
|
||||
print('>>> 本数据集包含: 总用户数 = %s | 总电影数 = %s' % (n_users, n_items) )
|
||||
train_data, test_data = cv.train_test_split(df, test_size=test_size)
|
||||
print(">>> 训练:测试 = %s:%s = %s:%s" % (len(train_data), len(test_data), 1-test_size, test_size))
|
||||
return df, n_users, n_items, train_data, test_data
|
||||
|
||||
|
||||
def calc_similarity(n_users, n_items, train_data, test_data):
|
||||
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
|
||||
"""
|
||||
line: Pandas(Index=93661, user_id=624, item_id=750, rating=4, timestamp=891961163)
|
||||
"""
|
||||
train_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in train_data.itertuples():
|
||||
train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
|
||||
|
||||
test_data_matrix = np.zeros((n_users, n_items))
|
||||
for line in test_data.itertuples():
|
||||
test_data_matrix[line[1] - 1, line[2] - 1] = line[3]
|
||||
|
||||
print("1:", np.shape(train_data_matrix)) # 行: 人 | 列: 电影
|
||||
print("2:", np.shape(train_data_matrix.T)) # 行: 电影 | 列: 人
|
||||
|
||||
# 使用sklearn的 pairwise_distances 计算向量距离,cosine来计算余弦距离,越小越相似
|
||||
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
|
||||
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
|
||||
# print("<<< %s \n %s" % (np.shape(user_similarity), user_similarity) )
|
||||
# print("<<< %s \n %s" % (np.shape(item_similarity), item_similarity) )
|
||||
|
||||
print('开始统计流行item的数量...', file=sys.stderr)
|
||||
item_popular = {}
|
||||
# 统计同一个电影,观看的总人数(也就是所谓的流行度!)
|
||||
for i_index in range(n_items):
|
||||
if np.sum(train_data_matrix[:, i_index]) != 0:
|
||||
item_popular[i_index] = np.sum(train_data_matrix[:, i_index] != 0)
|
||||
|
||||
# save the total number of items
|
||||
item_count = len(item_popular)
|
||||
print('总共流行 item 数量 = %d' % item_count, file=sys.stderr)
|
||||
return train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular
|
||||
|
||||
|
||||
def predict(rating, similarity, type='user'):
|
||||
"""
|
||||
:param rating: 训练数据
|
||||
:param similarity: 向量距离
|
||||
:return:
|
||||
"""
|
||||
print("+++ %s" % type)
|
||||
print(" rating=", np.shape(rating))
|
||||
print(" similarity=", np.shape(similarity))
|
||||
if type == 'item':
|
||||
"""
|
||||
综合打分:
|
||||
rating.dot(similarity) 表示:
|
||||
某1个人所有的电影组合 X ·电影*电影·距离(第1列都是关于第1部电影和其他的电影的距离)中,计算出 第一个人对第1/2/3部电影的 总评分 1*n
|
||||
某2个人所有的电影组合 X ·电影*电影·距离(第1列都是关于第1部电影和其他的电影的距离)中,计算出 第一个人对第1/2/3部电影的 总评分 1*n
|
||||
...
|
||||
某n个人所有的电影组合 X ·电影*电影·距离(第1列都是关于第1部电影和其他的电影的距离)中,计算出 第一个人对第1/2/3部电影的 总评分 1*n
|
||||
= 人-电影-评分(943, 1682) * 电影-电影-距离(1682, 1682)
|
||||
= 人-电影-总评分距离(943, 1682)
|
||||
|
||||
np.array([np.abs(similarity).sum(axis=1)]) 表示: 横向求和: 1 表示某一行所有的列求和
|
||||
第1列表示:某个A电影,对于所有电影计算出A的总距离
|
||||
第2列表示:某个B电影,对于所有电影的综出B的总距离
|
||||
...
|
||||
第n列表示:某个N电影,对于所有电影的综出N的总距离
|
||||
= 每一个电影的总距离 (1, 1682)
|
||||
|
||||
pred = 人-电影-平均评分 (943, 1682)
|
||||
"""
|
||||
pred = rating.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
|
||||
elif type == 'user':
|
||||
# 每个样本上减去数据的统计平均值可以移除共同的部分,凸显个体差异。
|
||||
|
||||
# 求出每一个用户,所有电影的综合评分
|
||||
# 横向求平均: 1 表示某一行所有的列求平均
|
||||
mean_user_rating = rating.mean(axis=1)
|
||||
# numpy中包含的 newaxis 可以给原数组增加一个维度
|
||||
rating_diff = (rating - mean_user_rating[:, np.newaxis])
|
||||
|
||||
# 均分 +
|
||||
# 人-人-距离(943, 943)*人-电影-评分diff(943, 1682)=结果-人-电影(每个人对同一电影的综合得分)(943, 1682) 再除以 个人与其他人总的距离 = 人-电影综合得分
|
||||
"""
|
||||
综合打分:
|
||||
similarity.dot(rating_diff) 表示:
|
||||
第1列:第1个人与其他人的相似度 * 人与电影的相似度,得到 第1个人对第1/2/3列电影的 总得分 1*n
|
||||
第2列:第2个人与其他人的相似度 * 人与电影的相似度,得到 第2个人对第1/2/3列电影的 总得分 1*n
|
||||
...
|
||||
第n列:第n个人与其他人的相似度 * 人与电影的相似度,得到 第n个人对第1/2/3列电影的 总得分 1*n
|
||||
= 人-人-距离(943, 943) * 人-电影-评分(943, 1682)
|
||||
= 人-电影-总评分距离(943, 1682)
|
||||
|
||||
np.array([np.abs(similarity).sum(axis=1)]) 表示: 横向求和: 1 表示某一行所有的列求和
|
||||
第1列表示:第A个人,对于所有人计算出A的总距离
|
||||
第2列表示:第B个人,对于所有人计算出B的总距离
|
||||
...
|
||||
第n列表示:第N个人,对于所有人计算出N的总距离
|
||||
= 每一个电影的总距离 (1, 943)
|
||||
|
||||
pred = 均值 + 人-电影-平均评分 (943, 1682)
|
||||
"""
|
||||
pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
|
||||
|
||||
return pred
|
||||
|
||||
|
||||
def rmse(prediction, ground_truth):
|
||||
prediction = prediction[ground_truth.nonzero()].flatten()
|
||||
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
|
||||
return math.sqrt(mean_squared_error(prediction, ground_truth))
|
||||
|
||||
|
||||
def evaluate(prediction, item_popular, name):
|
||||
hit = 0
|
||||
rec_count = 0
|
||||
test_count = 0
|
||||
popular_sum = 0
|
||||
all_rec_items = set()
|
||||
for u_index in range(n_users):
|
||||
items = np.where(train_data_matrix[u_index, :] == 0)[0]
|
||||
pre_items = sorted(
|
||||
dict(zip(items, prediction[u_index, items])).items(),
|
||||
key=itemgetter(1),
|
||||
reverse=True)[:20]
|
||||
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
|
||||
|
||||
# 对比测试集和推荐集的差异 item, w
|
||||
for item, _ in pre_items:
|
||||
if item in test_items:
|
||||
hit += 1
|
||||
all_rec_items.add(item)
|
||||
|
||||
# popular_sum是对所有的item的流行度进行加和
|
||||
if item in item_popular:
|
||||
popular_sum += math.log(1 + item_popular[item])
|
||||
|
||||
rec_count += len(pre_items)
|
||||
test_count += len(test_items)
|
||||
|
||||
precision = hit / (1.0 * rec_count)
|
||||
# 召回率,相对于测试推荐集合的数据
|
||||
recall = hit / (1.0 * test_count)
|
||||
# 覆盖率,相对于训练集合的数据
|
||||
coverage = len(all_rec_items) / (1.0 * len(item_popular))
|
||||
popularity = popular_sum / (1.0 * rec_count)
|
||||
print('--- %s: precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
|
||||
name, precision, recall, coverage, popularity), file=sys.stderr)
|
||||
|
||||
|
||||
def recommend(u_index, prediction):
|
||||
items = np.where(train_data_matrix[u_index, :] == 0)[0]
|
||||
pre_items = sorted(
|
||||
dict(zip(items, prediction[u_index, items])).items(),
|
||||
key=itemgetter(1),
|
||||
reverse=True)[:10]
|
||||
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
|
||||
|
||||
result = [key for key, value in pre_items]
|
||||
result.sort(reverse=False)
|
||||
print('原始结果(%s): %s' % (len(test_items), test_items) )
|
||||
print('推荐结果(%s): %s' % (len(result), result) )
|
||||
|
||||
|
||||
def main():
|
||||
global n_users, train_data_matrix, test_data_matrix
|
||||
# 基于内存的协同过滤
|
||||
# ...
|
||||
# 拆分数据集
|
||||
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
|
||||
path_root = "/Users/jiangzl/work/data/机器学习"
|
||||
dataFile = '%s/16.RecommenderSystems/ml-100k/u.data' % path_root
|
||||
|
||||
df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25)
|
||||
|
||||
# 计算相似度
|
||||
train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular = calc_similarity(
|
||||
n_users, n_items, train_data, test_data)
|
||||
|
||||
item_prediction = predict(train_data_matrix, item_similarity, type='item')
|
||||
user_prediction = predict(train_data_matrix, user_similarity, type='user')
|
||||
|
||||
# # 评估: 均方根误差
|
||||
print('>>> Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))
|
||||
print('>>> User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
|
||||
|
||||
# 基于模型的协同过滤
|
||||
# ...
|
||||
# 计算MovieLens数据集的稀疏度 (n_users,n_items 是常量,所以,用户行为数据越少,意味着信息量少;越稀疏,优化的空间也越大)
|
||||
sparsity = round(1.0 - len(df) / float(n_users * n_items), 3)
|
||||
print('\nMovieLen100K的稀疏度: %s%%\n' % (sparsity * 100))
|
||||
|
||||
# # 计算稀疏矩阵的最大k个奇异值/向量
|
||||
# minrmse = math.inf
|
||||
# index = 1
|
||||
# for k in range(1, 30, 1):
|
||||
# u, s, vt = svds(train_data_matrix, k=k)
|
||||
# # print(">>> ", s)
|
||||
# s_diag_matrix = np.diag(s)
|
||||
# svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
# r_rmse = rmse(svd_prediction, test_data_matrix)
|
||||
# if r_rmse < minrmse:
|
||||
# index = k
|
||||
# minrmse = r_rmse
|
||||
|
||||
index = 11
|
||||
minrmse = 2.6717213264389765
|
||||
u, s, vt = svds(train_data_matrix, k=index)
|
||||
# print(">>> ", s)
|
||||
s_diag_matrix = np.diag(s)
|
||||
svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
|
||||
r_rmse = rmse(svd_prediction, test_data_matrix)
|
||||
print("+++ k=%s, svd-shape: %s" % (index, np.shape(svd_prediction)) )
|
||||
print('>>> Model based CF RMSE: %s\n' % minrmse)
|
||||
# """
|
||||
# 在信息量相同的情况下,矩阵越小,那么携带的信息越可靠。
|
||||
# 所以: user-cf 推荐效果高于 item-cf; 而svd分解后,发现15个维度效果就能达到90%以上,所以信息更可靠,效果也更好。
|
||||
# item-cf: 1682
|
||||
# user-cf: 943
|
||||
# svd: 15
|
||||
# """
|
||||
evaluate(item_prediction, item_popular, 'item')
|
||||
evaluate(user_prediction, item_popular, 'user')
|
||||
evaluate(svd_prediction, item_popular, 'svd')
|
||||
|
||||
# 推荐结果
|
||||
# recommend(1, item_prediction)
|
||||
# recommend(1, user_prediction)
|
||||
recommend(1, svd_prediction)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
src/tutorials/__init__.py
Normal file
0
src/tutorials/__init__.py
Normal file
0
src/tutorials/keras/__init__.py
Normal file
0
src/tutorials/keras/__init__.py
Normal file
121
src/tutorials/keras/brat_tag.py
Normal file
121
src/tutorials/keras/brat_tag.py
Normal file
@@ -0,0 +1,121 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
数据格式转化
|
||||
"""
|
||||
import os
|
||||
import emoji
|
||||
from middleware.utils import get_catalog_files
|
||||
from config.setting import Config
|
||||
|
||||
tag_dic = {"实体对象": "ORG",
|
||||
"正向观点": "Po_VIEW",
|
||||
"中性观点": "Mi_VIEW",
|
||||
"负向观点": "Ne_VIEW"}
|
||||
|
||||
|
||||
# 转换成可训练的格式,最后以"END O"结尾
|
||||
def from_ann2dic(r_ann_path, r_txt_path, w_path):
|
||||
q_dic = {}
|
||||
print("开始读取文件:%s" % r_ann_path)
|
||||
with open(r_ann_path, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
line_arr = line.split()
|
||||
# print(">>> ", line_arr)
|
||||
cls = tag_dic[line_arr[1]]
|
||||
start_index = int(line_arr[2])
|
||||
end_index = int(line_arr[3])
|
||||
length = end_index - start_index
|
||||
for r in range(length):
|
||||
q_dic[start_index+r] = ("B-%s" % cls) if r == 0 else ("I-%s" % cls)
|
||||
|
||||
# 存储坐标和对应的列名: {23: 'B-Ne_VIEW', 24: 'I-Ne_VIEW', 46: 'B-ORG', 47: 'I-ORG'}
|
||||
print("q_dic: ", q_dic)
|
||||
|
||||
print("开始读取文件内容: %s" % r_txt_path)
|
||||
with open(r_txt_path, "r", encoding="utf-8") as f:
|
||||
content_str = f.read()
|
||||
|
||||
print("开始写入文本%s" % w_path)
|
||||
with open(w_path, "w", encoding="utf-8") as w:
|
||||
for i, strA in enumerate(content_str):
|
||||
# print(">>> %s-%s" % (i, strA))
|
||||
if strA == "\n":
|
||||
w.write("\n")
|
||||
else:
|
||||
if i in q_dic:
|
||||
tag = q_dic[i]
|
||||
else:
|
||||
tag = "O" # 大写字母O
|
||||
w.write('%s %s\n' % (strA, tag))
|
||||
w.write('%s\n' % "END O")
|
||||
|
||||
|
||||
# 生成train.txt、dev.txt、test.txt
|
||||
# 除8,9-new.txt分别用于dev和test外,剩下的合并成train.txt
|
||||
def create_train_data(data_root_dir, w_path):
|
||||
if os.path.exists(w_path):
|
||||
os.remove(w_path)
|
||||
for file in os.listdir(data_root_dir):
|
||||
path = os.path.join(data_root_dir, file)
|
||||
if file.endswith("8-new.txt"):
|
||||
# 重命名为dev.txt
|
||||
os.rename(path, os.path.join(data_root_dir, "dev.txt"))
|
||||
continue
|
||||
if file.endswith("9-new.txt"):
|
||||
# 重命名为test.txt
|
||||
os.rename(path, os.path.join(data_root_dir, "test.txt"))
|
||||
continue
|
||||
q_list = []
|
||||
print("开始读取文件:%s" % file)
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
line = line.rstrip()
|
||||
if line == "END O":
|
||||
break
|
||||
q_list.append(line)
|
||||
|
||||
# 获取list 列表: ['美 O', '! O', '气 O', '质 O', '特 O', '别 O', '好 O', '', '造 O', '型 O', '独 O', '特 O', ', O', '尺 B-ORG', '码 I-ORG', '偏 B-Ne_VIEW', '大 I-Ne_VIEW', ', O']
|
||||
# print("q_list: ", q_list)
|
||||
print("开始写入文本: %s" % w_path)
|
||||
with open(w_path, "a", encoding="utf-8") as f:
|
||||
for item in q_list:
|
||||
f.write('%s\n' % item)
|
||||
|
||||
|
||||
def brat_1_format_origin(catalog):
|
||||
"""
|
||||
格式化原始文件(去除表情符号的影响,brat占2个字符,但是python占1个字符)
|
||||
"""
|
||||
with open('%s/origin/origin.txt' % path_root, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
with open('%s/tag_befer/befer.txt' % path_root, "w", encoding="utf-8") as f:
|
||||
# 转换原始文件
|
||||
for line in lines:
|
||||
text = emoji.demojize(line)
|
||||
f.write('%s' % text)
|
||||
# 创建标注的新文件
|
||||
with open('%s/tag_befer/befer.ann' % path_root, "w", encoding="utf-8") as f:
|
||||
pass
|
||||
|
||||
def brat_2_create_train_data(catalog):
|
||||
file_list = get_catalog_files("%s/tag_after" % catalog, status=-1, str1=".DS_Store")
|
||||
file_list = list(set([i.split("/")[-1].split(".")[0] for i in file_list]))
|
||||
print(file_list)
|
||||
for filename in file_list:
|
||||
r_ann_path = os.path.join(catalog, "tag_after/%s.ann" % filename)
|
||||
r_txt_path = os.path.join(catalog, "tag_after/%s.txt" % filename)
|
||||
w_path = os.path.join(catalog, "new/%s-new.txt" % filename)
|
||||
print("filename", r_ann_path, r_txt_path, w_path)
|
||||
from_ann2dic(r_ann_path, r_txt_path, w_path)
|
||||
# 生成train.txt、dev.txt、test.txt
|
||||
create_train_data("%s/new" % catalog, "%s/new/train.txt" % catalog)
|
||||
|
||||
|
||||
def main():
|
||||
catalog = Config.nlp_ner.path_root
|
||||
|
||||
# brat_1_format_origin(catalog)
|
||||
brat_2_create_train_data(catalog)
|
||||
165
src/tutorials/keras/text_NER.py
Normal file
165
src/tutorials/keras/text_NER.py
Normal file
@@ -0,0 +1,165 @@
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import platform
|
||||
from collections import Counter
|
||||
import keras
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Embedding, Bidirectional, LSTM, Dropout
|
||||
from keras_contrib.layers import CRF
|
||||
from keras_contrib.losses import crf_loss
|
||||
from keras_contrib.metrics import crf_viterbi_accuracy
|
||||
"""
|
||||
# padding: pre(默认) 向前补充0 post 向后补充0
|
||||
# truncating: 文本超过 pad_num, pre(默认) 删除前面 post 删除后面
|
||||
# x_train = pad_sequences(x, maxlen=pad_num, value=0, padding='post', truncating="post")
|
||||
# print("--- ", x_train[0][:20])
|
||||
|
||||
使用keras_bert、keras_contrib的crf时bug记录
|
||||
TypeError: Tensors in list passed to 'values' of 'ConcatV2' Op have types [bool, float32] that don't all match
|
||||
解决方案, 修改crf.py 516行:
|
||||
mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1),
|
||||
为:
|
||||
mask2 = K.cast(K.concatenate([mask, K.cast(K.zeros_like(mask[:, :1]), mask.dtype)], axis=1),
|
||||
"""
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
from config.setting import Config
|
||||
|
||||
|
||||
def load_data():
|
||||
train = _parse_data(Config.nlp_ner.path_train)
|
||||
test = _parse_data(Config.nlp_ner.path_test)
|
||||
print("--- init 数据加载解析完成 ---")
|
||||
|
||||
# Counter({'的': 8, '中': 7, '致': 7, '党': 7})
|
||||
word_counts = Counter(row[0].lower() for sample in train for row in sample)
|
||||
vocab = [w for w, f in iter(word_counts.items()) if f >= 2]
|
||||
chunk_tags = Config.nlp_ner.chunk_tags
|
||||
|
||||
# 存储保留的有效个数的 vovab 和 对应 chunk_tags
|
||||
with open(Config.nlp_ner.path_config, 'wb') as outp:
|
||||
pickle.dump((vocab, chunk_tags), outp)
|
||||
print("--- init 配置文件保存成功 ---")
|
||||
|
||||
train = _process_data(train, vocab, chunk_tags)
|
||||
test = _process_data(test , vocab, chunk_tags)
|
||||
print("--- init 对数据进行编码,生成训练需要的数据格式 ---")
|
||||
return train, test, (vocab, chunk_tags)
|
||||
|
||||
|
||||
def _parse_data(filename):
|
||||
"""
|
||||
以单下划线开头(_foo)的代表不能直接访问的类属性
|
||||
用于解析数据,用于模型训练
|
||||
:param filename: 文件地址
|
||||
:return: data: 解析数据后的结果
|
||||
[[['中', 'B-ORG'], ['共', 'I-ORG']], [['中', 'B-ORG'], ['国', 'I-ORG']]]
|
||||
"""
|
||||
with open(filename, 'rb') as fn:
|
||||
split_text = '\n'
|
||||
# 主要是分句: split_text 默认每个句子都是一行,所以原来换行就需要 两个split_text
|
||||
texts = fn.read().decode('utf-8').strip().split(split_text + split_text)
|
||||
# 对于每个字需要 split_text, 而字的内部需要用空格分隔
|
||||
# len(row) > 0 避免连续2个换行,导致 row 数据为空
|
||||
# row.split() 会删除空格或特殊符号,导致空格数据缺失!
|
||||
data = [[[" ", "O"] if len(row.split()) != 2 else row.split() for row in text.split(split_text) if len(row) > 0] for text in texts]
|
||||
# data = [[row.split() for row in text.split(split_text) if len(row.split()) == 2] for text in texts]
|
||||
return data
|
||||
|
||||
|
||||
def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):
|
||||
if maxlen is None:
|
||||
maxlen = max(len(s) for s in data)
|
||||
|
||||
# 对每个字进行编码
|
||||
word2idx = dict((w, i) for i, w in enumerate(vocab))
|
||||
# 如果不在 vocab里面,就给 unk 值为 1
|
||||
x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]
|
||||
y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]
|
||||
|
||||
x = pad_sequences(x, maxlen) # left padding
|
||||
y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
|
||||
|
||||
if onehot:
|
||||
# 返回一个onehot 编码的多维数组
|
||||
y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk]
|
||||
else:
|
||||
# np.expand_dims:用于扩展数组的形状
|
||||
# https://blog.csdn.net/hong615771420/article/details/83448878
|
||||
y_chunk = np.expand_dims(y_chunk, 2)
|
||||
return x, y_chunk
|
||||
|
||||
|
||||
def process_data(data, vocab, maxlen=100):
|
||||
word2idx = dict((w, i) for i, w in enumerate(vocab))
|
||||
x = [word2idx.get(w[0].lower(), 1) for w in data]
|
||||
length = len(x)
|
||||
x = pad_sequences([x], maxlen) # left padding
|
||||
return x, length
|
||||
|
||||
|
||||
def create_model(len_vocab, len_chunk_tags):
|
||||
model = Sequential()
|
||||
model.add(Embedding(len_vocab, Config.nlp_ner.EMBED_DIM, mask_zero=True)) # Random embedding
|
||||
model.add(Bidirectional(LSTM(Config.nlp_ner.BiLSTM_UNITS // 2, return_sequences=True)))
|
||||
model.add(Dropout(0.25))
|
||||
crf = CRF(len_chunk_tags, sparse_target=True)
|
||||
model.add(crf)
|
||||
model.summary()
|
||||
model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy])
|
||||
# model.compile('rmsprop', loss=crf_loss, metrics=[crf_viterbi_accuracy])
|
||||
|
||||
# from keras.optimizers import Adam
|
||||
# adam_lr = 0.0001
|
||||
# adam_beta_1 = 0.5
|
||||
# model.compile(optimizer=Adam(lr=adam_lr, beta_1=adam_beta_1), loss=crf_loss, metrics=[crf_viterbi_accuracy])
|
||||
return model
|
||||
|
||||
|
||||
def train():
|
||||
(train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data()
|
||||
model = create_model(len(vocab), len(chunk_tags))
|
||||
# train model
|
||||
model.fit(train_x, train_y, batch_size=16, epochs=Config.nlp_ner.EPOCHS, validation_data=[test_x, test_y])
|
||||
model.save(Config.nlp_ner.path_model)
|
||||
|
||||
|
||||
def test():
|
||||
with open(Config.nlp_ner.path_config, 'rb') as inp:
|
||||
(vocab, chunk_tags) = pickle.load(inp)
|
||||
model = create_model(len(vocab), len(chunk_tags))
|
||||
# predict_text = '造型独特,尺码偏大,估计是钉子头圆的半径的缘故'
|
||||
with open(Config.nlp_ner.path_origin, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
for predict_text in lines:
|
||||
content = predict_text.strip()
|
||||
text_EMBED, length = process_data(content, vocab)
|
||||
model.load_weights(Config.nlp_ner.path_model)
|
||||
raw = model.predict(text_EMBED)[0][-length:]
|
||||
pre_result = [np.argmax(row) for row in raw]
|
||||
result_tags = [chunk_tags[i] for i in pre_result]
|
||||
|
||||
# 保存每句话的 实体和观点
|
||||
result = {}
|
||||
tag_list = [i for i in chunk_tags if i not in ["O"]]
|
||||
for word, t in zip(content, result_tags):
|
||||
# print(word, t)
|
||||
if t not in tag_list:
|
||||
continue
|
||||
for i in range(0, len(tag_list), 2):
|
||||
if t in tag_list[i:i+2]:
|
||||
# print("\n>>> %s---%s==%s" % (word, t, tag_list[i:i+2]))
|
||||
tag = tag_list[i].split("-")[-1]
|
||||
if tag not in result:
|
||||
result[tag] = ""
|
||||
result[tag] += ' '+word if t==tag_list[i] else word
|
||||
print(result)
|
||||
|
||||
|
||||
def main():
|
||||
# print("--")
|
||||
train()
|
||||
test()
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# train()
|
||||
124
src/tutorials/tool/DecisionTree_getInfoGain.py
Normal file
124
src/tutorials/tool/DecisionTree_getInfoGain.py
Normal file
@@ -0,0 +1,124 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf8
|
||||
|
||||
from math import log
|
||||
|
||||
|
||||
def calcShannonEnt(dataSet):
|
||||
"""calcShannonEnt(calculate Shannon entropy 计算label分类标签的香农熵)
|
||||
|
||||
Args:
|
||||
dataSet 数据集
|
||||
Returns:
|
||||
返回香农熵的计算值
|
||||
Raises:
|
||||
|
||||
"""
|
||||
# 求list的长度,表示计算参与训练的数据量
|
||||
numEntries = len(dataSet)
|
||||
# print(type(dataSet), 'numEntries: ', numEntries)
|
||||
|
||||
# 计算分类标签label出现的次数
|
||||
labelCounts = {}
|
||||
# the the number of unique elements and their occurance
|
||||
for featVec in dataSet:
|
||||
currentLabel = featVec[-1]
|
||||
if currentLabel not in labelCounts.keys():
|
||||
labelCounts[currentLabel] = 0
|
||||
labelCounts[currentLabel] += 1
|
||||
# print('-----', featVec, labelCounts)
|
||||
|
||||
# 对于label标签的占比,求出label标签的香农熵
|
||||
shannonEnt = 0.0
|
||||
for key in labelCounts:
|
||||
prob = float(labelCounts[key])/numEntries
|
||||
# log base 2
|
||||
shannonEnt -= prob * log(prob, 2)
|
||||
# print('---', prob, prob * log(prob, 2), shannonEnt)
|
||||
return shannonEnt
|
||||
|
||||
|
||||
def splitDataSet(dataSet, axis, value):
|
||||
"""splitDataSet(通过遍历dataSet数据集,求出axis对应的colnum列的值为value的行)
|
||||
|
||||
Args:
|
||||
dataSet 数据集
|
||||
axis 表示每一行的axis列
|
||||
value 表示axis列对应的value值
|
||||
Returns:
|
||||
axis列为value的数据集【该数据集需要排除axis列】
|
||||
Raises:
|
||||
|
||||
"""
|
||||
retDataSet = []
|
||||
for featVec in dataSet:
|
||||
# axis列为value的数据集【该数据集需要排除axis列】
|
||||
if featVec[axis] == value:
|
||||
# chop out axis used for splitting
|
||||
reducedFeatVec = featVec[:axis]
|
||||
'''
|
||||
请百度查询一下: extend和append的区别
|
||||
'''
|
||||
reducedFeatVec.extend(featVec[axis+1:])
|
||||
# 收集结果值 axis列为value的行【该行需要排除axis列】
|
||||
retDataSet.append(reducedFeatVec)
|
||||
return retDataSet
|
||||
|
||||
|
||||
def getFeatureShannonEnt(dataSet, labels):
|
||||
"""chooseBestFeatureToSplit(选择最好的特征)
|
||||
|
||||
Args:
|
||||
dataSet 数据集
|
||||
Returns:
|
||||
bestFeature 最优的特征列
|
||||
Raises:
|
||||
|
||||
"""
|
||||
# 求第一行有多少列的 Feature
|
||||
numFeatures = len(dataSet[0]) - 1
|
||||
# label的信息熵
|
||||
baseEntropy = calcShannonEnt(dataSet)
|
||||
# 最优的信息增益值, 和最优的Featurn编号
|
||||
bestInfoGain, bestFeature, endEntropy = 0.0, -1, 0.0
|
||||
# iterate over all the features
|
||||
for i in range(numFeatures):
|
||||
# create a list of all the examples of this feature
|
||||
# 获取每一个feature的list集合
|
||||
featList = [example[i] for example in dataSet]
|
||||
# get a set of unique values
|
||||
# 获取剔重后的集合
|
||||
uniqueVals = set(featList)
|
||||
# 创建一个临时的信息熵
|
||||
newEntropy = 0.0
|
||||
# 遍历某一列的value集合,计算该列的信息熵
|
||||
for value in uniqueVals:
|
||||
subDataSet = splitDataSet(dataSet, i, value)
|
||||
prob = len(subDataSet)/float(len(dataSet))
|
||||
newEntropy += prob * calcShannonEnt(subDataSet)
|
||||
# gain[信息增益] 值越大,意味着该分类提供的信息量越大,该特征对分类的不确定程度越小
|
||||
# gain[信息增益]=0, 表示与类别相同,无需其他的分类
|
||||
# gain[信息增益]=baseEntropy, 表示分类和没分类没有区别
|
||||
infoGain = baseEntropy - newEntropy
|
||||
# print(infoGain)
|
||||
if (infoGain > bestInfoGain):
|
||||
endEntropy = newEntropy
|
||||
bestInfoGain = infoGain
|
||||
bestFeature = i
|
||||
else:
|
||||
if numFeatures < 0:
|
||||
labels[bestFeature] = 'null'
|
||||
|
||||
return labels[bestFeature], baseEntropy, endEntropy, bestInfoGain
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
labels = ['no surfacing', 'flippers']
|
||||
dataSet1 = [['yes'], ['yes'], ['no'], ['no'], ['no']]
|
||||
dataSet2 = [['a', 1, 'yes'], ['a', 2, 'yes'], ['b', 3, 'no'], ['c', 4, 'no'], ['c', 5, 'no']]
|
||||
dataSet3 = [[1, 'yes'], [1, 'yes'], [1, 'no'], [3, 'no'], [3, 'no']]
|
||||
infoGain1 = getFeatureShannonEnt(dataSet1, labels)
|
||||
infoGain2 = getFeatureShannonEnt(dataSet2, labels)
|
||||
infoGain3 = getFeatureShannonEnt(dataSet3, labels)
|
||||
print('信息增益: \n\t%s, \n\t%s, \n\t%s' % (infoGain1, infoGain2, infoGain3))
|
||||
|
||||
53
src/tutorials/tool/python2libsvm.py
Normal file
53
src/tutorials/tool/python2libsvm.py
Normal file
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import sklearn.datasets as datasets
|
||||
|
||||
|
||||
def get_data(file_input, separator='\t'):
|
||||
if 'libsvm' not in file_input:
|
||||
file_input = other2libsvm(file_input, separator)
|
||||
data = datasets.load_svmlight_file(file_input)
|
||||
return data[0], data[1]
|
||||
|
||||
|
||||
def other2libsvm(file_name, separator='\t'):
|
||||
|
||||
libsvm_name = file_name.replace('.txt', '.libsvm_tmp')
|
||||
libsvm_data = open(libsvm_name, 'w')
|
||||
|
||||
file_data = open(file_name, 'r')
|
||||
for line in file_data.readlines():
|
||||
features = line.strip().split(separator)
|
||||
# print len(features)
|
||||
class_data = features[-1]
|
||||
svm_format = ''
|
||||
for i in range(len(features)-1):
|
||||
svm_format += " %d:%s" % (i+1, features[i])
|
||||
# print svm_format
|
||||
svm_format = "%s%s\n" % (class_data, svm_format)
|
||||
# print svm_format
|
||||
libsvm_data.write(svm_format)
|
||||
file_data.close()
|
||||
|
||||
libsvm_data.close()
|
||||
return libsvm_name
|
||||
|
||||
|
||||
def dump_data(x, y, file_output):
|
||||
datasets.dump_svmlight_file(x, y, file_output)
|
||||
os.remove("%s_tmp" % file_output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_input = "data/7.AdaBoost/horseColicTest2.txt"
|
||||
file_output = "data/7.AdaBoost/horseColicTest2.libsvm"
|
||||
|
||||
# 获取数据集
|
||||
x, y = get_data(file_input, separator='\t')
|
||||
print(x[3, :])
|
||||
print(y)
|
||||
# 导出数据为 libsvm
|
||||
dump_data(x, y, file_output)
|
||||
Reference in New Issue
Block a user