2020-10-19 21:54:47

This commit is contained in:
wizardforcel
2020-10-19 21:54:47 +08:00
parent c6a01c691d
commit ba8b9cea69
9 changed files with 0 additions and 0 deletions

View File

@@ -0,0 +1,259 @@
#!/usr/bin/python
# coding:utf8
from __future__ import print_function
import sys
import math
from operator import itemgetter
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from sklearn import model_selection as cv
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances
from middleware.utils import TimeStat, Chart
"""
推荐系统: Item CF/User CF/SVD 对比
"""
def splitData(dataFile, test_size):
# 加载数据集 (用户ID 电影ID 评分, 时间戳)
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(dataFile, sep='\t', names=header)
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print('>>> 本数据集包含: 总用户数 = %s | 总电影数 = %s' % (n_users, n_items) )
train_data, test_data = cv.train_test_split(df, test_size=test_size)
print(">>> 训练:测试 = %s:%s = %s:%s" % (len(train_data), len(test_data), 1-test_size, test_size))
return df, n_users, n_items, train_data, test_data
def calc_similarity(n_users, n_items, train_data, test_data):
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
"""
line: Pandas(Index=93661, user_id=624, item_id=750, rating=4, timestamp=891961163)
"""
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
test_data_matrix[line[1] - 1, line[2] - 1] = line[3]
print("1:", np.shape(train_data_matrix)) # 行: 人 | 列: 电影
print("2:", np.shape(train_data_matrix.T)) # 行: 电影 | 列: 人
# 使用sklearn的 pairwise_distances 计算向量距离cosine来计算余弦距离越小越相似
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
# print("<<< %s \n %s" % (np.shape(user_similarity), user_similarity) )
# print("<<< %s \n %s" % (np.shape(item_similarity), item_similarity) )
print('开始统计流行item的数量...', file=sys.stderr)
item_popular = {}
# 统计同一个电影,观看的总人数(也就是所谓的流行度!)
for i_index in range(n_items):
if np.sum(train_data_matrix[:, i_index]) != 0:
item_popular[i_index] = np.sum(train_data_matrix[:, i_index] != 0)
# save the total number of items
item_count = len(item_popular)
print('总共流行 item 数量 = %d' % item_count, file=sys.stderr)
return train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular
def predict(rating, similarity, type='user'):
"""
:param rating: 训练数据
:param similarity: 向量距离
:return:
"""
print("+++ %s" % type)
print(" rating=", np.shape(rating))
print(" similarity=", np.shape(similarity))
if type == 'item':
"""
综合打分:
rating.dot(similarity) 表示:
某1个人所有的电影组合 X ·电影*电影·距离第1列都是关于第1部电影和其他的电影的距离计算出 第一个人对第1/2/3部电影的 总评分 1*n
某2个人所有的电影组合 X ·电影*电影·距离第1列都是关于第1部电影和其他的电影的距离计算出 第一个人对第1/2/3部电影的 总评分 1*n
...
某n个人所有的电影组合 X ·电影*电影·距离第1列都是关于第1部电影和其他的电影的距离计算出 第一个人对第1/2/3部电影的 总评分 1*n
= 人-电影-评分(943, 1682) * 电影-电影-距离(1682, 1682)
= 人-电影-总评分距离(943, 1682)
np.array([np.abs(similarity).sum(axis=1)]) 表示: 横向求和: 1 表示某一行所有的列求和
第1列表示某个A电影对于所有电影计算出A的总距离
第2列表示某个B电影对于所有电影的综出B的总距离
...
第n列表示某个N电影对于所有电影的综出N的总距离
= 每一个电影的总距离 (1, 1682)
pred = 人-电影-平均评分 (943, 1682)
"""
pred = rating.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
elif type == 'user':
# 每个样本上减去数据的统计平均值可以移除共同的部分,凸显个体差异。
# 求出每一个用户,所有电影的综合评分
# 横向求平均: 1 表示某一行所有的列求平均
mean_user_rating = rating.mean(axis=1)
# numpy中包含的 newaxis 可以给原数组增加一个维度
rating_diff = (rating - mean_user_rating[:, np.newaxis])
# 均分 +
# 人-人-距离(943, 943)*人-电影-评分diff(943, 1682)=结果-人-电影(每个人对同一电影的综合得分)(943, 1682) 再除以 个人与其他人总的距离 = 人-电影综合得分
"""
综合打分:
similarity.dot(rating_diff) 表示:
第1列第1个人与其他人的相似度 * 人与电影的相似度,得到 第1个人对第1/2/3列电影的 总得分 1*n
第2列第2个人与其他人的相似度 * 人与电影的相似度,得到 第2个人对第1/2/3列电影的 总得分 1*n
...
第n列第n个人与其他人的相似度 * 人与电影的相似度,得到 第n个人对第1/2/3列电影的 总得分 1*n
= 人-人-距离(943, 943) * 人-电影-评分(943, 1682)
= 人-电影-总评分距离(943, 1682)
np.array([np.abs(similarity).sum(axis=1)]) 表示: 横向求和: 1 表示某一行所有的列求和
第1列表示第A个人对于所有人计算出A的总距离
第2列表示第B个人对于所有人计算出B的总距离
...
第n列表示第N个人对于所有人计算出N的总距离
= 每一个电影的总距离 (1, 943)
pred = 均值 + 人-电影-平均评分 (943, 1682)
"""
pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
return pred
def rmse(prediction, ground_truth):
prediction = prediction[ground_truth.nonzero()].flatten()
ground_truth = ground_truth[ground_truth.nonzero()].flatten()
return math.sqrt(mean_squared_error(prediction, ground_truth))
def evaluate(prediction, item_popular, name):
hit = 0
rec_count = 0
test_count = 0
popular_sum = 0
all_rec_items = set()
for u_index in range(n_users):
items = np.where(train_data_matrix[u_index, :] == 0)[0]
pre_items = sorted(
dict(zip(items, prediction[u_index, items])).items(),
key=itemgetter(1),
reverse=True)[:20]
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
# 对比测试集和推荐集的差异 item, w
for item, _ in pre_items:
if item in test_items:
hit += 1
all_rec_items.add(item)
# popular_sum是对所有的item的流行度进行加和
if item in item_popular:
popular_sum += math.log(1 + item_popular[item])
rec_count += len(pre_items)
test_count += len(test_items)
precision = hit / (1.0 * rec_count)
# 召回率,相对于测试推荐集合的数据
recall = hit / (1.0 * test_count)
# 覆盖率,相对于训练集合的数据
coverage = len(all_rec_items) / (1.0 * len(item_popular))
popularity = popular_sum / (1.0 * rec_count)
print('--- %s: precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
name, precision, recall, coverage, popularity), file=sys.stderr)
def recommend(u_index, prediction):
items = np.where(train_data_matrix[u_index, :] == 0)[0]
pre_items = sorted(
dict(zip(items, prediction[u_index, items])).items(),
key=itemgetter(1),
reverse=True)[:10]
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]
result = [key for key, value in pre_items]
result.sort(reverse=False)
print('原始结果(%s): %s' % (len(test_items), test_items) )
print('推荐结果(%s): %s' % (len(result), result) )
def main():
global n_users, train_data_matrix, test_data_matrix
# 基于内存的协同过滤
# ...
# 拆分数据集
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
path_root = "/Users/jiangzl/work/data/机器学习"
dataFile = '%s/16.RecommenderSystems/ml-100k/u.data' % path_root
df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25)
# 计算相似度
train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular = calc_similarity(
n_users, n_items, train_data, test_data)
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')
# # 评估: 均方根误差
print('>>> Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))
print('>>> User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
# 基于模型的协同过滤
# ...
# 计算MovieLens数据集的稀疏度 n_usersn_items 是常量,所以,用户行为数据越少,意味着信息量少;越稀疏,优化的空间也越大)
sparsity = round(1.0 - len(df) / float(n_users * n_items), 3)
print('\nMovieLen100K的稀疏度: %s%%\n' % (sparsity * 100))
# # 计算稀疏矩阵的最大k个奇异值/向量
# minrmse = math.inf
# index = 1
# for k in range(1, 30, 1):
# u, s, vt = svds(train_data_matrix, k=k)
# # print(">>> ", s)
# s_diag_matrix = np.diag(s)
# svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
# r_rmse = rmse(svd_prediction, test_data_matrix)
# if r_rmse < minrmse:
# index = k
# minrmse = r_rmse
index = 11
minrmse = 2.6717213264389765
u, s, vt = svds(train_data_matrix, k=index)
# print(">>> ", s)
s_diag_matrix = np.diag(s)
svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
r_rmse = rmse(svd_prediction, test_data_matrix)
print("+++ k=%s, svd-shape: %s" % (index, np.shape(svd_prediction)) )
print('>>> Model based CF RMSE: %s\n' % minrmse)
# """
# 在信息量相同的情况下,矩阵越小,那么携带的信息越可靠。
# 所以: user-cf 推荐效果高于 item-cf 而svd分解后发现15个维度效果就能达到90%以上,所以信息更可靠,效果也更好。
# item-cf: 1682
# user-cf: 943
# svd: 15
# """
evaluate(item_prediction, item_popular, 'item')
evaluate(user_prediction, item_popular, 'user')
evaluate(svd_prediction, item_popular, 'svd')
# 推荐结果
# recommend(1, item_prediction)
# recommend(1, user_prediction)
recommend(1, svd_prediction)
if __name__ == "__main__":
main()

View File

View File

View File

@@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
"""
数据格式转化
"""
import os
import emoji
from middleware.utils import get_catalog_files
from config.setting import Config
tag_dic = {"实体对象": "ORG",
"正向观点": "Po_VIEW",
"中性观点": "Mi_VIEW",
"负向观点": "Ne_VIEW"}
# 转换成可训练的格式,最后以"END O"结尾
def from_ann2dic(r_ann_path, r_txt_path, w_path):
q_dic = {}
print("开始读取文件:%s" % r_ann_path)
with open(r_ann_path, "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
line_arr = line.split()
# print(">>> ", line_arr)
cls = tag_dic[line_arr[1]]
start_index = int(line_arr[2])
end_index = int(line_arr[3])
length = end_index - start_index
for r in range(length):
q_dic[start_index+r] = ("B-%s" % cls) if r == 0 else ("I-%s" % cls)
# 存储坐标和对应的列名: {23: 'B-Ne_VIEW', 24: 'I-Ne_VIEW', 46: 'B-ORG', 47: 'I-ORG'}
print("q_dic: ", q_dic)
print("开始读取文件内容: %s" % r_txt_path)
with open(r_txt_path, "r", encoding="utf-8") as f:
content_str = f.read()
print("开始写入文本%s" % w_path)
with open(w_path, "w", encoding="utf-8") as w:
for i, strA in enumerate(content_str):
# print(">>> %s-%s" % (i, strA))
if strA == "\n":
w.write("\n")
else:
if i in q_dic:
tag = q_dic[i]
else:
tag = "O" # 大写字母O
w.write('%s %s\n' % (strA, tag))
w.write('%s\n' % "END O")
# 生成train.txt、dev.txt、test.txt
# 除89-new.txt分别用于dev和test外,剩下的合并成train.txt
def create_train_data(data_root_dir, w_path):
if os.path.exists(w_path):
os.remove(w_path)
for file in os.listdir(data_root_dir):
path = os.path.join(data_root_dir, file)
if file.endswith("8-new.txt"):
# 重命名为dev.txt
os.rename(path, os.path.join(data_root_dir, "dev.txt"))
continue
if file.endswith("9-new.txt"):
# 重命名为test.txt
os.rename(path, os.path.join(data_root_dir, "test.txt"))
continue
q_list = []
print("开始读取文件:%s" % file)
with open(path, "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
line = line.rstrip()
if line == "END O":
break
q_list.append(line)
# 获取list 列表: ['美 O', ' O', '气 O', '质 O', '特 O', '别 O', '好 O', '', '造 O', '型 O', '独 O', '特 O', ' O', '尺 B-ORG', '码 I-ORG', '偏 B-Ne_VIEW', '大 I-Ne_VIEW', ' O']
# print("q_list: ", q_list)
print("开始写入文本: %s" % w_path)
with open(w_path, "a", encoding="utf-8") as f:
for item in q_list:
f.write('%s\n' % item)
def brat_1_format_origin(catalog):
"""
格式化原始文件去除表情符号的影响brat占2个字符但是python占1个字符
"""
with open('%s/origin/origin.txt' % path_root, "r", encoding="utf-8") as f:
lines = f.readlines()
with open('%s/tag_befer/befer.txt' % path_root, "w", encoding="utf-8") as f:
# 转换原始文件
for line in lines:
text = emoji.demojize(line)
f.write('%s' % text)
# 创建标注的新文件
with open('%s/tag_befer/befer.ann' % path_root, "w", encoding="utf-8") as f:
pass
def brat_2_create_train_data(catalog):
file_list = get_catalog_files("%s/tag_after" % catalog, status=-1, str1=".DS_Store")
file_list = list(set([i.split("/")[-1].split(".")[0] for i in file_list]))
print(file_list)
for filename in file_list:
r_ann_path = os.path.join(catalog, "tag_after/%s.ann" % filename)
r_txt_path = os.path.join(catalog, "tag_after/%s.txt" % filename)
w_path = os.path.join(catalog, "new/%s-new.txt" % filename)
print("filename", r_ann_path, r_txt_path, w_path)
from_ann2dic(r_ann_path, r_txt_path, w_path)
# 生成train.txt、dev.txt、test.txt
create_train_data("%s/new" % catalog, "%s/new/train.txt" % catalog)
def main():
catalog = Config.nlp_ner.path_root
# brat_1_format_origin(catalog)
brat_2_create_train_data(catalog)

View File

@@ -0,0 +1,165 @@
import pickle
import numpy as np
import pandas as pd
import platform
from collections import Counter
import keras
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dropout
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy
"""
# padding: pre(默认) 向前补充0 post 向后补充0
# truncating: 文本超过 pad_num, pre(默认) 删除前面 post 删除后面
# x_train = pad_sequences(x, maxlen=pad_num, value=0, padding='post', truncating="post")
# print("--- ", x_train[0][:20])
使用keras_bert、keras_contrib的crf时bug记录
TypeError: Tensors in list passed to 'values' of 'ConcatV2' Op have types [bool, float32] that don't all match
解决方案, 修改crf.py 516行
mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1),
为:
mask2 = K.cast(K.concatenate([mask, K.cast(K.zeros_like(mask[:, :1]), mask.dtype)], axis=1),
"""
from keras.preprocessing.sequence import pad_sequences
from config.setting import Config
def load_data():
train = _parse_data(Config.nlp_ner.path_train)
test = _parse_data(Config.nlp_ner.path_test)
print("--- init 数据加载解析完成 ---")
# Counter({'的': 8, '中': 7, '致': 7, '党': 7})
word_counts = Counter(row[0].lower() for sample in train for row in sample)
vocab = [w for w, f in iter(word_counts.items()) if f >= 2]
chunk_tags = Config.nlp_ner.chunk_tags
# 存储保留的有效个数的 vovab 和 对应 chunk_tags
with open(Config.nlp_ner.path_config, 'wb') as outp:
pickle.dump((vocab, chunk_tags), outp)
print("--- init 配置文件保存成功 ---")
train = _process_data(train, vocab, chunk_tags)
test = _process_data(test , vocab, chunk_tags)
print("--- init 对数据进行编码,生成训练需要的数据格式 ---")
return train, test, (vocab, chunk_tags)
def _parse_data(filename):
"""
以单下划线开头_foo的代表不能直接访问的类属性
用于解析数据,用于模型训练
:param filename: 文件地址
:return: data: 解析数据后的结果
[[['', 'B-ORG'], ['', 'I-ORG']], [['', 'B-ORG'], ['', 'I-ORG']]]
"""
with open(filename, 'rb') as fn:
split_text = '\n'
# 主要是分句: split_text 默认每个句子都是一行,所以原来换行就需要 两个split_text
texts = fn.read().decode('utf-8').strip().split(split_text + split_text)
# 对于每个字需要 split_text, 而字的内部需要用空格分隔
# len(row) > 0 避免连续2个换行导致 row 数据为空
# row.split() 会删除空格或特殊符号,导致空格数据缺失!
data = [[[" ", "O"] if len(row.split()) != 2 else row.split() for row in text.split(split_text) if len(row) > 0] for text in texts]
# data = [[row.split() for row in text.split(split_text) if len(row.split()) == 2] for text in texts]
return data
def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):
if maxlen is None:
maxlen = max(len(s) for s in data)
# 对每个字进行编码
word2idx = dict((w, i) for i, w in enumerate(vocab))
# 如果不在 vocab里面就给 unk 值为 1
x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]
y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]
x = pad_sequences(x, maxlen) # left padding
y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
if onehot:
# 返回一个onehot 编码的多维数组
y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk]
else:
# np.expand_dims:用于扩展数组的形状
# https://blog.csdn.net/hong615771420/article/details/83448878
y_chunk = np.expand_dims(y_chunk, 2)
return x, y_chunk
def process_data(data, vocab, maxlen=100):
word2idx = dict((w, i) for i, w in enumerate(vocab))
x = [word2idx.get(w[0].lower(), 1) for w in data]
length = len(x)
x = pad_sequences([x], maxlen) # left padding
return x, length
def create_model(len_vocab, len_chunk_tags):
model = Sequential()
model.add(Embedding(len_vocab, Config.nlp_ner.EMBED_DIM, mask_zero=True)) # Random embedding
model.add(Bidirectional(LSTM(Config.nlp_ner.BiLSTM_UNITS // 2, return_sequences=True)))
model.add(Dropout(0.25))
crf = CRF(len_chunk_tags, sparse_target=True)
model.add(crf)
model.summary()
model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy])
# model.compile('rmsprop', loss=crf_loss, metrics=[crf_viterbi_accuracy])
# from keras.optimizers import Adam
# adam_lr = 0.0001
# adam_beta_1 = 0.5
# model.compile(optimizer=Adam(lr=adam_lr, beta_1=adam_beta_1), loss=crf_loss, metrics=[crf_viterbi_accuracy])
return model
def train():
(train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data()
model = create_model(len(vocab), len(chunk_tags))
# train model
model.fit(train_x, train_y, batch_size=16, epochs=Config.nlp_ner.EPOCHS, validation_data=[test_x, test_y])
model.save(Config.nlp_ner.path_model)
def test():
with open(Config.nlp_ner.path_config, 'rb') as inp:
(vocab, chunk_tags) = pickle.load(inp)
model = create_model(len(vocab), len(chunk_tags))
# predict_text = '造型独特,尺码偏大,估计是钉子头圆的半径的缘故'
with open(Config.nlp_ner.path_origin, "r", encoding="utf-8") as f:
lines = f.readlines()
for predict_text in lines:
content = predict_text.strip()
text_EMBED, length = process_data(content, vocab)
model.load_weights(Config.nlp_ner.path_model)
raw = model.predict(text_EMBED)[0][-length:]
pre_result = [np.argmax(row) for row in raw]
result_tags = [chunk_tags[i] for i in pre_result]
# 保存每句话的 实体和观点
result = {}
tag_list = [i for i in chunk_tags if i not in ["O"]]
for word, t in zip(content, result_tags):
# print(word, t)
if t not in tag_list:
continue
for i in range(0, len(tag_list), 2):
if t in tag_list[i:i+2]:
# print("\n>>> %s---%s==%s" % (word, t, tag_list[i:i+2]))
tag = tag_list[i].split("-")[-1]
if tag not in result:
result[tag] = ""
result[tag] += ' '+word if t==tag_list[i] else word
print(result)
def main():
# print("--")
train()
test()
# if __name__ == "__main__":
# train()

View File

@@ -0,0 +1,124 @@
#!/usr/bin/python
# coding: utf8
from math import log
def calcShannonEnt(dataSet):
"""calcShannonEnt(calculate Shannon entropy 计算label分类标签的香农熵)
Args:
dataSet 数据集
Returns:
返回香农熵的计算值
Raises:
"""
# 求list的长度表示计算参与训练的数据量
numEntries = len(dataSet)
# print(type(dataSet), 'numEntries: ', numEntries)
# 计算分类标签label出现的次数
labelCounts = {}
# the the number of unique elements and their occurance
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
# print('-----', featVec, labelCounts)
# 对于label标签的占比求出label标签的香农熵
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
# log base 2
shannonEnt -= prob * log(prob, 2)
# print('---', prob, prob * log(prob, 2), shannonEnt)
return shannonEnt
def splitDataSet(dataSet, axis, value):
"""splitDataSet(通过遍历dataSet数据集求出axis对应的colnum列的值为value的行)
Args:
dataSet 数据集
axis 表示每一行的axis列
value 表示axis列对应的value值
Returns:
axis列为value的数据集【该数据集需要排除axis列】
Raises:
"""
retDataSet = []
for featVec in dataSet:
# axis列为value的数据集【该数据集需要排除axis列】
if featVec[axis] == value:
# chop out axis used for splitting
reducedFeatVec = featVec[:axis]
'''
请百度查询一下: extend和append的区别
'''
reducedFeatVec.extend(featVec[axis+1:])
# 收集结果值 axis列为value的行【该行需要排除axis列】
retDataSet.append(reducedFeatVec)
return retDataSet
def getFeatureShannonEnt(dataSet, labels):
"""chooseBestFeatureToSplit(选择最好的特征)
Args:
dataSet 数据集
Returns:
bestFeature 最优的特征列
Raises:
"""
# 求第一行有多少列的 Feature
numFeatures = len(dataSet[0]) - 1
# label的信息熵
baseEntropy = calcShannonEnt(dataSet)
# 最优的信息增益值, 和最优的Featurn编号
bestInfoGain, bestFeature, endEntropy = 0.0, -1, 0.0
# iterate over all the features
for i in range(numFeatures):
# create a list of all the examples of this feature
# 获取每一个feature的list集合
featList = [example[i] for example in dataSet]
# get a set of unique values
# 获取剔重后的集合
uniqueVals = set(featList)
# 创建一个临时的信息熵
newEntropy = 0.0
# 遍历某一列的value集合计算该列的信息熵
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
# gain[信息增益] 值越大,意味着该分类提供的信息量越大,该特征对分类的不确定程度越小
# gain[信息增益]=0, 表示与类别相同,无需其他的分类
# gain[信息增益]=baseEntropy, 表示分类和没分类没有区别
infoGain = baseEntropy - newEntropy
# print(infoGain)
if (infoGain > bestInfoGain):
endEntropy = newEntropy
bestInfoGain = infoGain
bestFeature = i
else:
if numFeatures < 0:
labels[bestFeature] = 'null'
return labels[bestFeature], baseEntropy, endEntropy, bestInfoGain
if __name__ == '__main__':
labels = ['no surfacing', 'flippers']
dataSet1 = [['yes'], ['yes'], ['no'], ['no'], ['no']]
dataSet2 = [['a', 1, 'yes'], ['a', 2, 'yes'], ['b', 3, 'no'], ['c', 4, 'no'], ['c', 5, 'no']]
dataSet3 = [[1, 'yes'], [1, 'yes'], [1, 'no'], [3, 'no'], [3, 'no']]
infoGain1 = getFeatureShannonEnt(dataSet1, labels)
infoGain2 = getFeatureShannonEnt(dataSet2, labels)
infoGain3 = getFeatureShannonEnt(dataSet3, labels)
print('信息增益: \n\t%s, \n\t%s, \n\t%s' % (infoGain1, infoGain2, infoGain3))

View File

@@ -0,0 +1,53 @@
#!/usr/bin/python
# coding:utf8
from __future__ import print_function
import os
import sklearn.datasets as datasets
def get_data(file_input, separator='\t'):
if 'libsvm' not in file_input:
file_input = other2libsvm(file_input, separator)
data = datasets.load_svmlight_file(file_input)
return data[0], data[1]
def other2libsvm(file_name, separator='\t'):
libsvm_name = file_name.replace('.txt', '.libsvm_tmp')
libsvm_data = open(libsvm_name, 'w')
file_data = open(file_name, 'r')
for line in file_data.readlines():
features = line.strip().split(separator)
# print len(features)
class_data = features[-1]
svm_format = ''
for i in range(len(features)-1):
svm_format += " %d:%s" % (i+1, features[i])
# print svm_format
svm_format = "%s%s\n" % (class_data, svm_format)
# print svm_format
libsvm_data.write(svm_format)
file_data.close()
libsvm_data.close()
return libsvm_name
def dump_data(x, y, file_output):
datasets.dump_svmlight_file(x, y, file_output)
os.remove("%s_tmp" % file_output)
if __name__ == "__main__":
file_input = "data/7.AdaBoost/horseColicTest2.txt"
file_output = "data/7.AdaBoost/horseColicTest2.libsvm"
# 获取数据集
x, y = get_data(file_input, separator='\t')
print(x[3, :])
print(y)
# 导出数据为 libsvm
dump_data(x, y, file_output)