diff --git a/README.md b/README.md index 2cbec3af..b0764d0c 100644 --- a/README.md +++ b/README.md @@ -271,6 +271,9 @@ * Python 自然语言处理 第二版: * 推荐一个[liuhuanyong大佬](https://github.com/liuhuanyong)整理的nlp全面知识体系: * 开源 - 词向量库集合: + * + * + * * * * diff --git a/src/py3.x/tensorflow2.x/EmotionData.xlsx b/src/py3.x/tensorflow2.x/EmotionData.xlsx new file mode 100644 index 00000000..9f4083a2 Binary files /dev/null and b/src/py3.x/tensorflow2.x/EmotionData.xlsx differ diff --git a/src/py3.x/tensorflow2.x/EmotionData的副本.xlsx b/src/py3.x/tensorflow2.x/EmotionData的副本.xlsx new file mode 100644 index 00000000..5ae205fd Binary files /dev/null and b/src/py3.x/tensorflow2.x/EmotionData的副本.xlsx differ diff --git a/src/py3.x/tensorflow2.x/config.py b/src/py3.x/tensorflow2.x/config.py index 22df209d..8d149749 100644 --- a/src/py3.x/tensorflow2.x/config.py +++ b/src/py3.x/tensorflow2.x/config.py @@ -8,7 +8,14 @@ class Config(object): poetry_file = 'poetry.txt' weight_file = 'poetry_model.h5' + data_file = 'EmotionData.xlsx' + model_file = 'EmotionModel.h5' + vocab_list = 'vocal_list.pkl' + word_index = 'word_index.pkl' # 根据前六个字预测第七个字 max_len = 6 batch_size = 512 learning_rate = 0.001 + pre_num = 3 + MAX_SEQUENCE_LENGTH = 1000 # 每个文本或者句子的截断长度,只保留1000个单词 + EMBEDDING_DIM = 60 # 词向量维度 diff --git a/src/py3.x/tensorflow2.x/test.ipynb b/src/py3.x/tensorflow2.x/test.ipynb new file mode 100644 index 00000000..bfe5f3a5 --- /dev/null +++ b/src/py3.x/tensorflow2.x/test.ipynb @@ -0,0 +1,190 @@ +{ + "nbformat": 4, + "nbformat_minor": 2, + "metadata": { + "language_info": { + "name": "python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "version": "3.6.3" + }, + "orig_nbformat": 2, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "npconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": "/Users/jiangzl/.virtualenvs/python3.6/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n from ._conv import register_converters as _register_converters\nUsing TensorFlow backend.\n" + } + ], + "source": [ + "import sys\n", + "# 加载自定义包(添加:中间件)\n", + "sys.path.append(\"src/py3.x/tensorflow2.x\")\n", + "from text_Emotion import *" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "outfile = \"/opt/data/开源词向量/gensim_word2vec_60/Word60.model\"\n", + "# 加载词向量\n", + "Word2VecModel = loadMyWord2Vec(outfile)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "空间的词向量(60 维): (60,) [ 2.2506642 -1.7324443 0.35593075 -3.7236977 -0.6317619 2.1253817\n -0.8911206 0.61192095 -2.5709946 5.6513844 2.3008282 -4.102604\n -0.61898416 -1.1190889 -6.060641 2.3529105 1.8131357 2.0764832\n -2.102738 -0.414962 -2.0553887 0.37966883 -2.015982 -1.4542716\n 3.191199 0.3265181 0.7307454 1.4761372 -2.2383723 0.925493\n 6.2617674 -1.3852879 0.6405419 -0.5601632 -1.084447 5.689829\n 0.46593904 -2.824275 4.2015862 -0.87934065 1.518804 -1.493514\n -1.9851282 -0.63166183 0.96814466 1.6375747 1.1566993 1.1981301\n 0.7950756 -3.0055897 1.2649575 1.2099069 1.9403213 1.3719954\n 2.6494706 1.8465079 -0.5507954 -2.3987298 -1.8990258 -4.651662 ]\n打印与空间最相近的5个词语: [('物件', 0.7354965806007385), ('维度', 0.7326242923736572), ('自由空间', 0.7247114181518555), ('拓扑', 0.7112817764282227), ('三维空间', 0.7062257528305054)]\n加载词向量结束..\n" + } + ], + "source": [ + "embeddings_matrix = load_embeding()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "--: [[ 0. 0. 0. ... 0. 0.\n 0. ]\n [ 3.6153059 2.63272738 -0.98327219 ... 0.03685202 -0.78566265\n 1.06350613]\n [ 0.21444647 2.58100891 0.08306306 ... -0.43973923 -0.2102039\n -1.37015963]\n ...\n [-1.07420349 1.90465117 2.2614491 ... -1.90614116 -0.34697708\n -2.43622112]\n [ 1.53204441 0.60434735 -0.02905927 ... -0.04591536 -0.63762575\n 0.29778937]\n [ 0.20260553 0.03990031 -0.22745971 ... -0.17701624 0.16334218\n 0.06799572]]\n" + } + ], + "source": [ + "print('--: ', embeddings_matrix)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import os\n", + "import keras\n", + "import random\n", + "import gensim\n", + "import numpy as np\n", + "import pandas as pd\n", + "from keras import Model\n", + "from keras.models import load_model\n", + "from keras.layers import Dropout, Dense, Flatten, Bidirectional, Embedding, GRU, Input\n", + "from keras.optimizers import Adam\n", + "# 该目录下的 config.py文件, 数据文件是: poetry.txt\n", + "from config import Config\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "空间的词向量(60 维): (60,) [ 2.2506642 -1.7324443 0.35593075 -3.7236977 -0.6317619 2.1253817\n -0.8911206 0.61192095 -2.5709946 5.6513844 2.3008282 -4.102604\n -0.61898416 -1.1190889 -6.060641 2.3529105 1.8131357 2.0764832\n -2.102738 -0.414962 -2.0553887 0.37966883 -2.015982 -1.4542716\n 3.191199 0.3265181 0.7307454 1.4761372 -2.2383723 0.925493\n 6.2617674 -1.3852879 0.6405419 -0.5601632 -1.084447 5.689829\n 0.46593904 -2.824275 4.2015862 -0.87934065 1.518804 -1.493514\n -1.9851282 -0.63166183 0.96814466 1.6375747 1.1566993 1.1981301\n 0.7950756 -3.0055897 1.2649575 1.2099069 1.9403213 1.3719954\n 2.6494706 1.8465079 -0.5507954 -2.3987298 -1.8990258 -4.651662 ]\n打印与空间最相近的5个词语: [('物件', 0.7354965806007385), ('维度', 0.7326242923736572), ('自由空间', 0.7247114181518555), ('拓扑', 0.7112817764282227), ('三维空间', 0.7062257528305054)]\n加载词向量结束..\n" + }, + { + "ename": "NameError", + "evalue": "name 'load_data' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mEmotionModel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mConfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/opt/git/AiLearning/src/py3.x/tensorflow2.x/text_Emotion.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msummary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 77\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mbuild_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0membeddings_matrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/git/AiLearning/src/py3.x/tensorflow2.x/text_Emotion.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;34m'''训练模型'''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[0membeddings_matrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_embeding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 125\u001b[0;31m \u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 126\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuild_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0membeddings_matrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m128\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'load_data' is not defined" + ] + } + ], + "source": [ + "model = EmotionModel(Config)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
labelcomment
01距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...
11商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错!
21早餐太差,无论去多少人,那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
31宾馆在小街道上,不大好找,但还好北京热心同胞很多~宾馆设施跟介绍的差不多,房间很小,确实挺小...
41CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风
51总的来说,这样的酒店配这样的价格还算可以,希望他赶快装修,给我的客人留些好的印象
61价格比比较不错的酒店。这次免费升级了,感谢前台服务员。房子还好,地毯是新的,比上次的好些。早...
71不错,在同等档次酒店中应该是值得推荐的!
81入住丽晶,感觉很好。因为是新酒店,的确有淡淡的油漆味,房间内较新。房间大小合适,卫生间设备齐...
911。酒店比较新,装潢和设施还不错,只是房间有些油漆味。2。早餐还可以,只是品种不是很多。3。...
\n
", + "text/plain": " label comment\n0 1 距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...\n1 1 商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错!\n2 1 早餐太差,无论去多少人,那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。\n3 1 宾馆在小街道上,不大好找,但还好北京热心同胞很多~宾馆设施跟介绍的差不多,房间很小,确实挺小...\n4 1 CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风\n5 1 总的来说,这样的酒店配这样的价格还算可以,希望他赶快装修,给我的客人留些好的印象\n6 1 价格比比较不错的酒店。这次免费升级了,感谢前台服务员。房子还好,地毯是新的,比上次的好些。早...\n7 1 不错,在同等档次酒店中应该是值得推荐的!\n8 1 入住丽晶,感觉很好。因为是新酒店,的确有淡淡的油漆味,房间内较新。房间大小合适,卫生间设备齐...\n9 1 1。酒店比较新,装潢和设施还不错,只是房间有些油漆味。2。早餐还可以,只是品种不是很多。3。..." + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_excel(\"src/py3.x/tensorflow2.x/EmotionData.xlsx\", header=0, error_bad_lines=False, encoding=\"utf_8_sig\")\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]" + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y = df[\"label\"].tolist()\n", + "y[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def func(line, ngrams=[]):\n", + " # 加入我们的组合词,保证分词的准确性\n", + " \n", + " if ngrams != []:\n", + " for word in ngrams:\n", + " jieba.add_word(\"\".join(word.lower()))\n", + " # # 将文本 ['1, 2, 3', '1, 2, .., n'] 分解为: [[1, 2, 3], [1, 2, .., n]]\n", + " words = [word for word in jieba.cut(str(line).lower(), cut_all=False)]\n", + " # print(\">>> \", train)\n", + " return \" \".join(words)\n", + "x = df[\"comment\"].apply(lambda line: func(line))\n" + ] + } + ] +} \ No newline at end of file diff --git a/src/py3.x/tensorflow2.x/text_Emotion.py b/src/py3.x/tensorflow2.x/text_Emotion.py new file mode 100644 index 00000000..c44e6f39 --- /dev/null +++ b/src/py3.x/tensorflow2.x/text_Emotion.py @@ -0,0 +1,212 @@ +# *-* coding:utf-8 *-* +# 词向量: +# https://www.cnblogs.com/Darwin2000/p/5786984.html +# 数据集: +# https://blog.csdn.net/alip39/article/details/95891321 +# 参考代码: +# https://blog.csdn.net/u012052268/article/details/90238282 +import re +import os +import keras +import random +import gensim +import numpy as np +import pandas as pd +import jieba +from sklearn.model_selection import train_test_split +from keras import Model +from keras.models import load_model +from keras.layers import Dropout, Dense, Flatten, Bidirectional, Embedding, GRU, Input +from keras.preprocessing.sequence import pad_sequences +from keras.utils.np_utils import to_categorical +from keras.optimizers import Adam +from config import Config +import pickle + + +# 存储模型: 持久化 +def load_pkl(filename): + with open(filename, 'rb') as fr: + model = pickle.load(fr) + return model + + +def save_pkl(model, filename): + with open(filename, 'wb') as fw: + pickle.dump(model, fw) + + +## 训练自己的词向量,并保存。 +def trainWord2Vec(infile, outfile): + sentences = gensim.models.word2vec.LineSentence(infile) # 读取分词后的 文本 + model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=1, workers=4) # 训练模型 + model.save(outfile) + + +def loadMyWord2Vec(outfile): + # 导入 预训练的词向量 + Word2VecModel = gensim.models.Word2Vec.load(outfile) + return Word2VecModel + + +def load_embeding(): + # 训练词向量(用空格隔开的文本) + infile = "./CarCommentAll_cut.csv" + outfile = "gensim_word2vec_60/Word60.model" + # trainWord2Vec(infile, outfile) + # 加载词向量 + Word2VecModel = loadMyWord2Vec(outfile) + + print('空间的词向量(60 维):', Word2VecModel.wv['空间'].shape, Word2VecModel.wv['空间']) + print('打印与空间最相近的5个词语:', Word2VecModel.wv.most_similar('空间', topn=5)) + + ## 2 构造包含所有词语的 list,以及初始化 “词语-序号”字典 和 “词向量”矩阵 + vocab_list = [word for word, Vocab in Word2VecModel.wv.vocab.items()]# 存储 所有的 词语 + + word_index = {" ": 0}# 初始化 `[word : token]` ,后期 tokenize 语料库就是用该词典。 + word_vector = {} # 初始化`[word : vector]`字典 + + # 初始化存储所有向量的大矩阵,留意其中多一位(首行),词向量全为 0,用于 padding补零。 + # 行数 为 所有单词数+1 比如 10000+1 ; 列数为 词向量“维度”比如60。 + embeddings_matrix = np.zeros((len(vocab_list) + 1, Word2VecModel.vector_size)) + + ## 3 填充 上述 的字典 和 大矩阵 + for i in range(len(vocab_list)): + # print(i) + word = vocab_list[i] # 每个词语 + word_index[word] = i + 1 # 词语:序号 + word_vector[word] = Word2VecModel.wv[word] # 词语:词向量 + embeddings_matrix[i + 1] = Word2VecModel.wv[word] # 词向量矩阵 + print("加载词向量结束..") + return vocab_list, word_index, embeddings_matrix + + +class EmotionModel(object): + def __init__(self, config): + self.model = None + self.config = config + self.pre_num = self.config.pre_num + self.data_file = self.config.data_file + self.vocab_list = self.config.vocab_list + self.word_index = self.config.word_index + self.EMBEDDING_DIM = self.config.EMBEDDING_DIM + self.MAX_SEQUENCE_LENGTH = self.config.MAX_SEQUENCE_LENGTH + + # 如果模型文件存在则直接加载模型,否则开始训练 + if os.path.exists(self.config.model_file): + self.model = load_model(self.config.model_file) + self.model.summary() + else: + self.train() + + def build_model(self, embeddings_matrix): + ## 4 在 keras的Embedding层中使用 预训练词向量 + embedding_layer = Embedding( + input_dim = len(embeddings_matrix), # 字典长度 + output_dim = self.EMBEDDING_DIM, # 词向量 长度(60) + weights = [embeddings_matrix], # 重点:预训练的词向量系数 + input_length = self.MAX_SEQUENCE_LENGTH, # 每句话的 最大长度(必须padding) + trainable = False # 是否在 训练的过程中 更新词向量 + ) + # 如果不加载外界的,可以自己训练 + # 可以看出在使用 Keras的中Embedding层时候,不指定参数 weights=[embeddings_matrix] 即可自动生成词向量。 + # embedding_layer = Embedding( + # input_dim = len(word_index) + 1, # 由于 没有预训练,设置+1 + # output_dim = EMBEDDING_DIM, # 设置词向量的维度 + # input_length=MAX_SEQUENCE_LENGTH + # ) #设置句子的最大长度 + + print("开始训练模型.....") + # 使用 + sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32') # 返回一个张量,长度为1000,也就是模型的输入为batch_size*1000 + embedded_sequences = embedding_layer(sequence_input) # 返回batch_size*1000*100 + x = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences) + x = Dropout(0.6)(x) + x = Flatten()(x) + preds = Dense(self.pre_num, activation='softmax')(x) + self.model = Model(sequence_input, preds) + # 设置优化器 + optimizer = Adam(lr=self.config.learning_rate) + self.model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) + self.model.summary() + + def load_word2jieba(self): + vocab_list = load_pkl(self.vocab_list) + if vocab_list != []: + for word in vocab_list: + jieba.add_word(word) + + def predict(self, line): + '''预测''' + word_index = load_pkl(self.word_index) + STOPWORDS = ["-", "\t", "\n", ".", "。", ",", ",", ";", "!", "!", "?", "?", "%"] + words = [word for word in jieba.cut(str(line), cut_all=False) if word not in STOPWORDS] + indexs = [word_index.get(word, 0) for word in words] + x_pred = pad_sequences([indexs], maxlen=self.MAX_SEQUENCE_LENGTH) + res = self.model.predict(x_pred, verbose=0)[0] + return res + + def load_data(self, word_index, vocab_list, test_size=0.25): + STOPWORDS = ["-", "\t", "\n", ".", "。", ",", ",", ";", "!", "!", "?", "?", "%"] + if vocab_list != []: + for word in vocab_list: + jieba.add_word(word) + + def func(line): + # 将文本 ['1, 2, 3', '1, 2, .., n'] 分解为: [[1, 2, 3], [1, 2, .., n]] + words = [word for word in jieba.cut(str(line), cut_all=False) if word not in STOPWORDS] + indexs = [word_index.get(word, 0) for word in words] + return indexs + + df = pd.read_excel(self.data_file, header=0, error_bad_lines=False, encoding="utf_8_sig") + x = df["comment"].apply(lambda line: func(line)).tolist() + x = pad_sequences(x, maxlen=self.MAX_SEQUENCE_LENGTH) + y = df["label"].tolist() + # 按照大小和顺序,生成 label(0,1,2...自然数类型) + """ + In [7]: to_categorical(np.asarray([1,1,0,1,3])) + Out[7]: + array([[0., 1., 0., 0.], + [0., 1., 0., 0.], + [1., 0., 0., 0.], + [0., 1., 0., 0.], + [0., 0., 0., 1.]], dtype=float32) + """ + y = to_categorical(np.asarray(y)) + x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=10000) + return (x_train, y_train), (x_test, y_test) + + + def train(self): + '''训练模型''' + vocab_list, word_index, embeddings_matrix = load_embeding() + save_pkl(vocab_list, self.vocab_list) + save_pkl(word_index, self.word_index) + (x_train, y_train), (x_test, y_test) = self.load_data(word_index, vocab_list) + print("---------") + print(x_train[:3], "\n", y_train[:3]) + print("\n") + print(x_test[:3], "\n", y_test[:3]) + print("---------") + self.build_model(embeddings_matrix) + self.model.fit(x_train, y_train, batch_size=60, epochs=10) + self.model.evaluate(x_test, y_test, verbose=2) + self.model.save(self.config.model_file) + + +if __name__ == '__main__': + # 测试加载外界word2vec词向量 + # vocab_list, word_index, embeddings_matrix = load_embeding() + model = EmotionModel(Config) + status = False + while 1: + text = input("text:") + if text in ["exit", "quit"]: + break + # 首次启动加载jieba词库 + if not status: + model.load_word2jieba() + status = True + res = model.predict(text) + label_dic = {0:"消极的", 1:"中性的", 2:"积极的"} + print(res, " : ", label_dic[np.argmax(res)])