Merge pull request #560 from jiangzhonglian/master

添加情感分类第一个版本
2026-06-15 06:46:43 +08:00 · 2019-12-09 18:21:54 +08:00
parent 19f42823f2 c78b1c02de
commit c4f55f4c6a
6 changed files with 412 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -271,6 +271,9 @@
 * Python 自然语言处理 第二版: <https://usyiyi.github.io/nlp-py-2e-zh>
 * 推荐一个[liuhuanyong大佬](https://github.com/liuhuanyong)整理的nlp全面知识体系: <https://liuhuanyong.github.io>
 * 开源 - 词向量库集合: 
+  * <https://www.cnblogs.com/Darwin2000/p/5786984.html>
+  * <https://ai.tencent.com/ailab/nlp/embedding.html>
+  * <https://blog.csdn.net/xiezj007/article/details/85073890>
  * <https://github.com/Embedding/Chinese-Word-Vectors>
  * <https://github.com/brightmart/nlp_chinese_corpus>
  * <https://github.com/codemayq/chinese_chatbot_corpus>
--- a/src/py3.x/tensorflow2.x/EmotionData.xlsx
+++ b/src/py3.x/tensorflow2.x/EmotionData.xlsx
--- a/src/py3.x/tensorflow2.x/EmotionData的副本.xlsx
+++ b/src/py3.x/tensorflow2.x/EmotionData的副本.xlsx
--- a/src/py3.x/tensorflow2.x/config.py
+++ b/src/py3.x/tensorflow2.x/config.py
@@ -8,7 +8,14 @@
 class Config(object):
    poetry_file = 'poetry.txt'
    weight_file = 'poetry_model.h5'
+    data_file = 'EmotionData.xlsx'
+    model_file = 'EmotionModel.h5'
+    vocab_list = 'vocal_list.pkl'
+    word_index = 'word_index.pkl'
    # 根据前六个字预测第七个字
    max_len = 6
    batch_size = 512
    learning_rate = 0.001
+    pre_num = 3
+    MAX_SEQUENCE_LENGTH = 1000  # 每个文本或者句子的截断长度，只保留1000个单词
+    EMBEDDING_DIM = 60 # 词向量维度
--- a/src/py3.x/tensorflow2.x/test.ipynb
+++ b/src/py3.x/tensorflow2.x/test.ipynb
@@ -0,0 +1,190 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "version": "3.6.3"
+  },
+  "orig_nbformat": 2,
+  "file_extension": ".py",
+  "mimetype": "text/x-python",
+  "name": "python",
+  "npconvert_exporter": "python",
+  "pygments_lexer": "ipython3",
+  "version": 3
+ },
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": "/Users/jiangzl/.virtualenvs/python3.6/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n  from ._conv import register_converters as _register_converters\nUsing TensorFlow backend.\n"
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "# 加载自定义包(添加：中间件)\n",
+    "sys.path.append(\"src/py3.x/tensorflow2.x\")\n",
+    "from text_Emotion import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outfile = \"/opt/data/开源词向量/gensim_word2vec_60/Word60.model\"\n",
+    "# 加载词向量\n",
+    "Word2VecModel = loadMyWord2Vec(outfile)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": "空间的词向量（60 维）: (60,) [ 2.2506642  -1.7324443   0.35593075 -3.7236977  -0.6317619   2.1253817\n -0.8911206   0.61192095 -2.5709946   5.6513844   2.3008282  -4.102604\n -0.61898416 -1.1190889  -6.060641    2.3529105   1.8131357   2.0764832\n -2.102738   -0.414962   -2.0553887   0.37966883 -2.015982   -1.4542716\n  3.191199    0.3265181   0.7307454   1.4761372  -2.2383723   0.925493\n  6.2617674  -1.3852879   0.6405419  -0.5601632  -1.084447    5.689829\n  0.46593904 -2.824275    4.2015862  -0.87934065  1.518804   -1.493514\n -1.9851282  -0.63166183  0.96814466  1.6375747   1.1566993   1.1981301\n  0.7950756  -3.0055897   1.2649575   1.2099069   1.9403213   1.3719954\n  2.6494706   1.8465079  -0.5507954  -2.3987298  -1.8990258  -4.651662  ]\n打印与空间最相近的5个词语： [('物件', 0.7354965806007385), ('维度', 0.7326242923736572), ('自由空间', 0.7247114181518555), ('拓扑', 0.7112817764282227), ('三维空间', 0.7062257528305054)]\n加载词向量结束..\n"
+    }
+   ],
+   "source": [
+    "embeddings_matrix = load_embeding()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": "--:  [[ 0.          0.          0.         ...  0.          0.\n   0.        ]\n [ 3.6153059   2.63272738 -0.98327219 ...  0.03685202 -0.78566265\n   1.06350613]\n [ 0.21444647  2.58100891  0.08306306 ... -0.43973923 -0.2102039\n  -1.37015963]\n ...\n [-1.07420349  1.90465117  2.2614491  ... -1.90614116 -0.34697708\n  -2.43622112]\n [ 1.53204441  0.60434735 -0.02905927 ... -0.04591536 -0.63762575\n   0.29778937]\n [ 0.20260553  0.03990031 -0.22745971 ... -0.17701624  0.16334218\n   0.06799572]]\n"
+    }
+   ],
+   "source": [
+    "print('--: ', embeddings_matrix)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import os\n",
+    "import keras\n",
+    "import random\n",
+    "import gensim\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from keras import Model\n",
+    "from keras.models import load_model\n",
+    "from keras.layers import Dropout, Dense, Flatten, Bidirectional, Embedding, GRU, Input\n",
+    "from keras.optimizers import Adam\n",
+    "# 该目录下的 config.py文件， 数据文件是: poetry.txt\n",
+    "from config import Config\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": "空间的词向量（60 维）: (60,) [ 2.2506642  -1.7324443   0.35593075 -3.7236977  -0.6317619   2.1253817\n -0.8911206   0.61192095 -2.5709946   5.6513844   2.3008282  -4.102604\n -0.61898416 -1.1190889  -6.060641    2.3529105   1.8131357   2.0764832\n -2.102738   -0.414962   -2.0553887   0.37966883 -2.015982   -1.4542716\n  3.191199    0.3265181   0.7307454   1.4761372  -2.2383723   0.925493\n  6.2617674  -1.3852879   0.6405419  -0.5601632  -1.084447    5.689829\n  0.46593904 -2.824275    4.2015862  -0.87934065  1.518804   -1.493514\n -1.9851282  -0.63166183  0.96814466  1.6375747   1.1566993   1.1981301\n  0.7950756  -3.0055897   1.2649575   1.2099069   1.9403213   1.3719954\n  2.6494706   1.8465079  -0.5507954  -2.3987298  -1.8990258  -4.651662  ]\n打印与空间最相近的5个词语： [('物件', 0.7354965806007385), ('维度', 0.7326242923736572), ('自由空间', 0.7247114181518555), ('拓扑', 0.7112817764282227), ('三维空间', 0.7062257528305054)]\n加载词向量结束..\n"
+    },
+    {
+     "ename": "NameError",
+     "evalue": "name 'load_data' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-18-afd80ed77829>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mEmotionModel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mConfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m/opt/git/AiLearning/src/py3.x/tensorflow2.x/text_Emotion.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m     75\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msummary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     76\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 77\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     79\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mbuild_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0membeddings_matrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/git/AiLearning/src/py3.x/tensorflow2.x/text_Emotion.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    123\u001b[0m         \u001b[0;34m'''训练模型'''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    124\u001b[0m         \u001b[0membeddings_matrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_embeding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 125\u001b[0;31m         \u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    126\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuild_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0membeddings_matrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    127\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m128\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'load_data' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "model = EmotionModel(Config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>label</th>\n      <th>comment</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>0</td>\n      <td>1</td>\n      <td>距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...</td>\n    </tr>\n    <tr>\n      <td>1</td>\n      <td>1</td>\n      <td>商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!</td>\n    </tr>\n    <tr>\n      <td>2</td>\n      <td>1</td>\n      <td>早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。</td>\n    </tr>\n    <tr>\n      <td>3</td>\n      <td>1</td>\n      <td>宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...</td>\n    </tr>\n    <tr>\n      <td>4</td>\n      <td>1</td>\n      <td>CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风</td>\n    </tr>\n    <tr>\n      <td>5</td>\n      <td>1</td>\n      <td>总的来说，这样的酒店配这样的价格还算可以，希望他赶快装修，给我的客人留些好的印象</td>\n    </tr>\n    <tr>\n      <td>6</td>\n      <td>1</td>\n      <td>价格比比较不错的酒店。这次免费升级了，感谢前台服务员。房子还好，地毯是新的，比上次的好些。早...</td>\n    </tr>\n    <tr>\n      <td>7</td>\n      <td>1</td>\n      <td>不错，在同等档次酒店中应该是值得推荐的！</td>\n    </tr>\n    <tr>\n      <td>8</td>\n      <td>1</td>\n      <td>入住丽晶，感觉很好。因为是新酒店，的确有淡淡的油漆味，房间内较新。房间大小合适，卫生间设备齐...</td>\n    </tr>\n    <tr>\n      <td>9</td>\n      <td>1</td>\n      <td>1。酒店比较新，装潢和设施还不错，只是房间有些油漆味。2。早餐还可以，只是品种不是很多。3。...</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
+      "text/plain": "   label                                            comment\n0      1  距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...\n1      1                       商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!\n2      1         早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。\n3      1  宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...\n4      1               CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风\n5      1           总的来说，这样的酒店配这样的价格还算可以，希望他赶快装修，给我的客人留些好的印象\n6      1  价格比比较不错的酒店。这次免费升级了，感谢前台服务员。房子还好，地毯是新的，比上次的好些。早...\n7      1                               不错，在同等档次酒店中应该是值得推荐的！\n8      1  入住丽晶，感觉很好。因为是新酒店，的确有淡淡的油漆味，房间内较新。房间大小合适，卫生间设备齐...\n9      1  1。酒店比较新，装潢和设施还不错，只是房间有些油漆味。2。早餐还可以，只是品种不是很多。3。..."
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_excel(\"src/py3.x/tensorflow2.x/EmotionData.xlsx\", header=0, error_bad_lines=False, encoding=\"utf_8_sig\")\n",
+    "df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y = df[\"label\"].tolist()\n",
+    "y[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def func(line, ngrams=[]):\n",
+    "    # 加入我们的组合词，保证分词的准确性\n",
+    "        \n",
+    "    if ngrams != []:\n",
+    "        for word in ngrams:\n",
+    "            jieba.add_word(\"\".join(word.lower()))\n",
+    "    # # 将文本 ['1, 2, 3', '1, 2, .., n'] 分解为: [[1, 2, 3], [1, 2, .., n]]\n",
+    "    words = [word for word in jieba.cut(str(line).lower(), cut_all=False)]\n",
+    "    # print(\">>> \", train)\n",
+    "    return \" \".join(words)\n",
+    "x = df[\"comment\"].apply(lambda line: func(line))\n"
+   ]
+  }
+ ]
+}
--- a/src/py3.x/tensorflow2.x/text_Emotion.py
+++ b/src/py3.x/tensorflow2.x/text_Emotion.py
@@ -0,0 +1,212 @@
+# *-* coding:utf-8 *-*
+# 词向量: 
+#   https://www.cnblogs.com/Darwin2000/p/5786984.html
+# 数据集:
+#   https://blog.csdn.net/alip39/article/details/95891321
+# 参考代码:
+#   https://blog.csdn.net/u012052268/article/details/90238282
+import re
+import os
+import keras
+import random
+import gensim
+import numpy as np
+import pandas as pd
+import jieba
+from sklearn.model_selection import train_test_split
+from keras import Model
+from keras.models import load_model
+from keras.layers import Dropout, Dense, Flatten, Bidirectional, Embedding, GRU, Input
+from keras.preprocessing.sequence import pad_sequences
+from keras.utils.np_utils import to_categorical
+from keras.optimizers import Adam
+from config import Config
+import pickle
+
+
+# 存储模型: 持久化
+def load_pkl(filename):
+    with open(filename, 'rb') as fr:
+        model = pickle.load(fr)
+    return model
+
+
+def save_pkl(model, filename):
+    with open(filename, 'wb') as fw:
+        pickle.dump(model, fw)
+
+
+## 训练自己的词向量，并保存。
+def trainWord2Vec(infile, outfile):
+    sentences =  gensim.models.word2vec.LineSentence(infile) # 读取分词后的 文本
+    model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=1, workers=4) # 训练模型
+    model.save(outfile)
+
+
+def loadMyWord2Vec(outfile):
+    # 导入 预训练的词向量
+    Word2VecModel = gensim.models.Word2Vec.load(outfile)
+    return Word2VecModel
+
+
+def load_embeding():
+    # 训练词向量(用空格隔开的文本)
+    infile = "./CarCommentAll_cut.csv"
+    outfile = "gensim_word2vec_60/Word60.model"
+    # trainWord2Vec(infile, outfile)
+    # 加载词向量
+    Word2VecModel = loadMyWord2Vec(outfile)
+
+    print('空间的词向量（60 维）:', Word2VecModel.wv['空间'].shape, Word2VecModel.wv['空间'])
+    print('打印与空间最相近的5个词语：', Word2VecModel.wv.most_similar('空间', topn=5))
+
+    ## 2 构造包含所有词语的 list，以及初始化 “词语-序号”字典 和 “词向量”矩阵
+    vocab_list = [word for word, Vocab in Word2VecModel.wv.vocab.items()]# 存储 所有的 词语
+
+    word_index = {" ": 0}# 初始化 `[word : token]` ，后期 tokenize 语料库就是用该词典。
+    word_vector = {} # 初始化`[word : vector]`字典
+
+    # 初始化存储所有向量的大矩阵，留意其中多一位（首行），词向量全为 0，用于 padding补零。
+    # 行数 为 所有单词数+1 比如 10000+1 ； 列数为 词向量“维度”比如60。
+    embeddings_matrix = np.zeros((len(vocab_list) + 1, Word2VecModel.vector_size))
+
+    ## 3 填充 上述 的字典 和 大矩阵
+    for i in range(len(vocab_list)):
+        # print(i)
+        word = vocab_list[i]  # 每个词语
+        word_index[word] = i + 1 # 词语：序号
+        word_vector[word] = Word2VecModel.wv[word] # 词语：词向量
+        embeddings_matrix[i + 1] = Word2VecModel.wv[word]  # 词向量矩阵
+    print("加载词向量结束..")
+    return vocab_list, word_index, embeddings_matrix
+
+
+class EmotionModel(object):
+    def __init__(self, config):
+        self.model = None
+        self.config = config
+        self.pre_num = self.config.pre_num
+        self.data_file = self.config.data_file
+        self.vocab_list = self.config.vocab_list
+        self.word_index = self.config.word_index
+        self.EMBEDDING_DIM = self.config.EMBEDDING_DIM
+        self.MAX_SEQUENCE_LENGTH = self.config.MAX_SEQUENCE_LENGTH
+
+        # 如果模型文件存在则直接加载模型，否则开始训练
+        if os.path.exists(self.config.model_file):
+            self.model = load_model(self.config.model_file)
+            self.model.summary()
+        else:
+            self.train()
+
+    def build_model(self, embeddings_matrix):
+        ## 4 在 keras的Embedding层中使用 预训练词向量
+        embedding_layer = Embedding(
+            input_dim = len(embeddings_matrix), # 字典长度
+            output_dim = self.EMBEDDING_DIM, # 词向量 长度（60）
+            weights = [embeddings_matrix], # 重点：预训练的词向量系数
+            input_length = self.MAX_SEQUENCE_LENGTH, # 每句话的 最大长度（必须padding） 
+            trainable = False # 是否在 训练的过程中 更新词向量
+        )
+        # 如果不加载外界的，可以自己训练
+        # 可以看出在使用 Keras的中Embedding层时候，不指定参数 weights=[embeddings_matrix] 即可自动生成词向量。
+        # embedding_layer = Embedding(
+        #     input_dim = len(word_index) + 1, # 由于 没有预训练，设置+1 
+        #     output_dim = EMBEDDING_DIM, # 设置词向量的维度
+        #     input_length=MAX_SEQUENCE_LENGTH
+        # ) #设置句子的最大长度
+
+        print("开始训练模型.....")
+        # 使用
+        sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32')  # 返回一个张量，长度为1000，也就是模型的输入为batch_size*1000
+        embedded_sequences = embedding_layer(sequence_input)  # 返回batch_size*1000*100
+        x = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
+        x = Dropout(0.6)(x)
+        x = Flatten()(x)
+        preds = Dense(self.pre_num, activation='softmax')(x)
+        self.model = Model(sequence_input, preds)
+        # 设置优化器
+        optimizer = Adam(lr=self.config.learning_rate)
+        self.model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
+        self.model.summary()
+
+    def load_word2jieba(self):
+        vocab_list = load_pkl(self.vocab_list)
+        if vocab_list != []:
+            for word in vocab_list:
+                jieba.add_word(word)
+
+    def predict(self, line):
+        '''预测'''
+        word_index = load_pkl(self.word_index)
+        STOPWORDS = ["-", "\t", "\n", ".", "。", ",", "，", ";", "!", "！", "?", "？", "%"]
+        words = [word for word in jieba.cut(str(line), cut_all=False) if word not in STOPWORDS]
+        indexs = [word_index.get(word, 0) for word in words]
+        x_pred = pad_sequences([indexs], maxlen=self.MAX_SEQUENCE_LENGTH)
+        res = self.model.predict(x_pred, verbose=0)[0]
+        return res
+
+    def load_data(self, word_index, vocab_list, test_size=0.25):
+        STOPWORDS = ["-", "\t", "\n", ".", "。", ",", "，", ";", "!", "！", "?", "？", "%"]
+        if vocab_list != []:
+            for word in vocab_list:
+                jieba.add_word(word)
+
+        def func(line):
+            # 将文本 ['1, 2, 3', '1, 2, .., n'] 分解为: [[1, 2, 3], [1, 2, .., n]]
+            words = [word for word in jieba.cut(str(line), cut_all=False) if word not in STOPWORDS]
+            indexs = [word_index.get(word, 0) for word in words]
+            return indexs
+
+        df = pd.read_excel(self.data_file, header=0, error_bad_lines=False, encoding="utf_8_sig")
+        x = df["comment"].apply(lambda line: func(line)).tolist()
+        x = pad_sequences(x, maxlen=self.MAX_SEQUENCE_LENGTH)
+        y = df["label"].tolist()
+        # 按照大小和顺序，生成 label(0,1,2...自然数类型)
+        """
+        In [7]: to_categorical(np.asarray([1,1,0,1,3]))
+        Out[7]:
+        array([[0., 1., 0., 0.],
+            [0., 1., 0., 0.],
+            [1., 0., 0., 0.],
+            [0., 1., 0., 0.],
+            [0., 0., 0., 1.]], dtype=float32)
+        """
+        y = to_categorical(np.asarray(y))
+        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=10000)
+        return (x_train, y_train), (x_test, y_test) 
+
+
+    def train(self):
+        '''训练模型'''
+        vocab_list, word_index, embeddings_matrix = load_embeding()
+        save_pkl(vocab_list, self.vocab_list)
+        save_pkl(word_index, self.word_index)
+        (x_train, y_train), (x_test, y_test) = self.load_data(word_index, vocab_list)
+        print("---------")
+        print(x_train[:3], "\n", y_train[:3])
+        print("\n")
+        print(x_test[:3], "\n", y_test[:3])
+        print("---------")
+        self.build_model(embeddings_matrix)
+        self.model.fit(x_train, y_train, batch_size=60, epochs=10)
+        self.model.evaluate(x_test, y_test, verbose=2)
+        self.model.save(self.config.model_file)
+
+
+if __name__ == '__main__':
+    # 测试加载外界word2vec词向量
+    # vocab_list, word_index, embeddings_matrix = load_embeding()
+    model = EmotionModel(Config)
+    status = False
+    while 1:
+        text = input("text:")
+        if text in ["exit", "quit"]:
+            break
+        # 首次启动加载jieba词库
+        if not status:
+            model.load_word2jieba()
+            status = True
+        res = model.predict(text)
+        label_dic = {0:"消极的", 1:"中性的", 2:"积极的"}
+        print(res, " : ", label_dic[np.argmax(res)])