mirror of
https://github.com/apachecn/ailearning.git
synced 2026-04-07 12:39:28 +08:00
@@ -271,6 +271,9 @@
|
||||
* Python 自然语言处理 第二版: <https://usyiyi.github.io/nlp-py-2e-zh>
|
||||
* 推荐一个[liuhuanyong大佬](https://github.com/liuhuanyong)整理的nlp全面知识体系: <https://liuhuanyong.github.io>
|
||||
* 开源 - 词向量库集合:
|
||||
* <https://www.cnblogs.com/Darwin2000/p/5786984.html>
|
||||
* <https://ai.tencent.com/ailab/nlp/embedding.html>
|
||||
* <https://blog.csdn.net/xiezj007/article/details/85073890>
|
||||
* <https://github.com/Embedding/Chinese-Word-Vectors>
|
||||
* <https://github.com/brightmart/nlp_chinese_corpus>
|
||||
* <https://github.com/codemayq/chinese_chatbot_corpus>
|
||||
|
||||
BIN
src/py3.x/tensorflow2.x/EmotionData.xlsx
Normal file
BIN
src/py3.x/tensorflow2.x/EmotionData.xlsx
Normal file
Binary file not shown.
BIN
src/py3.x/tensorflow2.x/EmotionData的副本.xlsx
Normal file
BIN
src/py3.x/tensorflow2.x/EmotionData的副本.xlsx
Normal file
Binary file not shown.
@@ -8,7 +8,14 @@
|
||||
class Config(object):
|
||||
poetry_file = 'poetry.txt'
|
||||
weight_file = 'poetry_model.h5'
|
||||
data_file = 'EmotionData.xlsx'
|
||||
model_file = 'EmotionModel.h5'
|
||||
vocab_list = 'vocal_list.pkl'
|
||||
word_index = 'word_index.pkl'
|
||||
# 根据前六个字预测第七个字
|
||||
max_len = 6
|
||||
batch_size = 512
|
||||
learning_rate = 0.001
|
||||
pre_num = 3
|
||||
MAX_SEQUENCE_LENGTH = 1000 # 每个文本或者句子的截断长度,只保留1000个单词
|
||||
EMBEDDING_DIM = 60 # 词向量维度
|
||||
|
||||
190
src/py3.x/tensorflow2.x/test.ipynb
Normal file
190
src/py3.x/tensorflow2.x/test.ipynb
Normal file
@@ -0,0 +1,190 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2,
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"version": "3.6.3"
|
||||
},
|
||||
"orig_nbformat": 2,
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"npconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": 3
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": "/Users/jiangzl/.virtualenvs/python3.6/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n from ._conv import register_converters as _register_converters\nUsing TensorFlow backend.\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"# 加载自定义包(添加:中间件)\n",
|
||||
"sys.path.append(\"src/py3.x/tensorflow2.x\")\n",
|
||||
"from text_Emotion import *"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"outfile = \"/opt/data/开源词向量/gensim_word2vec_60/Word60.model\"\n",
|
||||
"# 加载词向量\n",
|
||||
"Word2VecModel = loadMyWord2Vec(outfile)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": "空间的词向量(60 维): (60,) [ 2.2506642 -1.7324443 0.35593075 -3.7236977 -0.6317619 2.1253817\n -0.8911206 0.61192095 -2.5709946 5.6513844 2.3008282 -4.102604\n -0.61898416 -1.1190889 -6.060641 2.3529105 1.8131357 2.0764832\n -2.102738 -0.414962 -2.0553887 0.37966883 -2.015982 -1.4542716\n 3.191199 0.3265181 0.7307454 1.4761372 -2.2383723 0.925493\n 6.2617674 -1.3852879 0.6405419 -0.5601632 -1.084447 5.689829\n 0.46593904 -2.824275 4.2015862 -0.87934065 1.518804 -1.493514\n -1.9851282 -0.63166183 0.96814466 1.6375747 1.1566993 1.1981301\n 0.7950756 -3.0055897 1.2649575 1.2099069 1.9403213 1.3719954\n 2.6494706 1.8465079 -0.5507954 -2.3987298 -1.8990258 -4.651662 ]\n打印与空间最相近的5个词语: [('物件', 0.7354965806007385), ('维度', 0.7326242923736572), ('自由空间', 0.7247114181518555), ('拓扑', 0.7112817764282227), ('三维空间', 0.7062257528305054)]\n加载词向量结束..\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"embeddings_matrix = load_embeding()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": "--: [[ 0. 0. 0. ... 0. 0.\n 0. ]\n [ 3.6153059 2.63272738 -0.98327219 ... 0.03685202 -0.78566265\n 1.06350613]\n [ 0.21444647 2.58100891 0.08306306 ... -0.43973923 -0.2102039\n -1.37015963]\n ...\n [-1.07420349 1.90465117 2.2614491 ... -1.90614116 -0.34697708\n -2.43622112]\n [ 1.53204441 0.60434735 -0.02905927 ... -0.04591536 -0.63762575\n 0.29778937]\n [ 0.20260553 0.03990031 -0.22745971 ... -0.17701624 0.16334218\n 0.06799572]]\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print('--: ', embeddings_matrix)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"import os\n",
|
||||
"import keras\n",
|
||||
"import random\n",
|
||||
"import gensim\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"from keras import Model\n",
|
||||
"from keras.models import load_model\n",
|
||||
"from keras.layers import Dropout, Dense, Flatten, Bidirectional, Embedding, GRU, Input\n",
|
||||
"from keras.optimizers import Adam\n",
|
||||
"# 该目录下的 config.py文件, 数据文件是: poetry.txt\n",
|
||||
"from config import Config\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": "空间的词向量(60 维): (60,) [ 2.2506642 -1.7324443 0.35593075 -3.7236977 -0.6317619 2.1253817\n -0.8911206 0.61192095 -2.5709946 5.6513844 2.3008282 -4.102604\n -0.61898416 -1.1190889 -6.060641 2.3529105 1.8131357 2.0764832\n -2.102738 -0.414962 -2.0553887 0.37966883 -2.015982 -1.4542716\n 3.191199 0.3265181 0.7307454 1.4761372 -2.2383723 0.925493\n 6.2617674 -1.3852879 0.6405419 -0.5601632 -1.084447 5.689829\n 0.46593904 -2.824275 4.2015862 -0.87934065 1.518804 -1.493514\n -1.9851282 -0.63166183 0.96814466 1.6375747 1.1566993 1.1981301\n 0.7950756 -3.0055897 1.2649575 1.2099069 1.9403213 1.3719954\n 2.6494706 1.8465079 -0.5507954 -2.3987298 -1.8990258 -4.651662 ]\n打印与空间最相近的5个词语: [('物件', 0.7354965806007385), ('维度', 0.7326242923736572), ('自由空间', 0.7247114181518555), ('拓扑', 0.7112817764282227), ('三维空间', 0.7062257528305054)]\n加载词向量结束..\n"
|
||||
},
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'load_data' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-18-afd80ed77829>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mEmotionModel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mConfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||
"\u001b[0;32m/opt/git/AiLearning/src/py3.x/tensorflow2.x/text_Emotion.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msummary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 77\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mbuild_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0membeddings_matrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/opt/git/AiLearning/src/py3.x/tensorflow2.x/text_Emotion.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;34m'''训练模型'''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[0membeddings_matrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_embeding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 125\u001b[0;31m \u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 126\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuild_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0membeddings_matrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m128\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'load_data' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model = EmotionModel(Config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>label</th>\n <th>comment</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>0</td>\n <td>1</td>\n <td>距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...</td>\n </tr>\n <tr>\n <td>1</td>\n <td>1</td>\n <td>商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错!</td>\n </tr>\n <tr>\n <td>2</td>\n <td>1</td>\n <td>早餐太差,无论去多少人,那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。</td>\n </tr>\n <tr>\n <td>3</td>\n <td>1</td>\n <td>宾馆在小街道上,不大好找,但还好北京热心同胞很多~宾馆设施跟介绍的差不多,房间很小,确实挺小...</td>\n </tr>\n <tr>\n <td>4</td>\n <td>1</td>\n <td>CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风</td>\n </tr>\n <tr>\n <td>5</td>\n <td>1</td>\n <td>总的来说,这样的酒店配这样的价格还算可以,希望他赶快装修,给我的客人留些好的印象</td>\n </tr>\n <tr>\n <td>6</td>\n <td>1</td>\n <td>价格比比较不错的酒店。这次免费升级了,感谢前台服务员。房子还好,地毯是新的,比上次的好些。早...</td>\n </tr>\n <tr>\n <td>7</td>\n <td>1</td>\n <td>不错,在同等档次酒店中应该是值得推荐的!</td>\n </tr>\n <tr>\n <td>8</td>\n <td>1</td>\n <td>入住丽晶,感觉很好。因为是新酒店,的确有淡淡的油漆味,房间内较新。房间大小合适,卫生间设备齐...</td>\n </tr>\n <tr>\n <td>9</td>\n <td>1</td>\n <td>1。酒店比较新,装潢和设施还不错,只是房间有些油漆味。2。早餐还可以,只是品种不是很多。3。...</td>\n </tr>\n </tbody>\n</table>\n</div>",
|
||||
"text/plain": " label comment\n0 1 距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...\n1 1 商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错!\n2 1 早餐太差,无论去多少人,那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。\n3 1 宾馆在小街道上,不大好找,但还好北京热心同胞很多~宾馆设施跟介绍的差不多,房间很小,确实挺小...\n4 1 CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风\n5 1 总的来说,这样的酒店配这样的价格还算可以,希望他赶快装修,给我的客人留些好的印象\n6 1 价格比比较不错的酒店。这次免费升级了,感谢前台服务员。房子还好,地毯是新的,比上次的好些。早...\n7 1 不错,在同等档次酒店中应该是值得推荐的!\n8 1 入住丽晶,感觉很好。因为是新酒店,的确有淡淡的油漆味,房间内较新。房间大小合适,卫生间设备齐...\n9 1 1。酒店比较新,装潢和设施还不错,只是房间有些油漆味。2。早餐还可以,只是品种不是很多。3。..."
|
||||
},
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df = pd.read_excel(\"src/py3.x/tensorflow2.x/EmotionData.xlsx\", header=0, error_bad_lines=False, encoding=\"utf_8_sig\")\n",
|
||||
"df.head(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
|
||||
},
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"y = df[\"label\"].tolist()\n",
|
||||
"y[:10]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"def func(line, ngrams=[]):\n",
|
||||
" # 加入我们的组合词,保证分词的准确性\n",
|
||||
" \n",
|
||||
" if ngrams != []:\n",
|
||||
" for word in ngrams:\n",
|
||||
" jieba.add_word(\"\".join(word.lower()))\n",
|
||||
" # # 将文本 ['1, 2, 3', '1, 2, .., n'] 分解为: [[1, 2, 3], [1, 2, .., n]]\n",
|
||||
" words = [word for word in jieba.cut(str(line).lower(), cut_all=False)]\n",
|
||||
" # print(\">>> \", train)\n",
|
||||
" return \" \".join(words)\n",
|
||||
"x = df[\"comment\"].apply(lambda line: func(line))\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
212
src/py3.x/tensorflow2.x/text_Emotion.py
Normal file
212
src/py3.x/tensorflow2.x/text_Emotion.py
Normal file
@@ -0,0 +1,212 @@
|
||||
# *-* coding:utf-8 *-*
|
||||
# 词向量:
|
||||
# https://www.cnblogs.com/Darwin2000/p/5786984.html
|
||||
# 数据集:
|
||||
# https://blog.csdn.net/alip39/article/details/95891321
|
||||
# 参考代码:
|
||||
# https://blog.csdn.net/u012052268/article/details/90238282
|
||||
import re
|
||||
import os
|
||||
import keras
|
||||
import random
|
||||
import gensim
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import jieba
|
||||
from sklearn.model_selection import train_test_split
|
||||
from keras import Model
|
||||
from keras.models import load_model
|
||||
from keras.layers import Dropout, Dense, Flatten, Bidirectional, Embedding, GRU, Input
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
from keras.utils.np_utils import to_categorical
|
||||
from keras.optimizers import Adam
|
||||
from config import Config
|
||||
import pickle
|
||||
|
||||
|
||||
# 存储模型: 持久化
|
||||
def load_pkl(filename):
|
||||
with open(filename, 'rb') as fr:
|
||||
model = pickle.load(fr)
|
||||
return model
|
||||
|
||||
|
||||
def save_pkl(model, filename):
|
||||
with open(filename, 'wb') as fw:
|
||||
pickle.dump(model, fw)
|
||||
|
||||
|
||||
## 训练自己的词向量,并保存。
|
||||
def trainWord2Vec(infile, outfile):
|
||||
sentences = gensim.models.word2vec.LineSentence(infile) # 读取分词后的 文本
|
||||
model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=1, workers=4) # 训练模型
|
||||
model.save(outfile)
|
||||
|
||||
|
||||
def loadMyWord2Vec(outfile):
|
||||
# 导入 预训练的词向量
|
||||
Word2VecModel = gensim.models.Word2Vec.load(outfile)
|
||||
return Word2VecModel
|
||||
|
||||
|
||||
def load_embeding():
|
||||
# 训练词向量(用空格隔开的文本)
|
||||
infile = "./CarCommentAll_cut.csv"
|
||||
outfile = "gensim_word2vec_60/Word60.model"
|
||||
# trainWord2Vec(infile, outfile)
|
||||
# 加载词向量
|
||||
Word2VecModel = loadMyWord2Vec(outfile)
|
||||
|
||||
print('空间的词向量(60 维):', Word2VecModel.wv['空间'].shape, Word2VecModel.wv['空间'])
|
||||
print('打印与空间最相近的5个词语:', Word2VecModel.wv.most_similar('空间', topn=5))
|
||||
|
||||
## 2 构造包含所有词语的 list,以及初始化 “词语-序号”字典 和 “词向量”矩阵
|
||||
vocab_list = [word for word, Vocab in Word2VecModel.wv.vocab.items()]# 存储 所有的 词语
|
||||
|
||||
word_index = {" ": 0}# 初始化 `[word : token]` ,后期 tokenize 语料库就是用该词典。
|
||||
word_vector = {} # 初始化`[word : vector]`字典
|
||||
|
||||
# 初始化存储所有向量的大矩阵,留意其中多一位(首行),词向量全为 0,用于 padding补零。
|
||||
# 行数 为 所有单词数+1 比如 10000+1 ; 列数为 词向量“维度”比如60。
|
||||
embeddings_matrix = np.zeros((len(vocab_list) + 1, Word2VecModel.vector_size))
|
||||
|
||||
## 3 填充 上述 的字典 和 大矩阵
|
||||
for i in range(len(vocab_list)):
|
||||
# print(i)
|
||||
word = vocab_list[i] # 每个词语
|
||||
word_index[word] = i + 1 # 词语:序号
|
||||
word_vector[word] = Word2VecModel.wv[word] # 词语:词向量
|
||||
embeddings_matrix[i + 1] = Word2VecModel.wv[word] # 词向量矩阵
|
||||
print("加载词向量结束..")
|
||||
return vocab_list, word_index, embeddings_matrix
|
||||
|
||||
|
||||
class EmotionModel(object):
|
||||
def __init__(self, config):
|
||||
self.model = None
|
||||
self.config = config
|
||||
self.pre_num = self.config.pre_num
|
||||
self.data_file = self.config.data_file
|
||||
self.vocab_list = self.config.vocab_list
|
||||
self.word_index = self.config.word_index
|
||||
self.EMBEDDING_DIM = self.config.EMBEDDING_DIM
|
||||
self.MAX_SEQUENCE_LENGTH = self.config.MAX_SEQUENCE_LENGTH
|
||||
|
||||
# 如果模型文件存在则直接加载模型,否则开始训练
|
||||
if os.path.exists(self.config.model_file):
|
||||
self.model = load_model(self.config.model_file)
|
||||
self.model.summary()
|
||||
else:
|
||||
self.train()
|
||||
|
||||
def build_model(self, embeddings_matrix):
|
||||
## 4 在 keras的Embedding层中使用 预训练词向量
|
||||
embedding_layer = Embedding(
|
||||
input_dim = len(embeddings_matrix), # 字典长度
|
||||
output_dim = self.EMBEDDING_DIM, # 词向量 长度(60)
|
||||
weights = [embeddings_matrix], # 重点:预训练的词向量系数
|
||||
input_length = self.MAX_SEQUENCE_LENGTH, # 每句话的 最大长度(必须padding)
|
||||
trainable = False # 是否在 训练的过程中 更新词向量
|
||||
)
|
||||
# 如果不加载外界的,可以自己训练
|
||||
# 可以看出在使用 Keras的中Embedding层时候,不指定参数 weights=[embeddings_matrix] 即可自动生成词向量。
|
||||
# embedding_layer = Embedding(
|
||||
# input_dim = len(word_index) + 1, # 由于 没有预训练,设置+1
|
||||
# output_dim = EMBEDDING_DIM, # 设置词向量的维度
|
||||
# input_length=MAX_SEQUENCE_LENGTH
|
||||
# ) #设置句子的最大长度
|
||||
|
||||
print("开始训练模型.....")
|
||||
# 使用
|
||||
sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32') # 返回一个张量,长度为1000,也就是模型的输入为batch_size*1000
|
||||
embedded_sequences = embedding_layer(sequence_input) # 返回batch_size*1000*100
|
||||
x = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
|
||||
x = Dropout(0.6)(x)
|
||||
x = Flatten()(x)
|
||||
preds = Dense(self.pre_num, activation='softmax')(x)
|
||||
self.model = Model(sequence_input, preds)
|
||||
# 设置优化器
|
||||
optimizer = Adam(lr=self.config.learning_rate)
|
||||
self.model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
|
||||
self.model.summary()
|
||||
|
||||
def load_word2jieba(self):
|
||||
vocab_list = load_pkl(self.vocab_list)
|
||||
if vocab_list != []:
|
||||
for word in vocab_list:
|
||||
jieba.add_word(word)
|
||||
|
||||
def predict(self, line):
|
||||
'''预测'''
|
||||
word_index = load_pkl(self.word_index)
|
||||
STOPWORDS = ["-", "\t", "\n", ".", "。", ",", ",", ";", "!", "!", "?", "?", "%"]
|
||||
words = [word for word in jieba.cut(str(line), cut_all=False) if word not in STOPWORDS]
|
||||
indexs = [word_index.get(word, 0) for word in words]
|
||||
x_pred = pad_sequences([indexs], maxlen=self.MAX_SEQUENCE_LENGTH)
|
||||
res = self.model.predict(x_pred, verbose=0)[0]
|
||||
return res
|
||||
|
||||
def load_data(self, word_index, vocab_list, test_size=0.25):
|
||||
STOPWORDS = ["-", "\t", "\n", ".", "。", ",", ",", ";", "!", "!", "?", "?", "%"]
|
||||
if vocab_list != []:
|
||||
for word in vocab_list:
|
||||
jieba.add_word(word)
|
||||
|
||||
def func(line):
|
||||
# 将文本 ['1, 2, 3', '1, 2, .., n'] 分解为: [[1, 2, 3], [1, 2, .., n]]
|
||||
words = [word for word in jieba.cut(str(line), cut_all=False) if word not in STOPWORDS]
|
||||
indexs = [word_index.get(word, 0) for word in words]
|
||||
return indexs
|
||||
|
||||
df = pd.read_excel(self.data_file, header=0, error_bad_lines=False, encoding="utf_8_sig")
|
||||
x = df["comment"].apply(lambda line: func(line)).tolist()
|
||||
x = pad_sequences(x, maxlen=self.MAX_SEQUENCE_LENGTH)
|
||||
y = df["label"].tolist()
|
||||
# 按照大小和顺序,生成 label(0,1,2...自然数类型)
|
||||
"""
|
||||
In [7]: to_categorical(np.asarray([1,1,0,1,3]))
|
||||
Out[7]:
|
||||
array([[0., 1., 0., 0.],
|
||||
[0., 1., 0., 0.],
|
||||
[1., 0., 0., 0.],
|
||||
[0., 1., 0., 0.],
|
||||
[0., 0., 0., 1.]], dtype=float32)
|
||||
"""
|
||||
y = to_categorical(np.asarray(y))
|
||||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=10000)
|
||||
return (x_train, y_train), (x_test, y_test)
|
||||
|
||||
|
||||
def train(self):
|
||||
'''训练模型'''
|
||||
vocab_list, word_index, embeddings_matrix = load_embeding()
|
||||
save_pkl(vocab_list, self.vocab_list)
|
||||
save_pkl(word_index, self.word_index)
|
||||
(x_train, y_train), (x_test, y_test) = self.load_data(word_index, vocab_list)
|
||||
print("---------")
|
||||
print(x_train[:3], "\n", y_train[:3])
|
||||
print("\n")
|
||||
print(x_test[:3], "\n", y_test[:3])
|
||||
print("---------")
|
||||
self.build_model(embeddings_matrix)
|
||||
self.model.fit(x_train, y_train, batch_size=60, epochs=10)
|
||||
self.model.evaluate(x_test, y_test, verbose=2)
|
||||
self.model.save(self.config.model_file)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试加载外界word2vec词向量
|
||||
# vocab_list, word_index, embeddings_matrix = load_embeding()
|
||||
model = EmotionModel(Config)
|
||||
status = False
|
||||
while 1:
|
||||
text = input("text:")
|
||||
if text in ["exit", "quit"]:
|
||||
break
|
||||
# 首次启动加载jieba词库
|
||||
if not status:
|
||||
model.load_word2jieba()
|
||||
status = True
|
||||
res = model.predict(text)
|
||||
label_dic = {0:"消极的", 1:"中性的", 2:"积极的"}
|
||||
print(res, " : ", label_dic[np.argmax(res)])
|
||||
Reference in New Issue
Block a user