mirror of
https://github.com/apachecn/ailearning.git
synced 2026-04-01 17:50:49 +08:00
移动 命名实体的代码位置
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -3,7 +3,6 @@ __pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
.vscode
|
||||
zh-NER
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
0
config/__init__.py
Normal file
0
config/__init__.py
Normal file
27
config/setting.py
Normal file
27
config/setting.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# *-* coding:utf-8 *-*
|
||||
'''
|
||||
@author: 片刻
|
||||
@date: 20200901 22:02
|
||||
'''
|
||||
|
||||
class TextNER(object):
|
||||
DEBUG = False
|
||||
path_root = "/home/wac/jiangzhonglian"
|
||||
if DEBUG:
|
||||
path_root = "/Users/jiangzl/work/data/深度学习/nlp/命名实体识别/data"
|
||||
|
||||
path_train = '%s/train_data.data' % path_root
|
||||
path_test = '%s/test_data.data' % path_root
|
||||
path_config = '%s/config.pkl' % path_root
|
||||
path_model = '%s/model.h5' % path_root
|
||||
|
||||
# 迭代次数
|
||||
EPOCHS = 3
|
||||
# embedding的列数
|
||||
EMBED_DIM = 128
|
||||
# LSTM的列数
|
||||
BiLSTM_UNITS = 128
|
||||
|
||||
|
||||
class Config(object):
|
||||
nlp_ner = TextNER()
|
||||
6
requirements.txt
Normal file
6
requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
numpy
|
||||
pandas
|
||||
sklearn
|
||||
keras
|
||||
tensorflow
|
||||
git+https://www.github.com/keras-team/keras-contrib.git
|
||||
9
run_example.py
Normal file
9
run_example.py
Normal file
@@ -0,0 +1,9 @@
|
||||
import tutorials.keras.text_NER as ft
|
||||
|
||||
|
||||
def main():
|
||||
ft.main()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,126 +0,0 @@
|
||||
import pickle
|
||||
import numpy as np
|
||||
import platform
|
||||
from collections import Counter
|
||||
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Embedding, Bidirectional, LSTM
|
||||
from keras_contrib.layers import CRF
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
|
||||
EMBED_DIM = 200
|
||||
BiRNN_UNITS = 200
|
||||
|
||||
|
||||
|
||||
def load_data():
|
||||
train = _parse_data(open('zh-NER/data/train_data.data', 'rb'))
|
||||
test = _parse_data(open('zh-NER/data/test_data.data', 'rb'))
|
||||
|
||||
word_counts = Counter(row[0].lower() for sample in train for row in sample)
|
||||
vocab = [w for w, f in iter(word_counts.items()) if f >= 2]
|
||||
chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"]
|
||||
|
||||
# save initial config data
|
||||
with open('zh-NER/model/config.pkl', 'wb') as outp:
|
||||
pickle.dump((vocab, chunk_tags), outp)
|
||||
|
||||
train = _process_data(train, vocab, chunk_tags)
|
||||
test = _process_data(test, vocab, chunk_tags)
|
||||
return train, test, (vocab, chunk_tags)
|
||||
|
||||
|
||||
def _parse_data(fh):
|
||||
# in windows the new line is '\r\n\r\n' the space is '\r\n' . so if you use windows system,
|
||||
# you have to use recorsponding instructions
|
||||
|
||||
if platform.system() == 'Windows':
|
||||
split_text = '\r\n'
|
||||
else:
|
||||
split_text = '\n'
|
||||
|
||||
string = fh.read().decode('utf-8')
|
||||
data = [[row.split() for row in sample.split(split_text)] for
|
||||
sample in
|
||||
string.strip().split(split_text + split_text)]
|
||||
fh.close()
|
||||
return data
|
||||
|
||||
|
||||
def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):
|
||||
if maxlen is None:
|
||||
maxlen = max(len(s) for s in data)
|
||||
word2idx = dict((w, i) for i, w in enumerate(vocab))
|
||||
x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data] # set to <unk> (index 1) if not in vocab
|
||||
|
||||
y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]
|
||||
|
||||
x = pad_sequences(x, maxlen) # left padding
|
||||
|
||||
y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
|
||||
|
||||
if onehot:
|
||||
y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk]
|
||||
else:
|
||||
y_chunk = np.expand_dims(y_chunk, 2)
|
||||
return x, y_chunk
|
||||
|
||||
|
||||
def process_data(data, vocab, maxlen=100):
|
||||
word2idx = dict((w, i) for i, w in enumerate(vocab))
|
||||
x = [word2idx.get(w[0].lower(), 1) for w in data]
|
||||
length = len(x)
|
||||
x = pad_sequences([x], maxlen) # left padding
|
||||
return x, length
|
||||
|
||||
|
||||
def create_model(train=True):
|
||||
if train:
|
||||
(train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data()
|
||||
else:
|
||||
with open('model/config.pkl', 'rb') as inp:
|
||||
(vocab, chunk_tags) = pickle.load(inp)
|
||||
model = Sequential()
|
||||
model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
|
||||
model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True)))
|
||||
crf = CRF(len(chunk_tags), sparse_target=True)
|
||||
model.add(crf)
|
||||
model.summary()
|
||||
model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
|
||||
if train:
|
||||
return model, (train_x, train_y), (test_x, test_y)
|
||||
else:
|
||||
return model, (vocab, chunk_tags)
|
||||
|
||||
|
||||
def train():
|
||||
EPOCHS = 10
|
||||
model, (train_x, train_y), (test_x, test_y) = create_model()
|
||||
# train model
|
||||
model.fit(train_x, train_y,batch_size=16,epochs=EPOCHS, validation_data=[test_x, test_y])
|
||||
model.save('model/crf.h5')
|
||||
|
||||
def test():
|
||||
model, (vocab, chunk_tags) = create_model(train=False)
|
||||
predict_text = '中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下,连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚'
|
||||
str, length = process_data(predict_text, vocab)
|
||||
model.load_weights('model/crf.h5')
|
||||
raw = model.predict(str)[0][-length:]
|
||||
result = [np.argmax(row) for row in raw]
|
||||
result_tags = [chunk_tags[i] for i in result]
|
||||
|
||||
per, loc, org = '', '', ''
|
||||
|
||||
for s, t in zip(predict_text, result_tags):
|
||||
if t in ('B-PER', 'I-PER'):
|
||||
per += ' ' + s if (t == 'B-PER') else s
|
||||
if t in ('B-ORG', 'I-ORG'):
|
||||
org += ' ' + s if (t == 'B-ORG') else s
|
||||
if t in ('B-LOC', 'I-LOC'):
|
||||
loc += ' ' + s if (t == 'B-LOC') else s
|
||||
|
||||
print(['person:' + per, 'location:' + loc, 'organzation:' + org])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train()
|
||||
0
tutorials/__init__.py
Normal file
0
tutorials/__init__.py
Normal file
0
tutorials/keras/__init__.py
Normal file
0
tutorials/keras/__init__.py
Normal file
143
tutorials/keras/text_NER.py
Normal file
143
tutorials/keras/text_NER.py
Normal file
@@ -0,0 +1,143 @@
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import platform
|
||||
from collections import Counter
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Embedding, Bidirectional, LSTM
|
||||
from keras_contrib.layers import CRF
|
||||
"""
|
||||
# padding: pre(默认) 向前补充0 post 向后补充0
|
||||
# truncating: 文本超过 pad_num, pre(默认) 删除前面 post 删除后面
|
||||
# x_train = pad_sequences(x, maxlen=pad_num, value=0, padding='post', truncating="post")
|
||||
# print("--- ", x_train[0][:20])
|
||||
|
||||
使用keras_bert、keras_contrib的crf时bug记录
|
||||
TypeError: Tensors in list passed to 'values' of 'ConcatV2' Op have types [bool, float32] that don't all match
|
||||
解决方案, 修改crf.py 516行:
|
||||
mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1),
|
||||
为:
|
||||
mask2 = K.cast(K.concatenate([mask, K.cast(K.zeros_like(mask[:, :1]), mask.dtype)], axis=1),
|
||||
"""
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
from config.setting import Config
|
||||
|
||||
|
||||
def load_data():
|
||||
train = _parse_data(Config.nlp_ner.path_train)
|
||||
test = _parse_data(Config.nlp_ner.path_test)
|
||||
print("--- init 数据加载解析完成 ---")
|
||||
|
||||
# Counter({'的': 8, '中': 7, '致': 7, '党': 7})
|
||||
word_counts = Counter(row[0].lower() for sample in train for row in sample)
|
||||
vocab = [w for w, f in iter(word_counts.items()) if f >= 2]
|
||||
chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"]
|
||||
|
||||
# 存储保留的有效个数的 vovab 和 对应 chunk_tags
|
||||
with open(Config.nlp_ner.path_config, 'wb') as outp:
|
||||
pickle.dump((vocab, chunk_tags), outp)
|
||||
print("--- init 配置文件保存成功 ---")
|
||||
|
||||
train = _process_data(train, vocab, chunk_tags)
|
||||
test = _process_data(test , vocab, chunk_tags)
|
||||
print("--- init 对数据进行编码,生成训练需要的数据格式 ---")
|
||||
return train, test, (vocab, chunk_tags)
|
||||
|
||||
|
||||
def _parse_data(filename):
|
||||
"""
|
||||
以单下划线开头(_foo)的代表不能直接访问的类属性
|
||||
用于解析数据,用于模型训练
|
||||
:param filename: 文件地址
|
||||
:return: data: 解析数据后的结果
|
||||
[[['中', 'B-ORG'], ['共', 'I-ORG']], [['中', 'B-ORG'], ['国', 'I-ORG']]]
|
||||
"""
|
||||
with open(filename, 'rb') as fn:
|
||||
split_text = '\n'
|
||||
# 主要是分句: split_text 默认每个句子都是一行,所以原来换行就需要 两个split_text
|
||||
texts = fn.read().decode('utf-8').strip().split(split_text + split_text)
|
||||
# 对于每个字需要 split_text, 而字的内部需要用空格分隔
|
||||
data = [[row.split() for row in text.split(split_text)] for text in texts]
|
||||
return data
|
||||
|
||||
|
||||
def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):
|
||||
if maxlen is None:
|
||||
maxlen = max(len(s) for s in data)
|
||||
|
||||
# 对每个字进行编码
|
||||
word2idx = dict((w, i) for i, w in enumerate(vocab))
|
||||
# 如果不在 vocab里面,就给 unk 值为 1
|
||||
x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]
|
||||
y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]
|
||||
|
||||
x = pad_sequences(x, maxlen) # left padding
|
||||
y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
|
||||
|
||||
if onehot:
|
||||
# 返回一个onehot 编码的多维数组
|
||||
y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk]
|
||||
else:
|
||||
# np.expand_dims:用于扩展数组的形状
|
||||
# https://blog.csdn.net/hong615771420/article/details/83448878
|
||||
y_chunk = np.expand_dims(y_chunk, 2)
|
||||
return x, y_chunk
|
||||
|
||||
|
||||
def process_data(data, vocab, maxlen=100):
|
||||
word2idx = dict((w, i) for i, w in enumerate(vocab))
|
||||
x = [word2idx.get(w[0].lower(), 1) for w in data]
|
||||
length = len(x)
|
||||
x = pad_sequences([x], maxlen) # left padding
|
||||
return x, length
|
||||
|
||||
|
||||
def create_model(len_vocab, len_chunk_tags):
|
||||
model = Sequential()
|
||||
model.add(Embedding(len_vocab, Config.nlp_ner.EMBED_DIM, mask_zero=True)) # Random embedding
|
||||
model.add(Bidirectional(LSTM(Config.nlp_ner.BiLSTM_UNITS // 2, return_sequences=True)))
|
||||
crf = CRF(len_chunk_tags, sparse_target=True)
|
||||
model.add(crf)
|
||||
model.summary()
|
||||
model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
|
||||
return model
|
||||
|
||||
|
||||
def train():
|
||||
(train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data()
|
||||
model = create_model(len(vocab), len(chunk_tags))
|
||||
# train model
|
||||
model.fit(train_x, train_y, batch_size=16, epochs=Config.nlp_ner.EPOCHS, validation_data=[test_x, test_y])
|
||||
model.save(Config.nlp_ner.path_model)
|
||||
|
||||
|
||||
def test():
|
||||
with open(Config.nlp_ner.path_config, 'rb') as inp:
|
||||
(vocab, chunk_tags) = pickle.load(inp)
|
||||
model = create_model(len(vocab), len(chunk_tags))
|
||||
predict_text = '中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下,连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚'
|
||||
text_EMBED, length = process_data(predict_text, vocab)
|
||||
model.load_weights(Config.nlp_ner.path_model)
|
||||
raw = model.predict(text_EMBED)[0][-length:]
|
||||
result = [np.argmax(row) for row in raw]
|
||||
result_tags = [chunk_tags[i] for i in result]
|
||||
|
||||
per, loc, org = '', '', ''
|
||||
|
||||
for s, t in zip(predict_text, result_tags):
|
||||
if t in ('B-PER', 'I-PER'):
|
||||
per += ' ' + s if (t == 'B-PER') else s
|
||||
if t in ('B-ORG', 'I-ORG'):
|
||||
org += ' ' + s if (t == 'B-ORG') else s
|
||||
if t in ('B-LOC', 'I-LOC'):
|
||||
loc += ' ' + s if (t == 'B-LOC') else s
|
||||
|
||||
print(['person:' + per, 'location:' + loc, 'organzation:' + org])
|
||||
|
||||
|
||||
def main():
|
||||
# print("--")
|
||||
train()
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# train()
|
||||
Reference in New Issue
Block a user