diff --git a/.gitignore b/.gitignore index 15d77707..5a190dbc 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ __pycache__/ *.py[cod] *$py.class .vscode -zh-NER # C extensions *.so diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/config/setting.py b/config/setting.py new file mode 100644 index 00000000..3d91adfd --- /dev/null +++ b/config/setting.py @@ -0,0 +1,27 @@ +# *-* coding:utf-8 *-* +''' +@author: 片刻 +@date: 20200901 22:02 +''' + +class TextNER(object): + DEBUG = False + path_root = "/home/wac/jiangzhonglian" + if DEBUG: + path_root = "/Users/jiangzl/work/data/深度学习/nlp/命名实体识别/data" + + path_train = '%s/train_data.data' % path_root + path_test = '%s/test_data.data' % path_root + path_config = '%s/config.pkl' % path_root + path_model = '%s/model.h5' % path_root + + # 迭代次数 + EPOCHS = 3 + # embedding的列数 + EMBED_DIM = 128 + # LSTM的列数 + BiLSTM_UNITS = 128 + + +class Config(object): + nlp_ner = TextNER() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..472afd61 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +numpy +pandas +sklearn +keras +tensorflow +git+https://www.github.com/keras-team/keras-contrib.git \ No newline at end of file diff --git a/run_example.py b/run_example.py new file mode 100644 index 00000000..7c0dc03b --- /dev/null +++ b/run_example.py @@ -0,0 +1,9 @@ +import tutorials.keras.text_NER as ft + + +def main(): + ft.main() + + +if __name__ == "__main__": + main() diff --git a/src/py3.x/tensorflow2.x/text_NER.py b/src/py3.x/tensorflow2.x/text_NER.py deleted file mode 100644 index 60726078..00000000 --- a/src/py3.x/tensorflow2.x/text_NER.py +++ /dev/null @@ -1,126 +0,0 @@ -import pickle -import numpy as np -import platform -from collections import Counter - -from keras.models import Sequential -from keras.layers import Embedding, Bidirectional, LSTM -from keras_contrib.layers import CRF -from keras.preprocessing.sequence import pad_sequences - -EMBED_DIM = 200 -BiRNN_UNITS = 200 - - - -def load_data(): - train = _parse_data(open('zh-NER/data/train_data.data', 'rb')) - test = _parse_data(open('zh-NER/data/test_data.data', 'rb')) - - word_counts = Counter(row[0].lower() for sample in train for row in sample) - vocab = [w for w, f in iter(word_counts.items()) if f >= 2] - chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"] - - # save initial config data - with open('zh-NER/model/config.pkl', 'wb') as outp: - pickle.dump((vocab, chunk_tags), outp) - - train = _process_data(train, vocab, chunk_tags) - test = _process_data(test, vocab, chunk_tags) - return train, test, (vocab, chunk_tags) - - -def _parse_data(fh): - # in windows the new line is '\r\n\r\n' the space is '\r\n' . so if you use windows system, - # you have to use recorsponding instructions - - if platform.system() == 'Windows': - split_text = '\r\n' - else: - split_text = '\n' - - string = fh.read().decode('utf-8') - data = [[row.split() for row in sample.split(split_text)] for - sample in - string.strip().split(split_text + split_text)] - fh.close() - return data - - -def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False): - if maxlen is None: - maxlen = max(len(s) for s in data) - word2idx = dict((w, i) for i, w in enumerate(vocab)) - x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data] # set to (index 1) if not in vocab - - y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data] - - x = pad_sequences(x, maxlen) # left padding - - y_chunk = pad_sequences(y_chunk, maxlen, value=-1) - - if onehot: - y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk] - else: - y_chunk = np.expand_dims(y_chunk, 2) - return x, y_chunk - - -def process_data(data, vocab, maxlen=100): - word2idx = dict((w, i) for i, w in enumerate(vocab)) - x = [word2idx.get(w[0].lower(), 1) for w in data] - length = len(x) - x = pad_sequences([x], maxlen) # left padding - return x, length - - -def create_model(train=True): - if train: - (train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data() - else: - with open('model/config.pkl', 'rb') as inp: - (vocab, chunk_tags) = pickle.load(inp) - model = Sequential() - model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding - model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True))) - crf = CRF(len(chunk_tags), sparse_target=True) - model.add(crf) - model.summary() - model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy]) - if train: - return model, (train_x, train_y), (test_x, test_y) - else: - return model, (vocab, chunk_tags) - - -def train(): - EPOCHS = 10 - model, (train_x, train_y), (test_x, test_y) = create_model() - # train model - model.fit(train_x, train_y,batch_size=16,epochs=EPOCHS, validation_data=[test_x, test_y]) - model.save('model/crf.h5') - -def test(): - model, (vocab, chunk_tags) = create_model(train=False) - predict_text = '中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下,连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚' - str, length = process_data(predict_text, vocab) - model.load_weights('model/crf.h5') - raw = model.predict(str)[0][-length:] - result = [np.argmax(row) for row in raw] - result_tags = [chunk_tags[i] for i in result] - - per, loc, org = '', '', '' - - for s, t in zip(predict_text, result_tags): - if t in ('B-PER', 'I-PER'): - per += ' ' + s if (t == 'B-PER') else s - if t in ('B-ORG', 'I-ORG'): - org += ' ' + s if (t == 'B-ORG') else s - if t in ('B-LOC', 'I-LOC'): - loc += ' ' + s if (t == 'B-LOC') else s - - print(['person:' + per, 'location:' + loc, 'organzation:' + org]) - - -if __name__ == "__main__": - train() diff --git a/tutorials/__init__.py b/tutorials/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tutorials/keras/__init__.py b/tutorials/keras/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tutorials/keras/text_NER.py b/tutorials/keras/text_NER.py new file mode 100644 index 00000000..6ffa7c95 --- /dev/null +++ b/tutorials/keras/text_NER.py @@ -0,0 +1,143 @@ +import pickle +import numpy as np +import pandas as pd +import platform +from collections import Counter +from keras.models import Sequential +from keras.layers import Embedding, Bidirectional, LSTM +from keras_contrib.layers import CRF +""" +# padding: pre(默认) 向前补充0 post 向后补充0 +# truncating: 文本超过 pad_num, pre(默认) 删除前面 post 删除后面 +# x_train = pad_sequences(x, maxlen=pad_num, value=0, padding='post', truncating="post") +# print("--- ", x_train[0][:20]) + +使用keras_bert、keras_contrib的crf时bug记录 +TypeError: Tensors in list passed to 'values' of 'ConcatV2' Op have types [bool, float32] that don't all match +解决方案, 修改crf.py 516行: +mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1), +为: +mask2 = K.cast(K.concatenate([mask, K.cast(K.zeros_like(mask[:, :1]), mask.dtype)], axis=1), +""" +from keras.preprocessing.sequence import pad_sequences +from config.setting import Config + + +def load_data(): + train = _parse_data(Config.nlp_ner.path_train) + test = _parse_data(Config.nlp_ner.path_test) + print("--- init 数据加载解析完成 ---") + + # Counter({'的': 8, '中': 7, '致': 7, '党': 7}) + word_counts = Counter(row[0].lower() for sample in train for row in sample) + vocab = [w for w, f in iter(word_counts.items()) if f >= 2] + chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"] + + # 存储保留的有效个数的 vovab 和 对应 chunk_tags + with open(Config.nlp_ner.path_config, 'wb') as outp: + pickle.dump((vocab, chunk_tags), outp) + print("--- init 配置文件保存成功 ---") + + train = _process_data(train, vocab, chunk_tags) + test = _process_data(test , vocab, chunk_tags) + print("--- init 对数据进行编码,生成训练需要的数据格式 ---") + return train, test, (vocab, chunk_tags) + + +def _parse_data(filename): + """ + 以单下划线开头(_foo)的代表不能直接访问的类属性 + 用于解析数据,用于模型训练 + :param filename: 文件地址 + :return: data: 解析数据后的结果 + [[['中', 'B-ORG'], ['共', 'I-ORG']], [['中', 'B-ORG'], ['国', 'I-ORG']]] + """ + with open(filename, 'rb') as fn: + split_text = '\n' + # 主要是分句: split_text 默认每个句子都是一行,所以原来换行就需要 两个split_text + texts = fn.read().decode('utf-8').strip().split(split_text + split_text) + # 对于每个字需要 split_text, 而字的内部需要用空格分隔 + data = [[row.split() for row in text.split(split_text)] for text in texts] + return data + + +def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False): + if maxlen is None: + maxlen = max(len(s) for s in data) + + # 对每个字进行编码 + word2idx = dict((w, i) for i, w in enumerate(vocab)) + # 如果不在 vocab里面,就给 unk 值为 1 + x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data] + y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data] + + x = pad_sequences(x, maxlen) # left padding + y_chunk = pad_sequences(y_chunk, maxlen, value=-1) + + if onehot: + # 返回一个onehot 编码的多维数组 + y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk] + else: + # np.expand_dims:用于扩展数组的形状 + # https://blog.csdn.net/hong615771420/article/details/83448878 + y_chunk = np.expand_dims(y_chunk, 2) + return x, y_chunk + + +def process_data(data, vocab, maxlen=100): + word2idx = dict((w, i) for i, w in enumerate(vocab)) + x = [word2idx.get(w[0].lower(), 1) for w in data] + length = len(x) + x = pad_sequences([x], maxlen) # left padding + return x, length + + +def create_model(len_vocab, len_chunk_tags): + model = Sequential() + model.add(Embedding(len_vocab, Config.nlp_ner.EMBED_DIM, mask_zero=True)) # Random embedding + model.add(Bidirectional(LSTM(Config.nlp_ner.BiLSTM_UNITS // 2, return_sequences=True))) + crf = CRF(len_chunk_tags, sparse_target=True) + model.add(crf) + model.summary() + model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy]) + return model + + +def train(): + (train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data() + model = create_model(len(vocab), len(chunk_tags)) + # train model + model.fit(train_x, train_y, batch_size=16, epochs=Config.nlp_ner.EPOCHS, validation_data=[test_x, test_y]) + model.save(Config.nlp_ner.path_model) + + +def test(): + with open(Config.nlp_ner.path_config, 'rb') as inp: + (vocab, chunk_tags) = pickle.load(inp) + model = create_model(len(vocab), len(chunk_tags)) + predict_text = '中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下,连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚' + text_EMBED, length = process_data(predict_text, vocab) + model.load_weights(Config.nlp_ner.path_model) + raw = model.predict(text_EMBED)[0][-length:] + result = [np.argmax(row) for row in raw] + result_tags = [chunk_tags[i] for i in result] + + per, loc, org = '', '', '' + + for s, t in zip(predict_text, result_tags): + if t in ('B-PER', 'I-PER'): + per += ' ' + s if (t == 'B-PER') else s + if t in ('B-ORG', 'I-ORG'): + org += ' ' + s if (t == 'B-ORG') else s + if t in ('B-LOC', 'I-LOC'): + loc += ' ' + s if (t == 'B-LOC') else s + + print(['person:' + per, 'location:' + loc, 'organzation:' + org]) + + +def main(): + # print("--") + train() + +# if __name__ == "__main__": +# train()