From 41cc04a76a70dbc917691198960764c37059e9c0 Mon Sep 17 00:00:00 2001 From: jiangzhonglian Date: Sun, 15 Mar 2020 14:34:02 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20tf=202.0=20=E5=91=BD?= =?UTF-8?q?=E5=90=8D=E5=AE=9E=E4=BD=93=E7=9A=84=E8=AF=86=E5=88=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/py3.x/tensorflow2.x/text_NER.py | 126 ++++++++++++++++++ .../zh-NER-keras-master/.gitignore | 45 +++++++ .../zh-NER-keras-master/README.md | 35 +++++ .../zh-NER-keras-master/bilsm_crf_model.py | 27 ++++ .../zh-NER-keras-master/process_data.py | 66 +++++++++ .../zh-NER-keras-master/train.py | 7 + .../tensorflow2.x/zh-NER-keras-master/val.py | 23 ++++ 7 files changed, 329 insertions(+) create mode 100644 src/py3.x/tensorflow2.x/text_NER.py create mode 100644 src/py3.x/tensorflow2.x/zh-NER-keras-master/.gitignore create mode 100644 src/py3.x/tensorflow2.x/zh-NER-keras-master/README.md create mode 100644 src/py3.x/tensorflow2.x/zh-NER-keras-master/bilsm_crf_model.py create mode 100644 src/py3.x/tensorflow2.x/zh-NER-keras-master/process_data.py create mode 100644 src/py3.x/tensorflow2.x/zh-NER-keras-master/train.py create mode 100644 src/py3.x/tensorflow2.x/zh-NER-keras-master/val.py diff --git a/src/py3.x/tensorflow2.x/text_NER.py b/src/py3.x/tensorflow2.x/text_NER.py new file mode 100644 index 00000000..60726078 --- /dev/null +++ b/src/py3.x/tensorflow2.x/text_NER.py @@ -0,0 +1,126 @@ +import pickle +import numpy as np +import platform +from collections import Counter + +from keras.models import Sequential +from keras.layers import Embedding, Bidirectional, LSTM +from keras_contrib.layers import CRF +from keras.preprocessing.sequence import pad_sequences + +EMBED_DIM = 200 +BiRNN_UNITS = 200 + + + +def load_data(): + train = _parse_data(open('zh-NER/data/train_data.data', 'rb')) + test = _parse_data(open('zh-NER/data/test_data.data', 'rb')) + + word_counts = Counter(row[0].lower() for sample in train for row in sample) + vocab = [w for w, f in iter(word_counts.items()) if f >= 2] + chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"] + + # save initial config data + with open('zh-NER/model/config.pkl', 'wb') as outp: + pickle.dump((vocab, chunk_tags), outp) + + train = _process_data(train, vocab, chunk_tags) + test = _process_data(test, vocab, chunk_tags) + return train, test, (vocab, chunk_tags) + + +def _parse_data(fh): + # in windows the new line is '\r\n\r\n' the space is '\r\n' . so if you use windows system, + # you have to use recorsponding instructions + + if platform.system() == 'Windows': + split_text = '\r\n' + else: + split_text = '\n' + + string = fh.read().decode('utf-8') + data = [[row.split() for row in sample.split(split_text)] for + sample in + string.strip().split(split_text + split_text)] + fh.close() + return data + + +def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False): + if maxlen is None: + maxlen = max(len(s) for s in data) + word2idx = dict((w, i) for i, w in enumerate(vocab)) + x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data] # set to (index 1) if not in vocab + + y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data] + + x = pad_sequences(x, maxlen) # left padding + + y_chunk = pad_sequences(y_chunk, maxlen, value=-1) + + if onehot: + y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk] + else: + y_chunk = np.expand_dims(y_chunk, 2) + return x, y_chunk + + +def process_data(data, vocab, maxlen=100): + word2idx = dict((w, i) for i, w in enumerate(vocab)) + x = [word2idx.get(w[0].lower(), 1) for w in data] + length = len(x) + x = pad_sequences([x], maxlen) # left padding + return x, length + + +def create_model(train=True): + if train: + (train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data() + else: + with open('model/config.pkl', 'rb') as inp: + (vocab, chunk_tags) = pickle.load(inp) + model = Sequential() + model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding + model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True))) + crf = CRF(len(chunk_tags), sparse_target=True) + model.add(crf) + model.summary() + model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy]) + if train: + return model, (train_x, train_y), (test_x, test_y) + else: + return model, (vocab, chunk_tags) + + +def train(): + EPOCHS = 10 + model, (train_x, train_y), (test_x, test_y) = create_model() + # train model + model.fit(train_x, train_y,batch_size=16,epochs=EPOCHS, validation_data=[test_x, test_y]) + model.save('model/crf.h5') + +def test(): + model, (vocab, chunk_tags) = create_model(train=False) + predict_text = '中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下,连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚' + str, length = process_data(predict_text, vocab) + model.load_weights('model/crf.h5') + raw = model.predict(str)[0][-length:] + result = [np.argmax(row) for row in raw] + result_tags = [chunk_tags[i] for i in result] + + per, loc, org = '', '', '' + + for s, t in zip(predict_text, result_tags): + if t in ('B-PER', 'I-PER'): + per += ' ' + s if (t == 'B-PER') else s + if t in ('B-ORG', 'I-ORG'): + org += ' ' + s if (t == 'B-ORG') else s + if t in ('B-LOC', 'I-LOC'): + loc += ' ' + s if (t == 'B-LOC') else s + + print(['person:' + per, 'location:' + loc, 'organzation:' + org]) + + +if __name__ == "__main__": + train() diff --git a/src/py3.x/tensorflow2.x/zh-NER-keras-master/.gitignore b/src/py3.x/tensorflow2.x/zh-NER-keras-master/.gitignore new file mode 100644 index 00000000..2eedd215 --- /dev/null +++ b/src/py3.x/tensorflow2.x/zh-NER-keras-master/.gitignore @@ -0,0 +1,45 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ +.idea \ No newline at end of file diff --git a/src/py3.x/tensorflow2.x/zh-NER-keras-master/README.md b/src/py3.x/tensorflow2.x/zh-NER-keras-master/README.md new file mode 100644 index 00000000..62afcf7b --- /dev/null +++ b/src/py3.x/tensorflow2.x/zh-NER-keras-master/README.md @@ -0,0 +1,35 @@ +# zh-NER-keras +> this project is a sample for Chinese Named Entity Recognition(NER) +by Keras 2.1.4 + +## requirements +* keras=>2.1.4 +* keras contribute 2.0.8 (https://github.com/keras-team/keras-contrib) +* h5py +* pickle + +## demo + +```python + +python val.py + +``` + + +input: +```text +中华人民共和国国务院总理周恩来在外交部长陈毅, +副部长王东的陪同下, +连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚 +``` +output: +```python +['person: 周恩来 陈毅, 王东', 'location: 埃塞俄比亚 非洲 阿尔巴尼亚', 'organzation: 中华人民共和国国务院 外交部'] + +``` + + + + + diff --git a/src/py3.x/tensorflow2.x/zh-NER-keras-master/bilsm_crf_model.py b/src/py3.x/tensorflow2.x/zh-NER-keras-master/bilsm_crf_model.py new file mode 100644 index 00000000..c41f374f --- /dev/null +++ b/src/py3.x/tensorflow2.x/zh-NER-keras-master/bilsm_crf_model.py @@ -0,0 +1,27 @@ +from keras.models import Sequential +from keras.layers import Embedding, Bidirectional, LSTM +from keras_contrib.layers import CRF +import process_data +import pickle + +EMBED_DIM = 200 +BiRNN_UNITS = 200 + + +def create_model(train=True): + if train: + (train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = process_data.load_data() + else: + with open('model/config.pkl', 'rb') as inp: + (vocab, chunk_tags) = pickle.load(inp) + model = Sequential() + model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding + model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True))) + crf = CRF(len(chunk_tags), sparse_target=True) + model.add(crf) + model.summary() + model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy]) + if train: + return model, (train_x, train_y), (test_x, test_y) + else: + return model, (vocab, chunk_tags) diff --git a/src/py3.x/tensorflow2.x/zh-NER-keras-master/process_data.py b/src/py3.x/tensorflow2.x/zh-NER-keras-master/process_data.py new file mode 100644 index 00000000..3ddd66a6 --- /dev/null +++ b/src/py3.x/tensorflow2.x/zh-NER-keras-master/process_data.py @@ -0,0 +1,66 @@ +import numpy +from collections import Counter +from keras.preprocessing.sequence import pad_sequences +import pickle +import platform + + +def load_data(): + train = _parse_data(open('data/train_data.data', 'rb')) + test = _parse_data(open('data/test_data.data', 'rb')) + + word_counts = Counter(row[0].lower() for sample in train for row in sample) + vocab = [w for w, f in iter(word_counts.items()) if f >= 2] + chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"] + + # save initial config data + with open('model/config.pkl', 'wb') as outp: + pickle.dump((vocab, chunk_tags), outp) + + train = _process_data(train, vocab, chunk_tags) + test = _process_data(test, vocab, chunk_tags) + return train, test, (vocab, chunk_tags) + + +def _parse_data(fh): + # in windows the new line is '\r\n\r\n' the space is '\r\n' . so if you use windows system, + # you have to use recorsponding instructions + + if platform.system() == 'Windows': + split_text = '\r\n' + else: + split_text = '\n' + + string = fh.read().decode('utf-8') + data = [[row.split() for row in sample.split(split_text)] for + sample in + string.strip().split(split_text + split_text)] + fh.close() + return data + + +def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False): + if maxlen is None: + maxlen = max(len(s) for s in data) + word2idx = dict((w, i) for i, w in enumerate(vocab)) + x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data] # set to (index 1) if not in vocab + + y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data] + + x = pad_sequences(x, maxlen) # left padding + + y_chunk = pad_sequences(y_chunk, maxlen, value=-1) + + if onehot: + y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y_chunk] + else: + y_chunk = numpy.expand_dims(y_chunk, 2) + return x, y_chunk + + +def process_data(data, vocab, maxlen=100): + word2idx = dict((w, i) for i, w in enumerate(vocab)) + x = [word2idx.get(w[0].lower(), 1) for w in data] + length = len(x) + x = pad_sequences([x], maxlen) # left padding + return x, length diff --git a/src/py3.x/tensorflow2.x/zh-NER-keras-master/train.py b/src/py3.x/tensorflow2.x/zh-NER-keras-master/train.py new file mode 100644 index 00000000..fcb5f6ac --- /dev/null +++ b/src/py3.x/tensorflow2.x/zh-NER-keras-master/train.py @@ -0,0 +1,7 @@ +import bilsm_crf_model + +EPOCHS = 10 +model, (train_x, train_y), (test_x, test_y) = bilsm_crf_model.create_model() +# train model +model.fit(train_x, train_y,batch_size=16,epochs=EPOCHS, validation_data=[test_x, test_y]) +model.save('model/crf.h5') diff --git a/src/py3.x/tensorflow2.x/zh-NER-keras-master/val.py b/src/py3.x/tensorflow2.x/zh-NER-keras-master/val.py new file mode 100644 index 00000000..0cdc5055 --- /dev/null +++ b/src/py3.x/tensorflow2.x/zh-NER-keras-master/val.py @@ -0,0 +1,23 @@ +import bilsm_crf_model +import process_data +import numpy as np + +model, (vocab, chunk_tags) = bilsm_crf_model.create_model(train=False) +predict_text = '中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下,连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚' +str, length = process_data.process_data(predict_text, vocab) +model.load_weights('model/crf.h5') +raw = model.predict(str)[0][-length:] +result = [np.argmax(row) for row in raw] +result_tags = [chunk_tags[i] for i in result] + +per, loc, org = '', '', '' + +for s, t in zip(predict_text, result_tags): + if t in ('B-PER', 'I-PER'): + per += ' ' + s if (t == 'B-PER') else s + if t in ('B-ORG', 'I-ORG'): + org += ' ' + s if (t == 'B-ORG') else s + if t in ('B-LOC', 'I-LOC'): + loc += ' ' + s if (t == 'B-LOC') else s + +print(['person:' + per, 'location:' + loc, 'organzation:' + org])