移动命名实体的代码位置

2026-04-01 17:50:49 +08:00 · 2020-09-01 15:49:46 +08:00
parent 97286e2ba1
commit d6123a1b07
9 changed files with 185 additions and 127 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,6 @@ __pycache__/
 *.py[cod]
 *$py.class
 .vscode
-zh-NER

 # C extensions
 *.so
--- a/config/init.py
+++ b/config/init.py
--- a/config/setting.py
+++ b/config/setting.py
@@ -0,0 +1,27 @@
+# *-* coding:utf-8 *-*
+'''
+@author: 片刻
+@date: 20200901 22:02
+'''
+
+class TextNER(object):
+    DEBUG = False
+    path_root = "/home/wac/jiangzhonglian"
+    if DEBUG:
+        path_root = "/Users/jiangzl/work/data/深度学习/nlp/命名实体识别/data"
+
+    path_train  = '%s/train_data.data'  % path_root
+    path_test   = '%s/test_data.data'   % path_root
+    path_config = '%s/config.pkl'       % path_root
+    path_model  = '%s/model.h5'         % path_root
+
+    # 迭代次数
+    EPOCHS = 3
+    # embedding的列数
+    EMBED_DIM = 128
+    # LSTM的列数
+    BiLSTM_UNITS = 128
+
+
+class Config(object):
+    nlp_ner = TextNER()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+numpy
+pandas
+sklearn
+keras
+tensorflow
+git+https://www.github.com/keras-team/keras-contrib.git
--- a/run_example.py
+++ b/run_example.py
@@ -0,0 +1,9 @@
+import tutorials.keras.text_NER as ft
+
+
+def main():
+    ft.main()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/py3.x/tensorflow2.x/text_NER.py
+++ b/src/py3.x/tensorflow2.x/text_NER.py
@@ -1,126 +0,0 @@
-import pickle
-import numpy as np
-import platform
-from collections import Counter
-
-from keras.models import Sequential
-from keras.layers import Embedding, Bidirectional, LSTM
-from keras_contrib.layers import CRF
-from keras.preprocessing.sequence import pad_sequences
-
-EMBED_DIM = 200
-BiRNN_UNITS = 200
-
-
-
-def load_data():
-    train = _parse_data(open('zh-NER/data/train_data.data', 'rb'))
-    test = _parse_data(open('zh-NER/data/test_data.data', 'rb'))
-
-    word_counts = Counter(row[0].lower() for sample in train for row in sample)
-    vocab = [w for w, f in iter(word_counts.items()) if f >= 2]
-    chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"]
-
-    # save initial config data
-    with open('zh-NER/model/config.pkl', 'wb') as outp:
-        pickle.dump((vocab, chunk_tags), outp)
-
-    train = _process_data(train, vocab, chunk_tags)
-    test = _process_data(test, vocab, chunk_tags)
-    return train, test, (vocab, chunk_tags)
-
-
-def _parse_data(fh):
-    #  in windows the new line is '\r\n\r\n' the space is '\r\n' . so if you use windows system,
-    #  you have to use recorsponding instructions
-
-    if platform.system() == 'Windows':
-        split_text = '\r\n'
-    else:
-        split_text = '\n'
-
-    string = fh.read().decode('utf-8')
-    data = [[row.split() for row in sample.split(split_text)] for
-            sample in
-            string.strip().split(split_text + split_text)]
-    fh.close()
-    return data
-
-
-def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):
-    if maxlen is None:
-        maxlen = max(len(s) for s in data)
-    word2idx = dict((w, i) for i, w in enumerate(vocab))
-    x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]  # set to <unk> (index 1) if not in vocab
-
-    y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]
-
-    x = pad_sequences(x, maxlen)  # left padding
-
-    y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
-
-    if onehot:
-        y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk]
-    else:
-        y_chunk = np.expand_dims(y_chunk, 2)
-    return x, y_chunk
-
-
-def process_data(data, vocab, maxlen=100):
-    word2idx = dict((w, i) for i, w in enumerate(vocab))
-    x = [word2idx.get(w[0].lower(), 1) for w in data]
-    length = len(x)
-    x = pad_sequences([x], maxlen)  # left padding
-    return x, length
-
-
-def create_model(train=True):
-    if train:
-        (train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data()
-    else:
-        with open('model/config.pkl', 'rb') as inp:
-            (vocab, chunk_tags) = pickle.load(inp)
-    model = Sequential()
-    model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True))  # Random embedding
-    model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True)))
-    crf = CRF(len(chunk_tags), sparse_target=True)
-    model.add(crf)
-    model.summary()
-    model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
-    if train:
-        return model, (train_x, train_y), (test_x, test_y)
-    else:
-        return model, (vocab, chunk_tags)
-
-
-def train():
-    EPOCHS = 10
-    model, (train_x, train_y), (test_x, test_y) = create_model()
-    # train model
-    model.fit(train_x, train_y,batch_size=16,epochs=EPOCHS, validation_data=[test_x, test_y])
-    model.save('model/crf.h5')
-
-def test():
-    model, (vocab, chunk_tags) = create_model(train=False)
-    predict_text = '中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下，连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚'
-    str, length = process_data(predict_text, vocab)
-    model.load_weights('model/crf.h5')
-    raw = model.predict(str)[0][-length:]
-    result = [np.argmax(row) for row in raw]
-    result_tags = [chunk_tags[i] for i in result]
-
-    per, loc, org = '', '', ''
-
-    for s, t in zip(predict_text, result_tags):
-        if t in ('B-PER', 'I-PER'):
-            per += ' ' + s if (t == 'B-PER') else s
-        if t in ('B-ORG', 'I-ORG'):
-            org += ' ' + s if (t == 'B-ORG') else s
-        if t in ('B-LOC', 'I-LOC'):
-            loc += ' ' + s if (t == 'B-LOC') else s
-
-    print(['person:' + per, 'location:' + loc, 'organzation:' + org])
-
-
-if __name__ == "__main__":
-    train()
--- a/tutorials/init.py
+++ b/tutorials/init.py
--- a/tutorials/keras/init.py
+++ b/tutorials/keras/init.py
--- a/tutorials/keras/text_NER.py
+++ b/tutorials/keras/text_NER.py
@@ -0,0 +1,143 @@
+import pickle
+import numpy as np
+import pandas as pd
+import platform
+from collections import Counter
+from keras.models import Sequential
+from keras.layers import Embedding, Bidirectional, LSTM
+from keras_contrib.layers import CRF
+"""
+# padding: pre(默认) 向前补充0  post 向后补充0
+# truncating: 文本超过 pad_num,  pre(默认) 删除前面  post 删除后面
+# x_train = pad_sequences(x, maxlen=pad_num, value=0, padding='post', truncating="post")
+# print("--- ", x_train[0][:20])
+
+使用keras_bert、keras_contrib的crf时bug记录
+TypeError: Tensors in list passed to 'values' of 'ConcatV2' Op have types [bool, float32] that don't all match
+解决方案, 修改crf.py 516行：
+mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1),
+为:
+mask2 = K.cast(K.concatenate([mask, K.cast(K.zeros_like(mask[:, :1]), mask.dtype)], axis=1),
+"""
+from keras.preprocessing.sequence import pad_sequences
+from config.setting import Config
+
+
+def load_data():
+    train = _parse_data(Config.nlp_ner.path_train)
+    test  = _parse_data(Config.nlp_ner.path_test)
+    print("--- init 数据加载解析完成 ---")
+    
+    # Counter({'的': 8, '中': 7, '致': 7, '党': 7})
+    word_counts = Counter(row[0].lower() for sample in train for row in sample)
+    vocab = [w for w, f in iter(word_counts.items()) if f >= 2]
+    chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"]
+
+    # 存储保留的有效个数的 vovab 和 对应 chunk_tags
+    with open(Config.nlp_ner.path_config, 'wb') as outp:
+        pickle.dump((vocab, chunk_tags), outp)
+    print("--- init 配置文件保存成功 ---")
+
+    train = _process_data(train, vocab, chunk_tags)
+    test  = _process_data(test , vocab, chunk_tags)
+    print("--- init 对数据进行编码，生成训练需要的数据格式 ---")
+    return train, test, (vocab, chunk_tags)
+
+
+def _parse_data(filename):
+    """
+    以单下划线开头（_foo）的代表不能直接访问的类属性
+    用于解析数据，用于模型训练
+    :param filename: 文件地址
+    :return: data: 解析数据后的结果
+    [[['中', 'B-ORG'], ['共', 'I-ORG']], [['中', 'B-ORG'], ['国', 'I-ORG']]]
+    """
+    with open(filename, 'rb') as fn:
+        split_text = '\n'
+        # 主要是分句: split_text 默认每个句子都是一行，所以原来换行就需要 两个split_text
+        texts = fn.read().decode('utf-8').strip().split(split_text + split_text)
+        # 对于每个字需要 split_text, 而字的内部需要用空格分隔
+        data = [[row.split() for row in text.split(split_text)] for text in texts]
+    return data
+
+
+def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):
+    if maxlen is None:
+        maxlen = max(len(s) for s in data)
+    
+    # 对每个字进行编码
+    word2idx = dict((w, i) for i, w in enumerate(vocab))
+    # 如果不在 vocab里面，就给 unk 值为 1
+    x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]
+    y_chunk = [[chunk_tags.index(w[1])  for w in s] for s in data]
+
+    x = pad_sequences(x, maxlen)  # left padding
+    y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
+
+    if onehot:
+        # 返回一个onehot 编码的多维数组
+        y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk]
+    else:
+        # np.expand_dims:用于扩展数组的形状
+        # https://blog.csdn.net/hong615771420/article/details/83448878
+        y_chunk = np.expand_dims(y_chunk, 2)
+    return x, y_chunk
+
+
+def process_data(data, vocab, maxlen=100):
+    word2idx = dict((w, i) for i, w in enumerate(vocab))
+    x = [word2idx.get(w[0].lower(), 1) for w in data]
+    length = len(x)
+    x = pad_sequences([x], maxlen)  # left padding
+    return x, length
+
+
+def create_model(len_vocab, len_chunk_tags):
+    model = Sequential()
+    model.add(Embedding(len_vocab, Config.nlp_ner.EMBED_DIM, mask_zero=True))  # Random embedding
+    model.add(Bidirectional(LSTM(Config.nlp_ner.BiLSTM_UNITS // 2, return_sequences=True)))
+    crf = CRF(len_chunk_tags, sparse_target=True)
+    model.add(crf)
+    model.summary()
+    model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
+    return model
+
+
+def train():
+    (train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data()
+    model = create_model(len(vocab), len(chunk_tags))
+    # train model
+    model.fit(train_x, train_y, batch_size=16, epochs=Config.nlp_ner.EPOCHS, validation_data=[test_x, test_y])
+    model.save(Config.nlp_ner.path_model)
+
+
+def test():
+    with open(Config.nlp_ner.path_config, 'rb') as inp:
+        (vocab, chunk_tags) = pickle.load(inp)
+    model = create_model(len(vocab), len(chunk_tags))
+    predict_text = '中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下，连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚'
+    text_EMBED, length = process_data(predict_text, vocab)
+    model.load_weights(Config.nlp_ner.path_model)
+    raw = model.predict(text_EMBED)[0][-length:]
+    result = [np.argmax(row) for row in raw]
+    result_tags = [chunk_tags[i] for i in result]
+
+    per, loc, org = '', '', ''
+
+    for s, t in zip(predict_text, result_tags):
+        if t in ('B-PER', 'I-PER'):
+            per += ' ' + s if (t == 'B-PER') else s
+        if t in ('B-ORG', 'I-ORG'):
+            org += ' ' + s if (t == 'B-ORG') else s
+        if t in ('B-LOC', 'I-LOC'):
+            loc += ' ' + s if (t == 'B-LOC') else s
+
+    print(['person:' + per, 'location:' + loc, 'organzation:' + org])
+
+
+def main():
+    # print("--")
+    train()
+
+# if __name__ == "__main__":
+#     train()