修改朴素贝叶斯

2026-06-10 06:05:44 +08:00 · 2017-08-31 13:42:03 +08:00
parent aa2e0d869e
commit ee7eee8f31
2 changed files with 48 additions and 5 deletions
--- a/src/python/4.NaiveBayes/bayes.py
+++ b/src/python/4.NaiveBayes/bayes.py
@@ -200,19 +200,37 @@ def testingNB():

 # 切分文本
 def textParse(bigString):
+    '''
+    Desc:
+        接收一个大字符串并将其解析为字符串列表
+    Args:
+        bigString -- 大字符串
+    Returns:
+        去掉少于 2 个字符的字符串，并将所有字符串转换为小写，返回字符串列表
+    '''
    import re
    # 使用正则表达式来切分句子，其中分隔符是除单词、数字外的任意字符串
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

 def spamTest():
+    '''
+    Desc:
+        对贝叶斯垃圾邮件分类器进行自动化处理。
+    Args:
+        none
+    Returns:
+        对测试集中的每封邮件进行分类，若邮件分类错误，则错误数加 1，最后返回总的错误百分比。
+    '''
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
+        # 切分，解析数据，并归类为 1 类别
        wordList = textParse(open('input/4.NaiveBayes/email/spam/%d.txt' % i).read())
        docList.append(wordList)
        classList.append(1)
+        # 切分，解析数据，并归类为 0 类别
        wordList = textParse(open('input/4.NaiveBayes/email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
@@ -326,6 +344,6 @@ def getTopWords(ny,sf):
        print item[0]

 if __name__ == "__main__":
-    # testingNB()
-    spamTest()
+    testingNB()
+    # spamTest()
    # laTest()