Merge pull request #130 from chenyyx/master

修改朴素贝叶斯
2026-02-11 14:26:04 +08:00 · 2017-08-31 13:38:46 +08:00
parent aa2e0d869e ee7eee8f31
commit fe4d339c24
2 changed files with 48 additions and 5 deletions
--- a/docs/4.朴素贝叶斯.md
+++ b/docs/4.朴素贝叶斯.md
@@ -438,27 +438,50 @@ def trainNB0(trainMatrix, trainCategory):
 文件解析及完整的垃圾邮件测试函数

 ```python
+# 切分文本
 def textParse(bigString):
+    '''
+    Desc:
+        接收一个大字符串并将其解析为字符串列表
+    Args:
+        bigString -- 大字符串
+    Returns:
+        去掉少于 2 个字符的字符串，并将所有字符串转换为小写，返回字符串列表
+    '''
    import re
+    # 使用正则表达式来切分句子，其中分隔符是除单词、数字外的任意字符串
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

 def spamTest():
+    '''
+    Desc:
+        对贝叶斯垃圾邮件分类器进行自动化处理。
+    Args:
+        none
+    Returns:
+        对测试集中的每封邮件进行分类，若邮件分类错误，则错误数加 1，最后返回总的错误百分比。
+    '''
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
-        wordList = textParse(open('email/spam/%d.txt' % i).read())
+        # 切分，解析数据，并归类为 1 类别
+        wordList = textParse(open('input/4.NaiveBayes/email/spam/%d.txt' % i).read())
        docList.append(wordList)
        classList.append(1)
-        wordList = textParse(open('email/ham/%d.txt' % i).read())
+        # 切分，解析数据，并归类为 0 类别
+        wordList = textParse(open('input/4.NaiveBayes/email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
+    # 创建词汇表    
    vocabList = createVocabList(docList)
    trainingSet = range(50)
    testSet = []
+    # 随机取 10 个邮件用来测试
    for i in range(10):
+        # random.uniform(x, y) 随机生成一个范围为 x - y 的实数
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
@@ -473,7 +496,9 @@ def spamTest():
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
-    print 'thr error rate is :', float(errorCount)/len(testSet)
+    print 'the errorCount is: ', errorCount
+    print 'the testSet length is :', len(testSet)
+    print 'the error rate is :', float(errorCount)/len(testSet)
 ```

 > 使用算法: 构建一个完整的程序对一组文档进行分类，将错分的文档输出到屏幕上
--- a/src/python/4.NaiveBayes/bayes.py
+++ b/src/python/4.NaiveBayes/bayes.py
@@ -200,19 +200,37 @@ def testingNB():

 # 切分文本
 def textParse(bigString):
+    '''
+    Desc:
+        接收一个大字符串并将其解析为字符串列表
+    Args:
+        bigString -- 大字符串
+    Returns:
+        去掉少于 2 个字符的字符串，并将所有字符串转换为小写，返回字符串列表
+    '''
    import re
    # 使用正则表达式来切分句子，其中分隔符是除单词、数字外的任意字符串
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

 def spamTest():
+    '''
+    Desc:
+        对贝叶斯垃圾邮件分类器进行自动化处理。
+    Args:
+        none
+    Returns:
+        对测试集中的每封邮件进行分类，若邮件分类错误，则错误数加 1，最后返回总的错误百分比。
+    '''
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
+        # 切分，解析数据，并归类为 1 类别
        wordList = textParse(open('input/4.NaiveBayes/email/spam/%d.txt' % i).read())
        docList.append(wordList)
        classList.append(1)
+        # 切分，解析数据，并归类为 0 类别
        wordList = textParse(open('input/4.NaiveBayes/email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
@@ -326,6 +344,6 @@ def getTopWords(ny,sf):
        print item[0]

 if __name__ == "__main__":
-    # testingNB()
-    spamTest()
+    testingNB()
+    # spamTest()
    # laTest()