From ee7eee8f3190c7b5e0ed2bc7f0fbe7e3898b9457 Mon Sep 17 00:00:00 2001 From: chenyyx Date: Thu, 31 Aug 2017 13:42:03 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=9C=B4=E7=B4=A0=E8=B4=9D?= =?UTF-8?q?=E5=8F=B6=E6=96=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/4.朴素贝叶斯.md | 31 ++++++++++++++++++++++++++++--- src/python/4.NaiveBayes/bayes.py | 22 ++++++++++++++++++++-- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/docs/4.朴素贝叶斯.md b/docs/4.朴素贝叶斯.md index 89d15fa0..324e9759 100644 --- a/docs/4.朴素贝叶斯.md +++ b/docs/4.朴素贝叶斯.md @@ -438,27 +438,50 @@ def trainNB0(trainMatrix, trainCategory): 文件解析及完整的垃圾邮件测试函数 ```python +# 切分文本 def textParse(bigString): + ''' + Desc: + 接收一个大字符串并将其解析为字符串列表 + Args: + bigString -- 大字符串 + Returns: + 去掉少于 2 个字符的字符串,并将所有字符串转换为小写,返回字符串列表 + ''' import re + # 使用正则表达式来切分句子,其中分隔符是除单词、数字外的任意字符串 listOfTokens = re.split(r'\W*', bigString) return [tok.lower() for tok in listOfTokens if len(tok) > 2] def spamTest(): + ''' + Desc: + 对贝叶斯垃圾邮件分类器进行自动化处理。 + Args: + none + Returns: + 对测试集中的每封邮件进行分类,若邮件分类错误,则错误数加 1,最后返回总的错误百分比。 + ''' docList = [] classList = [] fullText = [] for i in range(1, 26): - wordList = textParse(open('email/spam/%d.txt' % i).read()) + # 切分,解析数据,并归类为 1 类别 + wordList = textParse(open('input/4.NaiveBayes/email/spam/%d.txt' % i).read()) docList.append(wordList) classList.append(1) - wordList = textParse(open('email/ham/%d.txt' % i).read()) + # 切分,解析数据,并归类为 0 类别 + wordList = textParse(open('input/4.NaiveBayes/email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) + # 创建词汇表 vocabList = createVocabList(docList) trainingSet = range(50) testSet = [] + # 随机取 10 个邮件用来测试 for i in range(10): + # random.uniform(x, y) 随机生成一个范围为 x - y 的实数 randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) @@ -473,7 +496,9 @@ def spamTest(): wordVector = setOfWords2Vec(vocabList, docList[docIndex]) if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 - print 'thr error rate is :', float(errorCount)/len(testSet) + print 'the errorCount is: ', errorCount + print 'the testSet length is :', len(testSet) + print 'the error rate is :', float(errorCount)/len(testSet) ``` > 使用算法: 构建一个完整的程序对一组文档进行分类,将错分的文档输出到屏幕上 diff --git a/src/python/4.NaiveBayes/bayes.py b/src/python/4.NaiveBayes/bayes.py index e79a89c0..5b9aca54 100755 --- a/src/python/4.NaiveBayes/bayes.py +++ b/src/python/4.NaiveBayes/bayes.py @@ -200,19 +200,37 @@ def testingNB(): # 切分文本 def textParse(bigString): + ''' + Desc: + 接收一个大字符串并将其解析为字符串列表 + Args: + bigString -- 大字符串 + Returns: + 去掉少于 2 个字符的字符串,并将所有字符串转换为小写,返回字符串列表 + ''' import re # 使用正则表达式来切分句子,其中分隔符是除单词、数字外的任意字符串 listOfTokens = re.split(r'\W*', bigString) return [tok.lower() for tok in listOfTokens if len(tok) > 2] def spamTest(): + ''' + Desc: + 对贝叶斯垃圾邮件分类器进行自动化处理。 + Args: + none + Returns: + 对测试集中的每封邮件进行分类,若邮件分类错误,则错误数加 1,最后返回总的错误百分比。 + ''' docList = [] classList = [] fullText = [] for i in range(1, 26): + # 切分,解析数据,并归类为 1 类别 wordList = textParse(open('input/4.NaiveBayes/email/spam/%d.txt' % i).read()) docList.append(wordList) classList.append(1) + # 切分,解析数据,并归类为 0 类别 wordList = textParse(open('input/4.NaiveBayes/email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) @@ -326,6 +344,6 @@ def getTopWords(ny,sf): print item[0] if __name__ == "__main__": - # testingNB() - spamTest() + testingNB() + # spamTest() # laTest()