Merge pull request #130 from chenyyx/master

修改朴素贝叶斯
This commit is contained in:
Joy yx
2017-08-31 13:38:46 +08:00
committed by GitHub
2 changed files with 48 additions and 5 deletions

View File

@@ -438,27 +438,50 @@ def trainNB0(trainMatrix, trainCategory):
文件解析及完整的垃圾邮件测试函数
```python
# 切分文本
def textParse(bigString):
'''
Desc:
接收一个大字符串并将其解析为字符串列表
Args:
bigString -- 大字符串
Returns:
去掉少于 2 个字符的字符串,并将所有字符串转换为小写,返回字符串列表
'''
import re
# 使用正则表达式来切分句子,其中分隔符是除单词、数字外的任意字符串
listOfTokens = re.split(r'\W*', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
'''
Desc:
对贝叶斯垃圾邮件分类器进行自动化处理。
Args:
none
Returns:
对测试集中的每封邮件进行分类,若邮件分类错误,则错误数加 1最后返回总的错误百分比。
'''
docList = []
classList = []
fullText = []
for i in range(1, 26):
wordList = textParse(open('email/spam/%d.txt' % i).read())
# 切分,解析数据,并归类为 1 类别
wordList = textParse(open('input/4.NaiveBayes/email/spam/%d.txt' % i).read())
docList.append(wordList)
classList.append(1)
wordList = textParse(open('email/ham/%d.txt' % i).read())
# 切分,解析数据,并归类为 0 类别
wordList = textParse(open('input/4.NaiveBayes/email/ham/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
# 创建词汇表
vocabList = createVocabList(docList)
trainingSet = range(50)
testSet = []
# 随机取 10 个邮件用来测试
for i in range(10):
# random.uniform(x, y) 随机生成一个范围为 x - y 的实数
randIndex = int(random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
@@ -473,7 +496,9 @@ def spamTest():
wordVector = setOfWords2Vec(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print 'thr error rate is :', float(errorCount)/len(testSet)
print 'the errorCount is: ', errorCount
print 'the testSet length is :', len(testSet)
print 'the error rate is :', float(errorCount)/len(testSet)
```
> 使用算法: 构建一个完整的程序对一组文档进行分类,将错分的文档输出到屏幕上

View File

@@ -200,19 +200,37 @@ def testingNB():
# 切分文本
def textParse(bigString):
'''
Desc:
接收一个大字符串并将其解析为字符串列表
Args:
bigString -- 大字符串
Returns:
去掉少于 2 个字符的字符串,并将所有字符串转换为小写,返回字符串列表
'''
import re
# 使用正则表达式来切分句子,其中分隔符是除单词、数字外的任意字符串
listOfTokens = re.split(r'\W*', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
'''
Desc:
对贝叶斯垃圾邮件分类器进行自动化处理。
Args:
none
Returns:
对测试集中的每封邮件进行分类,若邮件分类错误,则错误数加 1最后返回总的错误百分比。
'''
docList = []
classList = []
fullText = []
for i in range(1, 26):
# 切分,解析数据,并归类为 1 类别
wordList = textParse(open('input/4.NaiveBayes/email/spam/%d.txt' % i).read())
docList.append(wordList)
classList.append(1)
# 切分,解析数据,并归类为 0 类别
wordList = textParse(open('input/4.NaiveBayes/email/ham/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
@@ -326,6 +344,6 @@ def getTopWords(ny,sf):
print item[0]
if __name__ == "__main__":
# testingNB()
spamTest()
testingNB()
# spamTest()
# laTest()