mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-11 14:26:04 +08:00
@@ -438,27 +438,50 @@ def trainNB0(trainMatrix, trainCategory):
|
||||
文件解析及完整的垃圾邮件测试函数
|
||||
|
||||
```python
|
||||
# 切分文本
|
||||
def textParse(bigString):
|
||||
'''
|
||||
Desc:
|
||||
接收一个大字符串并将其解析为字符串列表
|
||||
Args:
|
||||
bigString -- 大字符串
|
||||
Returns:
|
||||
去掉少于 2 个字符的字符串,并将所有字符串转换为小写,返回字符串列表
|
||||
'''
|
||||
import re
|
||||
# 使用正则表达式来切分句子,其中分隔符是除单词、数字外的任意字符串
|
||||
listOfTokens = re.split(r'\W*', bigString)
|
||||
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
|
||||
|
||||
def spamTest():
|
||||
'''
|
||||
Desc:
|
||||
对贝叶斯垃圾邮件分类器进行自动化处理。
|
||||
Args:
|
||||
none
|
||||
Returns:
|
||||
对测试集中的每封邮件进行分类,若邮件分类错误,则错误数加 1,最后返回总的错误百分比。
|
||||
'''
|
||||
docList = []
|
||||
classList = []
|
||||
fullText = []
|
||||
for i in range(1, 26):
|
||||
wordList = textParse(open('email/spam/%d.txt' % i).read())
|
||||
# 切分,解析数据,并归类为 1 类别
|
||||
wordList = textParse(open('input/4.NaiveBayes/email/spam/%d.txt' % i).read())
|
||||
docList.append(wordList)
|
||||
classList.append(1)
|
||||
wordList = textParse(open('email/ham/%d.txt' % i).read())
|
||||
# 切分,解析数据,并归类为 0 类别
|
||||
wordList = textParse(open('input/4.NaiveBayes/email/ham/%d.txt' % i).read())
|
||||
docList.append(wordList)
|
||||
fullText.extend(wordList)
|
||||
classList.append(0)
|
||||
# 创建词汇表
|
||||
vocabList = createVocabList(docList)
|
||||
trainingSet = range(50)
|
||||
testSet = []
|
||||
# 随机取 10 个邮件用来测试
|
||||
for i in range(10):
|
||||
# random.uniform(x, y) 随机生成一个范围为 x - y 的实数
|
||||
randIndex = int(random.uniform(0, len(trainingSet)))
|
||||
testSet.append(trainingSet[randIndex])
|
||||
del(trainingSet[randIndex])
|
||||
@@ -473,7 +496,9 @@ def spamTest():
|
||||
wordVector = setOfWords2Vec(vocabList, docList[docIndex])
|
||||
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
|
||||
errorCount += 1
|
||||
print 'thr error rate is :', float(errorCount)/len(testSet)
|
||||
print 'the errorCount is: ', errorCount
|
||||
print 'the testSet length is :', len(testSet)
|
||||
print 'the error rate is :', float(errorCount)/len(testSet)
|
||||
```
|
||||
|
||||
> 使用算法: 构建一个完整的程序对一组文档进行分类,将错分的文档输出到屏幕上
|
||||
|
||||
@@ -200,19 +200,37 @@ def testingNB():
|
||||
|
||||
# 切分文本
|
||||
def textParse(bigString):
|
||||
'''
|
||||
Desc:
|
||||
接收一个大字符串并将其解析为字符串列表
|
||||
Args:
|
||||
bigString -- 大字符串
|
||||
Returns:
|
||||
去掉少于 2 个字符的字符串,并将所有字符串转换为小写,返回字符串列表
|
||||
'''
|
||||
import re
|
||||
# 使用正则表达式来切分句子,其中分隔符是除单词、数字外的任意字符串
|
||||
listOfTokens = re.split(r'\W*', bigString)
|
||||
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
|
||||
|
||||
def spamTest():
|
||||
'''
|
||||
Desc:
|
||||
对贝叶斯垃圾邮件分类器进行自动化处理。
|
||||
Args:
|
||||
none
|
||||
Returns:
|
||||
对测试集中的每封邮件进行分类,若邮件分类错误,则错误数加 1,最后返回总的错误百分比。
|
||||
'''
|
||||
docList = []
|
||||
classList = []
|
||||
fullText = []
|
||||
for i in range(1, 26):
|
||||
# 切分,解析数据,并归类为 1 类别
|
||||
wordList = textParse(open('input/4.NaiveBayes/email/spam/%d.txt' % i).read())
|
||||
docList.append(wordList)
|
||||
classList.append(1)
|
||||
# 切分,解析数据,并归类为 0 类别
|
||||
wordList = textParse(open('input/4.NaiveBayes/email/ham/%d.txt' % i).read())
|
||||
docList.append(wordList)
|
||||
fullText.extend(wordList)
|
||||
@@ -326,6 +344,6 @@ def getTopWords(ny,sf):
|
||||
print item[0]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# testingNB()
|
||||
spamTest()
|
||||
testingNB()
|
||||
# spamTest()
|
||||
# laTest()
|
||||
|
||||
Reference in New Issue
Block a user