mirror of
https://github.com/apachecn/ailearning.git
synced 2026-05-08 14:52:28 +08:00
添加训练函数原版
This commit is contained in:
@@ -220,6 +220,44 @@ def setOfWords2Vec(vocabList, inputSet):
|
||||
朴素贝叶斯分类器训练函数
|
||||
|
||||
```python
|
||||
def _trainNB0(trainMatrix, trainCategory):
|
||||
"""
|
||||
训练数据原版
|
||||
:param trainMatrix: 文件单词矩阵 [[1,0,1,1,1....],[],[]...]
|
||||
:param trainCategory: 文件对应的类别[0,1,1,0....],列表长度等于单词矩阵数,其中的1代表对应的文件是侮辱性文件,0代表不是侮辱性矩阵
|
||||
:return:
|
||||
"""
|
||||
# 文件数
|
||||
numTrainDocs = len(trainMatrix)
|
||||
# 单词数
|
||||
numWords = len(trainMatrix[0])
|
||||
# 侮辱性文件的出现概率,即trainCategory中所有的1的个数,
|
||||
# 代表的就是多少个侮辱性文件,与文件的总数相除就得到了侮辱性文件的出现概率
|
||||
pAbusive = sum(trainCategory) / float(numTrainDocs)
|
||||
# 构造单词出现次数列表
|
||||
p0Num = zeros(numWords) # [0,0,0,.....]
|
||||
p1Num = zeros(numWords) # [0,0,0,.....]
|
||||
|
||||
# 整个数据集单词出现总数
|
||||
p0Denom = 0.0
|
||||
p1Denom = 0.0
|
||||
for i in range(numTrainDocs):
|
||||
# 遍历所有的文件,如果是侮辱性文件,就计算此侮辱性文件中出现的侮辱性单词的个数
|
||||
if trainCategory[i] == 1:
|
||||
p1Num += trainMatrix[i] #[0,1,1,....]->[0,1,1,...]
|
||||
p1Denom += sum(trainMatrix[i])
|
||||
else:
|
||||
# 如果不是侮辱性文件,则计算非侮辱性文件中出现的侮辱性单词的个数
|
||||
p0Num += trainMatrix[i]
|
||||
p0Denom += sum(trainMatrix[i])
|
||||
# 类别1,即侮辱性文档的[P(F1|C1),P(F2|C1),P(F3|C1),P(F4|C1),P(F5|C1)....]列表
|
||||
# 即 在1类别下,每个单词出现次数的占比
|
||||
p1Vect = p1Num / p1Denom# [1,2,3,5]/90->[1/90,...]
|
||||
# 类别0,即正常文档的[P(F1|C0),P(F2|C0),P(F3|C0),P(F4|C0),P(F5|C0)....]列表
|
||||
# 即 在0类别下,每个单词出现次数的占比
|
||||
p0Vect = p0Num / p0Denom
|
||||
return p0Vect, p1Vect, pAbusive
|
||||
|
||||
def trainNB0(trainMatrix, trainCategory):
|
||||
"""
|
||||
训练数据优化版本
|
||||
|
||||
Reference in New Issue
Block a user