diff --git a/src/python/04.NaiveBayes/bayes.py b/src/python/04.NaiveBayes/bayes.py new file mode 100755 index 00000000..a4d10d95 --- /dev/null +++ b/src/python/04.NaiveBayes/bayes.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +from numpy import * + + +def loadDataSet(): + """ + 创建数据集 + :return: 单词列表postingList, 所属类别classVec + """ + postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], + ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], + ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], + ['stop', 'posting', 'stupid', 'worthless', 'garbage'], + ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], + ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] + classVec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 not + return postingList, classVec + + +def createVocabList(dataSet): + """ + 获取所有单词的集合 + :param dataSet: 数据集 + :return: 所有单词的集合(即不含重复元素的单词列表) + """ + vocabSet = set([]) # create empty set + for document in dataSet: + vocabSet = vocabSet | set(document) # union of the two sets + return list(vocabSet) + + +def setOfWords2Vec(vocabList, inputSet): + """ + 遍历查看该单词属否出现,出现该单词则将该单词置1 + :param vocabList: 所有单词集合列表 + :param inputSet: 输入数据集 + :return: 匹配列表[0,1,0,1...] + """ + returnVec = [0] * len(vocabList) + for word in inputSet: + if word in vocabList: + returnVec[vocabList.index(word)] = 1 + else: + print "the word: %s is not in my Vocabulary!" % word + return returnVec + + +def _trainNB0(trainMatrix, trainCategory): + """ + 训练数据原版 + :param trainMatrix: 文件单词矩阵 + :param trainCategory: 文件对应的类别 + :return: + """ + # 文件数 + numTrainDocs = len(trainMatrix) + # 单词数 + numWords = len(trainMatrix[0]) + # 侮辱性文件的出现概率 + pAbusive = sum(trainCategory) / float(numTrainDocs) + # 构造单词出现次数列表 + p0Num = zeros(numWords) + p1Num = zeros(numWords) + + # 整个数据集单词出现总数 + p0Denom = 0.0 + p1Denom = 0.0 + for i in range(numTrainDocs): + if trainCategory[i] == 1: + p1Num += trainMatrix[i] + p1Denom += sum(trainMatrix[i]) + else: + p0Num += trainMatrix[i] + p0Denom += sum(trainMatrix[i]) + # 类别1,即侮辱性文档的[P(F1|C1),P(F2|C1),P(F3|C1),P(F4|C1),P(F5|C1)....]列表 + p1Vect = p1Num / p1Denom + # 类别0,即正常文档的[P(F1|C0),P(F2|C0),P(F3|C0),P(F4|C0),P(F5|C0)....]列表 + p0Vect = p0Num / p0Denom + return p0Vect, p1Vect, pAbusive + + +def trainNB0(trainMatrix, trainCategory): + """ + 训练数据优化版本 + :param trainMatrix: 文件单词矩阵 + :param trainCategory: 文件对应的类别 + :return: + """ + # 文件数 + numTrainDocs = len(trainMatrix) + # 单词数 + numWords = len(trainMatrix[0]) + # 侮辱性文件的出现概率 + pAbusive = sum(trainCategory) / float(numTrainDocs) + # 构造单词出现次数列表 + p0Num = ones(numWords) + p1Num = ones(numWords) + + # 整个数据集单词出现总数,2.0根据样本/实际调查结果调整分母的值 + p0Denom = 2.0 + p1Denom = 2.0 + for i in range(numTrainDocs): + if trainCategory[i] == 1: + p1Num += trainMatrix[i] + p1Denom += sum(trainMatrix[i]) + else: + p0Num += trainMatrix[i] + p0Denom += sum(trainMatrix[i]) + # 类别1,即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表 + p1Vect = log(p1Num / p1Denom) + # 类别0,即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表 + p0Vect = log(p0Num / p0Denom) + return p0Vect, p1Vect, pAbusive + + +def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): + """ + 使用算法 + :param vec2Classify: 待测数据 + :param p0Vec: 类别1,即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表 + :param p1Vec: 类别0,即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表 + :param pClass1: 类别1,侮辱性文件的出现概率 + :return: 类别1 or 0 + """ + # 计算公式 log(P(F1|C))+log(P(F2|C))+....+log(P(Fn|C))+log(P(C)) + p1 = sum(vec2Classify * p1Vec) + log(pClass1) + p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) + if p1 > p0: + return 1 + else: + return 0 + + +def bagOfWords2VecMN(vocabList, inputSet): + returnVec = [0] * len(vocabList) + for word in inputSet: + if word in vocabList: + returnVec[vocabList.index(word)] += 1 + return returnVec + + +def testingNB(): + """ + 测试朴素贝叶斯算法 + """ + # 1. 加载数据集 + listOPosts, listClasses = loadDataSet() + # 2. 创建单词集合 + myVocabList = createVocabList(listOPosts) + # 3. 计算单词是否出现并创建数据矩阵 + trainMat = [] + for postinDoc in listOPosts: + trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) + # 4. 训练数据 + p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses)) + # 5. 测试数据 + testEntry = ['love', 'my', 'dalmation'] + thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) + print testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb) + testEntry = ['stupid', 'garbage'] + thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) + print testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb) + + +testingNB()