diff --git a/testData/DT_data.txt b/input/03.DecisionTree/data.txt similarity index 100% rename from testData/DT_data.txt rename to input/03.DecisionTree/data.txt diff --git a/testData/AB_horseColicTest2.txt b/input/07.AdaBoost/horseColicTest2.txt similarity index 100% rename from testData/AB_horseColicTest2.txt rename to input/07.AdaBoost/horseColicTest2.txt diff --git a/testData/AB_horseColicTraining2.txt b/input/07.AdaBoost/horseColicTraining2.txt similarity index 100% rename from testData/AB_horseColicTraining2.txt rename to input/07.AdaBoost/horseColicTraining2.txt diff --git a/testData/RT_bikeSpeedVsIq_test.txt b/input/09.RegTrees/bikeSpeedVsIq_test.txt similarity index 100% rename from testData/RT_bikeSpeedVsIq_test.txt rename to input/09.RegTrees/bikeSpeedVsIq_test.txt diff --git a/testData/RT_bikeSpeedVsIq_train.txt b/input/09.RegTrees/bikeSpeedVsIq_train.txt similarity index 100% rename from testData/RT_bikeSpeedVsIq_train.txt rename to input/09.RegTrees/bikeSpeedVsIq_train.txt diff --git a/testData/RT_data1.txt b/input/09.RegTrees/data1.txt similarity index 100% rename from testData/RT_data1.txt rename to input/09.RegTrees/data1.txt diff --git a/testData/RT_data2.txt b/input/09.RegTrees/data2.txt similarity index 100% rename from testData/RT_data2.txt rename to input/09.RegTrees/data2.txt diff --git a/testData/RT_data3.txt b/input/09.RegTrees/data3.txt similarity index 100% rename from testData/RT_data3.txt rename to input/09.RegTrees/data3.txt diff --git a/testData/RT_data3test.txt b/input/09.RegTrees/data3test.txt similarity index 100% rename from testData/RT_data3test.txt rename to input/09.RegTrees/data3test.txt diff --git a/testData/RT_data4.txt b/input/09.RegTrees/data4.txt similarity index 100% rename from testData/RT_data4.txt rename to input/09.RegTrees/data4.txt diff --git a/testData/RT_sine.txt b/input/09.RegTrees/sine.txt similarity index 100% rename from testData/RT_sine.txt rename to input/09.RegTrees/sine.txt diff --git a/output/03.DecisionTree/tree.pdf b/output/03.DecisionTree/tree.pdf index d3ad800b..2bbd0fb4 100644 Binary files a/output/03.DecisionTree/tree.pdf and b/output/03.DecisionTree/tree.pdf differ diff --git a/src/python/03.DecisionTree/DTSklearn.py b/src/python/03.DecisionTree/DTSklearn.py index 883ad6cf..f1c7d143 100644 --- a/src/python/03.DecisionTree/DTSklearn.py +++ b/src/python/03.DecisionTree/DTSklearn.py @@ -12,7 +12,7 @@ def createDataSet(): ''' 数据读入 ''' data = [] labels = [] - with open("testData/DT_data.txt") as ifile: + with open("input/03.DecisionTree/data.txt") as ifile: for line in ifile: # 特征: 身高 体重 label: 胖瘦 tokens = line.strip().split(' ') diff --git a/src/python/07.AdaBoost/adaboost.py b/src/python/07.AdaBoost/adaboost.py index 676cdff8..bc8cd7db 100644 --- a/src/python/07.AdaBoost/adaboost.py +++ b/src/python/07.AdaBoost/adaboost.py @@ -258,47 +258,47 @@ def plotROC(predStrengths, classLabels): if __name__ == "__main__": - # 我们要将5个点进行分类 - dataArr, labelArr = loadSimpData() - print 'dataArr', dataArr, 'labelArr', labelArr + # # 我们要将5个点进行分类 + # dataArr, labelArr = loadSimpData() + # print 'dataArr', dataArr, 'labelArr', labelArr - # D表示最初值,对1进行均分为5份,平均每一个初始的概率都为0.2 - # D的目的是为了计算错误概率: weightedError = D.T*errArr - D = mat(ones((5, 1))/5) - print 'D=', D.T + # # D表示最初值,对1进行均分为5份,平均每一个初始的概率都为0.2 + # # D的目的是为了计算错误概率: weightedError = D.T*errArr + # D = mat(ones((5, 1))/5) + # print 'D=', D.T - # bestStump, minError, bestClasEst = buildStump(dataArr, labelArr, D) - # print 'bestStump=', bestStump - # print 'minError=', minError - # print 'bestClasEst=', bestClasEst.T + # # bestStump, minError, bestClasEst = buildStump(dataArr, labelArr, D) + # # print 'bestStump=', bestStump + # # print 'minError=', minError + # # print 'bestClasEst=', bestClasEst.T - # 分类器:weakClassArr - # 历史累计的分类结果集 - weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 9) - print '\nweakClassArr=', weakClassArr, '\naggClassEst=', aggClassEst.T + # # 分类器:weakClassArr + # # 历史累计的分类结果集 + # weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 9) + # print '\nweakClassArr=', weakClassArr, '\naggClassEst=', aggClassEst.T - """ - 发现: - 分类的权重值:最大的值,为alpha的加和,最小值为-最大值 - 特征的权重值:如果一个值误判的几率越小,那么D的特征权重越少 - """ + # """ + # 发现: + # 分类的权重值:最大的值,为alpha的加和,最小值为-最大值 + # 特征的权重值:如果一个值误判的几率越小,那么D的特征权重越少 + # """ - # 测试数据的分类结果, 观测:aggClassEst分类的最终权重 - print adaClassify([0, 0], weakClassArr).T - print adaClassify([[5, 5], [0, 0]], weakClassArr).T + # # 测试数据的分类结果, 观测:aggClassEst分类的最终权重 + # print adaClassify([0, 0], weakClassArr).T + # print adaClassify([[5, 5], [0, 0]], weakClassArr).T - # # 马疝病数据集 - # # 训练集合 - # dataArr, labelArr = loadDataSet("testData/AB_horseColicTraining2.txt") - # weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 40) - # print weakClassArr, '\n-----\n', aggClassEst.T - # # 计算ROC下面的AUC的面积大小 - # plotROC(aggClassEst.T, labelArr) - # # 测试集合 - # dataArrTest, labelArrTest = loadDataSet("testData/AB_horseColicTest2.txt") - # m = shape(dataArrTest)[0] - # predicting10 = adaClassify(dataArrTest, weakClassArr) - # errArr = mat(ones((m, 1))) - # # 测试:计算总样本数,错误样本数,错误率 - # print m, errArr[predicting10 != mat(labelArrTest).T].sum(), errArr[predicting10 != mat(labelArrTest).T].sum()/m + # 马疝病数据集 + # 训练集合 + dataArr, labelArr = loadDataSet("input/07.AdaBoost/horseColicTraining2.txt") + weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 40) + print weakClassArr, '\n-----\n', aggClassEst.T + # 计算ROC下面的AUC的面积大小 + plotROC(aggClassEst.T, labelArr) + # 测试集合 + dataArrTest, labelArrTest = loadDataSet("input/07.AdaBoost/horseColicTest2.txt") + m = shape(dataArrTest)[0] + predicting10 = adaClassify(dataArrTest, weakClassArr) + errArr = mat(ones((m, 1))) + # 测试:计算总样本数,错误样本数,错误率 + print m, errArr[predicting10 != mat(labelArrTest).T].sum(), errArr[predicting10 != mat(labelArrTest).T].sum()/m diff --git a/src/python/09.RegTrees/regTrees.py b/src/python/09.RegTrees/regTrees.py index b621c1cb..fec88808 100644 --- a/src/python/09.RegTrees/regTrees.py +++ b/src/python/09.RegTrees/regTrees.py @@ -290,8 +290,8 @@ if __name__ == "__main__": # print mat0, '\n-----------\n', mat1 # # 回归树 - # myDat = loadDataSet('testData/RT_data1.txt') - # # myDat = loadDataSet('testData/RT_data2.txt') + # myDat = loadDataSet('input/09.RegTrees/data1.txt') + # # myDat = loadDataSet('input/09.RegTrees/data2.txt') # # print 'myDat=', myDat # myMat = mat(myDat) # # print 'myMat=', myMat @@ -299,13 +299,13 @@ if __name__ == "__main__": # print myTree # # 1. 预剪枝就是:提起设置最大误差数和最少元素数 - # myDat = loadDataSet('testData/RT_data3.txt') + # myDat = loadDataSet('input/09.RegTrees/data3.txt') # myMat = mat(myDat) # myTree = createTree(myMat, ops=(0, 1)) # print myTree # # 2. 后剪枝就是:通过测试数据,对预测模型进行合并判断 - # myDatTest = loadDataSet('testData/RT_data3test.txt') + # myDatTest = loadDataSet('input/09.RegTrees/data3test.txt') # myMat2Test = mat(myDatTest) # myFinalTree = prune(myTree, myMat2Test) # print '\n\n\n-------------------' @@ -313,14 +313,14 @@ if __name__ == "__main__": # # -------- # # 模型树求解 - # myDat = loadDataSet('testData/RT_data4.txt') + # myDat = loadDataSet('input/09.RegTrees/data4.txt') # myMat = mat(myDat) # myTree = createTree(myMat, modelLeaf, modelErr) # print myTree # 回归树 VS 模型树 VS 线性回归 - trainMat = mat(loadDataSet('testData/RT_bikeSpeedVsIq_train.txt')) - testMat = mat(loadDataSet('testData/RT_bikeSpeedVsIq_test.txt')) + trainMat = mat(loadDataSet('input/09.RegTrees/bikeSpeedVsIq_train.txt')) + testMat = mat(loadDataSet('input/09.RegTrees/bikeSpeedVsIq_test.txt')) # 回归树 myTree1 = createTree(trainMat, ops=(1, 20)) print myTree1