From 4df7935b5de0ff9609c24c085887a2a89c10f5db Mon Sep 17 00:00:00 2001 From: jiangzhonglian Date: Fri, 7 Apr 2017 16:01:04 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E6=96=87=E4=BB=B6=E8=B7=AF?= =?UTF-8?q?=E5=BE=84=E5=88=B0input=E7=9B=AE=E5=BD=95=E4=B8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../03.DecisionTree/data.txt | 0 .../07.AdaBoost/horseColicTest2.txt | 0 .../07.AdaBoost/horseColicTraining2.txt | 0 .../09.RegTrees/bikeSpeedVsIq_test.txt | 0 .../09.RegTrees/bikeSpeedVsIq_train.txt | 0 .../09.RegTrees/data1.txt | 0 .../09.RegTrees/data2.txt | 0 .../09.RegTrees/data3.txt | 0 .../09.RegTrees/data3test.txt | 0 .../09.RegTrees/data4.txt | 0 .../RT_sine.txt => input/09.RegTrees/sine.txt | 0 output/03.DecisionTree/tree.pdf | Bin 16596 -> 16550 bytes src/python/03.DecisionTree/DTSklearn.py | 2 +- src/python/07.AdaBoost/adaboost.py | 74 +++++++++--------- src/python/09.RegTrees/regTrees.py | 14 ++-- 15 files changed, 45 insertions(+), 45 deletions(-) rename testData/DT_data.txt => input/03.DecisionTree/data.txt (100%) rename testData/AB_horseColicTest2.txt => input/07.AdaBoost/horseColicTest2.txt (100%) rename testData/AB_horseColicTraining2.txt => input/07.AdaBoost/horseColicTraining2.txt (100%) rename testData/RT_bikeSpeedVsIq_test.txt => input/09.RegTrees/bikeSpeedVsIq_test.txt (100%) rename testData/RT_bikeSpeedVsIq_train.txt => input/09.RegTrees/bikeSpeedVsIq_train.txt (100%) rename testData/RT_data1.txt => input/09.RegTrees/data1.txt (100%) rename testData/RT_data2.txt => input/09.RegTrees/data2.txt (100%) rename testData/RT_data3.txt => input/09.RegTrees/data3.txt (100%) rename testData/RT_data3test.txt => input/09.RegTrees/data3test.txt (100%) rename testData/RT_data4.txt => input/09.RegTrees/data4.txt (100%) rename testData/RT_sine.txt => input/09.RegTrees/sine.txt (100%) diff --git a/testData/DT_data.txt b/input/03.DecisionTree/data.txt similarity index 100% rename from testData/DT_data.txt rename to input/03.DecisionTree/data.txt diff --git a/testData/AB_horseColicTest2.txt b/input/07.AdaBoost/horseColicTest2.txt similarity index 100% rename from testData/AB_horseColicTest2.txt rename to input/07.AdaBoost/horseColicTest2.txt diff --git a/testData/AB_horseColicTraining2.txt b/input/07.AdaBoost/horseColicTraining2.txt similarity index 100% rename from testData/AB_horseColicTraining2.txt rename to input/07.AdaBoost/horseColicTraining2.txt diff --git a/testData/RT_bikeSpeedVsIq_test.txt b/input/09.RegTrees/bikeSpeedVsIq_test.txt similarity index 100% rename from testData/RT_bikeSpeedVsIq_test.txt rename to input/09.RegTrees/bikeSpeedVsIq_test.txt diff --git a/testData/RT_bikeSpeedVsIq_train.txt b/input/09.RegTrees/bikeSpeedVsIq_train.txt similarity index 100% rename from testData/RT_bikeSpeedVsIq_train.txt rename to input/09.RegTrees/bikeSpeedVsIq_train.txt diff --git a/testData/RT_data1.txt b/input/09.RegTrees/data1.txt similarity index 100% rename from testData/RT_data1.txt rename to input/09.RegTrees/data1.txt diff --git a/testData/RT_data2.txt b/input/09.RegTrees/data2.txt similarity index 100% rename from testData/RT_data2.txt rename to input/09.RegTrees/data2.txt diff --git a/testData/RT_data3.txt b/input/09.RegTrees/data3.txt similarity index 100% rename from testData/RT_data3.txt rename to input/09.RegTrees/data3.txt diff --git a/testData/RT_data3test.txt b/input/09.RegTrees/data3test.txt similarity index 100% rename from testData/RT_data3test.txt rename to input/09.RegTrees/data3test.txt diff --git a/testData/RT_data4.txt b/input/09.RegTrees/data4.txt similarity index 100% rename from testData/RT_data4.txt rename to input/09.RegTrees/data4.txt diff --git a/testData/RT_sine.txt b/input/09.RegTrees/sine.txt similarity index 100% rename from testData/RT_sine.txt rename to input/09.RegTrees/sine.txt diff --git a/output/03.DecisionTree/tree.pdf b/output/03.DecisionTree/tree.pdf index d3ad800b572c89857e475b484617f0794f780ab5..2bbd0fb47948a209d2113eb8aa43dfb546f81e21 100644 GIT binary patch delta 1961 zcmaivdpr{g8^<^Ixwpu*3{ey|yVA&6E@3X2m}}U~dYd#Rmvb^x=@lLKTgBNsw}W*k z6)9}VT;kvjFC*rXq|{srIdr_|^ZxOE^nO0?^VjqFJ>Spse4il}u#yGL@?>@p-J$}} zs?Sz^Kb(;XOnv&&gZ<EY8qS<4*qEop(2dGi~3jc2?Fp3z-XMlgIg6 z?z&gYh=Odg(Sf^Ow5LQtLh0pYdd4cIH7TLV((I~9?xr(2`vhpnpZV-G0BQ2zL3djf zTZJ^xRvGely3TLTpR8Gn?uK68M~cgjsy{aGP(6y#P7l@}I)>{lh%vpcR>g2uch3_Z zcZWFNOI|JKYK*WsJX$4btd7O_d7dqEztq!jEVNB3+U=t^XtZ`%Q~HVq&BWkZ@DMql zVC=@1mu1|t)**t=GriwOgQyl!NLa4S{SRfgw??Xt8MVLVCNelOx=tW7DUI3+5q{sf zXqUBQqp_P2QHWpNQXc6opcCU9;WMp(A)Llf7|!)k@?+Pe-Xp4iV0n`3 z1#$_JU8HvD&m(!<2$s|v#lYs98)VlWm%ss&4Y`mE3CXC3H#d4N1adyNn#-mtn2eN~ z66-L}t{XE$R3ars>8D8XYr&_Jmtakqw?*6-<*wE#4J$6Bnge9V0=8O}Ba{Bv5NEj1 zPS&~jyAJ7Vk<*uah8@1;D-PNeGK-ylb-n%-=pOtxTHmsPnze4r`oaqZ2NYarBRs7$ z;?7$RS+{8i{J5DKHcr@*S5-wP-CP-@f{CX4q)BX6o9Kgb44Hc zkuZNP&hxP*7IaCk!Qx0W@uxJHF= z*wXtAJa-n5K(xA!RvwM~R#ht4@BnsRVN^SBn6H$gG6~LX!F7ZlhT0@IZ{pQ6HLBx3 zoAf)6+3t$+0p8|Y_sBz}VQ zTbgIR1~|<631Awy4E}|(jJO57!69jJ!azipmXya&*S231=-vb28xn_Jv^r6Abxi3; z-}au64f?(MxeR#uT(qCfQ)~Llw~p{pqn4DO31s(c&_6HA67NmfQoZ^fDkU6~BV{zzt}lm@AkD+9hI@V8dJdNxmNk9@p+}&O2bZh)|1d;+jAvr~(8w z#_!Ur*Im_V}8}?`XJy79kMkDW?6Rxm5|bi@1YHta);sV!gur#Jy4Mc4N~znEE1?U>W@^nLsslmeHLmug+)4S=D|&PZ&{>2WFAbKO6^4k zF!6V5A{Hem3HP2`7BA&LD0%+!YlTb4OKEhx5=@rhItAsQcl@Q5z<=W?)-+g$5I(tD zGN_LD7OynWPNJhCiA+B^2~w2y^Cl8p8lHd)fd6T|)eKFvt*c`3k>|~d9y2;PkFFDZep`6* zh6$8wY}U%nwyBlEK%uZ)J6pCX6b*s?yC4@K@_#i@7!rfrn<0=$RIdCDZ)HOAH$Stmn??APAC2wAcn z5@HHzqU@nU)?Sj^z32Vo9^P}_-(S!9JfHLYp6^pa-HV18n}bP z?6&Li+36d;_c9`8DC2qUBFxWMOw$WYu6j{cb60onsqH*t$+7pt4L|T|*F}KdzuM8J z-6mo;ncKQ7k-fmJWM<>>T*i=Oq1oE#msKzL?w;Io^ugCR-)1)Q6b_1|!)lAc*o8nG zeO`x74eC;xZ%dfh5qYikK5=6rSy%V{WMf;D7j%PRZ#-#Fi5zun;^-@~u*3Xk2(>hc zTl(%p}r@!;bUL|NXt!9ICZlRjd?NQcWQf9R##xbk=XXAJIFX5e?p;Y%0cx* z7GS!(cS_QflFE!4YL_m&1BL3OC-@it>gT^<9YX}sbMm>W|KHm2Q z#^+wC2-6NrI-l081+e*QFHG?ATuHi9n(@Z*dWuYAjdXM|pTNzK$sKHU!oW(c6V*a& zI<3`{W5c5C4-sgok=npOv+qKa+qGf&^gj#uK>~#^$Iw%gUgcp-4$-8P z?l@8f%Zum?h6P5qWzeEuE*tvBt4!^P=NN;WyDLMah;U8Pbs)7Lilj+=%YFh_EDSFz z2B%WS^GASG_&L07OQ; zk0C@bNMo9=^Qu(wp!aN)XL&?S?F>$B3i&9^EY1 z89JSgJ|UGheyajD5fLTo;mGyqw#%zHt#}5Age?yYPa@9MwV49KM0`KO!}cKsI0-tF*1nhkmeGqTqul5 z2#^ip&tU5)o9{Me#*|%*YI~zCv)Up)-|T2MsJm);nBvfQrZt5>P%ewYh{{Hdd^#7{ zwZ`O5b+kIq7LKzZDf{=VCSl2*+$TC4d2EW;HeLf@$0u`SPwy zKFaB~u*k7Ek7G)(SR%sp@j5{a$aw_MxiVzHHn6f{@)d^;V( zBc%n)yd-6?w=6>W9&8QQNq%~>?fE>%$6rncr=a2ykzFuYN?XMyrHHxb0Hq(qi%DRV z6`CG^&CrUSDXMN`(2`47G!M+Q--rvCR8s{b2A^%WXZA`9xdd=|ueVf~`41gZNh2(~JL!w;W&}0-IM#j6N@hCirL?*-BT|L~8 mt{4JP=l?1YCkTN63=VM(3JD7$djR1W7#atJLXE9VfPVoKXkfSi diff --git a/src/python/03.DecisionTree/DTSklearn.py b/src/python/03.DecisionTree/DTSklearn.py index 883ad6cf..f1c7d143 100644 --- a/src/python/03.DecisionTree/DTSklearn.py +++ b/src/python/03.DecisionTree/DTSklearn.py @@ -12,7 +12,7 @@ def createDataSet(): ''' 数据读入 ''' data = [] labels = [] - with open("testData/DT_data.txt") as ifile: + with open("input/03.DecisionTree/data.txt") as ifile: for line in ifile: # 特征: 身高 体重 label: 胖瘦 tokens = line.strip().split(' ') diff --git a/src/python/07.AdaBoost/adaboost.py b/src/python/07.AdaBoost/adaboost.py index 676cdff8..bc8cd7db 100644 --- a/src/python/07.AdaBoost/adaboost.py +++ b/src/python/07.AdaBoost/adaboost.py @@ -258,47 +258,47 @@ def plotROC(predStrengths, classLabels): if __name__ == "__main__": - # 我们要将5个点进行分类 - dataArr, labelArr = loadSimpData() - print 'dataArr', dataArr, 'labelArr', labelArr + # # 我们要将5个点进行分类 + # dataArr, labelArr = loadSimpData() + # print 'dataArr', dataArr, 'labelArr', labelArr - # D表示最初值,对1进行均分为5份,平均每一个初始的概率都为0.2 - # D的目的是为了计算错误概率: weightedError = D.T*errArr - D = mat(ones((5, 1))/5) - print 'D=', D.T + # # D表示最初值,对1进行均分为5份,平均每一个初始的概率都为0.2 + # # D的目的是为了计算错误概率: weightedError = D.T*errArr + # D = mat(ones((5, 1))/5) + # print 'D=', D.T - # bestStump, minError, bestClasEst = buildStump(dataArr, labelArr, D) - # print 'bestStump=', bestStump - # print 'minError=', minError - # print 'bestClasEst=', bestClasEst.T + # # bestStump, minError, bestClasEst = buildStump(dataArr, labelArr, D) + # # print 'bestStump=', bestStump + # # print 'minError=', minError + # # print 'bestClasEst=', bestClasEst.T - # 分类器:weakClassArr - # 历史累计的分类结果集 - weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 9) - print '\nweakClassArr=', weakClassArr, '\naggClassEst=', aggClassEst.T + # # 分类器:weakClassArr + # # 历史累计的分类结果集 + # weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 9) + # print '\nweakClassArr=', weakClassArr, '\naggClassEst=', aggClassEst.T - """ - 发现: - 分类的权重值:最大的值,为alpha的加和,最小值为-最大值 - 特征的权重值:如果一个值误判的几率越小,那么D的特征权重越少 - """ + # """ + # 发现: + # 分类的权重值:最大的值,为alpha的加和,最小值为-最大值 + # 特征的权重值:如果一个值误判的几率越小,那么D的特征权重越少 + # """ - # 测试数据的分类结果, 观测:aggClassEst分类的最终权重 - print adaClassify([0, 0], weakClassArr).T - print adaClassify([[5, 5], [0, 0]], weakClassArr).T + # # 测试数据的分类结果, 观测:aggClassEst分类的最终权重 + # print adaClassify([0, 0], weakClassArr).T + # print adaClassify([[5, 5], [0, 0]], weakClassArr).T - # # 马疝病数据集 - # # 训练集合 - # dataArr, labelArr = loadDataSet("testData/AB_horseColicTraining2.txt") - # weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 40) - # print weakClassArr, '\n-----\n', aggClassEst.T - # # 计算ROC下面的AUC的面积大小 - # plotROC(aggClassEst.T, labelArr) - # # 测试集合 - # dataArrTest, labelArrTest = loadDataSet("testData/AB_horseColicTest2.txt") - # m = shape(dataArrTest)[0] - # predicting10 = adaClassify(dataArrTest, weakClassArr) - # errArr = mat(ones((m, 1))) - # # 测试:计算总样本数,错误样本数,错误率 - # print m, errArr[predicting10 != mat(labelArrTest).T].sum(), errArr[predicting10 != mat(labelArrTest).T].sum()/m + # 马疝病数据集 + # 训练集合 + dataArr, labelArr = loadDataSet("input/07.AdaBoost/horseColicTraining2.txt") + weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 40) + print weakClassArr, '\n-----\n', aggClassEst.T + # 计算ROC下面的AUC的面积大小 + plotROC(aggClassEst.T, labelArr) + # 测试集合 + dataArrTest, labelArrTest = loadDataSet("input/07.AdaBoost/horseColicTest2.txt") + m = shape(dataArrTest)[0] + predicting10 = adaClassify(dataArrTest, weakClassArr) + errArr = mat(ones((m, 1))) + # 测试:计算总样本数,错误样本数,错误率 + print m, errArr[predicting10 != mat(labelArrTest).T].sum(), errArr[predicting10 != mat(labelArrTest).T].sum()/m diff --git a/src/python/09.RegTrees/regTrees.py b/src/python/09.RegTrees/regTrees.py index b621c1cb..fec88808 100644 --- a/src/python/09.RegTrees/regTrees.py +++ b/src/python/09.RegTrees/regTrees.py @@ -290,8 +290,8 @@ if __name__ == "__main__": # print mat0, '\n-----------\n', mat1 # # 回归树 - # myDat = loadDataSet('testData/RT_data1.txt') - # # myDat = loadDataSet('testData/RT_data2.txt') + # myDat = loadDataSet('input/09.RegTrees/data1.txt') + # # myDat = loadDataSet('input/09.RegTrees/data2.txt') # # print 'myDat=', myDat # myMat = mat(myDat) # # print 'myMat=', myMat @@ -299,13 +299,13 @@ if __name__ == "__main__": # print myTree # # 1. 预剪枝就是:提起设置最大误差数和最少元素数 - # myDat = loadDataSet('testData/RT_data3.txt') + # myDat = loadDataSet('input/09.RegTrees/data3.txt') # myMat = mat(myDat) # myTree = createTree(myMat, ops=(0, 1)) # print myTree # # 2. 后剪枝就是:通过测试数据,对预测模型进行合并判断 - # myDatTest = loadDataSet('testData/RT_data3test.txt') + # myDatTest = loadDataSet('input/09.RegTrees/data3test.txt') # myMat2Test = mat(myDatTest) # myFinalTree = prune(myTree, myMat2Test) # print '\n\n\n-------------------' @@ -313,14 +313,14 @@ if __name__ == "__main__": # # -------- # # 模型树求解 - # myDat = loadDataSet('testData/RT_data4.txt') + # myDat = loadDataSet('input/09.RegTrees/data4.txt') # myMat = mat(myDat) # myTree = createTree(myMat, modelLeaf, modelErr) # print myTree # 回归树 VS 模型树 VS 线性回归 - trainMat = mat(loadDataSet('testData/RT_bikeSpeedVsIq_train.txt')) - testMat = mat(loadDataSet('testData/RT_bikeSpeedVsIq_test.txt')) + trainMat = mat(loadDataSet('input/09.RegTrees/bikeSpeedVsIq_train.txt')) + testMat = mat(loadDataSet('input/09.RegTrees/bikeSpeedVsIq_test.txt')) # 回归树 myTree1 = createTree(trainMat, ops=(1, 20)) print myTree1