mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-13 15:26:28 +08:00
更新文件路径到input目录下
This commit is contained in:
@@ -12,7 +12,7 @@ def createDataSet():
|
||||
''' 数据读入 '''
|
||||
data = []
|
||||
labels = []
|
||||
with open("testData/DT_data.txt") as ifile:
|
||||
with open("input/03.DecisionTree/data.txt") as ifile:
|
||||
for line in ifile:
|
||||
# 特征: 身高 体重 label: 胖瘦
|
||||
tokens = line.strip().split(' ')
|
||||
|
||||
@@ -258,47 +258,47 @@ def plotROC(predStrengths, classLabels):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 我们要将5个点进行分类
|
||||
dataArr, labelArr = loadSimpData()
|
||||
print 'dataArr', dataArr, 'labelArr', labelArr
|
||||
# # 我们要将5个点进行分类
|
||||
# dataArr, labelArr = loadSimpData()
|
||||
# print 'dataArr', dataArr, 'labelArr', labelArr
|
||||
|
||||
# D表示最初值,对1进行均分为5份,平均每一个初始的概率都为0.2
|
||||
# D的目的是为了计算错误概率: weightedError = D.T*errArr
|
||||
D = mat(ones((5, 1))/5)
|
||||
print 'D=', D.T
|
||||
# # D表示最初值,对1进行均分为5份,平均每一个初始的概率都为0.2
|
||||
# # D的目的是为了计算错误概率: weightedError = D.T*errArr
|
||||
# D = mat(ones((5, 1))/5)
|
||||
# print 'D=', D.T
|
||||
|
||||
# bestStump, minError, bestClasEst = buildStump(dataArr, labelArr, D)
|
||||
# print 'bestStump=', bestStump
|
||||
# print 'minError=', minError
|
||||
# print 'bestClasEst=', bestClasEst.T
|
||||
# # bestStump, minError, bestClasEst = buildStump(dataArr, labelArr, D)
|
||||
# # print 'bestStump=', bestStump
|
||||
# # print 'minError=', minError
|
||||
# # print 'bestClasEst=', bestClasEst.T
|
||||
|
||||
|
||||
# 分类器:weakClassArr
|
||||
# 历史累计的分类结果集
|
||||
weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 9)
|
||||
print '\nweakClassArr=', weakClassArr, '\naggClassEst=', aggClassEst.T
|
||||
# # 分类器:weakClassArr
|
||||
# # 历史累计的分类结果集
|
||||
# weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 9)
|
||||
# print '\nweakClassArr=', weakClassArr, '\naggClassEst=', aggClassEst.T
|
||||
|
||||
"""
|
||||
发现:
|
||||
分类的权重值:最大的值,为alpha的加和,最小值为-最大值
|
||||
特征的权重值:如果一个值误判的几率越小,那么D的特征权重越少
|
||||
"""
|
||||
# """
|
||||
# 发现:
|
||||
# 分类的权重值:最大的值,为alpha的加和,最小值为-最大值
|
||||
# 特征的权重值:如果一个值误判的几率越小,那么D的特征权重越少
|
||||
# """
|
||||
|
||||
# 测试数据的分类结果, 观测:aggClassEst分类的最终权重
|
||||
print adaClassify([0, 0], weakClassArr).T
|
||||
print adaClassify([[5, 5], [0, 0]], weakClassArr).T
|
||||
# # 测试数据的分类结果, 观测:aggClassEst分类的最终权重
|
||||
# print adaClassify([0, 0], weakClassArr).T
|
||||
# print adaClassify([[5, 5], [0, 0]], weakClassArr).T
|
||||
|
||||
# # 马疝病数据集
|
||||
# # 训练集合
|
||||
# dataArr, labelArr = loadDataSet("testData/AB_horseColicTraining2.txt")
|
||||
# weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 40)
|
||||
# print weakClassArr, '\n-----\n', aggClassEst.T
|
||||
# # 计算ROC下面的AUC的面积大小
|
||||
# plotROC(aggClassEst.T, labelArr)
|
||||
# # 测试集合
|
||||
# dataArrTest, labelArrTest = loadDataSet("testData/AB_horseColicTest2.txt")
|
||||
# m = shape(dataArrTest)[0]
|
||||
# predicting10 = adaClassify(dataArrTest, weakClassArr)
|
||||
# errArr = mat(ones((m, 1)))
|
||||
# # 测试:计算总样本数,错误样本数,错误率
|
||||
# print m, errArr[predicting10 != mat(labelArrTest).T].sum(), errArr[predicting10 != mat(labelArrTest).T].sum()/m
|
||||
# 马疝病数据集
|
||||
# 训练集合
|
||||
dataArr, labelArr = loadDataSet("input/07.AdaBoost/horseColicTraining2.txt")
|
||||
weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, labelArr, 40)
|
||||
print weakClassArr, '\n-----\n', aggClassEst.T
|
||||
# 计算ROC下面的AUC的面积大小
|
||||
plotROC(aggClassEst.T, labelArr)
|
||||
# 测试集合
|
||||
dataArrTest, labelArrTest = loadDataSet("input/07.AdaBoost/horseColicTest2.txt")
|
||||
m = shape(dataArrTest)[0]
|
||||
predicting10 = adaClassify(dataArrTest, weakClassArr)
|
||||
errArr = mat(ones((m, 1)))
|
||||
# 测试:计算总样本数,错误样本数,错误率
|
||||
print m, errArr[predicting10 != mat(labelArrTest).T].sum(), errArr[predicting10 != mat(labelArrTest).T].sum()/m
|
||||
|
||||
@@ -290,8 +290,8 @@ if __name__ == "__main__":
|
||||
# print mat0, '\n-----------\n', mat1
|
||||
|
||||
# # 回归树
|
||||
# myDat = loadDataSet('testData/RT_data1.txt')
|
||||
# # myDat = loadDataSet('testData/RT_data2.txt')
|
||||
# myDat = loadDataSet('input/09.RegTrees/data1.txt')
|
||||
# # myDat = loadDataSet('input/09.RegTrees/data2.txt')
|
||||
# # print 'myDat=', myDat
|
||||
# myMat = mat(myDat)
|
||||
# # print 'myMat=', myMat
|
||||
@@ -299,13 +299,13 @@ if __name__ == "__main__":
|
||||
# print myTree
|
||||
|
||||
# # 1. 预剪枝就是:提起设置最大误差数和最少元素数
|
||||
# myDat = loadDataSet('testData/RT_data3.txt')
|
||||
# myDat = loadDataSet('input/09.RegTrees/data3.txt')
|
||||
# myMat = mat(myDat)
|
||||
# myTree = createTree(myMat, ops=(0, 1))
|
||||
# print myTree
|
||||
|
||||
# # 2. 后剪枝就是:通过测试数据,对预测模型进行合并判断
|
||||
# myDatTest = loadDataSet('testData/RT_data3test.txt')
|
||||
# myDatTest = loadDataSet('input/09.RegTrees/data3test.txt')
|
||||
# myMat2Test = mat(myDatTest)
|
||||
# myFinalTree = prune(myTree, myMat2Test)
|
||||
# print '\n\n\n-------------------'
|
||||
@@ -313,14 +313,14 @@ if __name__ == "__main__":
|
||||
|
||||
# # --------
|
||||
# # 模型树求解
|
||||
# myDat = loadDataSet('testData/RT_data4.txt')
|
||||
# myDat = loadDataSet('input/09.RegTrees/data4.txt')
|
||||
# myMat = mat(myDat)
|
||||
# myTree = createTree(myMat, modelLeaf, modelErr)
|
||||
# print myTree
|
||||
|
||||
# 回归树 VS 模型树 VS 线性回归
|
||||
trainMat = mat(loadDataSet('testData/RT_bikeSpeedVsIq_train.txt'))
|
||||
testMat = mat(loadDataSet('testData/RT_bikeSpeedVsIq_test.txt'))
|
||||
trainMat = mat(loadDataSet('input/09.RegTrees/bikeSpeedVsIq_train.txt'))
|
||||
testMat = mat(loadDataSet('input/09.RegTrees/bikeSpeedVsIq_test.txt'))
|
||||
# 回归树
|
||||
myTree1 = createTree(trainMat, ops=(1, 20))
|
||||
print myTree1
|
||||
|
||||
Reference in New Issue
Block a user