From f035b5948b793f0741749bbc5194d0ba0cf98b3e Mon Sep 17 00:00:00 2001 From: chenyyx Date: Thu, 13 Jul 2017 15:09:17 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=9B=9E=E5=BD=92=E7=9A=84sk?= =?UTF-8?q?learn=E7=9A=84demo=E5=92=8C=E6=A0=91=E5=9B=9E=E5=BD=92=E7=9A=84?= =?UTF-8?q?sklearn=E7=9A=84demo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../regression.py | 2 +- .../sklearn-regression-demo.py | 191 ++++++++++++++++++ src/python/9.RegTrees/regTrees.py | 50 +++-- .../9.RegTrees/sklearn-regressTree-demo.py | 58 ++++++ 4 files changed, 281 insertions(+), 20 deletions(-) rename src/python/{8.Predictive numerical data regression => 8.PredictiveNumericalDataRegression}/regression.py (99%) create mode 100644 src/python/8.PredictiveNumericalDataRegression/sklearn-regression-demo.py create mode 100644 src/python/9.RegTrees/sklearn-regressTree-demo.py diff --git a/src/python/8.Predictive numerical data regression/regression.py b/src/python/8.PredictiveNumericalDataRegression/regression.py similarity index 99% rename from src/python/8.Predictive numerical data regression/regression.py rename to src/python/8.PredictiveNumericalDataRegression/regression.py index 59304c5a..16f2bd00 100644 --- a/src/python/8.Predictive numerical data regression/regression.py +++ b/src/python/8.PredictiveNumericalDataRegression/regression.py @@ -66,7 +66,7 @@ def standRegres(xArr,yArr): # 书中的公式,求得w的最优解 ws = xTx.I * (xMat.T*yMat) return ws - + # 局部加权线性回归 def lwlr(testPoint,xArr,yArr,k=1.0): ''' diff --git a/src/python/8.PredictiveNumericalDataRegression/sklearn-regression-demo.py b/src/python/8.PredictiveNumericalDataRegression/sklearn-regression-demo.py new file mode 100644 index 00000000..ffde9b74 --- /dev/null +++ b/src/python/8.PredictiveNumericalDataRegression/sklearn-regression-demo.py @@ -0,0 +1,191 @@ +#!/usr/bin/python +# coding:utf8 + +''' +Created on Jan 8, 2011 +Update on 2017-05-18 +@author: Peter Harrington/小瑶 +《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning +''' + + +# Isotonic Regression 等式回归 +print(__doc__) + +# Author: Nelle Varoquaux +# Alexandre Gramfort +# License: BSD + +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.collections import LineCollection + +from sklearn.linear_model import LinearRegression +from sklearn.isotonic import IsotonicRegression +from sklearn.utils import check_random_state + +n = 100 +x = np.arange(n) +rs = check_random_state(0) +y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n)) + +ir = IsotonicRegression() + +y_ = ir.fit_transform(x, y) + +lr = LinearRegression() +lr.fit(x[:, np.newaxis], y) # 线性回归的 x 需要为 2d + +segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)] +lc = LineCollection(segments, zorder=0) +lc.set_array(np.ones(len(y))) +lc.set_linewidths(0.5 * np.ones(n)) + +fig = plt.figure() +plt.plot(x, y, 'r.', markersize=12) +plt.plot(x, y_, 'g.-', markersize=12) +plt.plot(x, lr.predict(x[:, np.newaxis]), 'b-') +plt.gca().add_collection(lc) +plt.legend(('Data', 'Isotonic Fit', 'Linear Fit'), loc='lower right') +plt.title('Isotonic regression') +plt.show() + +# Kernel ridge regression ( 内核岭回归 ) + +# 2.1 Comparison of kernel ridge regression and SVR ( 内核岭回归与 SVR 的比较 ) + +# Authors: Jan Hendrik Metzen +# License: BSD 3 clause + +''' +from __future__ import division +import time + +import numpy as np + +from sklearn.svm import SVR +from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import learning_curve +from sklearn.kernel_ridge import KernelRidge +import matplotlib.pyplot as plt + +rng = np.random.RandomState(0) + +# 生成样本数据 +X = 5 * rng.rand(10000, 1) +y = np.sin(X).ravel() + +# 给目标增加噪音 +y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5)) + +X_plot = np.linspace(0, 5, 100000)[:, None] + +# Fit regression model ( 拟合 回归 模型 ) +train_size = 100 +svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5, + param_grid={"C": [1e0, 1e1, 1e2, 1e3], + "gamma": np.logspace(-2, 2, 5)}) + +kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5, + param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3], + "gamma": np.logspace(-2, 2, 5)}) + +t0 = time.time() +svr.fit(X[:train_size], y[:train_size]) +svr_fit = time.time() - t0 +print("SVR complexity and bandwidth selected and model fitted in %.3f s" + % svr_fit) + +t0 = time.time() +kr.fit(X[:train_size], y[:train_size]) +kr_fit = time.time() - t0 +print("KRR complexity and bandwidth selected and model fitted in %.3f s" + % kr_fit) + +sv_ratio = svr.best_estimator_.support_.shape[0] / train_size +print("Support vector ratio: %.3f" % sv_ratio) + +t0 = time.time() +y_svr = svr.predict(X_plot) +svr_predict = time.time() - t0 +print("SVR prediction for %d inputs in %.3f s" + % (X_plot.shape[0], svr_predict)) + +t0 = time.time() +y_kr = kr.predict(X_plot) +kr_predict = time.time() - t0 +print("KRR prediction for %d inputs in %.3f s" + % (X_plot.shape[0], kr_predict)) + +# 查看结果 +sv_ind = svr.best_estimator_.support_ +plt.scatter(X[sv_ind], y[sv_ind], c='r', s=50, label='SVR support vectors', + zorder=2) +plt.scatter(X[:100], y[:100], c='k', label='data', zorder=1) +plt.hold('on') +plt.plot(X_plot, y_svr, c='r', + label='SVR (fit: %.3fs, predict: %.3fs)' % (svr_fit, svr_predict)) +plt.plot(X_plot, y_kr, c='g', + label='KRR (fit: %.3fs, predict: %.3fs)' % (kr_fit, kr_predict)) +plt.xlabel('data') +plt.ylabel('target') +plt.title('SVR versus Kernel Ridge') +plt.legend() + +# 可视化训练和预测时间 +plt.figure() + +# 生成样本数据 +X = 5 * rng.rand(10000, 1) +y = np.sin(X).ravel() +y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5)) +sizes = np.logspace(1, 4, 7, dtype=np.int) +for name, estimator in {"KRR": KernelRidge(kernel='rbf', alpha=0.1, + gamma=10), + "SVR": SVR(kernel='rbf', C=1e1, gamma=10)}.items(): + train_time = [] + test_time = [] + for train_test_size in sizes: + t0 = time.time() + estimator.fit(X[:train_test_size], y[:train_test_size]) + train_time.append(time.time() - t0) + + t0 = time.time() + estimator.predict(X_plot[:1000]) + test_time.append(time.time() - t0) + + plt.plot(sizes, train_time, 'o-', color="r" if name == "SVR" else "g", + label="%s (train)" % name) + plt.plot(sizes, test_time, 'o--', color="r" if name == "SVR" else "g", + label="%s (test)" % name) + +plt.xscale("log") +plt.yscale("log") +plt.xlabel("Train size") +plt.ylabel("Time (seconds)") +plt.title('Execution Time') +plt.legend(loc="best") + +# 可视化学习曲线 +plt.figure() + +svr = SVR(kernel='rbf', C=1e1, gamma=0.1) +kr = KernelRidge(kernel='rbf', alpha=0.1, gamma=0.1) +train_sizes, train_scores_svr, test_scores_svr = \ + learning_curve(svr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10), + scoring="neg_mean_squared_error", cv=10) +train_sizes_abs, train_scores_kr, test_scores_kr = \ + learning_curve(kr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10), + scoring="neg_mean_squared_error", cv=10) + +plt.plot(train_sizes, -test_scores_svr.mean(1), 'o-', color="r", + label="SVR") +plt.plot(train_sizes, -test_scores_kr.mean(1), 'o-', color="g", + label="KRR") +plt.xlabel("Train size") +plt.ylabel("Mean Squared Error") +plt.title('Learning curves') +plt.legend(loc="best") + +plt.show() +''' diff --git a/src/python/9.RegTrees/regTrees.py b/src/python/9.RegTrees/regTrees.py index 41cd60dc..44f56945 100644 --- a/src/python/9.RegTrees/regTrees.py +++ b/src/python/9.RegTrees/regTrees.py @@ -4,7 +4,7 @@ Created on Feb 4, 2011 Update on 2017-05-18 Tree-Based Regression Methods Source Code for Machine Learning in Action Ch. 9 -@author: Peter Harrington/片刻 +@author: Peter Harrington/片刻/小瑶 《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning ''' print(__doc__) @@ -15,7 +15,7 @@ from numpy import * # general function to parse tab -delimited floats def loadDataSet(fileName): """loadDataSet(解析每一行,并转化为float类型) - + Desc:该函数读取一个以 tab 键为分隔符的文件,然后将每行的内容保存成一组浮点数 Args: fileName 文件名 Returns: @@ -30,6 +30,7 @@ def loadDataSet(fileName): curLine = line.strip().split('\t') # 将所有的元素转化为float类型 # map all elements to float() + # map() 函数具体的含义,可见 https://my.oschina.net/zyzzy/blog/115096 fltLine = map(float, curLine) dataMat.append(fltLine) return dataMat @@ -37,14 +38,14 @@ def loadDataSet(fileName): def binSplitDataSet(dataSet, feature, value): """binSplitDataSet(将数据集,按照feature列的value进行 二元切分) - + Description:在给定特征和特征值的情况下,该函数通过数组过滤方式将上述数据集合切分得到两个子集并返回。 Args: dataMat 数据集 - feature 特征列 + feature 待切分的特征列 value 特征列要比较的值 Returns: - mat0 小于的数据集在左边 - mat1 大于的数据集在右边 + mat0 小于等于 value 的数据集在左边 + mat1 大于 value 的数据集在右边 Raises: """ # # 测试案例 @@ -61,11 +62,13 @@ def binSplitDataSet(dataSet, feature, value): # 返回每一个叶子结点的均值 # returns the value used for each leaf +# 我的理解是:regLeaf 是产生叶节点的函数,就是求均值,即用聚类中心点来代表这类数据 def regLeaf(dataSet): return mean(dataSet[:, -1]) # 计算总方差=方差*样本数 +# 我的理解是:求这组数据的方差,即通过决策树划分,可以让靠近的数据分到同一类中去 def regErr(dataSet): # shape(dataSet)[0] 表示行数 return var(dataSet[:, -1]) * shape(dataSet)[0] @@ -80,18 +83,23 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): dataSet 加载的原始数据集 leafType 建立叶子点的函数 errType 误差计算函数(求总方差) - ops [容许误差下降值,切分的最少样本数] + ops [容许误差下降值,切分的最少样本数]。 Returns: bestIndex feature的index坐标 bestValue 切分的最优值 Raises: """ + + # ops=(1,4),非常重要,因为它决定了决策树划分停止的threshold值,被称为预剪枝(prepruning),其实也就是用于控制函数的停止时机。 + # 之所以这样说,是因为它防止决策树的过拟合,所以当误差的下降值小于tolS,或划分后的集合size小于tolN时,选择停止继续划分。 + # 最小误差下降值,划分后的误差减小小于这个差值,就不用继续划分 tolS = ops[0] + # 划分最小 size 小于,就不继续划分了 tolN = ops[1] - # 如果结果集(最后一列为1个变量),就返回推出 + # 如果结果集(最后一列为1个变量),就返回退出 # .T 对数据集进行转置 # .tolist()[0] 转化为数组并取第0列 - if len(set(dataSet[:, -1].T.tolist()[0])) == 1: + if len(set(dataSet[:, -1].T.tolist()[0])) == 1: # 如果集合size为1,不用继续划分。 # exit cond 1 return None, leafType(dataSet) # 计算行列值 @@ -102,7 +110,7 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): # inf 正无穷大 bestS, bestIndex, bestValue = inf, 0, 0 # 循环处理每一列对应的feature值 - for featIndex in range(n-1): + for featIndex in range(n-1): # 对于每个特征 # [0]表示这一列的[所有行],不要[0]就是一个array[[所有行]] for splitVal in set(dataSet[:, featIndex].T.tolist()[0]): # 对该列进行分组,然后组内的成员的val值进行 二元切分 @@ -112,6 +120,7 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): continue newS = errType(mat0) + errType(mat1) # 如果二元切分,算出来的误差在可接受范围内,那么就记录切分点,并记录最小误差 + # 如果划分后误差小于 bestS,则说明找到了新的bestS if newS < bestS: bestIndex = featIndex bestValue = splitVal @@ -122,15 +131,17 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): return None, leafType(dataSet) mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) # 对整体的成员进行判断,是否符合预期 - if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): + # 如果集合的 size 小于 tolN + if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): # 当最佳划分后,集合过小,也不划分,产生叶节点 return None, leafType(dataSet) return bestIndex, bestValue # assume dataSet is NumPy Mat so we can array filtering +# 假设 dataSet 是 NumPy Mat 类型的,那么我们可以进行 array 过滤 def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): """createTree(获取回归树) - + Description:递归函数:如果构建的是回归树,该模型是一个常数,如果是模型树,其模型师一个线性方程。 Args: dataSet 加载的原始数据集 leafType 建立叶子点的函数 @@ -143,6 +154,7 @@ def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): # choose the best split feat, val = chooseBestSplit(dataSet, leafType, errType, ops) # if the splitting hit a stop condition return val + # 如果 splitting 达到一个停止条件,那么返回 val if feat is None: return val retTree = {} @@ -150,7 +162,7 @@ def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): retTree['spVal'] = val # 大于在右边,小于在左边,分为2个数据集 lSet, rSet = binSplitDataSet(dataSet, feat, val) - # 递归的进行调用 + # 递归的进行调用,在左右子树中继续递归生成树 retTree['left'] = createTree(lSet, leafType, errType, ops) retTree['right'] = createTree(rSet, leafType, errType, ops) return retTree @@ -318,14 +330,14 @@ if __name__ == "__main__": # myTree = createTree(myMat, modelLeaf, modelErr) # print myTree - # 回归树 VS 模型树 VS 线性回归 + # # 回归树 VS 模型树 VS 线性回归 trainMat = mat(loadDataSet('input/9.RegTrees/bikeSpeedVsIq_train.txt')) testMat = mat(loadDataSet('input/9.RegTrees/bikeSpeedVsIq_test.txt')) - # 回归树 - myTree1 = createTree(trainMat, ops=(1, 20)) - print myTree1 - yHat1 = createForeCast(myTree1, testMat[:, 0]) - print "回归树:", corrcoef(yHat1, testMat[:, 1],rowvar=0)[0, 1] + # # 回归树 + # myTree1 = createTree(trainMat, ops=(1, 20)) + # print myTree1 + # yHat1 = createForeCast(myTree1, testMat[:, 0]) + # print "回归树:", corrcoef(yHat1, testMat[:, 1],rowvar=0)[0, 1] # 模型树 myTree2 = createTree(trainMat, modelLeaf, modelErr, ops=(1, 20)) diff --git a/src/python/9.RegTrees/sklearn-regressTree-demo.py b/src/python/9.RegTrees/sklearn-regressTree-demo.py new file mode 100644 index 00000000..93722e8c --- /dev/null +++ b/src/python/9.RegTrees/sklearn-regressTree-demo.py @@ -0,0 +1,58 @@ +#!/usr/bin/python +# coding:utf8 + +""" +Created on 2017-07-13 +Updated on 2017-07-13 +RegressionTree:树回归 +@author: 小瑶 +《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning +""" + +print(__doc__) + +# 引入必要的模型和库 +import numpy as np +from sklearn.tree import DecisionTreeRegressor +import matplotlib.pyplot as plt + +# 创建一个随机的数据集 +# 参考 https://docs.scipy.org/doc/numpy-1.6.0/reference/generated/numpy.random.mtrand.RandomState.html +rng = np.random.RandomState(1) +# print 'lalalalala===', rng +# rand() 是给定形状的随机值,rng.rand(80, 1)即矩阵的形状是 80行,1列 +# sort() +X = np.sort(5 * rng.rand(80, 1), axis=0) +# print 'X=', X +y = np.sin(X).ravel() +# print 'y=', y +y[::5] += 3 * (0.5 - rng.rand(16)) +# print 'yyy=', y + +# 拟合回归模型 +# regr_1 = DecisionTreeRegressor(max_depth=2) +# 保持 max_depth=5 不变,增加 min_samples_leaf=6 的参数,效果进一步提升了 +regr_2 = DecisionTreeRegressor(max_depth=5) +regr_2 = DecisionTreeRegressor(min_samples_leaf=6) +# regr_3 = DecisionTreeRegressor(max_depth=4) +# regr_1.fit(X, y) +regr_2.fit(X, y) +# regr_3.fit(X, y) + +# 预测 +X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis] +# y_1 = regr_1.predict(X_test) +y_2 = regr_2.predict(X_test) +# y_3 = regr_3.predict(X_test) + +# 绘制结果 +plt.figure() +plt.scatter(X, y, c="darkorange", label="data") +# plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2) +plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2) +# plt.plot(X_test, y_3, color="red", label="max_depth=3", linewidth=2) +plt.xlabel("data") +plt.ylabel("target") +plt.title("Decision Tree Regression") +plt.legend() +plt.show() \ No newline at end of file