mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-11 06:15:22 +08:00
Merge pull request #101 from chenyyx/master
添加回归的sklearn的demo和树回归的sklearn的demo
This commit is contained in:
@@ -66,7 +66,7 @@ def standRegres(xArr,yArr):
|
||||
# 书中的公式,求得w的最优解
|
||||
ws = xTx.I * (xMat.T*yMat)
|
||||
return ws
|
||||
|
||||
|
||||
# 局部加权线性回归
|
||||
def lwlr(testPoint,xArr,yArr,k=1.0):
|
||||
'''
|
||||
@@ -0,0 +1,191 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on Jan 8, 2011
|
||||
Update on 2017-05-18
|
||||
@author: Peter Harrington/小瑶
|
||||
《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning
|
||||
'''
|
||||
|
||||
|
||||
# Isotonic Regression 等式回归
|
||||
print(__doc__)
|
||||
|
||||
# Author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
|
||||
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# License: BSD
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.collections import LineCollection
|
||||
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.isotonic import IsotonicRegression
|
||||
from sklearn.utils import check_random_state
|
||||
|
||||
n = 100
|
||||
x = np.arange(n)
|
||||
rs = check_random_state(0)
|
||||
y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))
|
||||
|
||||
ir = IsotonicRegression()
|
||||
|
||||
y_ = ir.fit_transform(x, y)
|
||||
|
||||
lr = LinearRegression()
|
||||
lr.fit(x[:, np.newaxis], y) # 线性回归的 x 需要为 2d
|
||||
|
||||
segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]
|
||||
lc = LineCollection(segments, zorder=0)
|
||||
lc.set_array(np.ones(len(y)))
|
||||
lc.set_linewidths(0.5 * np.ones(n))
|
||||
|
||||
fig = plt.figure()
|
||||
plt.plot(x, y, 'r.', markersize=12)
|
||||
plt.plot(x, y_, 'g.-', markersize=12)
|
||||
plt.plot(x, lr.predict(x[:, np.newaxis]), 'b-')
|
||||
plt.gca().add_collection(lc)
|
||||
plt.legend(('Data', 'Isotonic Fit', 'Linear Fit'), loc='lower right')
|
||||
plt.title('Isotonic regression')
|
||||
plt.show()
|
||||
|
||||
# Kernel ridge regression ( 内核岭回归 )
|
||||
|
||||
# 2.1 Comparison of kernel ridge regression and SVR ( 内核岭回归与 SVR 的比较 )
|
||||
|
||||
# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
|
||||
# License: BSD 3 clause
|
||||
|
||||
'''
|
||||
from __future__ import division
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.svm import SVR
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.model_selection import learning_curve
|
||||
from sklearn.kernel_ridge import KernelRidge
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
# 生成样本数据
|
||||
X = 5 * rng.rand(10000, 1)
|
||||
y = np.sin(X).ravel()
|
||||
|
||||
# 给目标增加噪音
|
||||
y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5))
|
||||
|
||||
X_plot = np.linspace(0, 5, 100000)[:, None]
|
||||
|
||||
# Fit regression model ( 拟合 回归 模型 )
|
||||
train_size = 100
|
||||
svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5,
|
||||
param_grid={"C": [1e0, 1e1, 1e2, 1e3],
|
||||
"gamma": np.logspace(-2, 2, 5)})
|
||||
|
||||
kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5,
|
||||
param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
|
||||
"gamma": np.logspace(-2, 2, 5)})
|
||||
|
||||
t0 = time.time()
|
||||
svr.fit(X[:train_size], y[:train_size])
|
||||
svr_fit = time.time() - t0
|
||||
print("SVR complexity and bandwidth selected and model fitted in %.3f s"
|
||||
% svr_fit)
|
||||
|
||||
t0 = time.time()
|
||||
kr.fit(X[:train_size], y[:train_size])
|
||||
kr_fit = time.time() - t0
|
||||
print("KRR complexity and bandwidth selected and model fitted in %.3f s"
|
||||
% kr_fit)
|
||||
|
||||
sv_ratio = svr.best_estimator_.support_.shape[0] / train_size
|
||||
print("Support vector ratio: %.3f" % sv_ratio)
|
||||
|
||||
t0 = time.time()
|
||||
y_svr = svr.predict(X_plot)
|
||||
svr_predict = time.time() - t0
|
||||
print("SVR prediction for %d inputs in %.3f s"
|
||||
% (X_plot.shape[0], svr_predict))
|
||||
|
||||
t0 = time.time()
|
||||
y_kr = kr.predict(X_plot)
|
||||
kr_predict = time.time() - t0
|
||||
print("KRR prediction for %d inputs in %.3f s"
|
||||
% (X_plot.shape[0], kr_predict))
|
||||
|
||||
# 查看结果
|
||||
sv_ind = svr.best_estimator_.support_
|
||||
plt.scatter(X[sv_ind], y[sv_ind], c='r', s=50, label='SVR support vectors',
|
||||
zorder=2)
|
||||
plt.scatter(X[:100], y[:100], c='k', label='data', zorder=1)
|
||||
plt.hold('on')
|
||||
plt.plot(X_plot, y_svr, c='r',
|
||||
label='SVR (fit: %.3fs, predict: %.3fs)' % (svr_fit, svr_predict))
|
||||
plt.plot(X_plot, y_kr, c='g',
|
||||
label='KRR (fit: %.3fs, predict: %.3fs)' % (kr_fit, kr_predict))
|
||||
plt.xlabel('data')
|
||||
plt.ylabel('target')
|
||||
plt.title('SVR versus Kernel Ridge')
|
||||
plt.legend()
|
||||
|
||||
# 可视化训练和预测时间
|
||||
plt.figure()
|
||||
|
||||
# 生成样本数据
|
||||
X = 5 * rng.rand(10000, 1)
|
||||
y = np.sin(X).ravel()
|
||||
y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5))
|
||||
sizes = np.logspace(1, 4, 7, dtype=np.int)
|
||||
for name, estimator in {"KRR": KernelRidge(kernel='rbf', alpha=0.1,
|
||||
gamma=10),
|
||||
"SVR": SVR(kernel='rbf', C=1e1, gamma=10)}.items():
|
||||
train_time = []
|
||||
test_time = []
|
||||
for train_test_size in sizes:
|
||||
t0 = time.time()
|
||||
estimator.fit(X[:train_test_size], y[:train_test_size])
|
||||
train_time.append(time.time() - t0)
|
||||
|
||||
t0 = time.time()
|
||||
estimator.predict(X_plot[:1000])
|
||||
test_time.append(time.time() - t0)
|
||||
|
||||
plt.plot(sizes, train_time, 'o-', color="r" if name == "SVR" else "g",
|
||||
label="%s (train)" % name)
|
||||
plt.plot(sizes, test_time, 'o--', color="r" if name == "SVR" else "g",
|
||||
label="%s (test)" % name)
|
||||
|
||||
plt.xscale("log")
|
||||
plt.yscale("log")
|
||||
plt.xlabel("Train size")
|
||||
plt.ylabel("Time (seconds)")
|
||||
plt.title('Execution Time')
|
||||
plt.legend(loc="best")
|
||||
|
||||
# 可视化学习曲线
|
||||
plt.figure()
|
||||
|
||||
svr = SVR(kernel='rbf', C=1e1, gamma=0.1)
|
||||
kr = KernelRidge(kernel='rbf', alpha=0.1, gamma=0.1)
|
||||
train_sizes, train_scores_svr, test_scores_svr = \
|
||||
learning_curve(svr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10),
|
||||
scoring="neg_mean_squared_error", cv=10)
|
||||
train_sizes_abs, train_scores_kr, test_scores_kr = \
|
||||
learning_curve(kr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10),
|
||||
scoring="neg_mean_squared_error", cv=10)
|
||||
|
||||
plt.plot(train_sizes, -test_scores_svr.mean(1), 'o-', color="r",
|
||||
label="SVR")
|
||||
plt.plot(train_sizes, -test_scores_kr.mean(1), 'o-', color="g",
|
||||
label="KRR")
|
||||
plt.xlabel("Train size")
|
||||
plt.ylabel("Mean Squared Error")
|
||||
plt.title('Learning curves')
|
||||
plt.legend(loc="best")
|
||||
|
||||
plt.show()
|
||||
'''
|
||||
@@ -4,7 +4,7 @@
|
||||
Created on Feb 4, 2011
|
||||
Update on 2017-05-18
|
||||
Tree-Based Regression Methods Source Code for Machine Learning in Action Ch. 9
|
||||
@author: Peter Harrington/片刻
|
||||
@author: Peter Harrington/片刻/小瑶
|
||||
《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning
|
||||
'''
|
||||
print(__doc__)
|
||||
@@ -15,7 +15,7 @@ from numpy import *
|
||||
# general function to parse tab -delimited floats
|
||||
def loadDataSet(fileName):
|
||||
"""loadDataSet(解析每一行,并转化为float类型)
|
||||
|
||||
Desc:该函数读取一个以 tab 键为分隔符的文件,然后将每行的内容保存成一组浮点数
|
||||
Args:
|
||||
fileName 文件名
|
||||
Returns:
|
||||
@@ -30,6 +30,7 @@ def loadDataSet(fileName):
|
||||
curLine = line.strip().split('\t')
|
||||
# 将所有的元素转化为float类型
|
||||
# map all elements to float()
|
||||
# map() 函数具体的含义,可见 https://my.oschina.net/zyzzy/blog/115096
|
||||
fltLine = map(float, curLine)
|
||||
dataMat.append(fltLine)
|
||||
return dataMat
|
||||
@@ -37,14 +38,14 @@ def loadDataSet(fileName):
|
||||
|
||||
def binSplitDataSet(dataSet, feature, value):
|
||||
"""binSplitDataSet(将数据集,按照feature列的value进行 二元切分)
|
||||
|
||||
Description:在给定特征和特征值的情况下,该函数通过数组过滤方式将上述数据集合切分得到两个子集并返回。
|
||||
Args:
|
||||
dataMat 数据集
|
||||
feature 特征列
|
||||
feature 待切分的特征列
|
||||
value 特征列要比较的值
|
||||
Returns:
|
||||
mat0 小于的数据集在左边
|
||||
mat1 大于的数据集在右边
|
||||
mat0 小于等于 value 的数据集在左边
|
||||
mat1 大于 value 的数据集在右边
|
||||
Raises:
|
||||
"""
|
||||
# # 测试案例
|
||||
@@ -61,11 +62,13 @@ def binSplitDataSet(dataSet, feature, value):
|
||||
|
||||
# 返回每一个叶子结点的均值
|
||||
# returns the value used for each leaf
|
||||
# 我的理解是:regLeaf 是产生叶节点的函数,就是求均值,即用聚类中心点来代表这类数据
|
||||
def regLeaf(dataSet):
|
||||
return mean(dataSet[:, -1])
|
||||
|
||||
|
||||
# 计算总方差=方差*样本数
|
||||
# 我的理解是:求这组数据的方差,即通过决策树划分,可以让靠近的数据分到同一类中去
|
||||
def regErr(dataSet):
|
||||
# shape(dataSet)[0] 表示行数
|
||||
return var(dataSet[:, -1]) * shape(dataSet)[0]
|
||||
@@ -80,18 +83,23 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
|
||||
dataSet 加载的原始数据集
|
||||
leafType 建立叶子点的函数
|
||||
errType 误差计算函数(求总方差)
|
||||
ops [容许误差下降值,切分的最少样本数]
|
||||
ops [容许误差下降值,切分的最少样本数]。
|
||||
Returns:
|
||||
bestIndex feature的index坐标
|
||||
bestValue 切分的最优值
|
||||
Raises:
|
||||
"""
|
||||
|
||||
# ops=(1,4),非常重要,因为它决定了决策树划分停止的threshold值,被称为预剪枝(prepruning),其实也就是用于控制函数的停止时机。
|
||||
# 之所以这样说,是因为它防止决策树的过拟合,所以当误差的下降值小于tolS,或划分后的集合size小于tolN时,选择停止继续划分。
|
||||
# 最小误差下降值,划分后的误差减小小于这个差值,就不用继续划分
|
||||
tolS = ops[0]
|
||||
# 划分最小 size 小于,就不继续划分了
|
||||
tolN = ops[1]
|
||||
# 如果结果集(最后一列为1个变量),就返回推出
|
||||
# 如果结果集(最后一列为1个变量),就返回退出
|
||||
# .T 对数据集进行转置
|
||||
# .tolist()[0] 转化为数组并取第0列
|
||||
if len(set(dataSet[:, -1].T.tolist()[0])) == 1:
|
||||
if len(set(dataSet[:, -1].T.tolist()[0])) == 1: # 如果集合size为1,不用继续划分。
|
||||
# exit cond 1
|
||||
return None, leafType(dataSet)
|
||||
# 计算行列值
|
||||
@@ -102,7 +110,7 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
|
||||
# inf 正无穷大
|
||||
bestS, bestIndex, bestValue = inf, 0, 0
|
||||
# 循环处理每一列对应的feature值
|
||||
for featIndex in range(n-1):
|
||||
for featIndex in range(n-1): # 对于每个特征
|
||||
# [0]表示这一列的[所有行],不要[0]就是一个array[[所有行]]
|
||||
for splitVal in set(dataSet[:, featIndex].T.tolist()[0]):
|
||||
# 对该列进行分组,然后组内的成员的val值进行 二元切分
|
||||
@@ -112,6 +120,7 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
|
||||
continue
|
||||
newS = errType(mat0) + errType(mat1)
|
||||
# 如果二元切分,算出来的误差在可接受范围内,那么就记录切分点,并记录最小误差
|
||||
# 如果划分后误差小于 bestS,则说明找到了新的bestS
|
||||
if newS < bestS:
|
||||
bestIndex = featIndex
|
||||
bestValue = splitVal
|
||||
@@ -122,15 +131,17 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
|
||||
return None, leafType(dataSet)
|
||||
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
|
||||
# 对整体的成员进行判断,是否符合预期
|
||||
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
|
||||
# 如果集合的 size 小于 tolN
|
||||
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): # 当最佳划分后,集合过小,也不划分,产生叶节点
|
||||
return None, leafType(dataSet)
|
||||
return bestIndex, bestValue
|
||||
|
||||
|
||||
# assume dataSet is NumPy Mat so we can array filtering
|
||||
# 假设 dataSet 是 NumPy Mat 类型的,那么我们可以进行 array 过滤
|
||||
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
|
||||
"""createTree(获取回归树)
|
||||
|
||||
Description:递归函数:如果构建的是回归树,该模型是一个常数,如果是模型树,其模型师一个线性方程。
|
||||
Args:
|
||||
dataSet 加载的原始数据集
|
||||
leafType 建立叶子点的函数
|
||||
@@ -143,6 +154,7 @@ def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
|
||||
# choose the best split
|
||||
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
|
||||
# if the splitting hit a stop condition return val
|
||||
# 如果 splitting 达到一个停止条件,那么返回 val
|
||||
if feat is None:
|
||||
return val
|
||||
retTree = {}
|
||||
@@ -150,7 +162,7 @@ def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
|
||||
retTree['spVal'] = val
|
||||
# 大于在右边,小于在左边,分为2个数据集
|
||||
lSet, rSet = binSplitDataSet(dataSet, feat, val)
|
||||
# 递归的进行调用
|
||||
# 递归的进行调用,在左右子树中继续递归生成树
|
||||
retTree['left'] = createTree(lSet, leafType, errType, ops)
|
||||
retTree['right'] = createTree(rSet, leafType, errType, ops)
|
||||
return retTree
|
||||
@@ -318,14 +330,14 @@ if __name__ == "__main__":
|
||||
# myTree = createTree(myMat, modelLeaf, modelErr)
|
||||
# print myTree
|
||||
|
||||
# 回归树 VS 模型树 VS 线性回归
|
||||
# # 回归树 VS 模型树 VS 线性回归
|
||||
trainMat = mat(loadDataSet('input/9.RegTrees/bikeSpeedVsIq_train.txt'))
|
||||
testMat = mat(loadDataSet('input/9.RegTrees/bikeSpeedVsIq_test.txt'))
|
||||
# 回归树
|
||||
myTree1 = createTree(trainMat, ops=(1, 20))
|
||||
print myTree1
|
||||
yHat1 = createForeCast(myTree1, testMat[:, 0])
|
||||
print "回归树:", corrcoef(yHat1, testMat[:, 1],rowvar=0)[0, 1]
|
||||
# # 回归树
|
||||
# myTree1 = createTree(trainMat, ops=(1, 20))
|
||||
# print myTree1
|
||||
# yHat1 = createForeCast(myTree1, testMat[:, 0])
|
||||
# print "回归树:", corrcoef(yHat1, testMat[:, 1],rowvar=0)[0, 1]
|
||||
|
||||
# 模型树
|
||||
myTree2 = createTree(trainMat, modelLeaf, modelErr, ops=(1, 20))
|
||||
|
||||
58
src/python/9.RegTrees/sklearn-regressTree-demo.py
Normal file
58
src/python/9.RegTrees/sklearn-regressTree-demo.py
Normal file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
"""
|
||||
Created on 2017-07-13
|
||||
Updated on 2017-07-13
|
||||
RegressionTree:树回归
|
||||
@author: 小瑶
|
||||
《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning
|
||||
"""
|
||||
|
||||
print(__doc__)
|
||||
|
||||
# 引入必要的模型和库
|
||||
import numpy as np
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# 创建一个随机的数据集
|
||||
# 参考 https://docs.scipy.org/doc/numpy-1.6.0/reference/generated/numpy.random.mtrand.RandomState.html
|
||||
rng = np.random.RandomState(1)
|
||||
# print 'lalalalala===', rng
|
||||
# rand() 是给定形状的随机值,rng.rand(80, 1)即矩阵的形状是 80行,1列
|
||||
# sort()
|
||||
X = np.sort(5 * rng.rand(80, 1), axis=0)
|
||||
# print 'X=', X
|
||||
y = np.sin(X).ravel()
|
||||
# print 'y=', y
|
||||
y[::5] += 3 * (0.5 - rng.rand(16))
|
||||
# print 'yyy=', y
|
||||
|
||||
# 拟合回归模型
|
||||
# regr_1 = DecisionTreeRegressor(max_depth=2)
|
||||
# 保持 max_depth=5 不变,增加 min_samples_leaf=6 的参数,效果进一步提升了
|
||||
regr_2 = DecisionTreeRegressor(max_depth=5)
|
||||
regr_2 = DecisionTreeRegressor(min_samples_leaf=6)
|
||||
# regr_3 = DecisionTreeRegressor(max_depth=4)
|
||||
# regr_1.fit(X, y)
|
||||
regr_2.fit(X, y)
|
||||
# regr_3.fit(X, y)
|
||||
|
||||
# 预测
|
||||
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
|
||||
# y_1 = regr_1.predict(X_test)
|
||||
y_2 = regr_2.predict(X_test)
|
||||
# y_3 = regr_3.predict(X_test)
|
||||
|
||||
# 绘制结果
|
||||
plt.figure()
|
||||
plt.scatter(X, y, c="darkorange", label="data")
|
||||
# plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
|
||||
plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
|
||||
# plt.plot(X_test, y_3, color="red", label="max_depth=3", linewidth=2)
|
||||
plt.xlabel("data")
|
||||
plt.ylabel("target")
|
||||
plt.title("Decision Tree Regression")
|
||||
plt.legend()
|
||||
plt.show()
|
||||
Reference in New Issue
Block a user