Merge pull request #101 from chenyyx/master

添加回归的sklearn的demo和树回归的sklearn的demo
This commit is contained in:
片刻
2017-07-13 17:00:37 +08:00
committed by GitHub
4 changed files with 281 additions and 20 deletions

View File

@@ -66,7 +66,7 @@ def standRegres(xArr,yArr):
# 书中的公式求得w的最优解
ws = xTx.I * (xMat.T*yMat)
return ws
# 局部加权线性回归
def lwlr(testPoint,xArr,yArr,k=1.0):
'''

View File

@@ -0,0 +1,191 @@
#!/usr/bin/python
# coding:utf8
'''
Created on Jan 8, 2011
Update on 2017-05-18
@author: Peter Harrington/小瑶
《机器学习实战》更新地址https://github.com/apachecn/MachineLearning
'''
# Isotonic Regression 等式回归
print(__doc__)
# Author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
# License: BSD
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from sklearn.linear_model import LinearRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.utils import check_random_state
n = 100
x = np.arange(n)
rs = check_random_state(0)
y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))
ir = IsotonicRegression()
y_ = ir.fit_transform(x, y)
lr = LinearRegression()
lr.fit(x[:, np.newaxis], y) # 线性回归的 x 需要为 2d
segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]
lc = LineCollection(segments, zorder=0)
lc.set_array(np.ones(len(y)))
lc.set_linewidths(0.5 * np.ones(n))
fig = plt.figure()
plt.plot(x, y, 'r.', markersize=12)
plt.plot(x, y_, 'g.-', markersize=12)
plt.plot(x, lr.predict(x[:, np.newaxis]), 'b-')
plt.gca().add_collection(lc)
plt.legend(('Data', 'Isotonic Fit', 'Linear Fit'), loc='lower right')
plt.title('Isotonic regression')
plt.show()
# Kernel ridge regression ( 内核岭回归 )
# 2.1 Comparison of kernel ridge regression and SVR ( 内核岭回归与 SVR 的比较 )
# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
# License: BSD 3 clause
'''
from __future__ import division
import time
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.kernel_ridge import KernelRidge
import matplotlib.pyplot as plt
rng = np.random.RandomState(0)
# 生成样本数据
X = 5 * rng.rand(10000, 1)
y = np.sin(X).ravel()
# 给目标增加噪音
y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5))
X_plot = np.linspace(0, 5, 100000)[:, None]
# Fit regression model ( 拟合 回归 模型 )
train_size = 100
svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5,
param_grid={"C": [1e0, 1e1, 1e2, 1e3],
"gamma": np.logspace(-2, 2, 5)})
kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5,
param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
"gamma": np.logspace(-2, 2, 5)})
t0 = time.time()
svr.fit(X[:train_size], y[:train_size])
svr_fit = time.time() - t0
print("SVR complexity and bandwidth selected and model fitted in %.3f s"
% svr_fit)
t0 = time.time()
kr.fit(X[:train_size], y[:train_size])
kr_fit = time.time() - t0
print("KRR complexity and bandwidth selected and model fitted in %.3f s"
% kr_fit)
sv_ratio = svr.best_estimator_.support_.shape[0] / train_size
print("Support vector ratio: %.3f" % sv_ratio)
t0 = time.time()
y_svr = svr.predict(X_plot)
svr_predict = time.time() - t0
print("SVR prediction for %d inputs in %.3f s"
% (X_plot.shape[0], svr_predict))
t0 = time.time()
y_kr = kr.predict(X_plot)
kr_predict = time.time() - t0
print("KRR prediction for %d inputs in %.3f s"
% (X_plot.shape[0], kr_predict))
# 查看结果
sv_ind = svr.best_estimator_.support_
plt.scatter(X[sv_ind], y[sv_ind], c='r', s=50, label='SVR support vectors',
zorder=2)
plt.scatter(X[:100], y[:100], c='k', label='data', zorder=1)
plt.hold('on')
plt.plot(X_plot, y_svr, c='r',
label='SVR (fit: %.3fs, predict: %.3fs)' % (svr_fit, svr_predict))
plt.plot(X_plot, y_kr, c='g',
label='KRR (fit: %.3fs, predict: %.3fs)' % (kr_fit, kr_predict))
plt.xlabel('data')
plt.ylabel('target')
plt.title('SVR versus Kernel Ridge')
plt.legend()
# 可视化训练和预测时间
plt.figure()
# 生成样本数据
X = 5 * rng.rand(10000, 1)
y = np.sin(X).ravel()
y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5))
sizes = np.logspace(1, 4, 7, dtype=np.int)
for name, estimator in {"KRR": KernelRidge(kernel='rbf', alpha=0.1,
gamma=10),
"SVR": SVR(kernel='rbf', C=1e1, gamma=10)}.items():
train_time = []
test_time = []
for train_test_size in sizes:
t0 = time.time()
estimator.fit(X[:train_test_size], y[:train_test_size])
train_time.append(time.time() - t0)
t0 = time.time()
estimator.predict(X_plot[:1000])
test_time.append(time.time() - t0)
plt.plot(sizes, train_time, 'o-', color="r" if name == "SVR" else "g",
label="%s (train)" % name)
plt.plot(sizes, test_time, 'o--', color="r" if name == "SVR" else "g",
label="%s (test)" % name)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Train size")
plt.ylabel("Time (seconds)")
plt.title('Execution Time')
plt.legend(loc="best")
# 可视化学习曲线
plt.figure()
svr = SVR(kernel='rbf', C=1e1, gamma=0.1)
kr = KernelRidge(kernel='rbf', alpha=0.1, gamma=0.1)
train_sizes, train_scores_svr, test_scores_svr = \
learning_curve(svr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10),
scoring="neg_mean_squared_error", cv=10)
train_sizes_abs, train_scores_kr, test_scores_kr = \
learning_curve(kr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10),
scoring="neg_mean_squared_error", cv=10)
plt.plot(train_sizes, -test_scores_svr.mean(1), 'o-', color="r",
label="SVR")
plt.plot(train_sizes, -test_scores_kr.mean(1), 'o-', color="g",
label="KRR")
plt.xlabel("Train size")
plt.ylabel("Mean Squared Error")
plt.title('Learning curves')
plt.legend(loc="best")
plt.show()
'''

View File

@@ -4,7 +4,7 @@
Created on Feb 4, 2011
Update on 2017-05-18
Tree-Based Regression Methods Source Code for Machine Learning in Action Ch. 9
@author: Peter Harrington/片刻
@author: Peter Harrington/片刻/小瑶
《机器学习实战》更新地址https://github.com/apachecn/MachineLearning
'''
print(__doc__)
@@ -15,7 +15,7 @@ from numpy import *
# general function to parse tab -delimited floats
def loadDataSet(fileName):
"""loadDataSet(解析每一行并转化为float类型)
Desc该函数读取一个以 tab 键为分隔符的文件,然后将每行的内容保存成一组浮点数
Args:
fileName 文件名
Returns:
@@ -30,6 +30,7 @@ def loadDataSet(fileName):
curLine = line.strip().split('\t')
# 将所有的元素转化为float类型
# map all elements to float()
# map() 函数具体的含义,可见 https://my.oschina.net/zyzzy/blog/115096
fltLine = map(float, curLine)
dataMat.append(fltLine)
return dataMat
@@ -37,14 +38,14 @@ def loadDataSet(fileName):
def binSplitDataSet(dataSet, feature, value):
"""binSplitDataSet(将数据集按照feature列的value进行 二元切分)
Description在给定特征和特征值的情况下该函数通过数组过滤方式将上述数据集合切分得到两个子集并返回。
Args:
dataMat 数据集
feature 特征列
feature 待切分的特征列
value 特征列要比较的值
Returns:
mat0 小于的数据集在左边
mat1 大于的数据集在右边
mat0 小于等于 value 的数据集在左边
mat1 大于 value 的数据集在右边
Raises:
"""
# # 测试案例
@@ -61,11 +62,13 @@ def binSplitDataSet(dataSet, feature, value):
# 返回每一个叶子结点的均值
# returns the value used for each leaf
# 我的理解是regLeaf 是产生叶节点的函数,就是求均值,即用聚类中心点来代表这类数据
def regLeaf(dataSet):
return mean(dataSet[:, -1])
# 计算总方差=方差*样本数
# 我的理解是:求这组数据的方差,即通过决策树划分,可以让靠近的数据分到同一类中去
def regErr(dataSet):
# shape(dataSet)[0] 表示行数
return var(dataSet[:, -1]) * shape(dataSet)[0]
@@ -80,18 +83,23 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
dataSet 加载的原始数据集
leafType 建立叶子点的函数
errType 误差计算函数(求总方差)
ops [容许误差下降值,切分的最少样本数]
ops [容许误差下降值,切分的最少样本数]
Returns:
bestIndex feature的index坐标
bestValue 切分的最优值
Raises:
"""
# ops=(1,4)非常重要因为它决定了决策树划分停止的threshold值被称为预剪枝prepruning其实也就是用于控制函数的停止时机。
# 之所以这样说是因为它防止决策树的过拟合所以当误差的下降值小于tolS或划分后的集合size小于tolN时选择停止继续划分。
# 最小误差下降值,划分后的误差减小小于这个差值,就不用继续划分
tolS = ops[0]
# 划分最小 size 小于,就不继续划分了
tolN = ops[1]
# 如果结果集(最后一列为1个变量),就返回
# 如果结果集(最后一列为1个变量),就返回退
# .T 对数据集进行转置
# .tolist()[0] 转化为数组并取第0列
if len(set(dataSet[:, -1].T.tolist()[0])) == 1:
if len(set(dataSet[:, -1].T.tolist()[0])) == 1: # 如果集合size为1不用继续划分。
# exit cond 1
return None, leafType(dataSet)
# 计算行列值
@@ -102,7 +110,7 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
# inf 正无穷大
bestS, bestIndex, bestValue = inf, 0, 0
# 循环处理每一列对应的feature值
for featIndex in range(n-1):
for featIndex in range(n-1): # 对于每个特征
# [0]表示这一列的[所有行],不要[0]就是一个array[[所有行]]
for splitVal in set(dataSet[:, featIndex].T.tolist()[0]):
# 对该列进行分组然后组内的成员的val值进行 二元切分
@@ -112,6 +120,7 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
continue
newS = errType(mat0) + errType(mat1)
# 如果二元切分,算出来的误差在可接受范围内,那么就记录切分点,并记录最小误差
# 如果划分后误差小于 bestS则说明找到了新的bestS
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
@@ -122,15 +131,17 @@ def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
# 对整体的成员进行判断,是否符合预期
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
# 如果集合的 size 小于 tolN
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): # 当最佳划分后,集合过小,也不划分,产生叶节点
return None, leafType(dataSet)
return bestIndex, bestValue
# assume dataSet is NumPy Mat so we can array filtering
# 假设 dataSet 是 NumPy Mat 类型的,那么我们可以进行 array 过滤
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
"""createTree(获取回归树)
Description递归函数如果构建的是回归树该模型是一个常数如果是模型树其模型师一个线性方程。
Args:
dataSet 加载的原始数据集
leafType 建立叶子点的函数
@@ -143,6 +154,7 @@ def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
# choose the best split
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
# if the splitting hit a stop condition return val
# 如果 splitting 达到一个停止条件,那么返回 val
if feat is None:
return val
retTree = {}
@@ -150,7 +162,7 @@ def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
retTree['spVal'] = val
# 大于在右边小于在左边分为2个数据集
lSet, rSet = binSplitDataSet(dataSet, feat, val)
# 递归的进行调用
# 递归的进行调用,在左右子树中继续递归生成树
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
@@ -318,14 +330,14 @@ if __name__ == "__main__":
# myTree = createTree(myMat, modelLeaf, modelErr)
# print myTree
# 回归树 VS 模型树 VS 线性回归
# # 回归树 VS 模型树 VS 线性回归
trainMat = mat(loadDataSet('input/9.RegTrees/bikeSpeedVsIq_train.txt'))
testMat = mat(loadDataSet('input/9.RegTrees/bikeSpeedVsIq_test.txt'))
# 回归树
myTree1 = createTree(trainMat, ops=(1, 20))
print myTree1
yHat1 = createForeCast(myTree1, testMat[:, 0])
print "回归树:", corrcoef(yHat1, testMat[:, 1],rowvar=0)[0, 1]
# # 回归树
# myTree1 = createTree(trainMat, ops=(1, 20))
# print myTree1
# yHat1 = createForeCast(myTree1, testMat[:, 0])
# print "回归树:", corrcoef(yHat1, testMat[:, 1],rowvar=0)[0, 1]
# 模型树
myTree2 = createTree(trainMat, modelLeaf, modelErr, ops=(1, 20))

View File

@@ -0,0 +1,58 @@
#!/usr/bin/python
# coding:utf8
"""
Created on 2017-07-13
Updated on 2017-07-13
RegressionTree树回归
@author: 小瑶
《机器学习实战》更新地址https://github.com/apachecn/MachineLearning
"""
print(__doc__)
# 引入必要的模型和库
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
# 创建一个随机的数据集
# 参考 https://docs.scipy.org/doc/numpy-1.6.0/reference/generated/numpy.random.mtrand.RandomState.html
rng = np.random.RandomState(1)
# print 'lalalalala===', rng
# rand() 是给定形状的随机值rng.rand(80, 1)即矩阵的形状是 80行1列
# sort()
X = np.sort(5 * rng.rand(80, 1), axis=0)
# print 'X=', X
y = np.sin(X).ravel()
# print 'y=', y
y[::5] += 3 * (0.5 - rng.rand(16))
# print 'yyy=', y
# 拟合回归模型
# regr_1 = DecisionTreeRegressor(max_depth=2)
# 保持 max_depth=5 不变,增加 min_samples_leaf=6 的参数,效果进一步提升了
regr_2 = DecisionTreeRegressor(max_depth=5)
regr_2 = DecisionTreeRegressor(min_samples_leaf=6)
# regr_3 = DecisionTreeRegressor(max_depth=4)
# regr_1.fit(X, y)
regr_2.fit(X, y)
# regr_3.fit(X, y)
# 预测
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
# y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)
# y_3 = regr_3.predict(X_test)
# 绘制结果
plt.figure()
plt.scatter(X, y, c="darkorange", label="data")
# plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
# plt.plot(X_test, y_3, color="red", label="max_depth=3", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()