更新 adaboost sklearn测试案例

2026-02-13 15:26:28 +08:00 · 2017-08-15 18:31:34 +08:00
parent 357b922758
commit af6229cead
3 changed files with 159 additions and 66 deletions
--- a/src/python/7.AdaBoost/sklearn-adaboost-demo.py
+++ b/src/python/7.AdaBoost/sklearn-adaboost-demo.py
@@ -1,6 +1,5 @@
 #!/usr/bin/python
 # coding:utf8
-
 """
 Created on 2017-07-10
 Updated on 2017-07-10
@@ -8,80 +7,55 @@ Updated on 2017-07-10
 《机器学习实战》更新地址：https://github.com/apachecn/MachineLearning
 sklearn-AdaBoost译文链接: http://cwiki.apachecn.org/pages/viewpage.action?pageId=10813457
 """
+
 import matplotlib.pyplot as plt
+# importing necessary libraries
 import numpy as np
-from sklearn.datasets import make_gaussian_quantiles
-from sklearn.ensemble import AdaBoostClassifier
-from sklearn.tree import DecisionTreeClassifier
+from sklearn import metrics
+from sklearn.ensemble import AdaBoostRegressor
+from sklearn.tree import DecisionTreeRegressor

 print(__doc__)

-# Construct dataset
-X1, y1 = make_gaussian_quantiles(cov=2.,
-                                 n_samples=200, n_features=2,
-                                 n_classes=2, random_state=1)
-X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5,
-                                 n_samples=300, n_features=2,
-                                 n_classes=2, random_state=1)
-X = np.concatenate((X1, X2))
-y = np.concatenate((y1, - y2 + 1))

-# Create and fit an AdaBoosted decision tree
-bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
-                         algorithm="SAMME",
-                         n_estimators=200)
+# Create the dataset
+rng = np.random.RandomState(1)
+X = np.linspace(0, 6, 100)[:, np.newaxis]
+y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])
+# dataArr, labelArr = loadDataSet("input/7.AdaBoost/horseColicTraining2.txt")

-bdt.fit(X, y)

-plot_colors = "br"
-plot_step = 0.02
-class_names = "AB"
+# Fit regression model
+regr_1 = DecisionTreeRegressor(max_depth=4)
+regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=rng)

-plt.figure(figsize=(10, 5))
+regr_1.fit(X, y)
+regr_2.fit(X, y)

-# Plot the decision boundaries
-plt.subplot(121)
-x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
-y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
-xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
-                     np.arange(y_min, y_max, plot_step))
+# Predict
+y_1 = regr_1.predict(X)
+y_2 = regr_2.predict(X)

-Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()])
-Z = Z.reshape(xx.shape)
-cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
-plt.axis("tight")
-
-# Plot the training points
-for i, n, c in zip(range(2), class_names, plot_colors):
-    idx = np.where(y == i)
-    plt.scatter(X[idx, 0], X[idx, 1],
-                c=c, cmap=plt.cm.Paired,
-                label="Class %s" % n)
-plt.xlim(x_min, x_max)
-plt.ylim(y_min, y_max)
-plt.legend(loc='upper right')
-plt.xlabel('x')
-plt.ylabel('y')
-plt.title('Decision Boundary')
-
-# Plot the two-class decision scores
-twoclass_output = bdt.decision_function(X)
-plot_range = (twoclass_output.min(), twoclass_output.max())
-plt.subplot(122)
-for i, n, c in zip(range(2), class_names, plot_colors):
-    plt.hist(twoclass_output[y == i],
-             bins=10,
-             range=plot_range,
-             facecolor=c,
-             label='Class %s' % n,
-             alpha=.5)
-x1, x2, y1, y2 = plt.axis()
-plt.axis((x1, x2, y1, y2 * 1.2))
-plt.legend(loc='upper right')
-plt.ylabel('Samples')
-plt.xlabel('Score')
-plt.title('Decision Scores')
-
-plt.tight_layout()
-plt.subplots_adjust(wspace=0.35)
+# Plot the results
+plt.figure()
+plt.scatter(X, y, c="k", label="training samples")
+plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2)
+plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2)
+plt.xlabel("data")
+plt.ylabel("target")
+plt.title("Boosted Decision Tree Regression")
+plt.legend()
 plt.show()
+
+print 'y---', type(y[0]), len(y), y[:4]
+print 'y_1---', type(y_1[0]), len(y_1), y_1[:4]
+print 'y_2---', type(y_2[0]), len(y_2), y_2[:4]
+
+# 适合2分类
+y_true = np.array([0, 0, 1, 1])
+y_scores = np.array([0.1, 0.4, 0.35, 0.8])
+print 'y_scores---', type(y_scores[0]), len(y_scores), y_scores
+print metrics.roc_auc_score(y_true, y_scores)
+
+# print "-" * 100
+# print metrics.roc_auc_score(y[:1], y_2[:1])