mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-13 07:15:26 +08:00
Merge branch 'master' of https://github.com/apachecn/MachineLearning
This commit is contained in:
@@ -19,36 +19,38 @@ import matplotlib.pyplot as plt
|
||||
# 创建一个随机的数据集
|
||||
# 参考 https://docs.scipy.org/doc/numpy-1.6.0/reference/generated/numpy.random.mtrand.RandomState.html
|
||||
rng = np.random.RandomState(1)
|
||||
print 'lalalalala===', rng
|
||||
# print 'lalalalala===', rng
|
||||
# rand() 是给定形状的随机值,rng.rand(80, 1)即矩阵的形状是 80行,1列
|
||||
# sort()
|
||||
X = np.sort(5 * rng.rand(80, 1), axis=0)
|
||||
print 'X=', X
|
||||
# print 'X=', X
|
||||
y = np.sin(X).ravel()
|
||||
print 'y=', y
|
||||
# print 'y=', y
|
||||
y[::5] += 3 * (0.5 - rng.rand(16))
|
||||
print 'yyy=', y
|
||||
# print 'yyy=', y
|
||||
|
||||
# 拟合回归模型
|
||||
regr_1 = DecisionTreeRegressor(max_depth=2)
|
||||
# regr_1 = DecisionTreeRegressor(max_depth=2)
|
||||
# 保持 max_depth=5 不变,增加 min_samples_leaf=6 的参数,效果进一步提升了
|
||||
regr_2 = DecisionTreeRegressor(max_depth=5)
|
||||
regr_3 = DecisionTreeRegressor(max_depth=3)
|
||||
regr_1.fit(X, y)
|
||||
regr_2 = DecisionTreeRegressor(min_samples_leaf=6)
|
||||
# regr_3 = DecisionTreeRegressor(max_depth=4)
|
||||
# regr_1.fit(X, y)
|
||||
regr_2.fit(X, y)
|
||||
regr_3.fit(X, y)
|
||||
# regr_3.fit(X, y)
|
||||
|
||||
# 预测
|
||||
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
|
||||
y_1 = regr_1.predict(X_test)
|
||||
# y_1 = regr_1.predict(X_test)
|
||||
y_2 = regr_2.predict(X_test)
|
||||
y_3 = regr_3.predict(X_test)
|
||||
# y_3 = regr_3.predict(X_test)
|
||||
|
||||
# 绘制结果
|
||||
plt.figure()
|
||||
plt.scatter(X, y, c="darkorange", label="data")
|
||||
plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
|
||||
# plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
|
||||
plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
|
||||
plt.plot(X_test, y_3, color="red", label="max_depth=3", linewidth=2)
|
||||
# plt.plot(X_test, y_3, color="red", label="max_depth=3", linewidth=2)
|
||||
plt.xlabel("data")
|
||||
plt.ylabel("target")
|
||||
plt.title("Decision Tree Regression")
|
||||
|
||||
@@ -8,73 +8,38 @@ NaiveBayes:朴素贝叶斯
|
||||
@author: 小瑶
|
||||
《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning
|
||||
"""
|
||||
|
||||
|
||||
# GaussianNB_高斯朴素贝叶斯
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn import svm
|
||||
print(__doc__)
|
||||
|
||||
|
||||
# 创建40个分离点
|
||||
np.random.seed(0)
|
||||
# X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
|
||||
# Y = [0] * 20 + [1] * 20
|
||||
|
||||
|
||||
def loadDataSet(fileName):
|
||||
"""
|
||||
对文件进行逐行解析,从而得到第行的类标签和整个数据矩阵
|
||||
Args:
|
||||
fileName 文件名
|
||||
Returns:
|
||||
dataMat 数据矩阵
|
||||
labelMat 类标签
|
||||
"""
|
||||
dataMat = []
|
||||
labelMat = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines():
|
||||
lineArr = line.strip().split('\t')
|
||||
dataMat.append([float(lineArr[0]), float(lineArr[1])])
|
||||
labelMat.append(float(lineArr[2]))
|
||||
return dataMat, labelMat
|
||||
|
||||
|
||||
X, Y = loadDataSet('input/6.SVM/testSet.txt')
|
||||
X = np.mat(X)
|
||||
|
||||
print("X=", X)
|
||||
print("Y=", Y)
|
||||
|
||||
# 拟合一个SVM模型
|
||||
clf = svm.SVC(kernel='linear')
|
||||
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
|
||||
Y = np.array([1, 1, 1, 2, 2, 2])
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
clf = GaussianNB()
|
||||
clf.fit(X, Y)
|
||||
print clf.predict([[-0.8, -1]])
|
||||
clf_pf = GaussianNB()
|
||||
clf_pf.partial_fit(X, Y, np.unique(Y))
|
||||
print clf_pf.predict([[-0.8, -1]])
|
||||
|
||||
# 获取分割超平面
|
||||
w = clf.coef_[0]
|
||||
# 斜率
|
||||
a = -w[0] / w[1]
|
||||
# 从-5到5,顺序间隔采样50个样本,默认是num=50
|
||||
# xx = np.linspace(-5, 5) # , num=50)
|
||||
xx = np.linspace(-2, 10) # , num=50)
|
||||
# 二维的直线方程
|
||||
yy = a * xx - (clf.intercept_[0]) / w[1]
|
||||
print("yy=", yy)
|
||||
# MultinomialNB_多项朴素贝叶斯
|
||||
'''
|
||||
import numpy as np
|
||||
X = np.random.randint(5, size=(6, 100))
|
||||
y = np.array([1, 2, 3, 4, 5, 6])
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
clf = MultinomialNB()
|
||||
clf.fit(X, y)
|
||||
print clf.predict(X[2:3])
|
||||
'''
|
||||
|
||||
# plot the parallels to the separating hyperplane that pass through the support vectors
|
||||
# 通过支持向量绘制分割超平面
|
||||
print("support_vectors_=", clf.support_vectors_)
|
||||
b = clf.support_vectors_[0]
|
||||
yy_down = a * xx + (b[1] - a * b[0])
|
||||
b = clf.support_vectors_[-1]
|
||||
yy_up = a * xx + (b[1] - a * b[0])
|
||||
|
||||
# plot the line, the points, and the nearest vectors to the plane
|
||||
plt.plot(xx, yy, 'k-')
|
||||
plt.plot(xx, yy_down, 'k--')
|
||||
plt.plot(xx, yy_up, 'k--')
|
||||
|
||||
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80, facecolors='none')
|
||||
plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)
|
||||
|
||||
plt.axis('tight')
|
||||
plt.show()
|
||||
# BernoulliNB_伯努利朴素贝叶斯
|
||||
'''
|
||||
import numpy as np
|
||||
X = np.random.randint(2, size=(6, 100))
|
||||
Y = np.array([1, 2, 3, 4, 4, 5])
|
||||
from sklearn.naive_bayes import BernoulliNB
|
||||
clf = BernoulliNB()
|
||||
clf.fit(X, Y)
|
||||
print clf.predict(X[2:3])
|
||||
'''
|
||||
|
||||
@@ -19,7 +19,7 @@ def loadDataSet(file_name):
|
||||
fr = open(file_name)
|
||||
for line in fr.readlines():
|
||||
lineArr = line.strip().split()
|
||||
# 将 X0 的值设为 1.0
|
||||
# 为了方便计算,我们将 X0 的值设为 1.0 ,也就是在每一行的开头添加一个 1.0 作为 X0
|
||||
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||||
labelMat.append(int(lineArr[2]))
|
||||
return dataMat,labelMat
|
||||
@@ -57,6 +57,7 @@ def gradAscent(dataMatIn, classLabels):
|
||||
# print 'weights====', weights
|
||||
# n*3 * 3*1 = n*1
|
||||
h = sigmoid(dataMatrix*weights) # 矩阵乘法
|
||||
# print 'hhhhhhh====', h
|
||||
# labelMat是实际值
|
||||
error = (labelMat - h) # 向量相减
|
||||
# 0.001* (3*m)*(m*1) 表示在每一个列上的一个误差情况,最后得出 x1,x2,xn的系数的偏移量
|
||||
@@ -110,6 +111,17 @@ def stocGradAscent1(dataMatrix, classLabels, numIter=150):
|
||||
|
||||
# 可视化展示
|
||||
def plotBestFit(dataArr, labelMat, weights):
|
||||
'''
|
||||
Desc:
|
||||
将我们得到的数据可视化展示出来
|
||||
Args:
|
||||
dataArr:样本数据的特征
|
||||
labelMat:样本数据的类别标签,即目标变量
|
||||
weights:回归系数
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
|
||||
n = shape(dataArr)[0]
|
||||
xcord1 = []; ycord1 = []
|
||||
xcord2 = []; ycord2 = []
|
||||
@@ -146,8 +158,8 @@ def main():
|
||||
# 因为数组没有是复制n份, array的乘法就是乘法
|
||||
dataArr = array(dataMat)
|
||||
# print dataArr
|
||||
# weights = gradAscent(dataArr, labelMat)
|
||||
weights = stocGradAscent0(dataArr, labelMat)
|
||||
weights = gradAscent(dataArr, labelMat)
|
||||
# weights = stocGradAscent0(dataArr, labelMat)
|
||||
# weights = stocGradAscent1(dataArr, labelMat)
|
||||
# print '*'*30, weights
|
||||
|
||||
|
||||
281
src/python/5.Logistic/sklearn_logisticRegression_demo.py
Normal file
281
src/python/5.Logistic/sklearn_logisticRegression_demo.py
Normal file
@@ -0,0 +1,281 @@
|
||||
#!/usr/bin/python
|
||||
# coding: utf8
|
||||
|
||||
'''
|
||||
Created on Oct 27, 2010
|
||||
Update on 2017-05-18
|
||||
Logistic Regression Working Module
|
||||
@author: 小瑶
|
||||
《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning
|
||||
scikit-learn的例子地址:http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
|
||||
'''
|
||||
|
||||
# 逻辑回归中的 L1 惩罚和稀缺性 L1 Penalty and Sparsity in Logistic Regression
|
||||
'''
|
||||
print(__doc__)
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn import datasets
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
digits = datasets.load_digits()
|
||||
|
||||
X, y = digits.data, digits.target
|
||||
X = StandardScaler().fit_transform(X)
|
||||
|
||||
# 将大小数字分类为小
|
||||
y = (y > 4).astype(np.int)
|
||||
|
||||
|
||||
# 设置正则化参数
|
||||
for i, C in enumerate((100, 1, 0.01)):
|
||||
# 减少训练时间短的容忍度
|
||||
clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
|
||||
clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01)
|
||||
clf_l1_LR.fit(X, y)
|
||||
clf_l2_LR.fit(X, y)
|
||||
|
||||
coef_l1_LR = clf_l1_LR.coef_.ravel()
|
||||
coef_l2_LR = clf_l2_LR.coef_.ravel()
|
||||
|
||||
# coef_l1_LR contains zeros due to the
|
||||
# L1 sparsity inducing norm
|
||||
# 由于 L1 稀疏诱导规范,coef_l1_LR 包含零
|
||||
|
||||
sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100
|
||||
sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100
|
||||
|
||||
print("C=%.2f" % C)
|
||||
print("Sparsity with L1 penalty: %.2f%%" % sparsity_l1_LR)
|
||||
print("score with L1 penalty: %.4f" % clf_l1_LR.score(X, y))
|
||||
print("Sparsity with L2 penalty: %.2f%%" % sparsity_l2_LR)
|
||||
print("score with L2 penalty: %.4f" % clf_l2_LR.score(X, y))
|
||||
|
||||
l1_plot = plt.subplot(3, 2, 2 * i + 1)
|
||||
l2_plot = plt.subplot(3, 2, 2 * (i + 1))
|
||||
if i == 0:
|
||||
l1_plot.set_title("L1 penalty")
|
||||
l2_plot.set_title("L2 penalty")
|
||||
|
||||
l1_plot.imshow(np.abs(coef_l1_LR.reshape(8, 8)), interpolation='nearest',
|
||||
cmap='binary', vmax=1, vmin=0)
|
||||
l2_plot.imshow(np.abs(coef_l2_LR.reshape(8, 8)), interpolation='nearest',
|
||||
cmap='binary', vmax=1, vmin=0)
|
||||
plt.text(-8, 3, "C = %.2f" % C)
|
||||
|
||||
l1_plot.set_xticks(())
|
||||
l1_plot.set_yticks(())
|
||||
l2_plot.set_xticks(())
|
||||
l2_plot.set_yticks(())
|
||||
|
||||
plt.show()
|
||||
'''
|
||||
|
||||
# 具有 L1-逻辑回归的路径
|
||||
'''
|
||||
print(__doc__)
|
||||
|
||||
from datetime import datetime
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from sklearn import linear_model
|
||||
from sklearn import datasets
|
||||
from sklearn.svm import l1_min_c
|
||||
|
||||
iris = datasets.load_iris()
|
||||
X = iris.data
|
||||
y = iris.target
|
||||
|
||||
X = X[y != 2]
|
||||
y = y[y != 2]
|
||||
|
||||
X -= np.mean(X, 0)
|
||||
|
||||
cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)
|
||||
|
||||
|
||||
print("Computing regularization path ...")
|
||||
start = datetime.now()
|
||||
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
|
||||
coefs_ = []
|
||||
for c in cs:
|
||||
clf.set_params(C=c)
|
||||
clf.fit(X, y)
|
||||
coefs_.append(clf.coef_.ravel().copy())
|
||||
print("This took ", datetime.now() - start)
|
||||
|
||||
coefs_ = np.array(coefs_)
|
||||
plt.plot(np.log10(cs), coefs_)
|
||||
ymin, ymax = plt.ylim()
|
||||
plt.xlabel('log(C)')
|
||||
plt.ylabel('Coefficients')
|
||||
plt.title('Logistic Regression Path')
|
||||
plt.axis('tight')
|
||||
plt.show()
|
||||
'''
|
||||
|
||||
# 绘制多项式和一对二的逻辑回归 Plot multinomial and One-vs-Rest Logistic Regression
|
||||
'''
|
||||
print(__doc__)
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
# 制作 3 类数据集进行分类
|
||||
centers = [[-5, 0], [0, 1.5], [5, -1]]
|
||||
X, y = make_blobs(n_samples=1000, centers=centers, random_state=40)
|
||||
transformation = [[0.4, 0.2], [-0.4, 1.2]]
|
||||
X = np.dot(X, transformation)
|
||||
|
||||
for multi_class in ('multinomial', 'ovr'):
|
||||
clf = LogisticRegression(solver='sag', max_iter=100, random_state=42,
|
||||
multi_class=multi_class).fit(X, y)
|
||||
|
||||
# 打印训练分数
|
||||
print("training score : %.3f (%s)" % (clf.score(X, y), multi_class))
|
||||
|
||||
# 创建一个网格来绘制
|
||||
h = .02 # 网格中的步长
|
||||
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
|
||||
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
|
||||
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
|
||||
np.arange(y_min, y_max, h))
|
||||
|
||||
# 绘制决策边界。为此,我们将为网格 [x_min, x_max]x[y_min, y_max]中的每个点分配一个颜色。
|
||||
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
|
||||
# 将结果放入彩色图
|
||||
Z = Z.reshape(xx.shape)
|
||||
plt.figure()
|
||||
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
|
||||
plt.title("Decision surface of LogisticRegression (%s)" % multi_class)
|
||||
plt.axis('tight')
|
||||
|
||||
# 将训练点也绘制进入
|
||||
colors = "bry"
|
||||
for i, color in zip(clf.classes_, colors):
|
||||
idx = np.where(y == i)
|
||||
plt.scatter(X[idx, 0], X[idx, 1], c=color, cmap=plt.cm.Paired)
|
||||
|
||||
# 绘制三个一对数分类器
|
||||
xmin, xmax = plt.xlim()
|
||||
ymin, ymax = plt.ylim()
|
||||
coef = clf.coef_
|
||||
intercept = clf.intercept_
|
||||
|
||||
def plot_hyperplane(c, color):
|
||||
def line(x0):
|
||||
return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
|
||||
plt.plot([xmin, xmax], [line(xmin), line(xmax)],
|
||||
ls="--", color=color)
|
||||
|
||||
for i, color in zip(clf.classes_, colors):
|
||||
plot_hyperplane(i, color)
|
||||
|
||||
plt.show()
|
||||
'''
|
||||
|
||||
# Logistic Regression 3-class Classifier 逻辑回归 3-类 分类器
|
||||
|
||||
print(__doc__)
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn import linear_model, datasets
|
||||
|
||||
# 引入一些数据来玩
|
||||
iris = datasets.load_iris()
|
||||
# 我们只采用样本数据的前两个feature
|
||||
X = iris.data[:, :2]
|
||||
Y = iris.target
|
||||
|
||||
h = .02 # 网格中的步长
|
||||
|
||||
logreg = linear_model.LogisticRegression(C=1e5)
|
||||
|
||||
# 我们创建了一个 Neighbours Classifier 的实例,并拟合数据。
|
||||
logreg.fit(X, Y)
|
||||
|
||||
# 绘制决策边界。为此我们将为网格 [x_min, x_max]x[y_min, y_max] 中的每个点分配一个颜色。
|
||||
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
|
||||
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
|
||||
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
|
||||
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])
|
||||
|
||||
# 将结果放入彩色图中
|
||||
Z = Z.reshape(xx.shape)
|
||||
plt.figure(1, figsize=(4, 3))
|
||||
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
|
||||
|
||||
# 将训练点也同样放入彩色图中
|
||||
plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
|
||||
plt.xlabel('Sepal length')
|
||||
plt.ylabel('Sepal width')
|
||||
|
||||
plt.xlim(xx.min(), xx.max())
|
||||
plt.ylim(yy.min(), yy.max())
|
||||
plt.xticks(())
|
||||
plt.yticks(())
|
||||
|
||||
plt.show()
|
||||
|
||||
# Logistic function 逻辑回归函数
|
||||
# 这个类似于咱们之前讲解 logistic 回归的 Sigmoid 函数,模拟的阶跃函数
|
||||
|
||||
'''
|
||||
print(__doc__)
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from sklearn import linear_model
|
||||
|
||||
# 这是我们的测试集,它只是一条直线,带有一些高斯噪声。
|
||||
xmin, xmax = -5, 5
|
||||
n_samples = 100
|
||||
np.random.seed(0)
|
||||
X = np.random.normal(size=n_samples)
|
||||
y = (X > 0).astype(np.float)
|
||||
X[X > 0] *= 4
|
||||
X += .3 * np.random.normal(size=n_samples)
|
||||
|
||||
X = X[:, np.newaxis]
|
||||
# 运行分类器
|
||||
clf = linear_model.LogisticRegression(C=1e5)
|
||||
clf.fit(X, y)
|
||||
|
||||
# 并且画出我们的结果
|
||||
plt.figure(1, figsize=(4, 3))
|
||||
plt.clf()
|
||||
plt.scatter(X.ravel(), y, color='black', zorder=20)
|
||||
X_test = np.linspace(-5, 10, 300)
|
||||
|
||||
|
||||
def model(x):
|
||||
return 1 / (1 + np.exp(-x))
|
||||
loss = model(X_test * clf.coef_ + clf.intercept_).ravel()
|
||||
plt.plot(X_test, loss, color='red', linewidth=3)
|
||||
|
||||
ols = linear_model.LinearRegression()
|
||||
ols.fit(X, y)
|
||||
plt.plot(X_test, ols.coef_ * X_test + ols.intercept_, linewidth=1)
|
||||
plt.axhline(.5, color='.5')
|
||||
|
||||
plt.ylabel('y')
|
||||
plt.xlabel('X')
|
||||
plt.xticks(range(-5, 10))
|
||||
plt.yticks([0, 0.5, 1])
|
||||
plt.ylim(-.25, 1.25)
|
||||
plt.xlim(-4, 10)
|
||||
plt.legend(('Logistic Regression Model', 'Linear Regression Model'),
|
||||
loc="lower right", fontsize='small')
|
||||
plt.show()
|
||||
'''
|
||||
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
'''
|
||||
Created on Jan 8, 2011
|
||||
Update on 2017-05-18
|
||||
@author: Peter Harrington/ApacheCN-小瑶
|
||||
@author: Peter Harrington/小瑶
|
||||
《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning
|
||||
'''
|
||||
|
||||
@@ -12,89 +12,214 @@ Update on 2017-05-18
|
||||
from numpy import *
|
||||
import matplotlib.pylab as plt
|
||||
|
||||
def loadDataSet(fileName): # 解析以tab键分隔的文件中的浮点数
|
||||
def loadDataSet(fileName):
|
||||
""" 加载数据
|
||||
解析以tab键分隔的文件中的浮点数
|
||||
Returns:
|
||||
dataMat feature 对应的数据集
|
||||
labelMat feature 对应的分类标签,即类别标签
|
||||
dataMat : feature 对应的数据集
|
||||
labelMat : feature 对应的分类标签,即类别标签
|
||||
|
||||
"""
|
||||
numFeat = len(open(fileName).readline().split('\t')) - 1 # 获得每一行的输入数据,最后一个代表真实值
|
||||
dataMat = []; labelMat = []
|
||||
# 获取样本特征的总数,不算最后的目标变量
|
||||
numFeat = len(open(fileName).readline().split('\t')) - 1
|
||||
dataMat = []
|
||||
labelMat = []
|
||||
fr = open(fileName)
|
||||
for line in fr.readlines(): # 读取每一行
|
||||
for line in fr.readlines():
|
||||
# 读取每一行
|
||||
lineArr =[]
|
||||
curLine = line.strip().split('\t') # 删除一行中以tab分隔的数据前后的空白符号
|
||||
for i in range(numFeat): # 从0到2,不包括2
|
||||
lineArr.append(float(curLine[i]))# 将数据添加到lineArr List中,每一行数据测试数据组成一个行向量
|
||||
dataMat.append(lineArr) # 将测试数据的输入数据部分存储到dataMat矩阵中
|
||||
labelMat.append(float(curLine[-1]))# 将每一行的最后一个数据,即真实的目标变量存储到labelMat矩阵中
|
||||
# 删除一行中以tab分隔的数据前后的空白符号
|
||||
curLine = line.strip().split('\t')
|
||||
# i 从0到2,不包括2
|
||||
for i in range(numFeat):
|
||||
# 将数据添加到lineArr List中,每一行数据测试数据组成一个行向量
|
||||
lineArr.append(float(curLine[i]))
|
||||
# 将测试数据的输入数据部分存储到dataMat 的List中
|
||||
dataMat.append(lineArr)
|
||||
# 将每一行的最后一个数据,即类别,或者叫目标变量存储到labelMat List中
|
||||
labelMat.append(float(curLine[-1]))
|
||||
return dataMat,labelMat
|
||||
|
||||
def standRegres(xArr,yArr): # 线性回归
|
||||
xMat = mat(xArr); yMat = mat(yArr).T # mat()函数将xArr,yArr转换为矩阵
|
||||
xTx = xMat.T*xMat # 矩阵乘法的条件是左矩阵的列数等于右矩阵的行数
|
||||
if linalg.det(xTx) == 0.0: # 因为要用到xTx的逆矩阵,所以事先需要确定计算得到的xTx是否可逆,条件是矩阵的行列式不为0
|
||||
print ("This matrix is singular, cannot do inverse")
|
||||
def standRegres(xArr,yArr):
|
||||
'''
|
||||
Description:
|
||||
线性回归
|
||||
Args:
|
||||
xArr :输入的样本数据,包含每个样本数据的 feature
|
||||
yArr :对应于输入数据的类别标签,也就是每个样本对应的目标变量
|
||||
Returns:
|
||||
ws:回归系数
|
||||
'''
|
||||
|
||||
# mat()函数将xArr,yArr转换为矩阵 mat().T 代表的是对矩阵进行转置操作
|
||||
xMat = mat(xArr)
|
||||
yMat = mat(yArr).T
|
||||
# 矩阵乘法的条件是左矩阵的列数等于右矩阵的行数
|
||||
xTx = xMat.T*xMat
|
||||
# 因为要用到xTx的逆矩阵,所以事先需要确定计算得到的xTx是否可逆,条件是矩阵的行列式不为0
|
||||
# linalg.det() 函数是用来求得矩阵的行列式的,如果矩阵的行列式为0,则这个矩阵是不可逆的,就无法进行接下来的运算
|
||||
if linalg.det(xTx) == 0.0:
|
||||
print "This matrix is singular, cannot do inverse"
|
||||
return
|
||||
# 最小二乘法
|
||||
# http://www.apache.wiki/pages/viewpage.action?pageId=5505133
|
||||
ws = xTx.I * (xMat.T*yMat) # 书中的公式,求得w的最优解
|
||||
# 书中的公式,求得w的最优解
|
||||
ws = xTx.I * (xMat.T*yMat)
|
||||
return ws
|
||||
|
||||
def lwlr(testPoint,xArr,yArr,k=1.0): # 局部加权线性回归
|
||||
xMat = mat(xArr); yMat = mat(yArr).T
|
||||
m = shape(xMat)[0] # 获得xMat矩阵的行数
|
||||
weights = mat(eye((m))) # eye()返回一个对角线元素为1,其他元素为0的二维数组,创建权重矩阵
|
||||
for j in range(m): # 下面两行创建权重矩阵
|
||||
diffMat = testPoint - xMat[j,:] # 遍历数据集,计算每个样本点对应的权重值
|
||||
weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))#k控制衰减的速度
|
||||
# 局部加权线性回归
|
||||
def lwlr(testPoint,xArr,yArr,k=1.0):
|
||||
'''
|
||||
Description:
|
||||
局部加权线性回归,在待预测点附近的每个点赋予一定的权重,在子集上基于最小均方差来进行普通的回归。
|
||||
Args:
|
||||
testPoint:样本点
|
||||
xArr:样本的特征数据,即 feature
|
||||
yArr:每个样本对应的类别标签,即目标变量
|
||||
k:关于赋予权重矩阵的核的一个参数,与权重的衰减速率有关
|
||||
Returns:
|
||||
testPoint * ws:数据点与具有权重的系数相乘得到的预测点
|
||||
Notes:
|
||||
这其中会用到计算权重的公式,w = e^((x^((i))-x) / -2k^2)
|
||||
理解:x为某个预测点,x^((i))为样本点,样本点距离预测点越近,贡献的误差越大(权值越大),越远则贡献的误差越小(权值越小)。
|
||||
关于预测点的选取,在我的代码中取的是样本点。其中k是带宽参数,控制w(钟形函数)的宽窄程度,类似于高斯函数的标准差。
|
||||
算法思路:假设预测点取样本点中的第i个样本点(共m个样本点),遍历1到m个样本点(含第i个),算出每一个样本点与预测点的距离,
|
||||
也就可以计算出每个样本贡献误差的权值,可以看出w是一个有m个元素的向量(写成对角阵形式)。
|
||||
'''
|
||||
# mat() 函数是将array转换为矩阵的函数, mat().T 是转换为矩阵之后,再进行转置操作
|
||||
xMat = mat(xArr)
|
||||
yMat = mat(yArr).T
|
||||
# 获得xMat矩阵的行数
|
||||
m = shape(xMat)[0]
|
||||
# eye()返回一个对角线元素为1,其他元素为0的二维数组,创建权重矩阵weights,该矩阵为每个样本点初始化了一个权重
|
||||
weights = mat(eye((m)))
|
||||
for j in range(m):
|
||||
# testPoint 的形式是 一个行向量的形式
|
||||
# 计算 testPoint 与输入样本点之间的距离,然后下面计算出每个样本贡献误差的权值
|
||||
diffMat = testPoint - xMat[j,:]
|
||||
# k控制衰减的速度
|
||||
weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))
|
||||
# 根据矩阵乘法计算 xTx ,其中的 weights 矩阵是样本点对应的权重矩阵
|
||||
xTx = xMat.T * (weights * xMat)
|
||||
if linalg.det(xTx) == 0.0:
|
||||
print ("This matrix is singular, cannot do inverse")
|
||||
return
|
||||
ws = xTx.I * (xMat.T * (weights * yMat)) # 计算出回归系数的一个估计
|
||||
# 计算出回归系数的一个估计
|
||||
ws = xTx.I * (xMat.T * (weights * yMat))
|
||||
return testPoint * ws
|
||||
|
||||
def lwlrTest(testArr,xArr,yArr,k=1.0): # 循环所有的数据点,并将lwlr运用于所有的数据点
|
||||
def lwlrTest(testArr,xArr,yArr,k=1.0):
|
||||
'''
|
||||
Description:
|
||||
测试局部加权线性回归,对数据集中每个点调用 lwlr() 函数
|
||||
Args:
|
||||
testArr:测试所用的所有样本点
|
||||
xArr:样本的特征数据,即 feature
|
||||
yArr:每个样本对应的类别标签,即目标变量
|
||||
k:控制核函数的衰减速率
|
||||
Returns:
|
||||
yHat:预测点的估计值
|
||||
'''
|
||||
# 得到样本点的总数
|
||||
m = shape(testArr)[0]
|
||||
# 构建一个全部都是 0 的 1 * m 的矩阵
|
||||
yHat = zeros(m)
|
||||
# 循环所有的数据点,并将lwlr运用于所有的数据点
|
||||
for i in range(m):
|
||||
yHat[i] = lwlr(testArr[i],xArr,yArr,k)
|
||||
# 返回估计值
|
||||
return yHat
|
||||
|
||||
def lwlrTestPlot(xArr,yArr,k=1.0): # 首先将 X 排序,其余的都与lwlrTest相同,这样更容易绘图
|
||||
yHat = zeros(shape(yArr))
|
||||
def lwlrTestPlot(xArr,yArr,k=1.0):
|
||||
'''
|
||||
Description:
|
||||
首先将 X 排序,其余的都与lwlrTest相同,这样更容易绘图
|
||||
Args:
|
||||
xArr:样本的特征数据,即 feature
|
||||
yArr:每个样本对应的类别标签,即目标变量,实际值
|
||||
k:控制核函数的衰减速率的有关参数,这里设定的是常量值 1
|
||||
Return:
|
||||
yHat:样本点的估计值
|
||||
xCopy:xArr的复制
|
||||
'''
|
||||
# 生成一个与目标变量数目相同的 0 向量
|
||||
yHat = zeros(shape(yArr))
|
||||
# 将 xArr 转换为 矩阵形式
|
||||
xCopy = mat(xArr)
|
||||
# 排序
|
||||
xCopy.sort(0)
|
||||
# 开始循环,为每个样本点进行局部加权线性回归,得到最终的目标变量估计值
|
||||
for i in range(shape(xArr)[0]):
|
||||
yHat[i] = lwlr(xCopy[i],xArr,yArr,k)
|
||||
return yHat,xCopy
|
||||
|
||||
def rssError(yArr,yHatArr): # yArr 和 yHatArr 两者都需要是数组
|
||||
def rssError(yArr,yHatArr):
|
||||
'''
|
||||
Desc:
|
||||
计算分析预测误差的大小
|
||||
Args:
|
||||
yArr:真实的目标变量
|
||||
yHatArr:预测得到的估计值
|
||||
Returns:
|
||||
计算真实值和估计值得到的值的平方和作为最后的返回值
|
||||
'''
|
||||
return ((yArr-yHatArr)**2).sum()
|
||||
|
||||
def ridgeRegres(xMat,yMat,lam=0.2): # 岭回归
|
||||
def ridgeRegres(xMat,yMat,lam=0.2):
|
||||
'''
|
||||
Desc:
|
||||
这个函数实现了给定 lambda 下的岭回归求解。
|
||||
如果数据的特征比样本点还多,就不能再使用上面介绍的的线性回归和局部现行回归了,因为计算 (xTx)^(-1)会出现错误。
|
||||
如果特征比样本点还多(n > m),也就是说,输入数据的矩阵x不是满秩矩阵。非满秩矩阵在求逆时会出现问题。
|
||||
为了解决这个问题,我们下边讲一下:岭回归,这是我们要讲的第一种缩减方法。
|
||||
Args:
|
||||
xMat:样本的特征数据,即 feature
|
||||
yMat:每个样本对应的类别标签,即目标变量,实际值
|
||||
lam:引入的一个λ值,使得矩阵非奇异
|
||||
Returns:
|
||||
经过岭回归公式计算得到的回归系数
|
||||
'''
|
||||
|
||||
xTx = xMat.T*xMat
|
||||
denom = xTx + eye(shape(xMat)[1])*lam # 按照书上的公式计算计算回归系数
|
||||
if linalg.det(denom) == 0.0: # 检查行列式是否为零,即矩阵是否可逆
|
||||
# 岭回归就是在矩阵 xTx 上加一个 λI 从而使得矩阵非奇异,进而能对 xTx + λI 求逆
|
||||
denom = xTx + eye(shape(xMat)[1])*lam
|
||||
# 检查行列式是否为零,即矩阵是否可逆,行列式为0的话就不可逆,不为0的话就是可逆。
|
||||
if linalg.det(denom) == 0.0:
|
||||
print ("This matrix is singular, cannot do inverse")
|
||||
return
|
||||
ws = denom.I * (xMat.T*yMat)
|
||||
return ws
|
||||
|
||||
def ridgeTest(xArr,yArr):
|
||||
xMat = mat(xArr); yMat=mat(yArr).T
|
||||
yMean = mean(yMat,0) # 计算Y均值
|
||||
yMat = yMat - yMean # Y的所有的特征减去均值
|
||||
# 标准化 x
|
||||
xMeans = mean(xMat,0) # X计算平均值
|
||||
xVar = var(xMat,0) # 然后计算 X的方差
|
||||
'''
|
||||
Desc:
|
||||
函数 ridgeTest() 用于在一组 λ 上测试结果
|
||||
Args:
|
||||
xArr:样本数据的特征,即 feature
|
||||
yArr:样本数据的类别标签,即真实数据
|
||||
Returns:
|
||||
wMat:将所有的回归系数输出到一个矩阵并返回
|
||||
'''
|
||||
|
||||
xMat = mat(xArr)
|
||||
yMat=mat(yArr).T
|
||||
# 计算Y的均值
|
||||
yMean = mean(yMat,0)
|
||||
# Y的所有的特征减去均值
|
||||
yMat = yMat - yMean
|
||||
# 标准化 x,计算 xMat 平均值
|
||||
xMeans = mean(xMat,0)
|
||||
# 然后计算 X的方差
|
||||
xVar = var(xMat,0)
|
||||
# 所有特征都减去各自的均值并除以方差
|
||||
xMat = (xMat - xMeans)/xVar
|
||||
# 可以在 30 个不同的 lambda 下调用 ridgeRegres() 函数。
|
||||
numTestPts = 30
|
||||
wMat = zeros((numTestPts,shape(xMat)[1]))# 创建30 * m 的全部数据为0 的矩阵
|
||||
# 创建30 * m 的全部数据为0 的矩阵
|
||||
wMat = zeros((numTestPts,shape(xMat)[1]))
|
||||
for i in range(numTestPts):
|
||||
ws = ridgeRegres(xMat,yMat,exp(i-10))# exp返回e^x
|
||||
# exp() 返回 e^x
|
||||
ws = ridgeRegres(xMat,yMat,exp(i-10))
|
||||
wMat[i,:]=ws.T
|
||||
return wMat
|
||||
|
||||
|
||||
Reference in New Issue
Block a user