mirror of
https://github.com/apachecn/ailearning.git
synced 2026-05-08 23:12:06 +08:00
1
.gitignore
vendored
1
.gitignore
vendored
@@ -88,4 +88,3 @@ ENV/
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
.vscode
|
||||
.idea
|
||||
@@ -1,11 +1,36 @@
|
||||
|
||||
# 1) 逻辑回归基础
|
||||
# 5) 逻辑回归基础
|
||||
|
||||
* 逻辑回归(Logistic Regression)
|
||||
* 1.1 分类问题
|
||||
* 1.2 假说表示
|
||||
* 1.3 判定边界
|
||||
* 1.4 代价函数
|
||||
* 1.5 简化的成本函数和梯度下降
|
||||
* 1.6 高级优化
|
||||
* 1.7 多类分类:一个对所有
|
||||
* 5.1 分类问题
|
||||
* 在分类问题中,尝试预测的是结果是否属于某一个类(例如正确或错误)。
|
||||
* 分类问题的例子有:
|
||||
* 判断一封电子邮件是否是垃圾邮件;
|
||||
* 判断一次金融交易是否是欺诈等等。
|
||||
* 从二元的分类问题开始讨论:
|
||||
将因变量(dependant variable)可能属于的两个类分别称为负向类(negative class)和正向类(positive class),则因变量
|
||||
y属于{0,1}
|
||||
注:其中 0 表示负向类,1 表示正向类。
|
||||
* 5.2 假说表示
|
||||
|
||||
* 5.3 判定边界
|
||||
* 在逻辑回归中,我们预测:
|
||||
当 hθ 大于等于 0.5 时,预测 y=1
|
||||
当 hθ 小于 0.5 时,预测 y=0
|
||||
* 根据上面绘制出的 S 形函数图像,我们知道当
|
||||
z=0时 ,g(z)=0.5
|
||||
z>0时 ,g(z)>0.5
|
||||
z<0时 ,g(z)<0.5
|
||||
又z=θ的T次方与X的积,即:
|
||||
z大于等于0时,预测:y=1
|
||||
z小于0时,预测:y=0
|
||||
* 现在假设我们有一个模型:Hθ(x)=g(θ0+θ1*x1+θ2*x2)
|
||||
并且参数θ是向量[-3 1 1]。则当-3+x1+x2大于等于0,即x1+x2大于等于3时,模型将预测y=1。
|
||||
我们可以绘制直线x1+x2=3,这条线便是我们模型的分界线,将预测为1的区域和预测为0的区域分隔开。
|
||||
* 假使我们的数据呈现这样的分布情况,怎样的模型才能适合呢?
|
||||
因为需要用曲线才能分隔 y=0 的区域和 y=1 的区域,我们需要二次方特征: 假设参数是Hθ(x)=g(θ0+θ1*x1+θ2*x2+θ3*(x1^2)+θ4*(x2^2)+θ4*(x2^2))
|
||||
是[-1 0 0 1 1],则我们得到的判定边界恰好是圆点在原点且半径为 1 的圆形。可以用非常复杂的模型来适应非常复杂形状的判定边界。
|
||||
* 5.4 代价函数
|
||||
* 5.5 简化的成本函数和梯度下降
|
||||
* 5.6 高级优化
|
||||
* 5.7 多类分类:一个对所有
|
||||
|
||||
@@ -6,6 +6,12 @@ from sklearn import tree
|
||||
from sklearn.metrics import precision_recall_curve
|
||||
from sklearn.metrics import classification_report
|
||||
from sklearn.cross_validation import train_test_split
|
||||
"""
|
||||
需要安装依赖模块:
|
||||
pip install scikit_learn-0.18-cp27-cp27m-win_amd64.whl
|
||||
非常完整的网址:
|
||||
http://www.lfd.uci.edu/~gohlke/pythonlibs/#numpy
|
||||
"""
|
||||
|
||||
|
||||
def createDataSet():
|
||||
|
||||
103
src/python/05.Logistic/core/logRegression01.py
Normal file
103
src/python/05.Logistic/core/logRegression01.py
Normal file
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
from numpy import *
|
||||
import matplotlib.pyplot as plt
|
||||
import time
|
||||
'''
|
||||
1、需要安装模块:pip install matplotlib-1.5.0-cp27-none-win_amd64.whl
|
||||
由于直接安装会出现问题,所以建议下载whl包进行安装,下载网址:
|
||||
https://pypi.python.org/pypi/matplotlib/1.5.0
|
||||
|
||||
2、可以看见画出的图像
|
||||
'''
|
||||
|
||||
"""
|
||||
@version:
|
||||
@author: yangjf
|
||||
@license: ApacheCN
|
||||
@contact: highfei2011@126.com
|
||||
@site: https://github.com/apachecn/MachineLearning
|
||||
@software: PyCharm
|
||||
@file: logRegression01.py
|
||||
@time: 2017/3/3 22:03
|
||||
@test result: ok
|
||||
"""
|
||||
|
||||
# sigmoid函数
|
||||
def sigmoid(inX):
|
||||
return 1.0 / (1 + exp(-inX))
|
||||
|
||||
def trainLogRegres(train_x, train_y, opts):
|
||||
# 计算训练时间
|
||||
startTime = time.time()
|
||||
|
||||
numSamples, numFeatures = shape(train_x)
|
||||
alpha = opts['alpha']; maxIter = opts['maxIter']
|
||||
weights = ones((numFeatures, 1))
|
||||
|
||||
# 通过梯度下降算法优化
|
||||
for k in range(maxIter):
|
||||
if opts['optimizeType'] == 'gradDescent': # 梯度下降算法
|
||||
output = sigmoid(train_x * weights)
|
||||
error = train_y - output
|
||||
weights = weights + alpha * train_x.transpose() * error
|
||||
elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降
|
||||
for i in range(numSamples):
|
||||
output = sigmoid(train_x[i, :] * weights)
|
||||
error = train_y[i, 0] - output
|
||||
weights = weights + alpha * train_x[i, :].transpose() * error
|
||||
elif opts['optimizeType'] == 'smoothStocGradDescent': # 光滑随机梯度下降
|
||||
# 随机选择样本以优化以减少周期波动
|
||||
dataIndex = range(numSamples)
|
||||
for i in range(numSamples):
|
||||
alpha = 4.0 / (1.0 + k + i) + 0.01
|
||||
randIndex = int(random.uniform(0, len(dataIndex)))
|
||||
output = sigmoid(train_x[randIndex, :] * weights)
|
||||
error = train_y[randIndex, 0] - output
|
||||
weights = weights + alpha * train_x[randIndex, :].transpose() * error
|
||||
del(dataIndex[randIndex]) # 在一次交互期间,删除优化的样品
|
||||
else:
|
||||
raise NameError('Not support optimize method type!')
|
||||
|
||||
|
||||
print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
|
||||
return weights
|
||||
|
||||
|
||||
#测试给定测试集的训练Logistic回归模型
|
||||
def testLogRegres(weights, test_x, test_y):
|
||||
numSamples, numFeatures = shape(test_x)
|
||||
matchCount = 0
|
||||
for i in xrange(numSamples):
|
||||
predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
|
||||
if predict == bool(test_y[i, 0]):
|
||||
matchCount += 1
|
||||
accuracy = float(matchCount) / numSamples
|
||||
return accuracy
|
||||
|
||||
|
||||
# 显示你的训练逻辑回归模型只有2-D数据可用
|
||||
def showLogRegres(weights, train_x, train_y):
|
||||
# 注意:train_x和train_y是垫数据类型
|
||||
numSamples, numFeatures = shape(train_x)
|
||||
if numFeatures != 3:
|
||||
print "抱歉! 我不能绘制,因为你的数据的维度不是2!"
|
||||
return 1
|
||||
|
||||
# 画出所有抽样数据
|
||||
for i in xrange(numSamples):
|
||||
if int(train_y[i, 0]) == 0:
|
||||
plt.plot(train_x[i, 1], train_x[i, 2], 'or')
|
||||
elif int(train_y[i, 0]) == 1:
|
||||
plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
|
||||
|
||||
# 画图操作
|
||||
min_x = min(train_x[:, 1])[0, 0]
|
||||
max_x = max(train_x[:, 1])[0, 0]
|
||||
weights = weights.getA() # 将mat转换为数组
|
||||
y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
|
||||
y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
|
||||
plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
|
||||
plt.xlabel('X1'); plt.ylabel('X2')
|
||||
#显示图像
|
||||
plt.show()
|
||||
54
src/python/05.Logistic/core/test_logRegression.py
Normal file
54
src/python/05.Logistic/core/test_logRegression.py
Normal file
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
import os
|
||||
import sys
|
||||
sys.path.append("C:\Python27")
|
||||
from numpy import *
|
||||
|
||||
from logRegression01 import *
|
||||
"""
|
||||
@version:
|
||||
@author: yangjf
|
||||
@license: ApacheCN
|
||||
@contact: highfei2011@126.com
|
||||
@site: https://github.com/apachecn/MachineLearning
|
||||
@software: PyCharm
|
||||
@file: test_logRegression.py
|
||||
@time: 2017/3/3 22:09
|
||||
@test result: ok
|
||||
"""
|
||||
|
||||
def loadData():
|
||||
train_x = []
|
||||
train_y = []
|
||||
# 获取当前文件所在路径
|
||||
project_dir = os.getcwdu()
|
||||
# 截取字符串至项目名:Test\
|
||||
project_dir = project_dir[:project_dir.find("MachineLearning\\") + 15]
|
||||
print project_dir
|
||||
fileIn = open("%s/testData/Logistic_testdata.txt" % project_dir)
|
||||
for line in fileIn.readlines():
|
||||
lineArr = line.strip().split()
|
||||
train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
|
||||
train_y.append(float(lineArr[2]))
|
||||
return mat(train_x), mat(train_y).transpose()
|
||||
|
||||
|
||||
##第一步: 加载数据
|
||||
print "step 1: load data..."
|
||||
train_x, train_y = loadData()
|
||||
test_x = train_x; test_y = train_y
|
||||
|
||||
##第二步: 训练数据...
|
||||
print "step 2: training..."
|
||||
opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'}
|
||||
optimalWeights = trainLogRegres(train_x, train_y, opts)
|
||||
|
||||
##第三步: 测试
|
||||
print "step 3: testing..."
|
||||
accuracy = testLogRegres(optimalWeights, test_x, test_y)
|
||||
|
||||
##第四步: 显示结果
|
||||
print "step 4: show the result..."
|
||||
print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
|
||||
showLogRegres(optimalWeights, train_x, train_y)
|
||||
@@ -117,7 +117,7 @@ def plotBestFit(dataArr, labelMat, weights):
|
||||
def main():
|
||||
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
|
||||
# 1.收集并准备数据
|
||||
dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
|
||||
dataMat, labelMat = loadDataSet("%s/testData/Logistic_testdata.txt" % project_dir)
|
||||
|
||||
# print dataMat, '---\n', labelMat
|
||||
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值
|
||||
|
||||
@@ -73,7 +73,7 @@ def apriori(dataSet, minSupport = 0.5):
|
||||
def main():
|
||||
# project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
|
||||
# 1.收集并准备数据
|
||||
# dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
|
||||
# dataMat, labelMat = loadDataSet("%s/resources/Logistic_testdata.txt" % project_dir)
|
||||
|
||||
|
||||
# 1. 加载数据
|
||||
|
||||
100
testData/Logistic_testdata.txt
Normal file
100
testData/Logistic_testdata.txt
Normal file
@@ -0,0 +1,100 @@
|
||||
-0.017612 14.053064 0
|
||||
-1.395634 4.662541 1
|
||||
-0.752157 6.538620 0
|
||||
-1.322371 7.152853 0
|
||||
0.423363 11.054677 0
|
||||
0.406704 7.067335 1
|
||||
0.667394 12.741452 0
|
||||
-2.460150 6.866805 1
|
||||
0.569411 9.548755 0
|
||||
-0.026632 10.427743 0
|
||||
0.850433 6.920334 1
|
||||
1.347183 13.175500 0
|
||||
1.176813 3.167020 1
|
||||
-1.781871 9.097953 0
|
||||
-0.566606 5.749003 1
|
||||
0.931635 1.589505 1
|
||||
-0.024205 6.151823 1
|
||||
-0.036453 2.690988 1
|
||||
-0.196949 0.444165 1
|
||||
1.014459 5.754399 1
|
||||
1.985298 3.230619 1
|
||||
-1.693453 -0.557540 1
|
||||
-0.576525 11.778922 0
|
||||
-0.346811 -1.678730 1
|
||||
-2.124484 2.672471 1
|
||||
1.217916 9.597015 0
|
||||
-0.733928 9.098687 0
|
||||
-3.642001 -1.618087 1
|
||||
0.315985 3.523953 1
|
||||
1.416614 9.619232 0
|
||||
-0.386323 3.989286 1
|
||||
0.556921 8.294984 1
|
||||
1.224863 11.587360 0
|
||||
-1.347803 -2.406051 1
|
||||
1.196604 4.951851 1
|
||||
0.275221 9.543647 0
|
||||
0.470575 9.332488 0
|
||||
-1.889567 9.542662 0
|
||||
-1.527893 12.150579 0
|
||||
-1.185247 11.309318 0
|
||||
-0.445678 3.297303 1
|
||||
1.042222 6.105155 1
|
||||
-0.618787 10.320986 0
|
||||
1.152083 0.548467 1
|
||||
0.828534 2.676045 1
|
||||
-1.237728 10.549033 0
|
||||
-0.683565 -2.166125 1
|
||||
0.229456 5.921938 1
|
||||
-0.959885 11.555336 0
|
||||
0.492911 10.993324 0
|
||||
0.184992 8.721488 0
|
||||
-0.355715 10.325976 0
|
||||
-0.397822 8.058397 0
|
||||
0.824839 13.730343 0
|
||||
1.507278 5.027866 1
|
||||
0.099671 6.835839 1
|
||||
-0.344008 10.717485 0
|
||||
1.785928 7.718645 1
|
||||
-0.918801 11.560217 0
|
||||
-0.364009 4.747300 1
|
||||
-0.841722 4.119083 1
|
||||
0.490426 1.960539 1
|
||||
-0.007194 9.075792 0
|
||||
0.356107 12.447863 0
|
||||
0.342578 12.281162 0
|
||||
-0.810823 -1.466018 1
|
||||
2.530777 6.476801 1
|
||||
1.296683 11.607559 0
|
||||
0.475487 12.040035 0
|
||||
-0.783277 11.009725 0
|
||||
0.074798 11.023650 0
|
||||
-1.337472 0.468339 1
|
||||
-0.102781 13.763651 0
|
||||
-0.147324 2.874846 1
|
||||
0.518389 9.887035 0
|
||||
1.015399 7.571882 0
|
||||
-1.658086 -0.027255 1
|
||||
1.319944 2.171228 1
|
||||
2.056216 5.019981 1
|
||||
-0.851633 4.375691 1
|
||||
-1.510047 6.061992 0
|
||||
-1.076637 -3.181888 1
|
||||
1.821096 10.283990 0
|
||||
3.010150 8.401766 1
|
||||
-1.099458 1.688274 1
|
||||
-0.834872 -1.733869 1
|
||||
-0.846637 3.849075 1
|
||||
1.400102 12.628781 0
|
||||
1.752842 5.468166 1
|
||||
0.078557 0.059736 1
|
||||
0.089392 -0.715300 1
|
||||
1.825662 12.693808 0
|
||||
0.197445 9.744638 0
|
||||
0.126117 0.922311 1
|
||||
-0.679797 1.220530 1
|
||||
0.677983 2.556666 1
|
||||
0.761349 10.693862 0
|
||||
-2.168791 0.143632 1
|
||||
1.388610 9.341997 0
|
||||
0.317029 14.739025 0
|
||||
Reference in New Issue
Block a user