Merge pull request #26 from highfei2011/master

修改目录结构和代码
This commit is contained in:
ApacheCN
2017-03-09 15:07:45 +08:00
committed by GitHub
8 changed files with 298 additions and 11 deletions

1
.gitignore vendored
View File

@@ -88,4 +88,3 @@ ENV/
# Rope project settings
.ropeproject
.vscode
.idea

View File

@@ -1,11 +1,36 @@
# 1) 逻辑回归基础
# 5) 逻辑回归基础
* 逻辑回归(Logistic Regression)
* 1.1 分类问题
* 1.2 假说表示
* 1.3 判定边界
* 1.4 代价函数
* 1.5 简化的成本函数和梯度下降
* 1.6 高级优化
* 1.7 多类分类:一个对所有
* 5.1 分类问题
* 在分类问题中,尝试预测的是结果是否属于某一个类(例如正确或错误)。
* 分类问题的例子有:
* 判断一封电子邮件是否是垃圾邮件;
* 判断一次金融交易是否是欺诈等等。
* 从二元的分类问题开始讨论:
将因变量(dependant variable)可能属于的两个类分别称为负向类negative class和正向类positive class则因变量
y属于{01}
注:其中 0 表示负向类1 表示正向类。
* 5.2 假说表示
* 5.3 判定边界
* 在逻辑回归中,我们预测:
当 hθ 大于等于 0.5 时,预测 y=1
当 hθ 小于 0.5 时,预测 y=0
* 根据上面绘制出的 S 形函数图像,我们知道当
z=0时 g(z)=0.5
z>0时 g(z)>0.5
z<0时 g(z)<0.5
又z=θ的T次方与X的积
z大于等于0时预测y=1
z小于0时预测y=0
* 现在假设我们有一个模型Hθ(x)=g(θ0+θ1*x1+θ2*x2)
并且参数θ是向量[-3 1 1]。则当-3+x1+x2大于等于0即x1+x2大于等于3时模型将预测y=1。
我们可以绘制直线x1+x2=3这条线便是我们模型的分界线将预测为1的区域和预测为0的区域分隔开。
* 假使我们的数据呈现这样的分布情况,怎样的模型才能适合呢?
因为需要用曲线才能分隔 y=0 的区域和 y=1 的区域,我们需要二次方特征: 假设参数是Hθ(x)=g(θ0+θ1*x1+θ2*x2+θ3*(x1^2)+θ4*(x2^2)+θ4*(x2^2))
是[-1 0 0 1 1],则我们得到的判定边界恰好是圆点在原点且半径为 1 的圆形。可以用非常复杂的模型来适应非常复杂形状的判定边界。
* 5.4 代价函数
* 5.5 简化的成本函数和梯度下降
* 5.6 高级优化
* 5.7 多类分类:一个对所有

View File

@@ -6,6 +6,12 @@ from sklearn import tree
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
"""
需要安装依赖模块:
pip install scikit_learn-0.18-cp27-cp27m-win_amd64.whl
非常完整的网址:
http://www.lfd.uci.edu/~gohlke/pythonlibs/#numpy
"""
def createDataSet():

View File

@@ -0,0 +1,103 @@
#!/usr/bin/env python
# encoding: utf-8
from numpy import *
import matplotlib.pyplot as plt
import time
'''
1、需要安装模块pip install matplotlib-1.5.0-cp27-none-win_amd64.whl
由于直接安装会出现问题所以建议下载whl包进行安装下载网址
https://pypi.python.org/pypi/matplotlib/1.5.0
2、可以看见画出的图像
'''
"""
@version:
@author: yangjf
@license: ApacheCN
@contact: highfei2011@126.com
@site: https://github.com/apachecn/MachineLearning
@software: PyCharm
@file: logRegression01.py
@time: 2017/3/3 22:03
@test result: ok
"""
# sigmoid函数
def sigmoid(inX):
return 1.0 / (1 + exp(-inX))
def trainLogRegres(train_x, train_y, opts):
# 计算训练时间
startTime = time.time()
numSamples, numFeatures = shape(train_x)
alpha = opts['alpha']; maxIter = opts['maxIter']
weights = ones((numFeatures, 1))
# 通过梯度下降算法优化
for k in range(maxIter):
if opts['optimizeType'] == 'gradDescent': # 梯度下降算法
output = sigmoid(train_x * weights)
error = train_y - output
weights = weights + alpha * train_x.transpose() * error
elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降
for i in range(numSamples):
output = sigmoid(train_x[i, :] * weights)
error = train_y[i, 0] - output
weights = weights + alpha * train_x[i, :].transpose() * error
elif opts['optimizeType'] == 'smoothStocGradDescent': # 光滑随机梯度下降
# 随机选择样本以优化以减少周期波动
dataIndex = range(numSamples)
for i in range(numSamples):
alpha = 4.0 / (1.0 + k + i) + 0.01
randIndex = int(random.uniform(0, len(dataIndex)))
output = sigmoid(train_x[randIndex, :] * weights)
error = train_y[randIndex, 0] - output
weights = weights + alpha * train_x[randIndex, :].transpose() * error
del(dataIndex[randIndex]) # 在一次交互期间,删除优化的样品
else:
raise NameError('Not support optimize method type!')
print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
return weights
#测试给定测试集的训练Logistic回归模型
def testLogRegres(weights, test_x, test_y):
numSamples, numFeatures = shape(test_x)
matchCount = 0
for i in xrange(numSamples):
predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
if predict == bool(test_y[i, 0]):
matchCount += 1
accuracy = float(matchCount) / numSamples
return accuracy
# 显示你的训练逻辑回归模型只有2-D数据可用
def showLogRegres(weights, train_x, train_y):
# 注意train_x和train_y是垫数据类型
numSamples, numFeatures = shape(train_x)
if numFeatures != 3:
print "抱歉! 我不能绘制因为你的数据的维度不是2"
return 1
# 画出所有抽样数据
for i in xrange(numSamples):
if int(train_y[i, 0]) == 0:
plt.plot(train_x[i, 1], train_x[i, 2], 'or')
elif int(train_y[i, 0]) == 1:
plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
# 画图操作
min_x = min(train_x[:, 1])[0, 0]
max_x = max(train_x[:, 1])[0, 0]
weights = weights.getA() # 将mat转换为数组
y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
plt.xlabel('X1'); plt.ylabel('X2')
#显示图像
plt.show()

View File

@@ -0,0 +1,54 @@
#!/usr/bin/env python
# encoding: utf-8
import os
import sys
sys.path.append("C:\Python27")
from numpy import *
from logRegression01 import *
"""
@version:
@author: yangjf
@license: ApacheCN
@contact: highfei2011@126.com
@site: https://github.com/apachecn/MachineLearning
@software: PyCharm
@file: test_logRegression.py
@time: 2017/3/3 22:09
@test result: ok
"""
def loadData():
train_x = []
train_y = []
# 获取当前文件所在路径
project_dir = os.getcwdu()
# 截取字符串至项目名Test\
project_dir = project_dir[:project_dir.find("MachineLearning\\") + 15]
print project_dir
fileIn = open("%s/testData/Logistic_testdata.txt" % project_dir)
for line in fileIn.readlines():
lineArr = line.strip().split()
train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
train_y.append(float(lineArr[2]))
return mat(train_x), mat(train_y).transpose()
##第一步: 加载数据
print "step 1: load data..."
train_x, train_y = loadData()
test_x = train_x; test_y = train_y
##第二步: 训练数据...
print "step 2: training..."
opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'}
optimalWeights = trainLogRegres(train_x, train_y, opts)
##第三步: 测试
print "step 3: testing..."
accuracy = testLogRegres(optimalWeights, test_x, test_y)
##第四步: 显示结果
print "step 4: show the result..."
print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
showLogRegres(optimalWeights, train_x, train_y)

View File

@@ -117,7 +117,7 @@ def plotBestFit(dataArr, labelMat, weights):
def main():
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# 1.收集并准备数据
dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
dataMat, labelMat = loadDataSet("%s/testData/Logistic_testdata.txt" % project_dir)
# print dataMat, '---\n', labelMat
# 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值

View File

@@ -73,7 +73,7 @@ def apriori(dataSet, minSupport = 0.5):
def main():
# project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
# 1.收集并准备数据
# dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir)
# dataMat, labelMat = loadDataSet("%s/resources/Logistic_testdata.txt" % project_dir)
# 1. 加载数据

View File

@@ -0,0 +1,100 @@
-0.017612 14.053064 0
-1.395634 4.662541 1
-0.752157 6.538620 0
-1.322371 7.152853 0
0.423363 11.054677 0
0.406704 7.067335 1
0.667394 12.741452 0
-2.460150 6.866805 1
0.569411 9.548755 0
-0.026632 10.427743 0
0.850433 6.920334 1
1.347183 13.175500 0
1.176813 3.167020 1
-1.781871 9.097953 0
-0.566606 5.749003 1
0.931635 1.589505 1
-0.024205 6.151823 1
-0.036453 2.690988 1
-0.196949 0.444165 1
1.014459 5.754399 1
1.985298 3.230619 1
-1.693453 -0.557540 1
-0.576525 11.778922 0
-0.346811 -1.678730 1
-2.124484 2.672471 1
1.217916 9.597015 0
-0.733928 9.098687 0
-3.642001 -1.618087 1
0.315985 3.523953 1
1.416614 9.619232 0
-0.386323 3.989286 1
0.556921 8.294984 1
1.224863 11.587360 0
-1.347803 -2.406051 1
1.196604 4.951851 1
0.275221 9.543647 0
0.470575 9.332488 0
-1.889567 9.542662 0
-1.527893 12.150579 0
-1.185247 11.309318 0
-0.445678 3.297303 1
1.042222 6.105155 1
-0.618787 10.320986 0
1.152083 0.548467 1
0.828534 2.676045 1
-1.237728 10.549033 0
-0.683565 -2.166125 1
0.229456 5.921938 1
-0.959885 11.555336 0
0.492911 10.993324 0
0.184992 8.721488 0
-0.355715 10.325976 0
-0.397822 8.058397 0
0.824839 13.730343 0
1.507278 5.027866 1
0.099671 6.835839 1
-0.344008 10.717485 0
1.785928 7.718645 1
-0.918801 11.560217 0
-0.364009 4.747300 1
-0.841722 4.119083 1
0.490426 1.960539 1
-0.007194 9.075792 0
0.356107 12.447863 0
0.342578 12.281162 0
-0.810823 -1.466018 1
2.530777 6.476801 1
1.296683 11.607559 0
0.475487 12.040035 0
-0.783277 11.009725 0
0.074798 11.023650 0
-1.337472 0.468339 1
-0.102781 13.763651 0
-0.147324 2.874846 1
0.518389 9.887035 0
1.015399 7.571882 0
-1.658086 -0.027255 1
1.319944 2.171228 1
2.056216 5.019981 1
-0.851633 4.375691 1
-1.510047 6.061992 0
-1.076637 -3.181888 1
1.821096 10.283990 0
3.010150 8.401766 1
-1.099458 1.688274 1
-0.834872 -1.733869 1
-0.846637 3.849075 1
1.400102 12.628781 0
1.752842 5.468166 1
0.078557 0.059736 1
0.089392 -0.715300 1
1.825662 12.693808 0
0.197445 9.744638 0
0.126117 0.922311 1
-0.679797 1.220530 1
0.677983 2.556666 1
0.761349 10.693862 0
-2.168791 0.143632 1
1.388610 9.341997 0
0.317029 14.739025 0