diff --git a/.gitignore b/.gitignore index 3729f6a5..6564ff4a 100644 --- a/.gitignore +++ b/.gitignore @@ -88,4 +88,3 @@ ENV/ # Rope project settings .ropeproject .vscode -.idea \ No newline at end of file diff --git a/docs/5.Logistic回归.md b/docs/5.Logistic回归.md index a6c6a445..a3da78b3 100644 --- a/docs/5.Logistic回归.md +++ b/docs/5.Logistic回归.md @@ -1,11 +1,36 @@ -# 1) 逻辑回归基础 +# 5) 逻辑回归基础 * 逻辑回归(Logistic Regression) - * 1.1 分类问题 - * 1.2 假说表示 - * 1.3 判定边界 - * 1.4 代价函数 - * 1.5 简化的成本函数和梯度下降 - * 1.6 高级优化 - * 1.7 多类分类:一个对所有 \ No newline at end of file + * 5.1 分类问题 + * 在分类问题中,尝试预测的是结果是否属于某一个类(例如正确或错误)。 + * 分类问题的例子有: + * 判断一封电子邮件是否是垃圾邮件; + * 判断一次金融交易是否是欺诈等等。 + * 从二元的分类问题开始讨论: + 将因变量(dependant variable)可能属于的两个类分别称为负向类(negative class)和正向类(positive class),则因变量 + y属于{0,1} + 注:其中 0 表示负向类,1 表示正向类。 + * 5.2 假说表示 + + * 5.3 判定边界 + * 在逻辑回归中,我们预测: + 当 hθ 大于等于 0.5 时,预测 y=1 + 当 hθ 小于 0.5 时,预测 y=0 + * 根据上面绘制出的 S 形函数图像,我们知道当 + z=0时 ,g(z)=0.5 + z>0时 ,g(z)>0.5 + z<0时 ,g(z)<0.5 + 又z=θ的T次方与X的积,即: + z大于等于0时,预测:y=1 + z小于0时,预测:y=0 + * 现在假设我们有一个模型:Hθ(x)=g(θ0+θ1*x1+θ2*x2) + 并且参数θ是向量[-3 1 1]。则当-3+x1+x2大于等于0,即x1+x2大于等于3时,模型将预测y=1。 + 我们可以绘制直线x1+x2=3,这条线便是我们模型的分界线,将预测为1的区域和预测为0的区域分隔开。 + * 假使我们的数据呈现这样的分布情况,怎样的模型才能适合呢? + 因为需要用曲线才能分隔 y=0 的区域和 y=1 的区域,我们需要二次方特征: 假设参数是Hθ(x)=g(θ0+θ1*x1+θ2*x2+θ3*(x1^2)+θ4*(x2^2)+θ4*(x2^2)) + 是[-1 0 0 1 1],则我们得到的判定边界恰好是圆点在原点且半径为 1 的圆形。可以用非常复杂的模型来适应非常复杂形状的判定边界。 + * 5.4 代价函数 + * 5.5 简化的成本函数和梯度下降 + * 5.6 高级优化 + * 5.7 多类分类:一个对所有 diff --git a/src/python/03.DecisionTree/DTSklearn.py b/src/python/03.DecisionTree/DTSklearn.py index a4890961..f63d2635 100644 --- a/src/python/03.DecisionTree/DTSklearn.py +++ b/src/python/03.DecisionTree/DTSklearn.py @@ -6,6 +6,12 @@ from sklearn import tree from sklearn.metrics import precision_recall_curve from sklearn.metrics import classification_report from sklearn.cross_validation import train_test_split +""" +需要安装依赖模块: +pip install scikit_learn-0.18-cp27-cp27m-win_amd64.whl +非常完整的网址: +http://www.lfd.uci.edu/~gohlke/pythonlibs/#numpy +""" def createDataSet(): diff --git a/src/python/05.Logistic/core/logRegression01.py b/src/python/05.Logistic/core/logRegression01.py new file mode 100644 index 00000000..caa026be --- /dev/null +++ b/src/python/05.Logistic/core/logRegression01.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +# encoding: utf-8 +from numpy import * +import matplotlib.pyplot as plt +import time +''' +1、需要安装模块:pip install matplotlib-1.5.0-cp27-none-win_amd64.whl +由于直接安装会出现问题,所以建议下载whl包进行安装,下载网址: +https://pypi.python.org/pypi/matplotlib/1.5.0 + +2、可以看见画出的图像 +''' + +""" +@version: +@author: yangjf +@license: ApacheCN +@contact: highfei2011@126.com +@site: https://github.com/apachecn/MachineLearning +@software: PyCharm +@file: logRegression01.py +@time: 2017/3/3 22:03 +@test result: ok +""" + +# sigmoid函数 +def sigmoid(inX): + return 1.0 / (1 + exp(-inX)) + +def trainLogRegres(train_x, train_y, opts): + # 计算训练时间 + startTime = time.time() + + numSamples, numFeatures = shape(train_x) + alpha = opts['alpha']; maxIter = opts['maxIter'] + weights = ones((numFeatures, 1)) + + # 通过梯度下降算法优化 + for k in range(maxIter): + if opts['optimizeType'] == 'gradDescent': # 梯度下降算法 + output = sigmoid(train_x * weights) + error = train_y - output + weights = weights + alpha * train_x.transpose() * error + elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降 + for i in range(numSamples): + output = sigmoid(train_x[i, :] * weights) + error = train_y[i, 0] - output + weights = weights + alpha * train_x[i, :].transpose() * error + elif opts['optimizeType'] == 'smoothStocGradDescent': # 光滑随机梯度下降 + # 随机选择样本以优化以减少周期波动 + dataIndex = range(numSamples) + for i in range(numSamples): + alpha = 4.0 / (1.0 + k + i) + 0.01 + randIndex = int(random.uniform(0, len(dataIndex))) + output = sigmoid(train_x[randIndex, :] * weights) + error = train_y[randIndex, 0] - output + weights = weights + alpha * train_x[randIndex, :].transpose() * error + del(dataIndex[randIndex]) # 在一次交互期间,删除优化的样品 + else: + raise NameError('Not support optimize method type!') + + + print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime) + return weights + + +#测试给定测试集的训练Logistic回归模型 +def testLogRegres(weights, test_x, test_y): + numSamples, numFeatures = shape(test_x) + matchCount = 0 + for i in xrange(numSamples): + predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5 + if predict == bool(test_y[i, 0]): + matchCount += 1 + accuracy = float(matchCount) / numSamples + return accuracy + + +# 显示你的训练逻辑回归模型只有2-D数据可用 +def showLogRegres(weights, train_x, train_y): + # 注意:train_x和train_y是垫数据类型 + numSamples, numFeatures = shape(train_x) + if numFeatures != 3: + print "抱歉! 我不能绘制,因为你的数据的维度不是2!" + return 1 + + # 画出所有抽样数据 + for i in xrange(numSamples): + if int(train_y[i, 0]) == 0: + plt.plot(train_x[i, 1], train_x[i, 2], 'or') + elif int(train_y[i, 0]) == 1: + plt.plot(train_x[i, 1], train_x[i, 2], 'ob') + + # 画图操作 + min_x = min(train_x[:, 1])[0, 0] + max_x = max(train_x[:, 1])[0, 0] + weights = weights.getA() # 将mat转换为数组 + y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2] + y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2] + plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g') + plt.xlabel('X1'); plt.ylabel('X2') + #显示图像 + plt.show() \ No newline at end of file diff --git a/src/python/05.Logistic/core/test_logRegression.py b/src/python/05.Logistic/core/test_logRegression.py new file mode 100644 index 00000000..c7d5d50d --- /dev/null +++ b/src/python/05.Logistic/core/test_logRegression.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# encoding: utf-8 +import os +import sys +sys.path.append("C:\Python27") +from numpy import * + +from logRegression01 import * +""" +@version: +@author: yangjf +@license: ApacheCN +@contact: highfei2011@126.com +@site: https://github.com/apachecn/MachineLearning +@software: PyCharm +@file: test_logRegression.py +@time: 2017/3/3 22:09 +@test result: ok +""" + +def loadData(): + train_x = [] + train_y = [] + # 获取当前文件所在路径 + project_dir = os.getcwdu() + # 截取字符串至项目名:Test\ + project_dir = project_dir[:project_dir.find("MachineLearning\\") + 15] + print project_dir + fileIn = open("%s/testData/Logistic_testdata.txt" % project_dir) + for line in fileIn.readlines(): + lineArr = line.strip().split() + train_x.append([1.0, float(lineArr[0]), float(lineArr[1])]) + train_y.append(float(lineArr[2])) + return mat(train_x), mat(train_y).transpose() + + +##第一步: 加载数据 +print "step 1: load data..." +train_x, train_y = loadData() +test_x = train_x; test_y = train_y + +##第二步: 训练数据... +print "step 2: training..." +opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'} +optimalWeights = trainLogRegres(train_x, train_y, opts) + +##第三步: 测试 +print "step 3: testing..." +accuracy = testLogRegres(optimalWeights, test_x, test_y) + +##第四步: 显示结果 +print "step 4: show the result..." +print 'The classify accuracy is: %.3f%%' % (accuracy * 100) +showLogRegres(optimalWeights, train_x, train_y) \ No newline at end of file diff --git a/src/python/Logistic.py b/src/python/Logistic.py index 82ca465c..c59e022d 100644 --- a/src/python/Logistic.py +++ b/src/python/Logistic.py @@ -117,7 +117,7 @@ def plotBestFit(dataArr, labelMat, weights): def main(): project_dir = os.path.dirname(os.path.dirname(os.getcwd())) # 1.收集并准备数据 - dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir) + dataMat, labelMat = loadDataSet("%s/testData/Logistic_testdata.txt" % project_dir) # print dataMat, '---\n', labelMat # 2.训练模型, f(x)=a1*x1+b2*x2+..+nn*xn中 (a1,b2, .., nn).T的矩阵值 diff --git a/src/python/apriori.py b/src/python/apriori.py index ee6af908..98112685 100644 --- a/src/python/apriori.py +++ b/src/python/apriori.py @@ -73,7 +73,7 @@ def apriori(dataSet, minSupport = 0.5): def main(): # project_dir = os.path.dirname(os.path.dirname(os.getcwd())) # 1.收集并准备数据 - # dataMat, labelMat = loadDataSet("%s/resources/testSet.txt" % project_dir) + # dataMat, labelMat = loadDataSet("%s/resources/Logistic_testdata.txt" % project_dir) # 1. 加载数据 diff --git a/testData/Logistic_testdata.txt b/testData/Logistic_testdata.txt new file mode 100644 index 00000000..2356ac54 --- /dev/null +++ b/testData/Logistic_testdata.txt @@ -0,0 +1,100 @@ +-0.017612 14.053064 0 +-1.395634 4.662541 1 +-0.752157 6.538620 0 +-1.322371 7.152853 0 +0.423363 11.054677 0 +0.406704 7.067335 1 +0.667394 12.741452 0 +-2.460150 6.866805 1 +0.569411 9.548755 0 +-0.026632 10.427743 0 +0.850433 6.920334 1 +1.347183 13.175500 0 +1.176813 3.167020 1 +-1.781871 9.097953 0 +-0.566606 5.749003 1 +0.931635 1.589505 1 +-0.024205 6.151823 1 +-0.036453 2.690988 1 +-0.196949 0.444165 1 +1.014459 5.754399 1 +1.985298 3.230619 1 +-1.693453 -0.557540 1 +-0.576525 11.778922 0 +-0.346811 -1.678730 1 +-2.124484 2.672471 1 +1.217916 9.597015 0 +-0.733928 9.098687 0 +-3.642001 -1.618087 1 +0.315985 3.523953 1 +1.416614 9.619232 0 +-0.386323 3.989286 1 +0.556921 8.294984 1 +1.224863 11.587360 0 +-1.347803 -2.406051 1 +1.196604 4.951851 1 +0.275221 9.543647 0 +0.470575 9.332488 0 +-1.889567 9.542662 0 +-1.527893 12.150579 0 +-1.185247 11.309318 0 +-0.445678 3.297303 1 +1.042222 6.105155 1 +-0.618787 10.320986 0 +1.152083 0.548467 1 +0.828534 2.676045 1 +-1.237728 10.549033 0 +-0.683565 -2.166125 1 +0.229456 5.921938 1 +-0.959885 11.555336 0 +0.492911 10.993324 0 +0.184992 8.721488 0 +-0.355715 10.325976 0 +-0.397822 8.058397 0 +0.824839 13.730343 0 +1.507278 5.027866 1 +0.099671 6.835839 1 +-0.344008 10.717485 0 +1.785928 7.718645 1 +-0.918801 11.560217 0 +-0.364009 4.747300 1 +-0.841722 4.119083 1 +0.490426 1.960539 1 +-0.007194 9.075792 0 +0.356107 12.447863 0 +0.342578 12.281162 0 +-0.810823 -1.466018 1 +2.530777 6.476801 1 +1.296683 11.607559 0 +0.475487 12.040035 0 +-0.783277 11.009725 0 +0.074798 11.023650 0 +-1.337472 0.468339 1 +-0.102781 13.763651 0 +-0.147324 2.874846 1 +0.518389 9.887035 0 +1.015399 7.571882 0 +-1.658086 -0.027255 1 +1.319944 2.171228 1 +2.056216 5.019981 1 +-0.851633 4.375691 1 +-1.510047 6.061992 0 +-1.076637 -3.181888 1 +1.821096 10.283990 0 +3.010150 8.401766 1 +-1.099458 1.688274 1 +-0.834872 -1.733869 1 +-0.846637 3.849075 1 +1.400102 12.628781 0 +1.752842 5.468166 1 +0.078557 0.059736 1 +0.089392 -0.715300 1 +1.825662 12.693808 0 +0.197445 9.744638 0 +0.126117 0.922311 1 +-0.679797 1.220530 1 +0.677983 2.556666 1 +0.761349 10.693862 0 +-2.168791 0.143632 1 +1.388610 9.341997 0 +0.317029 14.739025 0 \ No newline at end of file