From 36127ba24b92da7cbb2edfa47129da0ca0197144 Mon Sep 17 00:00:00 2001 From: jiangzhonglian Date: Fri, 15 Sep 2017 17:03:58 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=2015=E7=AB=A0=20=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E6=96=B0=E6=A0=BC=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/python/15.BigData_MapReduce/mrSVM.py | 24 +++++++------- .../15.BigData_MapReduce/proximalSVM.py | 31 ++++++++++--------- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/src/python/15.BigData_MapReduce/mrSVM.py b/src/python/15.BigData_MapReduce/mrSVM.py index 6493795d..f7a7c302 100644 --- a/src/python/15.BigData_MapReduce/mrSVM.py +++ b/src/python/15.BigData_MapReduce/mrSVM.py @@ -2,9 +2,11 @@ # coding:utf8 ''' Created on 2017-04-07 +Update on 2017-06-20 MapReduce version of Pegasos SVM Using mrjob to automate job flow -@author: Peter/ApacheCN-xy +@author: Peter/ApacheCN-xy/片刻 +《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning ''' from mrjob.job import MRJob @@ -17,14 +19,14 @@ class MRsvm(MRJob): def __init__(self, *args, **kwargs): super(MRsvm, self).__init__(*args, **kwargs) - self.data = pickle.load(open('input/15.BigData_MapReduce/svmDat27')) + self.data = pickle.load(open('/opt/git/MachineLearning/input/15.BigData_MapReduce/svmDat27')) self.w = 0 self.eta = 0.69 self.dataList = [] self.k = self.options.batchsize self.numMappers = 1 self.t = 1 # iteration number - + def configure_options(self): super(MRsvm, self).configure_options() self.add_passthrough_option( @@ -42,20 +44,20 @@ class MRsvm(MRJob): self.w = inVals[1] elif inVals[0] == 'x': self.dataList.append(inVals[1]) # 累积数据点计算 - elif inVals[0] == 't': + elif inVals[0] == 't': # 迭代次数 self.t = inVals[1] else: - self.eta = inVals # 这用于 debug, eta未在map中使用 + self.eta = inVals # 这用于 debug, eta未在map中使用 def map_fin(self): - labels = self.data[:,-1] - X = self.data[:, 0:-1] # 将数据重新形成 X 和 Y - if self.w == 0: + labels = self.data[:, -1] + X = self.data[:, :-1] # 将数据重新形成 X 和 Y + if self.w == 0: self.w = [0.001] * shape(X)[1] # 在第一次迭代时,初始化 w for index in self.dataList: - p = mat(self.w)*X[index, :].T # calc p=w*dataSet[key].T + p = mat(self.w)*X[index, :].T # calc p=w*dataSet[key].T if labels[index]*p < 1.0: - yield (1, ['u', index]) # 确保一切数据包含相同的key + yield (1, ['u', index]) # 确保一切数据包含相同的key yield (1, ['w', self.w]) # 它们将在同一个 reducer yield (1, ['t', self.t]) @@ -66,7 +68,7 @@ class MRsvm(MRJob): elif valArr[0] == 'w': self.w = valArr[1] elif valArr[0] == 't': - self.t = valArr[1] + self.t = valArr[1] labels = self.data[:, -1] X = self.data[:, 0:-1] diff --git a/src/python/15.BigData_MapReduce/proximalSVM.py b/src/python/15.BigData_MapReduce/proximalSVM.py index 8fb01ee6..eed2e423 100644 --- a/src/python/15.BigData_MapReduce/proximalSVM.py +++ b/src/python/15.BigData_MapReduce/proximalSVM.py @@ -1,7 +1,10 @@ +#!/usr/bin/python +# coding:utf8 ''' -Created on Feb 25, 2011 - -@author: Peter +Created on 2011-02-25 +Update on 2017-06-20 +@author: Peter/ApacheCN-xy/片刻 +《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning ''' import numpy @@ -9,28 +12,28 @@ def map(key, value): # input key= class for one training example, e.g. "-1.0" classes = [float(item) for item in key.split(",")] # e.g. [-1.0] D = numpy.diag(classes) - + # input value = feature vector for one training example, e.g. "3.0, 7.0, 2.0" featurematrix = [float(item) for item in value.split(",")] A = numpy.matrix(featurematrix) - + # create matrix E and vector e - e = numpy.matrix(numpy.ones(len(A)).reshape(len(A),1)) - E = numpy.matrix(numpy.append(A,-e,axis=1)) - + e = numpy.matrix(numpy.ones(len(A)).reshape(len(A), 1)) + E = numpy.matrix(numpy.append(A, -e, axis=1)) + # create a tuple with the values to be used by reducer # and encode it with base64 to avoid potential trouble with '\t' and '\n' used # as default separators in Hadoop Streaming - producedvalue = base64.b64encode(pickle.dumps( (E.T*E, E.T*D*e) ) - + producedvalue = base64.b64encode(pickle.dumps( (E.T*E, E.T*D*e)) + # note: a single constant key "producedkey" sends to only one reducer # somewhat "atypical" due to low degree of parallism on reducer side print "producedkey\t%s" % (producedvalue) - + def reduce(key, values, mu=0.1): sumETE = None sumETDe = None - + # key isn't used, so ignoring it with _ (underscore). for _, value in values: # unpickle values @@ -39,13 +42,13 @@ def reduce(key, values, mu=0.1): # create the I/mu with correct dimensions sumETE = numpy.matrix(numpy.eye(ETE.shape[1])/mu) sumETE += ETE - + if sumETDe == None: # create sumETDe with correct dimensions sumETDe = ETDe else: sumETDe += ETDe - + # note: omega = result[:-1] and gamma = result[-1] # but printing entire vector as output result = sumETE.I*sumETDe