From 36127ba24b92da7cbb2edfa47129da0ca0197144 Mon Sep 17 00:00:00 2001
From: jiangzhonglian <jiang-s@163.com>
Date: Fri, 15 Sep 2017 17:03:58 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=2015=E7=AB=A0=20=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81=E6=96=B0=E6=A0=BC=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/python/15.BigData_MapReduce/mrSVM.py      | 24 +++++++-------
 .../15.BigData_MapReduce/proximalSVM.py       | 31 ++++++++++---------
 2 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/src/python/15.BigData_MapReduce/mrSVM.py b/src/python/15.BigData_MapReduce/mrSVM.py
index 6493795d..f7a7c302 100644
--- a/src/python/15.BigData_MapReduce/mrSVM.py
+++ b/src/python/15.BigData_MapReduce/mrSVM.py
@@ -2,9 +2,11 @@
 # coding:utf8
 '''
 Created on 2017-04-07
+Update  on 2017-06-20
 MapReduce version of Pegasos SVM
 Using mrjob to automate job flow
-@author: Peter/ApacheCN-xy
+@author: Peter/ApacheCN-xy/片刻
+《机器学习实战》更新地址：https://github.com/apachecn/MachineLearning
 '''
 from mrjob.job import MRJob
 
@@ -17,14 +19,14 @@ class MRsvm(MRJob):
 
     def __init__(self, *args, **kwargs):
         super(MRsvm, self).__init__(*args, **kwargs)
-        self.data = pickle.load(open('input/15.BigData_MapReduce/svmDat27'))
+        self.data = pickle.load(open('/opt/git/MachineLearning/input/15.BigData_MapReduce/svmDat27'))
         self.w = 0
         self.eta = 0.69
         self.dataList = []
         self.k = self.options.batchsize
         self.numMappers = 1
         self.t = 1  # iteration number
-                                      
+
     def configure_options(self):
         super(MRsvm, self).configure_options()
         self.add_passthrough_option(
@@ -42,20 +44,20 @@ class MRsvm(MRJob):
             self.w = inVals[1]
         elif inVals[0] == 'x':
             self.dataList.append(inVals[1])   # 累积数据点计算
-        elif inVals[0] == 't':
+        elif inVals[0] == 't':                # 迭代次数
             self.t = inVals[1]
         else:
-            self.eta = inVals  # 这用于 debug， eta未在map中使用
+            self.eta = inVals                 # 这用于 debug， eta未在map中使用
 
     def map_fin(self):
-        labels = self.data[:,-1]
-        X = self.data[:, 0:-1]   # 将数据重新形成 X 和 Y
-        if self.w == 0: 
+        labels = self.data[:, -1]
+        X = self.data[:, :-1]                # 将数据重新形成 X 和 Y
+        if self.w == 0:
             self.w = [0.001] * shape(X)[1]   # 在第一次迭代时，初始化 w
         for index in self.dataList:
-            p = mat(self.w)*X[index, :].T    # calc p=w*dataSet[key].T 
+            p = mat(self.w)*X[index, :].T    # calc p=w*dataSet[key].T
             if labels[index]*p < 1.0:
-                yield (1, ['u', index])      # 确保一切数据包含相同的key                           
+                yield (1, ['u', index])      # 确保一切数据包含相同的key
         yield (1, ['w', self.w])             # 它们将在同一个 reducer
         yield (1, ['t', self.t])
 
@@ -66,7 +68,7 @@ class MRsvm(MRJob):
             elif valArr[0] == 'w':
                 self.w = valArr[1]
             elif valArr[0] == 't':
-                self.t = valArr[1] 
+                self.t = valArr[1]
 
         labels = self.data[:, -1]
         X = self.data[:, 0:-1]
diff --git a/src/python/15.BigData_MapReduce/proximalSVM.py b/src/python/15.BigData_MapReduce/proximalSVM.py
index 8fb01ee6..eed2e423 100644
--- a/src/python/15.BigData_MapReduce/proximalSVM.py
+++ b/src/python/15.BigData_MapReduce/proximalSVM.py
@@ -1,7 +1,10 @@
+#!/usr/bin/python
+# coding:utf8
 '''
-Created on Feb 25, 2011
-
-@author: Peter
+Created on 2011-02-25
+Update  on 2017-06-20
+@author: Peter/ApacheCN-xy/片刻
+《机器学习实战》更新地址：https://github.com/apachecn/MachineLearning
 '''
 import numpy
 
@@ -9,28 +12,28 @@ def map(key, value):
    # input key= class for one training example, e.g. "-1.0"
    classes = [float(item) for item in key.split(",")]   # e.g. [-1.0]
    D = numpy.diag(classes)
- 
+
    # input value = feature vector for one training example, e.g. "3.0, 7.0, 2.0"
    featurematrix = [float(item) for item in value.split(",")]
    A = numpy.matrix(featurematrix)
- 
+
    # create matrix E and vector e
-   e = numpy.matrix(numpy.ones(len(A)).reshape(len(A),1))
-   E = numpy.matrix(numpy.append(A,-e,axis=1)) 
- 
+   e = numpy.matrix(numpy.ones(len(A)).reshape(len(A), 1))
+   E = numpy.matrix(numpy.append(A, -e, axis=1)) 
+
    # create a tuple with the values to be used by reducer
    # and encode it with base64 to avoid potential trouble with '\t' and '\n' used
    # as default separators in Hadoop Streaming
-   producedvalue = base64.b64encode(pickle.dumps( (E.T*E, E.T*D*e) )    
- 
+   producedvalue = base64.b64encode(pickle.dumps( (E.T*E, E.T*D*e))    
+
    # note: a single constant key "producedkey" sends to only one reducer
    # somewhat "atypical" due to low degree of parallism on reducer side
    print "producedkey\t%s" % (producedvalue)
-   
+
 def reduce(key, values, mu=0.1):
   sumETE = None
   sumETDe = None
- 
+
   # key isn't used, so ignoring it with _ (underscore).
   for _, value in values:
     # unpickle values
@@ -39,13 +42,13 @@ def reduce(key, values, mu=0.1):
       # create the I/mu with correct dimensions
       sumETE = numpy.matrix(numpy.eye(ETE.shape[1])/mu)
     sumETE += ETE
- 
+
     if sumETDe == None:
       # create sumETDe with correct dimensions
       sumETDe = ETDe
     else:
       sumETDe += ETDe
- 
+
     # note: omega = result[:-1] and gamma = result[-1]
     # but printing entire vector as output
     result = sumETE.I*sumETDe