mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-03 18:34:17 +08:00
更新 15章 代码新格式
This commit is contained in:
@@ -2,9 +2,11 @@
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on 2017-04-07
|
||||
Update on 2017-06-20
|
||||
MapReduce version of Pegasos SVM
|
||||
Using mrjob to automate job flow
|
||||
@author: Peter/ApacheCN-xy
|
||||
@author: Peter/ApacheCN-xy/片刻
|
||||
《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning
|
||||
'''
|
||||
from mrjob.job import MRJob
|
||||
|
||||
@@ -17,14 +19,14 @@ class MRsvm(MRJob):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(MRsvm, self).__init__(*args, **kwargs)
|
||||
self.data = pickle.load(open('input/15.BigData_MapReduce/svmDat27'))
|
||||
self.data = pickle.load(open('/opt/git/MachineLearning/input/15.BigData_MapReduce/svmDat27'))
|
||||
self.w = 0
|
||||
self.eta = 0.69
|
||||
self.dataList = []
|
||||
self.k = self.options.batchsize
|
||||
self.numMappers = 1
|
||||
self.t = 1 # iteration number
|
||||
|
||||
|
||||
def configure_options(self):
|
||||
super(MRsvm, self).configure_options()
|
||||
self.add_passthrough_option(
|
||||
@@ -42,20 +44,20 @@ class MRsvm(MRJob):
|
||||
self.w = inVals[1]
|
||||
elif inVals[0] == 'x':
|
||||
self.dataList.append(inVals[1]) # 累积数据点计算
|
||||
elif inVals[0] == 't':
|
||||
elif inVals[0] == 't': # 迭代次数
|
||||
self.t = inVals[1]
|
||||
else:
|
||||
self.eta = inVals # 这用于 debug, eta未在map中使用
|
||||
self.eta = inVals # 这用于 debug, eta未在map中使用
|
||||
|
||||
def map_fin(self):
|
||||
labels = self.data[:,-1]
|
||||
X = self.data[:, 0:-1] # 将数据重新形成 X 和 Y
|
||||
if self.w == 0:
|
||||
labels = self.data[:, -1]
|
||||
X = self.data[:, :-1] # 将数据重新形成 X 和 Y
|
||||
if self.w == 0:
|
||||
self.w = [0.001] * shape(X)[1] # 在第一次迭代时,初始化 w
|
||||
for index in self.dataList:
|
||||
p = mat(self.w)*X[index, :].T # calc p=w*dataSet[key].T
|
||||
p = mat(self.w)*X[index, :].T # calc p=w*dataSet[key].T
|
||||
if labels[index]*p < 1.0:
|
||||
yield (1, ['u', index]) # 确保一切数据包含相同的key
|
||||
yield (1, ['u', index]) # 确保一切数据包含相同的key
|
||||
yield (1, ['w', self.w]) # 它们将在同一个 reducer
|
||||
yield (1, ['t', self.t])
|
||||
|
||||
@@ -66,7 +68,7 @@ class MRsvm(MRJob):
|
||||
elif valArr[0] == 'w':
|
||||
self.w = valArr[1]
|
||||
elif valArr[0] == 't':
|
||||
self.t = valArr[1]
|
||||
self.t = valArr[1]
|
||||
|
||||
labels = self.data[:, -1]
|
||||
X = self.data[:, 0:-1]
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on Feb 25, 2011
|
||||
|
||||
@author: Peter
|
||||
Created on 2011-02-25
|
||||
Update on 2017-06-20
|
||||
@author: Peter/ApacheCN-xy/片刻
|
||||
《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning
|
||||
'''
|
||||
import numpy
|
||||
|
||||
@@ -9,28 +12,28 @@ def map(key, value):
|
||||
# input key= class for one training example, e.g. "-1.0"
|
||||
classes = [float(item) for item in key.split(",")] # e.g. [-1.0]
|
||||
D = numpy.diag(classes)
|
||||
|
||||
|
||||
# input value = feature vector for one training example, e.g. "3.0, 7.0, 2.0"
|
||||
featurematrix = [float(item) for item in value.split(",")]
|
||||
A = numpy.matrix(featurematrix)
|
||||
|
||||
|
||||
# create matrix E and vector e
|
||||
e = numpy.matrix(numpy.ones(len(A)).reshape(len(A),1))
|
||||
E = numpy.matrix(numpy.append(A,-e,axis=1))
|
||||
|
||||
e = numpy.matrix(numpy.ones(len(A)).reshape(len(A), 1))
|
||||
E = numpy.matrix(numpy.append(A, -e, axis=1))
|
||||
|
||||
# create a tuple with the values to be used by reducer
|
||||
# and encode it with base64 to avoid potential trouble with '\t' and '\n' used
|
||||
# as default separators in Hadoop Streaming
|
||||
producedvalue = base64.b64encode(pickle.dumps( (E.T*E, E.T*D*e) )
|
||||
|
||||
producedvalue = base64.b64encode(pickle.dumps( (E.T*E, E.T*D*e))
|
||||
|
||||
# note: a single constant key "producedkey" sends to only one reducer
|
||||
# somewhat "atypical" due to low degree of parallism on reducer side
|
||||
print "producedkey\t%s" % (producedvalue)
|
||||
|
||||
|
||||
def reduce(key, values, mu=0.1):
|
||||
sumETE = None
|
||||
sumETDe = None
|
||||
|
||||
|
||||
# key isn't used, so ignoring it with _ (underscore).
|
||||
for _, value in values:
|
||||
# unpickle values
|
||||
@@ -39,13 +42,13 @@ def reduce(key, values, mu=0.1):
|
||||
# create the I/mu with correct dimensions
|
||||
sumETE = numpy.matrix(numpy.eye(ETE.shape[1])/mu)
|
||||
sumETE += ETE
|
||||
|
||||
|
||||
if sumETDe == None:
|
||||
# create sumETDe with correct dimensions
|
||||
sumETDe = ETDe
|
||||
else:
|
||||
sumETDe += ETDe
|
||||
|
||||
|
||||
# note: omega = result[:-1] and gamma = result[-1]
|
||||
# but printing entire vector as output
|
||||
result = sumETE.I*sumETDe
|
||||
|
||||
Reference in New Issue
Block a user