mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-08 21:04:33 +08:00
更新15.mapreduce的使用注解
This commit is contained in:
@@ -1,3 +1,6 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on 2017-04-07
|
||||
|
||||
@@ -5,26 +8,27 @@ Created on 2017-04-07
|
||||
'''
|
||||
from mrjob.job import MRJob
|
||||
|
||||
|
||||
class MRmean(MRJob):
|
||||
def __init__(self, *args, **kwargs): # 对数据初始化
|
||||
super(MRmean, self).__init__(*args, **kwargs)
|
||||
self.inCount = 0
|
||||
self.inSum = 0
|
||||
self.inSqSum = 0
|
||||
|
||||
|
||||
def map(self, key, val): # 需要 2 个参数,求数据的和与平方和
|
||||
if False: yield
|
||||
inVal = float(val)
|
||||
self.inCount += 1
|
||||
self.inSum += inVal
|
||||
self.inSqSum += inVal*inVal
|
||||
|
||||
|
||||
def map_final(self): # 计算数据的平均值,平方的均值,并返回
|
||||
mn = self.inSum/self.inCount
|
||||
mnSq = self.inSqSum/self.inCount
|
||||
yield (1, [self.inCount, mn, mnSq])
|
||||
|
||||
def reduce(self, key, packedValues): #
|
||||
def reduce(self, key, packedValues):
|
||||
cumVal=0.0; cumSumSq=0.0; cumN=0.0
|
||||
for valArr in packedValues: # 从输入流中获取值
|
||||
nj = float(valArr[0])
|
||||
@@ -34,10 +38,10 @@ class MRmean(MRJob):
|
||||
mean = cumVal/cumN
|
||||
var = (cumSumSq - 2*mean*cumVal + cumN*mean*mean)/cumN
|
||||
yield (mean, var) # 发出平均值和方差
|
||||
|
||||
|
||||
def steps(self):
|
||||
return ([self.mr(mapper=self.map, mapper_final=self.map_final,\
|
||||
reducer=self.reduce,)])
|
||||
return ([self.mr(mapper=self.map, mapper_final=self.map_final, reducer=self.reduce,)])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
MRmean.run()
|
||||
MRmean.run()
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
'''
|
||||
Created on 2017-04-06
|
||||
Machine Learning in Action Chapter 18
|
||||
@@ -30,4 +32,4 @@ sqInput = power(input,2) # 将矩阵的数据分别求 平方,即 2次方
|
||||
|
||||
# 输出 数据的个数,n个数据的均值,n个数据平方之后的均值
|
||||
print ("%d\t%f\t%f" % (numInputs, mean(input), mean(sqInput))) #计算均值
|
||||
print ("report: still alive", file=sys.stderr)
|
||||
print >> sys.stderr, "report: still alive"
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
#!/usr/bin/python
|
||||
# coding:utf8
|
||||
|
||||
'''
|
||||
Created on 2017-04-06
|
||||
Machine Learning in Action Chapter 18
|
||||
@@ -7,9 +10,9 @@ Map Reduce Job for Hadoop Streaming
|
||||
|
||||
|
||||
'''
|
||||
mapper 接受原始的输入并产生中间值传递给 reducer。
|
||||
很多的mapper是并行执行的,所以需要将这些mapper的输出合并成一个值。
|
||||
即:将中间的 key/value 对进行组合。
|
||||
mapper 接受原始的输入并产生中间值传递给 reducer。
|
||||
很多的mapper是并行执行的,所以需要将这些mapper的输出合并成一个值。
|
||||
即:将中间的 key/value 对进行组合。
|
||||
'''
|
||||
import sys
|
||||
from numpy import mat, mean, power
|
||||
@@ -40,4 +43,4 @@ meanSq = cumSumSq/cumN
|
||||
|
||||
#输出 数据总量,均值,平方的均值(方差)
|
||||
print ("%d\t%f\t%f" % (cumN, mean, meanSq))
|
||||
print ("report: still alive", file=sys.stderr)
|
||||
print >> sys.stderr, "report: still alive"
|
||||
|
||||
Reference in New Issue
Block a user