更新15.mapreduce的使用注解

This commit is contained in:
jiangzhonglian
2017-04-08 18:06:58 +08:00
parent 87836f0f7b
commit 6bdfd5a420
3 changed files with 21 additions and 12 deletions

View File

@@ -1,3 +1,6 @@
#!/usr/bin/python
# coding:utf8
'''
Created on 2017-04-07
@@ -5,26 +8,27 @@ Created on 2017-04-07
'''
from mrjob.job import MRJob
class MRmean(MRJob):
def __init__(self, *args, **kwargs): # 对数据初始化
super(MRmean, self).__init__(*args, **kwargs)
self.inCount = 0
self.inSum = 0
self.inSqSum = 0
def map(self, key, val): # 需要 2 个参数,求数据的和与平方和
if False: yield
inVal = float(val)
self.inCount += 1
self.inSum += inVal
self.inSqSum += inVal*inVal
def map_final(self): # 计算数据的平均值,平方的均值,并返回
mn = self.inSum/self.inCount
mnSq = self.inSqSum/self.inCount
yield (1, [self.inCount, mn, mnSq])
def reduce(self, key, packedValues): #
def reduce(self, key, packedValues):
cumVal=0.0; cumSumSq=0.0; cumN=0.0
for valArr in packedValues: # 从输入流中获取值
nj = float(valArr[0])
@@ -34,10 +38,10 @@ class MRmean(MRJob):
mean = cumVal/cumN
var = (cumSumSq - 2*mean*cumVal + cumN*mean*mean)/cumN
yield (mean, var) # 发出平均值和方差
def steps(self):
return ([self.mr(mapper=self.map, mapper_final=self.map_final,\
reducer=self.reduce,)])
return ([self.mr(mapper=self.map, mapper_final=self.map_final, reducer=self.reduce,)])
if __name__ == '__main__':
MRmean.run()
MRmean.run()

View File

@@ -1,3 +1,5 @@
#!/usr/bin/python
# coding:utf8
'''
Created on 2017-04-06
Machine Learning in Action Chapter 18
@@ -30,4 +32,4 @@ sqInput = power(input,2) # 将矩阵的数据分别求 平方,即 2次方
# 输出 数据的个数n个数据的均值n个数据平方之后的均值
print ("%d\t%f\t%f" % (numInputs, mean(input), mean(sqInput))) #计算均值
print ("report: still alive", file=sys.stderr)
print >> sys.stderr, "report: still alive"

View File

@@ -1,3 +1,6 @@
#!/usr/bin/python
# coding:utf8
'''
Created on 2017-04-06
Machine Learning in Action Chapter 18
@@ -7,9 +10,9 @@ Map Reduce Job for Hadoop Streaming
'''
mapper 接受原始的输入并产生中间值传递给 reducer。
很多的mapper是并行执行的所以需要将这些mapper的输出合并成一个值。
即:将中间的 key/value 对进行组合。
mapper 接受原始的输入并产生中间值传递给 reducer。
很多的mapper是并行执行的所以需要将这些mapper的输出合并成一个值。
即:将中间的 key/value 对进行组合。
'''
import sys
from numpy import mat, mean, power
@@ -40,4 +43,4 @@ meanSq = cumSumSq/cumN
#输出 数据总量,均值,平方的均值(方差)
print ("%d\t%f\t%f" % (cumN, mean, meanSq))
print ("report: still alive", file=sys.stderr)
print >> sys.stderr, "report: still alive"