diff --git a/src/python/15.BigData_MapReduce/MR_inputFile.txt b/src/python/15.BigData_MapReduce/MR_inputFile.txt new file mode 100644 index 00000000..73a10c19 --- /dev/null +++ b/src/python/15.BigData_MapReduce/MR_inputFile.txt @@ -0,0 +1,100 @@ +0.970413 +0.901817 +0.828698 +0.197744 +0.466887 +0.962147 +0.187294 +0.388509 +0.243889 +0.115732 +0.616292 +0.713436 +0.761446 +0.944123 +0.200903 +0.547166 +0.800028 +0.848790 +0.001641 +0.058010 +0.859900 +0.009178 +0.736598 +0.683586 +0.142515 +0.212120 +0.752769 +0.546184 +0.652227 +0.583803 +0.812863 +0.036862 +0.075076 +0.257536 +0.431278 +0.600214 +0.985564 +0.055846 +0.905295 +0.336262 +0.198738 +0.845815 +0.527989 +0.448650 +0.235313 +0.599749 +0.443923 +0.968723 +0.911076 +0.279338 +0.569492 +0.635985 +0.267532 +0.975018 +0.463698 +0.842340 +0.065590 +0.233049 +0.810390 +0.448260 +0.431967 +0.549648 +0.703612 +0.187974 +0.231709 +0.784160 +0.072283 +0.921053 +0.735468 +0.715923 +0.150431 +0.661089 +0.734955 +0.633709 +0.216102 +0.498474 +0.195620 +0.339548 +0.245314 +0.819848 +0.521242 +0.549276 +0.200906 +0.202525 +0.922876 +0.025404 +0.604032 +0.752204 +0.158860 +0.651622 +0.592898 +0.500392 +0.410614 +0.968388 +0.265918 +0.565707 +0.413670 +0.080507 +0.929978 +0.609755 diff --git a/src/python/15.BigData_MapReduce/mrMeanMapper.py b/src/python/15.BigData_MapReduce/mrMeanMapper.py new file mode 100644 index 00000000..67f7d7e3 --- /dev/null +++ b/src/python/15.BigData_MapReduce/mrMeanMapper.py @@ -0,0 +1,33 @@ +''' +Created on 2017-04-06 +Machine Learning in Action Chapter 18 +Map Reduce Job for Hadoop Streaming +@author: Peter Harrington/ApacheCn-xy +''' + + +''' + 这个mapper文件按行读取所有的输入并创建一组对应的浮点数,然后得到数组的长度并创建NumPy矩阵。 + 再对所有的值进行平方,最后将均值和平方后的均值发送出去。这些值将用来计算全局的均值和方差。 + + Args: + file 输入数据 + Return: + +''' +import sys +from numpy import mat, mean, power + +def read_input(file): + for line in file: + yield line.rstrip() # 返回值中包含输入文件的每一行的数据的一个大的List + +input = read_input(sys.stdin) # 创建一个输入的数据行的列表list +input = [float(line) for line in input] # 将得到的数据转化为 float 类型 +numInputs = len(input) # 获取数据的个数,即输入文件的数据的行数 +input = mat(input) # 将 List 转换为矩阵 +sqInput = power(input,2) # 将矩阵的数据分别求 平方,即 2次方 + +# 输出 数据的个数,n个数据的均值,n个数据平方之后的均值 +print ("%d\t%f\t%f" % (numInputs, mean(input), mean(sqInput))) #计算均值 +print ("report: still alive", file=sys.stderr) \ No newline at end of file diff --git a/src/python/15.BigData_MapReduce/mrMeanReducer.py b/src/python/15.BigData_MapReduce/mrMeanReducer.py new file mode 100644 index 00000000..cc06c5d0 --- /dev/null +++ b/src/python/15.BigData_MapReduce/mrMeanReducer.py @@ -0,0 +1,43 @@ +''' +Created on 2017-04-06 +Machine Learning in Action Chapter 18 +Map Reduce Job for Hadoop Streaming +@author: Peter Harrington/ApacheCn-xy +''' + + +''' + mapper 接受原始的输入并产生中间值传递给 reducer。 + 很多的mapper是并行执行的,所以需要将这些mapper的输出合并成一个值。 + 即:将中间的 key/value 对进行组合。 +''' +import sys +from numpy import mat, mean, power + +def read_input(file): + for line in file: + yield line.rstrip() # 返回值中包含输入文件的每一行的数据的一个大的List + +input = read_input(sys.stdin) # 创建一个输入的数据行的列表list + +# 将输入行分割成单独的项目并存储在列表的列表中 +mapperOut = [line.split('\t') for line in input] +print (mapperOut) + +# 累计样本总和,总和 和 总和 sq +cumVal=0.0 +cumSumSq=0.0 +cumN=0.0 +for instance in mapperOut: + nj = float(instance[0]) + cumN += nj + cumVal += nj*float(instance[1]) + cumSumSq += nj*float(instance[2]) + +#计算均值 +mean = cumVal/cumN +meanSq = cumSumSq/cumN + +#输出 数据总量,均值,平方的均值(方差) +print ("%d\t%f\t%f" % (cumN, mean, meanSq)) +print ("report: still alive", file=sys.stderr) \ No newline at end of file