Merge pull request #69 from chenyyx/master

添加15章的一些代码和数据
2026-06-15 23:06:32 +08:00 · 2017-04-06 23:53:46 +08:00
parent 304e0e5ed1 fa118c498c
commit 735d03d3a2
3 changed files with 176 additions and 0 deletions
--- a/src/python/15.BigData_MapReduce/mrMeanMapper.py
+++ b/src/python/15.BigData_MapReduce/mrMeanMapper.py
@@ -0,0 +1,33 @@
+'''
+Created on 2017-04-06
+Machine Learning in Action Chapter 18
+Map Reduce Job for Hadoop Streaming 
+@author: Peter Harrington/ApacheCn-xy
+'''
+
+
+'''
+	这个mapper文件按行读取所有的输入并创建一组对应的浮点数，然后得到数组的长度并创建NumPy矩阵。
+	再对所有的值进行平方，最后将均值和平方后的均值发送出去。这些值将用来计算全局的均值和方差。
+
+	Args：
+		file 输入数据
+	Return：
+		
+'''
+import sys
+from numpy import mat, mean, power
+
+def read_input(file):
+    for line in file:
+        yield line.rstrip()				# 返回值中包含输入文件的每一行的数据的一个大的List
+        
+input = read_input(sys.stdin)			# 创建一个输入的数据行的列表list
+input = [float(line) for line in input] # 将得到的数据转化为 float 类型
+numInputs = len(input)					# 获取数据的个数，即输入文件的数据的行数
+input = mat(input)						# 将 List 转换为矩阵
+sqInput = power(input,2)				# 将矩阵的数据分别求 平方，即 2次方
+
+# 输出 数据的个数，n个数据的均值，n个数据平方之后的均值
+print ("%d\t%f\t%f" % (numInputs, mean(input), mean(sqInput))) #计算均值
+print ("report: still alive", file=sys.stderr)
--- a/src/python/15.BigData_MapReduce/mrMeanReducer.py
+++ b/src/python/15.BigData_MapReduce/mrMeanReducer.py
@@ -0,0 +1,43 @@
+'''
+Created on 2017-04-06
+Machine Learning in Action Chapter 18
+Map Reduce Job for Hadoop Streaming 
+@author: Peter Harrington/ApacheCn-xy
+'''
+
+
+'''
+	mapper 接受原始的输入并产生中间值传递给 reducer。
+	很多的mapper是并行执行的，所以需要将这些mapper的输出合并成一个值。
+	即：将中间的 key/value 对进行组合。
+'''
+import sys
+from numpy import mat, mean, power
+
+def read_input(file):
+    for line in file:
+        yield line.rstrip()						# 返回值中包含输入文件的每一行的数据的一个大的List
+       
+input = read_input(sys.stdin)					# 创建一个输入的数据行的列表list
+
+# 将输入行分割成单独的项目并存储在列表的列表中
+mapperOut = [line.split('\t') for line in input]
+print (mapperOut)
+
+# 累计样本总和，总和 和 总和 sq
+cumVal=0.0
+cumSumSq=0.0
+cumN=0.0
+for instance in mapperOut:
+    nj = float(instance[0])
+    cumN += nj
+    cumVal += nj*float(instance[1])
+    cumSumSq += nj*float(instance[2])
+    
+#计算均值
+mean = cumVal/cumN
+meanSq = cumSumSq/cumN
+
+#输出 数据总量，均值，平方的均值（方差）
+print ("%d\t%f\t%f" % (cumN, mean, meanSq))
+print ("report: still alive", file=sys.stderr)