mirror of
https://github.com/apachecn/ailearning.git
synced 2026-02-11 22:35:35 +08:00
添加15章的一些代码
This commit is contained in:
100
src/python/15.BigData_MapReduce/MR_inputFile.txt
Normal file
100
src/python/15.BigData_MapReduce/MR_inputFile.txt
Normal file
@@ -0,0 +1,100 @@
|
||||
0.970413
|
||||
0.901817
|
||||
0.828698
|
||||
0.197744
|
||||
0.466887
|
||||
0.962147
|
||||
0.187294
|
||||
0.388509
|
||||
0.243889
|
||||
0.115732
|
||||
0.616292
|
||||
0.713436
|
||||
0.761446
|
||||
0.944123
|
||||
0.200903
|
||||
0.547166
|
||||
0.800028
|
||||
0.848790
|
||||
0.001641
|
||||
0.058010
|
||||
0.859900
|
||||
0.009178
|
||||
0.736598
|
||||
0.683586
|
||||
0.142515
|
||||
0.212120
|
||||
0.752769
|
||||
0.546184
|
||||
0.652227
|
||||
0.583803
|
||||
0.812863
|
||||
0.036862
|
||||
0.075076
|
||||
0.257536
|
||||
0.431278
|
||||
0.600214
|
||||
0.985564
|
||||
0.055846
|
||||
0.905295
|
||||
0.336262
|
||||
0.198738
|
||||
0.845815
|
||||
0.527989
|
||||
0.448650
|
||||
0.235313
|
||||
0.599749
|
||||
0.443923
|
||||
0.968723
|
||||
0.911076
|
||||
0.279338
|
||||
0.569492
|
||||
0.635985
|
||||
0.267532
|
||||
0.975018
|
||||
0.463698
|
||||
0.842340
|
||||
0.065590
|
||||
0.233049
|
||||
0.810390
|
||||
0.448260
|
||||
0.431967
|
||||
0.549648
|
||||
0.703612
|
||||
0.187974
|
||||
0.231709
|
||||
0.784160
|
||||
0.072283
|
||||
0.921053
|
||||
0.735468
|
||||
0.715923
|
||||
0.150431
|
||||
0.661089
|
||||
0.734955
|
||||
0.633709
|
||||
0.216102
|
||||
0.498474
|
||||
0.195620
|
||||
0.339548
|
||||
0.245314
|
||||
0.819848
|
||||
0.521242
|
||||
0.549276
|
||||
0.200906
|
||||
0.202525
|
||||
0.922876
|
||||
0.025404
|
||||
0.604032
|
||||
0.752204
|
||||
0.158860
|
||||
0.651622
|
||||
0.592898
|
||||
0.500392
|
||||
0.410614
|
||||
0.968388
|
||||
0.265918
|
||||
0.565707
|
||||
0.413670
|
||||
0.080507
|
||||
0.929978
|
||||
0.609755
|
||||
33
src/python/15.BigData_MapReduce/mrMeanMapper.py
Normal file
33
src/python/15.BigData_MapReduce/mrMeanMapper.py
Normal file
@@ -0,0 +1,33 @@
|
||||
'''
|
||||
Created on 2017-04-06
|
||||
Machine Learning in Action Chapter 18
|
||||
Map Reduce Job for Hadoop Streaming
|
||||
@author: Peter Harrington/ApacheCn-xy
|
||||
'''
|
||||
|
||||
|
||||
'''
|
||||
这个mapper文件按行读取所有的输入并创建一组对应的浮点数,然后得到数组的长度并创建NumPy矩阵。
|
||||
再对所有的值进行平方,最后将均值和平方后的均值发送出去。这些值将用来计算全局的均值和方差。
|
||||
|
||||
Args:
|
||||
file 输入数据
|
||||
Return:
|
||||
|
||||
'''
|
||||
import sys
|
||||
from numpy import mat, mean, power
|
||||
|
||||
def read_input(file):
|
||||
for line in file:
|
||||
yield line.rstrip() # 返回值中包含输入文件的每一行的数据的一个大的List
|
||||
|
||||
input = read_input(sys.stdin) # 创建一个输入的数据行的列表list
|
||||
input = [float(line) for line in input] # 将得到的数据转化为 float 类型
|
||||
numInputs = len(input) # 获取数据的个数,即输入文件的数据的行数
|
||||
input = mat(input) # 将 List 转换为矩阵
|
||||
sqInput = power(input,2) # 将矩阵的数据分别求 平方,即 2次方
|
||||
|
||||
# 输出 数据的个数,n个数据的均值,n个数据平方之后的均值
|
||||
print ("%d\t%f\t%f" % (numInputs, mean(input), mean(sqInput))) #计算均值
|
||||
print ("report: still alive", file=sys.stderr)
|
||||
43
src/python/15.BigData_MapReduce/mrMeanReducer.py
Normal file
43
src/python/15.BigData_MapReduce/mrMeanReducer.py
Normal file
@@ -0,0 +1,43 @@
|
||||
'''
|
||||
Created on 2017-04-06
|
||||
Machine Learning in Action Chapter 18
|
||||
Map Reduce Job for Hadoop Streaming
|
||||
@author: Peter Harrington/ApacheCn-xy
|
||||
'''
|
||||
|
||||
|
||||
'''
|
||||
mapper 接受原始的输入并产生中间值传递给 reducer。
|
||||
很多的mapper是并行执行的,所以需要将这些mapper的输出合并成一个值。
|
||||
即:将中间的 key/value 对进行组合。
|
||||
'''
|
||||
import sys
|
||||
from numpy import mat, mean, power
|
||||
|
||||
def read_input(file):
|
||||
for line in file:
|
||||
yield line.rstrip() # 返回值中包含输入文件的每一行的数据的一个大的List
|
||||
|
||||
input = read_input(sys.stdin) # 创建一个输入的数据行的列表list
|
||||
|
||||
# 将输入行分割成单独的项目并存储在列表的列表中
|
||||
mapperOut = [line.split('\t') for line in input]
|
||||
print (mapperOut)
|
||||
|
||||
# 累计样本总和,总和 和 总和 sq
|
||||
cumVal=0.0
|
||||
cumSumSq=0.0
|
||||
cumN=0.0
|
||||
for instance in mapperOut:
|
||||
nj = float(instance[0])
|
||||
cumN += nj
|
||||
cumVal += nj*float(instance[1])
|
||||
cumSumSq += nj*float(instance[2])
|
||||
|
||||
#计算均值
|
||||
mean = cumVal/cumN
|
||||
meanSq = cumSumSq/cumN
|
||||
|
||||
#输出 数据总量,均值,平方的均值(方差)
|
||||
print ("%d\t%f\t%f" % (cumN, mean, meanSq))
|
||||
print ("report: still alive", file=sys.stderr)
|
||||
Reference in New Issue
Block a user