From 0d1f7770fcdbad3c5d73e1b99f61443e8188666e Mon Sep 17 00:00:00 2001 From: Yao Fu Date: Sat, 12 Mar 2022 01:05:08 +0000 Subject: [PATCH] Fix figures (#76) * Fix a citation error * Fix figures #70 --- chapter_recommender_system/index.md | 1 - .../ch10-abstract-recommendation-systems.svg | 648 ++++++------- img/ch10/ch10-recommendation-models.svg | 470 +++++---- img/ch10/ch10-recommendation-systems.svg | 902 ++++++++++-------- 4 files changed, 1081 insertions(+), 940 deletions(-) diff --git a/chapter_recommender_system/index.md b/chapter_recommender_system/index.md index 47b0ade..0f19142 100644 --- a/chapter_recommender_system/index.md +++ b/chapter_recommender_system/index.md @@ -31,7 +31,6 @@ 正如上文提到的,嵌入表占据了推荐模型绝大部分存储而其更新具有显著的稀疏性,因此推荐系统通常采用上一章介绍的参数服务器架构来存储模型。具体来讲,所有参数被分布存储在一组参数服务器上,而训练服务器一方面从数据存储模块拉取训练数据,另一方面根据训练数据从参数服务器上拉取对应的嵌入项和所有稠密神经网络参数。训练服务器本地更新之后将本地梯度或新的参数发送回参数服务器以更新全局参数。全局参数更新可以选择全同步,半同步,或异步更新。类似的,推理服务器在接到一批用户的推荐请求后,从参数服务器拉去相应的嵌入项和稠密神经网络参数来响应用户的请求。为了提升训练(推理)的吞吐,可以在训练(推理)服务器上缓存一部分参数。 为了避免训练服务器和参数服务器之间的通信限制训练吞吐率,一些公司也在探索单机多GPU训练超大规模推荐系统。然而正如前文提到的,即使是单个推荐模型的参数量(1̃00GB)也超出了目前最新的GPU显存。有鉴于此,脸书公司的定制训练平台 - -- ZionEX :cite:`zionex`利用计算设备之间的高速链接将多台设备的存储共享起来可以单机训练TB级推荐模型。然而对于更大规模的模型或中小型企业、实验室,参数服务器架构依然是性价比最高的解决方案。 为了提升在发生故障的情况下的可用性,在线服务中的深度学习推荐模型通常都采用多副本分布式部署。同一个模型的多个副本通常会被部署在至少两个不同的地理区域内的多个数据中心中,如图 :numref:`ch10-recommendation-systems`,以应对大面积停电或者网络中断而导致整个地区的所有副本都不可用。除了容错方面的考虑,部署多个副本还有其他几点优势。首先,将模型部署在靠近用户的云服务器上可以提升响应速度。其次,部署多份副本也可以拓展模型推理服务的吞吐率。 diff --git a/img/ch10/ch10-abstract-recommendation-systems.svg b/img/ch10/ch10-abstract-recommendation-systems.svg index 837b789..93d6cff 100644 --- a/img/ch10/ch10-abstract-recommendation-systems.svg +++ b/img/ch10/ch10-abstract-recommendation-systems.svg @@ -2,11 +2,11 @@ + inkscape:current-layer="g1236" /> + id="defs2"> + id="clipPath25"> + + + + id="path903" /> + + + + transform="translate(-7.4951336,-56.575409)"> + id="g15" + inkscape:label="ch11-recommendation-models(1)" + transform="matrix(0.35277777,0,0,-0.35277777,44.08155,178.42815)"> + id="g17" /> - + id="g29"> - + id="g1236" + inkscape:label="ch11-abstract-recommendation-systems(1)" + transform="translate(-314.29928,7.6998617)"> + + + + + + + + + 模型 + + + 训练 + + + + + + + 数据 + + + 收集 + + + 推荐系统 + + + + 推荐服务的 + + + 全球 + + + 用户 + + + + + + + + 数据处理 + + + + + + + 数据 + + + 存储 + + + + + + + 模型 + + + 存储 + + + + + + + 推理 + + + 服务 + + + + + + + + - - - - - - Model - - - training - - - - - - - Data - - - collection - - - Recommendation System - - - - - - - Global Devices with Recommendation Service - - - - - - - - Data - - - processing - - - - - - - Data - - - storag - - - e - - - - - - - Model - - - storage - - - - - - - Inference - - - server - - - - - - - diff --git a/img/ch10/ch10-recommendation-models.svg b/img/ch10/ch10-recommendation-models.svg index 7583a7d..7bd4044 100644 --- a/img/ch10/ch10-recommendation-models.svg +++ b/img/ch10/ch10-recommendation-models.svg @@ -2,9 +2,9 @@ + inkscape:current-layer="g15" /> + id="clipPath25"> + d="M 0,0.1636355 H 536.9455 V 351.00004 H 0 Z" + id="path23" /> + transform="translate(27.764827,-33.462145)"> + id="g15" + inkscape:label="ch11-recommendation-models(1)" + transform="matrix(0.35277777,0,0,-0.35277777,-27.90913,154.3432)"> + id="g17" /> + id="g29"> + id="path31" /> + id="g33" + transform="matrix(7.874016e-5,0,0,-7.874016e-5,70.29094,297.6909)"> + id="path35" /> + id="g37" + transform="matrix(0.2181818,0,0,0.2181818,383.0586,-67.03634)"> History of Interactions / Candidate Items + id="tspan39">交互历史 + + + / + + + 候选项 + id="path55" /> + id="path57" /> + id="g59" + transform="matrix(7.874016e-5,0,0,-7.874016e-5,0.9090492,257.5454)"> + id="path61" /> + + + 1000M x 100 + transform="matrix(0.2181818,0,0,0.2181818,53.13408,-184.4182)"> 1000M x 100 - - - User Embedding + id="tspan71">用户嵌入表 + id="path75" /> + id="g77" + transform="matrix(7.874016e-5,0,0,-7.874016e-5,202.2909,258.2)"> + id="path79" /> + + + 10M x 100 + transform="matrix(0.2181818,0,0,0.2181818,248.3355,-129.2182)"> 10M x 100 - - - Item Embedding + id="tspan89">内容嵌入表 + id="path93" /> + id="g95" + transform="matrix(7.874016e-5,0,0,-7.874016e-5,202.2909,205.1818)"> + id="path97" /> + id="g99" + transform="matrix(0.2181818,0,0,0.2181818,224.3355,-172.8545)"> User Context + id="tspan101">用户上下文嵌入表 + + + + + transform="matrix(0.2181818,0,0,0.2181818,224.3355,-228.2727)"> Embedding + id="tspan113">内容上下文嵌入表 + transform="matrix(7.874016e-5,0,0,-7.874016e-5,8.327238,95.87277)"> + - Item Context - - - Embedding + id="g125" + transform="matrix(7.874016e-5,0,0,-7.874016e-5,39.09094,88.0182)"> + + + + + + transform="matrix(7.874016e-5,0,0,-7.874016e-5,110.8728,27.36363)"> - - - - - - - - - - - - + transform="matrix(0.2181818,0,0,0.2181818,408.9256,-336.4909)"> Recommendations / Rank + id="tspan145">推荐结果 + + + 嵌入表 + + + 100s GB + + + + transform="matrix(0.2181818,0,0,0.2181818,466.1449,-186.8182)"> Embedding + id="tspan169">10s TB + transform="matrix(0.2181818,0,0,0.2181818,396.9702,-275.1818)"> 100s GB + id="tspan175">深度神经网络 + transform="matrix(0.2181818,0,0,0.2181818,375.9702,-296.7818)"> + sodipodi:role="line" + id="tspan181">10s GB + transform="matrix(0.2181818,0,0,0.2181818,440.9702,-296.7818)"> 10s TB + id="tspan187">– + transform="matrix(0.2181818,0,0,0.2181818,455.9702,-296.7818)"> Deep Neural Networks (DNNs) + id="tspan193">100s GB + transform="matrix(0.2181818,0,0,0.2181818,109.7371,-25.36362)"> 10s GB - - - - - - 100s GB - - - Deep Learning Recommendation Model (DLRM) + id="tspan199">深度学习推荐模型 diff --git a/img/ch10/ch10-recommendation-systems.svg b/img/ch10/ch10-recommendation-systems.svg index 3813057..0affaf3 100644 --- a/img/ch10/ch10-recommendation-systems.svg +++ b/img/ch10/ch10-recommendation-systems.svg @@ -2,11 +2,11 @@ + inkscape:current-layer="g31" /> + id="defs2"> + id="clipPath41"> + d="M 0,0.1090901 H 538.9091 V 329.99999 H 0 Z" + id="path39" /> + transform="translate(61.779345,-53.24402)"> + id="g31" + inkscape:label="ch11-recommendation-systems(1)" + transform="matrix(0.35277777,0,0,-0.35277777,-72.3985,168.34707)"> + id="g33" /> + id="g45" + transform="translate(29.128792,-2.9690309)"> + id="path47" /> + id="g49" + transform="matrix(7.874016e-5,0,0,-7.874016e-5,1.47275,327.6546)"> + id="path51" /> + id="path53" /> + id="g55" + transform="matrix(7.874016e-5,0,0,-7.874016e-5,220.0909,328.7454)"> + id="path57" /> + id="path59" /> + id="g61" + transform="matrix(7.874016e-5,0,0,-7.874016e-5,394.4182,328.7454)"> + id="path63" /> + id="path65" /> + id="g67" + transform="matrix(7.874016e-5,0,0,-7.874016e-5,31.36362,292.9636)"> + id="path69" /> + id="g71" + transform="matrix(0.2181818,0,0,0.2181818,52.0736,-54.87271)"> Parameter + id="tspan73">参数服务器 + id="g77" + transform="matrix(0.2181818,0,0,0.2181818,79.0736,-76.47271)"> Server Cluster + id="tspan79">集群 + id="g83" + transform="matrix(0.2181818,0,0,0.2181818,45.3236,-98.07271)"> (100s Servers) - - - - + id="tspan85">(100s + id="g89" + transform="matrix(0.2181818,0,0,0.2181818,89.3236,-98.07271)"> Training + id="tspan91">服务器 + id="g95" + transform="matrix(0.2181818,0,0,0.2181818,143.3236,-98.07271)"> Worker - - - Cluster - - - - - - - Parameter - - - Server - - - Replicas - - - (100s Servers) - - - - - - - Parameter - - - Server - - - Replicas - - - (100s Servers) - - - - - - - - - Inference - - - Cluster - - - - - - - Inference - - - Cluster - - - - - - Data Centre (DC) - - - 1 - - - DC 2 - - - DC 3 + id="tspan97">) + d="m 32.45456,137.7635 c 0,6.1856 5.01448,11.2001 11.20016,11.2001 H 153.0362 c 6.1857,0 11.2002,-5.0145 11.2002,-11.2001 V 92.96379 c 0,-6.18568 -5.0145,-11.20017 -11.2002,-11.20017 H 43.65472 c -6.18568,0 -11.20016,5.01449 -11.20016,11.20017 z" + style="fill:#e2f0d9;fill-opacity:1;fill-rule:evenodd;stroke:none" + id="path101" /> + id="g103" + transform="matrix(7.874016e-5,0,0,-7.874016e-5,32.45456,148.9636)"> + id="path105" /> + id="g107" + transform="matrix(0.2181818,0,0,0.2181818,53.21227,-209.5636)"> Global Devices with Recommendation Service + id="tspan109">训练服务器 + + + 集群 + d="m 226.2,224.818 c 0,6.1857 5.0145,11.2002 11.2002,11.2002 h 109.1632 c 6.1857,0 11.2002,-5.0145 11.2002,-11.2002 v -44.7996 c 0,-6.1857 -5.0145,-11.2002 -11.2002,-11.2002 H 237.4002 c -6.1857,0 -11.2002,5.0145 -11.2002,11.2002 z" + style="fill:#deebf7;fill-opacity:1;fill-rule:evenodd;stroke:none" + id="path119" /> + + + + + 参数服务 + + + + + + 副本 + + + ( + + + 100s + + + 服务器 + + + ) + + d="m 400.309,224.818 c 0,6.1857 5.0145,11.2002 11.2002,11.2002 h 109.3816 c 6.1856,0 11.2001,-5.0145 11.2001,-11.2002 v -44.7996 c 0,-6.1857 -5.0145,-11.2002 -11.2001,-11.2002 H 411.5092 c -6.1857,0 -11.2002,5.0145 -11.2002,11.2002 z" + style="fill:#deebf7;fill-opacity:1;fill-rule:evenodd;stroke:none" + id="path167" /> + + + + + 参数服务 + + + + + + 副本 + + + ( + + + 100s + + + 服务器 + + + ) + + id="path215" /> + + + + + + + 推理服务器 + + + 集群 + + + + + + + 推理服务器 + + + 集群 + + + + + + 数据 + + + 中心 + + + 1 + + + 数据 + + + 中心 + + + 2 + + + 数据 + + + 中心 + + + 3 + + + + + + + 推荐服务的全球用户 + + + +