From f974d3e841297bc4d6d9ecf083e62f5ca93a2db4 Mon Sep 17 00:00:00 2001 From: Yeqi Huang Date: Mon, 9 Mar 2026 17:32:24 +0000 Subject: [PATCH] Split book into English and Chinese builds with framework update (#487) * docs: split the book into English and Chinese builds * feat: update english version framework * fix: fix ci --- .github/workflows/main.yml | 83 +- .github/workflows/update_docs.yml | 67 +- README.md | 167 ++- README_EN.md | 128 ++ build_html.sh | 22 +- build_html_zh.sh | 28 + .../distributed_node_rl.md | 43 - .../classic_machine_learning.md | 1 + .../gradient_descent.md | 1 + .../index.md | 1 + .../neural_network.md | 1 + .../accelerator_architecture.md | 1 + .../accelerator_introduction.md | 1 + .../accelerator_practise.md | 1 + .../accelerator_programming.md | 1 + en_chapters/chapter_accelerator/index.md | 1 + en_chapters/chapter_accelerator/summary.md | 1 + .../compute_schedule_and_execute.md | 1 + .../graph_optimizer.md | 1 + .../chapter_backend_and_runtime/index.md | 1 + .../kernel_selecter.md | 1 + .../memory_allocator.md | 1 + .../op_compiler.md | 1 + .../chapter_backend_and_runtime/overview.md | 1 + .../chapter_backend_and_runtime/summary.md | 1 + .../background_and_functionality.md | 1 + .../components_of_computational_graph.md | 1 + .../generation_of_computational_graph.md | 1 + .../chapter_computational_graph/index.md | 1 + .../schedule_of_computational_graph.md | 1 + .../chapter_computational_graph/summary.md | 1 + .../chapter_data_processing/data_order.md | 1 + .../chapter_data_processing/extension.md | 1 + en_chapters/chapter_data_processing/index.md | 1 + .../chapter_data_processing/performance.md | 1 + .../chapter_data_processing/program_model.md | 1 + .../chapter_data_processing/requirements.md | 1 + .../chapter_data_processing/summary.md | 1 + .../chapter_distributed_training/cluster.md | 1 + .../collective.md | 1 + .../chapter_distributed_training/index.md | 1 + .../chapter_distributed_training/methods.md | 1 + .../chapter_distributed_training/overview.md | 1 + .../parameter_servers.md | 1 + .../chapter_distributed_training/summary.md | 1 + .../chapter_explainable_AI/explainable_ai.md | 1 + en_chapters/chapter_explainable_AI/index.md | 1 + .../horizontal_fl.md | 1 + .../chapter_federated_learning/index.md | 1 + .../chapter_federated_learning/outlook.md | 1 + .../chapter_federated_learning/overview.md | 1 + .../privacy_encryption_algorithm.md | 1 + .../chapter_federated_learning/summary.md | 1 + .../chapter_federated_learning/vertical_fl.md | 1 + en_chapters/chapter_frontend_and_ir/ad.md | 1 + .../ai_compiler_design_principle.md | 1 + .../common_frontend_optimization_pass.md | 1 + en_chapters/chapter_frontend_and_ir/index.md | 1 + .../intermediate_representation.md | 1 + .../overview_of_frontend.md | 1 + .../chapter_frontend_and_ir/summary.md | 1 + .../type_system_and_static_analysis.md | 1 + .../chapter_introduction/applications.md | 1 + .../chapter_introduction/architecture.md | 1 + en_chapters/chapter_introduction/design.md | 1 + en_chapters/chapter_introduction/ecosystem.md | 1 + en_chapters/chapter_introduction/index.md | 1 + en_chapters/chapter_introduction/readers.md | 1 + en_chapters/chapter_model_deployment/index.md | 1 + .../model_compression.md | 1 + .../model_converter_and_optimizer.md | 1 + .../model_deployment_introduction.md | 1 + .../model_inference.md | 1 + .../model_security.md | 1 + .../chapter_model_deployment/summary.md | 1 + en_chapters/chapter_preface/index.md | 1 + en_chapters/chapter_preface_advanced/index.md | 1 + .../chapter_preface_extension/index.md | 1 + .../c_python_interaction.md | 1 + .../development_history.md | 1 + .../chapter_programming_interface/index.md | 1 + .../ml_programming_paradigm.md | 1 + .../ml_workflow.md | 1 + .../neural_network_layer.md | 1 + .../chapter_programming_interface/summary.md | 1 + .../chapter_recommender_system/case_study.md | 1 + .../chapter_recommender_system/index.md | 1 + .../model_update.md | 1 + .../multi_stage_recommender_system.md | 1 + .../chapter_recommender_system/summary.md | 1 + .../system_architecture.md | 1 + .../chapter_reinforcement_learning/index.md | 1 + .../chapter_reinforcement_learning/marl.md | 1 + .../marl_sys.md | 1 + .../rl_introduction.md | 1 + .../single_node_rl.md | 1 + .../chapter_reinforcement_learning/summary.md | 1 + en_chapters/chapter_rl_sys/control.md | 1 + en_chapters/chapter_rl_sys/control_code_ex.md | 1 + en_chapters/chapter_rl_sys/index.md | 1 + en_chapters/chapter_rl_sys/perception.md | 1 + .../chapter_rl_sys/perception_code_ex.md | 1 + en_chapters/chapter_rl_sys/planning.md | 1 + .../chapter_rl_sys/planning_code_ex.md | 1 + en_chapters/chapter_rl_sys/rl_sys_intro.md | 1 + en_chapters/chapter_rl_sys/robot_learning.md | 1 + en_chapters/chapter_rl_sys/robot_safety.md | 1 + en_chapters/chapter_rl_sys/ros.md | 1 + en_chapters/chapter_rl_sys/ros_code_ex.md | 1 + en_chapters/chapter_rl_sys/summary.md | 1 + config.ini => en_chapters/config.ini | 5 +- en_chapters/img | 1 + en_chapters/index.md | 41 + en_chapters/mlsys.bib | 1308 +++++++++++++++++ en_chapters/references | 1 + en_chapters/static | 1 + requirements.txt | 7 +- .../classic_machine_learning.md | 0 .../gradient_descent.md | 0 .../index.md | 0 .../neural_network.md | 0 .../accelerator_architecture.md | 0 .../accelerator_introduction.md | 0 .../accelerator_practise.md | 0 .../accelerator_programming.md | 0 .../chapter_accelerator}/index.md | 0 .../chapter_accelerator}/summary.md | 0 .../compute_schedule_and_execute.md | 462 +++--- .../graph_optimizer.md | 112 +- .../chapter_backend_and_runtime}/index.md | 60 +- .../kernel_selecter.md | 168 +-- .../memory_allocator.md | 136 +- .../op_compiler.md | 0 .../chapter_backend_and_runtime}/overview.md | 58 +- .../chapter_backend_and_runtime}/summary.md | 54 +- .../background_and_functionality.md | 0 .../components_of_computational_graph.md | 0 .../generation_of_computational_graph.md | 0 .../chapter_computational_graph}/index.md | 0 .../schedule_of_computational_graph.md | 0 .../chapter_computational_graph}/summary.md | 0 .../chapter_data_processing}/data_order.md | 0 .../chapter_data_processing}/extension.md | 0 .../chapter_data_processing}/index.md | 0 .../chapter_data_processing}/performance.md | 0 .../chapter_data_processing}/program_model.md | 0 .../chapter_data_processing}/requirements.md | 0 .../chapter_data_processing}/summary.md | 0 .../chapter_distributed_training}/cluster.md | 0 .../collective.md | 0 .../chapter_distributed_training}/index.md | 0 .../chapter_distributed_training}/methods.md | 0 .../chapter_distributed_training}/overview.md | 0 .../parameter_servers.md | 0 .../chapter_distributed_training}/summary.md | 0 .../chapter_explainable_AI}/explainable_ai.md | 0 .../chapter_explainable_AI}/index.md | 0 .../horizontal_fl.md | 114 +- .../chapter_federated_learning}/index.md | 0 .../chapter_federated_learning}/outlook.md | 70 +- .../chapter_federated_learning}/overview.md | 0 .../privacy_encryption_algorithm.md | 280 ++-- .../chapter_federated_learning}/summary.md | 6 +- .../vertical_fl.md | 122 +- .../chapter_frontend_and_ir}/ad.md | 0 .../ai_compiler_design_principle.md | 0 .../common_frontend_optimization_pass.md | 0 .../chapter_frontend_and_ir}/index.md | 0 .../intermediate_representation.md | 0 .../overview_of_frontend.md | 0 .../chapter_frontend_and_ir}/summary.md | 0 .../type_system_and_static_analysis.md | 0 .../chapter_introduction}/applications.md | 0 .../chapter_introduction}/architecture.md | 0 .../chapter_introduction}/design.md | 0 .../chapter_introduction}/ecosystem.md | 0 .../chapter_introduction}/index.md | 0 .../chapter_introduction}/readers.md | 0 .../chapter_model_deployment}/index.md | 0 .../model_compression.md | 0 .../model_converter_and_optimizer.md | 0 .../model_deployment_introduction.md | 0 .../model_inference.md | 0 .../model_security.md | 0 .../chapter_model_deployment}/summary.md | 0 .../chapter_preface}/index.md | 0 .../chapter_preface_advanced}/index.md | 0 .../chapter_preface_extension}/index.md | 0 .../c_python_interaction.md | 0 .../development_history.md | 0 .../chapter_programming_interface}/index.md | 0 .../ml_programming_paradigm.md | 0 .../ml_workflow.md | 0 .../neural_network_layer.md | 0 .../chapter_programming_interface}/summary.md | 0 .../chapter_recommender_system}/case_study.md | 0 .../chapter_recommender_system}/index.md | 0 .../model_update.md | 0 .../multi_stage_recommender_system.md | 0 .../chapter_recommender_system}/summary.md | 0 .../system_architecture.md | 0 .../chapter_reinforcement_learning}/index.md | 0 .../chapter_reinforcement_learning}/marl.md | 80 +- .../marl_sys.md | 80 +- .../rl_introduction.md | 54 +- .../single_node_rl.md | 40 +- .../summary.md | 12 +- .../chapter_rl_sys}/control.md | 0 .../chapter_rl_sys}/control_code_ex.md | 0 .../chapter_rl_sys}/index.md | 0 .../chapter_rl_sys}/perception.md | 0 .../chapter_rl_sys}/perception_code_ex.md | 0 .../chapter_rl_sys}/planning.md | 0 .../chapter_rl_sys}/planning_code_ex.md | 0 .../chapter_rl_sys}/rl_sys_intro.md | 0 .../chapter_rl_sys}/robot_learning.md | 0 .../chapter_rl_sys}/robot_safety.md | 0 .../chapter_rl_sys}/ros.md | 0 .../chapter_rl_sys}/ros_code_ex.md | 0 .../chapter_rl_sys}/summary.md | 0 zh_chapters/config.ini | 79 + zh_chapters/img | 1 + index.md => zh_chapters/index.md | 0 zh_chapters/mlsys.bib | 1307 ++++++++++++++++ zh_chapters/references | 1 + zh_chapters/static | 1 + 226 files changed, 4206 insertions(+), 1096 deletions(-) create mode 100644 README_EN.md create mode 100755 build_html_zh.sh delete mode 100644 chapter_reinforcement_learning/distributed_node_rl.md create mode 100644 en_chapters/appendix_machine_learning_introduction/classic_machine_learning.md create mode 100644 en_chapters/appendix_machine_learning_introduction/gradient_descent.md create mode 100644 en_chapters/appendix_machine_learning_introduction/index.md create mode 100644 en_chapters/appendix_machine_learning_introduction/neural_network.md create mode 100644 en_chapters/chapter_accelerator/accelerator_architecture.md create mode 100644 en_chapters/chapter_accelerator/accelerator_introduction.md create mode 100644 en_chapters/chapter_accelerator/accelerator_practise.md create mode 100644 en_chapters/chapter_accelerator/accelerator_programming.md create mode 100644 en_chapters/chapter_accelerator/index.md create mode 100644 en_chapters/chapter_accelerator/summary.md create mode 100644 en_chapters/chapter_backend_and_runtime/compute_schedule_and_execute.md create mode 100644 en_chapters/chapter_backend_and_runtime/graph_optimizer.md create mode 100644 en_chapters/chapter_backend_and_runtime/index.md create mode 100644 en_chapters/chapter_backend_and_runtime/kernel_selecter.md create mode 100644 en_chapters/chapter_backend_and_runtime/memory_allocator.md create mode 100644 en_chapters/chapter_backend_and_runtime/op_compiler.md create mode 100644 en_chapters/chapter_backend_and_runtime/overview.md create mode 100644 en_chapters/chapter_backend_and_runtime/summary.md create mode 100644 en_chapters/chapter_computational_graph/background_and_functionality.md create mode 100644 en_chapters/chapter_computational_graph/components_of_computational_graph.md create mode 100644 en_chapters/chapter_computational_graph/generation_of_computational_graph.md create mode 100644 en_chapters/chapter_computational_graph/index.md create mode 100644 en_chapters/chapter_computational_graph/schedule_of_computational_graph.md create mode 100644 en_chapters/chapter_computational_graph/summary.md create mode 100644 en_chapters/chapter_data_processing/data_order.md create mode 100644 en_chapters/chapter_data_processing/extension.md create mode 100644 en_chapters/chapter_data_processing/index.md create mode 100644 en_chapters/chapter_data_processing/performance.md create mode 100644 en_chapters/chapter_data_processing/program_model.md create mode 100644 en_chapters/chapter_data_processing/requirements.md create mode 100644 en_chapters/chapter_data_processing/summary.md create mode 100644 en_chapters/chapter_distributed_training/cluster.md create mode 100644 en_chapters/chapter_distributed_training/collective.md create mode 100644 en_chapters/chapter_distributed_training/index.md create mode 100644 en_chapters/chapter_distributed_training/methods.md create mode 100644 en_chapters/chapter_distributed_training/overview.md create mode 100644 en_chapters/chapter_distributed_training/parameter_servers.md create mode 100644 en_chapters/chapter_distributed_training/summary.md create mode 100644 en_chapters/chapter_explainable_AI/explainable_ai.md create mode 100644 en_chapters/chapter_explainable_AI/index.md create mode 100644 en_chapters/chapter_federated_learning/horizontal_fl.md create mode 100644 en_chapters/chapter_federated_learning/index.md create mode 100644 en_chapters/chapter_federated_learning/outlook.md create mode 100644 en_chapters/chapter_federated_learning/overview.md create mode 100644 en_chapters/chapter_federated_learning/privacy_encryption_algorithm.md create mode 100644 en_chapters/chapter_federated_learning/summary.md create mode 100644 en_chapters/chapter_federated_learning/vertical_fl.md create mode 100644 en_chapters/chapter_frontend_and_ir/ad.md create mode 100644 en_chapters/chapter_frontend_and_ir/ai_compiler_design_principle.md create mode 100644 en_chapters/chapter_frontend_and_ir/common_frontend_optimization_pass.md create mode 100644 en_chapters/chapter_frontend_and_ir/index.md create mode 100644 en_chapters/chapter_frontend_and_ir/intermediate_representation.md create mode 100644 en_chapters/chapter_frontend_and_ir/overview_of_frontend.md create mode 100644 en_chapters/chapter_frontend_and_ir/summary.md create mode 100644 en_chapters/chapter_frontend_and_ir/type_system_and_static_analysis.md create mode 100644 en_chapters/chapter_introduction/applications.md create mode 100644 en_chapters/chapter_introduction/architecture.md create mode 100644 en_chapters/chapter_introduction/design.md create mode 100644 en_chapters/chapter_introduction/ecosystem.md create mode 100644 en_chapters/chapter_introduction/index.md create mode 100644 en_chapters/chapter_introduction/readers.md create mode 100644 en_chapters/chapter_model_deployment/index.md create mode 100644 en_chapters/chapter_model_deployment/model_compression.md create mode 100644 en_chapters/chapter_model_deployment/model_converter_and_optimizer.md create mode 100644 en_chapters/chapter_model_deployment/model_deployment_introduction.md create mode 100644 en_chapters/chapter_model_deployment/model_inference.md create mode 100644 en_chapters/chapter_model_deployment/model_security.md create mode 100644 en_chapters/chapter_model_deployment/summary.md create mode 100644 en_chapters/chapter_preface/index.md create mode 100644 en_chapters/chapter_preface_advanced/index.md create mode 100644 en_chapters/chapter_preface_extension/index.md create mode 100644 en_chapters/chapter_programming_interface/c_python_interaction.md create mode 100644 en_chapters/chapter_programming_interface/development_history.md create mode 100644 en_chapters/chapter_programming_interface/index.md create mode 100644 en_chapters/chapter_programming_interface/ml_programming_paradigm.md create mode 100644 en_chapters/chapter_programming_interface/ml_workflow.md create mode 100644 en_chapters/chapter_programming_interface/neural_network_layer.md create mode 100644 en_chapters/chapter_programming_interface/summary.md create mode 100644 en_chapters/chapter_recommender_system/case_study.md create mode 100644 en_chapters/chapter_recommender_system/index.md create mode 100644 en_chapters/chapter_recommender_system/model_update.md create mode 100644 en_chapters/chapter_recommender_system/multi_stage_recommender_system.md create mode 100644 en_chapters/chapter_recommender_system/summary.md create mode 100644 en_chapters/chapter_recommender_system/system_architecture.md create mode 100644 en_chapters/chapter_reinforcement_learning/index.md create mode 100644 en_chapters/chapter_reinforcement_learning/marl.md create mode 100644 en_chapters/chapter_reinforcement_learning/marl_sys.md create mode 100644 en_chapters/chapter_reinforcement_learning/rl_introduction.md create mode 100644 en_chapters/chapter_reinforcement_learning/single_node_rl.md create mode 100644 en_chapters/chapter_reinforcement_learning/summary.md create mode 100644 en_chapters/chapter_rl_sys/control.md create mode 100644 en_chapters/chapter_rl_sys/control_code_ex.md create mode 100644 en_chapters/chapter_rl_sys/index.md create mode 100644 en_chapters/chapter_rl_sys/perception.md create mode 100644 en_chapters/chapter_rl_sys/perception_code_ex.md create mode 100644 en_chapters/chapter_rl_sys/planning.md create mode 100644 en_chapters/chapter_rl_sys/planning_code_ex.md create mode 100644 en_chapters/chapter_rl_sys/rl_sys_intro.md create mode 100644 en_chapters/chapter_rl_sys/robot_learning.md create mode 100644 en_chapters/chapter_rl_sys/robot_safety.md create mode 100644 en_chapters/chapter_rl_sys/ros.md create mode 100644 en_chapters/chapter_rl_sys/ros_code_ex.md create mode 100644 en_chapters/chapter_rl_sys/summary.md rename config.ini => en_chapters/config.ini (80%) create mode 120000 en_chapters/img create mode 100644 en_chapters/index.md create mode 100644 en_chapters/mlsys.bib create mode 120000 en_chapters/references create mode 120000 en_chapters/static rename {appendix_machine_learning_introduction => zh_chapters/appendix_machine_learning_introduction}/classic_machine_learning.md (100%) rename {appendix_machine_learning_introduction => zh_chapters/appendix_machine_learning_introduction}/gradient_descent.md (100%) rename {appendix_machine_learning_introduction => zh_chapters/appendix_machine_learning_introduction}/index.md (100%) rename {appendix_machine_learning_introduction => zh_chapters/appendix_machine_learning_introduction}/neural_network.md (100%) rename {chapter_accelerator => zh_chapters/chapter_accelerator}/accelerator_architecture.md (100%) rename {chapter_accelerator => zh_chapters/chapter_accelerator}/accelerator_introduction.md (100%) rename {chapter_accelerator => zh_chapters/chapter_accelerator}/accelerator_practise.md (100%) rename {chapter_accelerator => zh_chapters/chapter_accelerator}/accelerator_programming.md (100%) rename {chapter_accelerator => zh_chapters/chapter_accelerator}/index.md (100%) rename {chapter_accelerator => zh_chapters/chapter_accelerator}/summary.md (100%) rename {chapter_backend_and_runtime => zh_chapters/chapter_backend_and_runtime}/compute_schedule_and_execute.md (98%) rename {chapter_backend_and_runtime => zh_chapters/chapter_backend_and_runtime}/graph_optimizer.md (98%) rename {chapter_backend_and_runtime => zh_chapters/chapter_backend_and_runtime}/index.md (97%) rename {chapter_backend_and_runtime => zh_chapters/chapter_backend_and_runtime}/kernel_selecter.md (98%) rename {chapter_backend_and_runtime => zh_chapters/chapter_backend_and_runtime}/memory_allocator.md (99%) rename {chapter_backend_and_runtime => zh_chapters/chapter_backend_and_runtime}/op_compiler.md (100%) rename {chapter_backend_and_runtime => zh_chapters/chapter_backend_and_runtime}/overview.md (99%) rename {chapter_backend_and_runtime => zh_chapters/chapter_backend_and_runtime}/summary.md (99%) rename {chapter_computational_graph => zh_chapters/chapter_computational_graph}/background_and_functionality.md (100%) rename {chapter_computational_graph => zh_chapters/chapter_computational_graph}/components_of_computational_graph.md (100%) rename {chapter_computational_graph => zh_chapters/chapter_computational_graph}/generation_of_computational_graph.md (100%) rename {chapter_computational_graph => zh_chapters/chapter_computational_graph}/index.md (100%) rename {chapter_computational_graph => zh_chapters/chapter_computational_graph}/schedule_of_computational_graph.md (100%) rename {chapter_computational_graph => zh_chapters/chapter_computational_graph}/summary.md (100%) rename {chapter_data_processing => zh_chapters/chapter_data_processing}/data_order.md (100%) rename {chapter_data_processing => zh_chapters/chapter_data_processing}/extension.md (100%) rename {chapter_data_processing => zh_chapters/chapter_data_processing}/index.md (100%) rename {chapter_data_processing => zh_chapters/chapter_data_processing}/performance.md (100%) rename {chapter_data_processing => zh_chapters/chapter_data_processing}/program_model.md (100%) rename {chapter_data_processing => zh_chapters/chapter_data_processing}/requirements.md (100%) rename {chapter_data_processing => zh_chapters/chapter_data_processing}/summary.md (100%) rename {chapter_distributed_training => zh_chapters/chapter_distributed_training}/cluster.md (100%) rename {chapter_distributed_training => zh_chapters/chapter_distributed_training}/collective.md (100%) rename {chapter_distributed_training => zh_chapters/chapter_distributed_training}/index.md (100%) rename {chapter_distributed_training => zh_chapters/chapter_distributed_training}/methods.md (100%) rename {chapter_distributed_training => zh_chapters/chapter_distributed_training}/overview.md (100%) rename {chapter_distributed_training => zh_chapters/chapter_distributed_training}/parameter_servers.md (100%) rename {chapter_distributed_training => zh_chapters/chapter_distributed_training}/summary.md (100%) rename {chapter_explainable_AI => zh_chapters/chapter_explainable_AI}/explainable_ai.md (100%) rename {chapter_explainable_AI => zh_chapters/chapter_explainable_AI}/index.md (100%) rename {chapter_federated_learning => zh_chapters/chapter_federated_learning}/horizontal_fl.md (98%) rename {chapter_federated_learning => zh_chapters/chapter_federated_learning}/index.md (100%) rename {chapter_federated_learning => zh_chapters/chapter_federated_learning}/outlook.md (99%) rename {chapter_federated_learning => zh_chapters/chapter_federated_learning}/overview.md (100%) rename {chapter_federated_learning => zh_chapters/chapter_federated_learning}/privacy_encryption_algorithm.md (97%) rename {chapter_federated_learning => zh_chapters/chapter_federated_learning}/summary.md (99%) rename {chapter_federated_learning => zh_chapters/chapter_federated_learning}/vertical_fl.md (99%) rename {chapter_frontend_and_ir => zh_chapters/chapter_frontend_and_ir}/ad.md (100%) rename {chapter_frontend_and_ir => zh_chapters/chapter_frontend_and_ir}/ai_compiler_design_principle.md (100%) rename {chapter_frontend_and_ir => zh_chapters/chapter_frontend_and_ir}/common_frontend_optimization_pass.md (100%) rename {chapter_frontend_and_ir => zh_chapters/chapter_frontend_and_ir}/index.md (100%) rename {chapter_frontend_and_ir => zh_chapters/chapter_frontend_and_ir}/intermediate_representation.md (100%) rename {chapter_frontend_and_ir => zh_chapters/chapter_frontend_and_ir}/overview_of_frontend.md (100%) rename {chapter_frontend_and_ir => zh_chapters/chapter_frontend_and_ir}/summary.md (100%) rename {chapter_frontend_and_ir => zh_chapters/chapter_frontend_and_ir}/type_system_and_static_analysis.md (100%) rename {chapter_introduction => zh_chapters/chapter_introduction}/applications.md (100%) rename {chapter_introduction => zh_chapters/chapter_introduction}/architecture.md (100%) rename {chapter_introduction => zh_chapters/chapter_introduction}/design.md (100%) rename {chapter_introduction => zh_chapters/chapter_introduction}/ecosystem.md (100%) rename {chapter_introduction => zh_chapters/chapter_introduction}/index.md (100%) rename {chapter_introduction => zh_chapters/chapter_introduction}/readers.md (100%) rename {chapter_model_deployment => zh_chapters/chapter_model_deployment}/index.md (100%) rename {chapter_model_deployment => zh_chapters/chapter_model_deployment}/model_compression.md (100%) rename {chapter_model_deployment => zh_chapters/chapter_model_deployment}/model_converter_and_optimizer.md (100%) rename {chapter_model_deployment => zh_chapters/chapter_model_deployment}/model_deployment_introduction.md (100%) rename {chapter_model_deployment => zh_chapters/chapter_model_deployment}/model_inference.md (100%) rename {chapter_model_deployment => zh_chapters/chapter_model_deployment}/model_security.md (100%) rename {chapter_model_deployment => zh_chapters/chapter_model_deployment}/summary.md (100%) rename {chapter_preface => zh_chapters/chapter_preface}/index.md (100%) rename {chapter_preface_advanced => zh_chapters/chapter_preface_advanced}/index.md (100%) rename {chapter_preface_extension => zh_chapters/chapter_preface_extension}/index.md (100%) rename {chapter_programming_interface => zh_chapters/chapter_programming_interface}/c_python_interaction.md (100%) rename {chapter_programming_interface => zh_chapters/chapter_programming_interface}/development_history.md (100%) rename {chapter_programming_interface => zh_chapters/chapter_programming_interface}/index.md (100%) rename {chapter_programming_interface => zh_chapters/chapter_programming_interface}/ml_programming_paradigm.md (100%) rename {chapter_programming_interface => zh_chapters/chapter_programming_interface}/ml_workflow.md (100%) rename {chapter_programming_interface => zh_chapters/chapter_programming_interface}/neural_network_layer.md (100%) rename {chapter_programming_interface => zh_chapters/chapter_programming_interface}/summary.md (100%) rename {chapter_recommender_system => zh_chapters/chapter_recommender_system}/case_study.md (100%) rename {chapter_recommender_system => zh_chapters/chapter_recommender_system}/index.md (100%) rename {chapter_recommender_system => zh_chapters/chapter_recommender_system}/model_update.md (100%) rename {chapter_recommender_system => zh_chapters/chapter_recommender_system}/multi_stage_recommender_system.md (100%) rename {chapter_recommender_system => zh_chapters/chapter_recommender_system}/summary.md (100%) rename {chapter_recommender_system => zh_chapters/chapter_recommender_system}/system_architecture.md (100%) rename {chapter_reinforcement_learning => zh_chapters/chapter_reinforcement_learning}/index.md (100%) rename {chapter_reinforcement_learning => zh_chapters/chapter_reinforcement_learning}/marl.md (99%) rename {chapter_reinforcement_learning => zh_chapters/chapter_reinforcement_learning}/marl_sys.md (99%) rename {chapter_reinforcement_learning => zh_chapters/chapter_reinforcement_learning}/rl_introduction.md (99%) rename {chapter_reinforcement_learning => zh_chapters/chapter_reinforcement_learning}/single_node_rl.md (99%) rename {chapter_reinforcement_learning => zh_chapters/chapter_reinforcement_learning}/summary.md (99%) rename {chapter_rl_sys => zh_chapters/chapter_rl_sys}/control.md (100%) rename {chapter_rl_sys => zh_chapters/chapter_rl_sys}/control_code_ex.md (100%) rename {chapter_rl_sys => zh_chapters/chapter_rl_sys}/index.md (100%) rename {chapter_rl_sys => zh_chapters/chapter_rl_sys}/perception.md (100%) rename {chapter_rl_sys => zh_chapters/chapter_rl_sys}/perception_code_ex.md (100%) rename {chapter_rl_sys => zh_chapters/chapter_rl_sys}/planning.md (100%) rename {chapter_rl_sys => zh_chapters/chapter_rl_sys}/planning_code_ex.md (100%) rename {chapter_rl_sys => zh_chapters/chapter_rl_sys}/rl_sys_intro.md (100%) rename {chapter_rl_sys => zh_chapters/chapter_rl_sys}/robot_learning.md (100%) rename {chapter_rl_sys => zh_chapters/chapter_rl_sys}/robot_safety.md (100%) rename {chapter_rl_sys => zh_chapters/chapter_rl_sys}/ros.md (100%) rename {chapter_rl_sys => zh_chapters/chapter_rl_sys}/ros_code_ex.md (100%) rename {chapter_rl_sys => zh_chapters/chapter_rl_sys}/summary.md (100%) create mode 100644 zh_chapters/config.ini create mode 120000 zh_chapters/img rename index.md => zh_chapters/index.md (100%) create mode 100644 zh_chapters/mlsys.bib create mode 120000 zh_chapters/references create mode 120000 zh_chapters/static diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e395dda..8b79f0f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,27 +1,78 @@ name: CI on: -- push -- pull_request -- workflow_dispatch # Allows you to run this workflow manually from the Actions tab + push: + pull_request: + workflow_dispatch: jobs: - build: - runs-on: ubuntu-20.04 + build-en: + name: Build (English) + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 - - uses: s-weigand/setup-conda@v1 - - name: Set up Python 3.8 - uses: actions/setup-python@v3 - with: - python-version: '3.8' - - run: conda config --append channels conda-forge - - run: python3 -m pip install -r requirements.txt - - run: conda install -y pandoc==2.17 + - uses: actions/checkout@v4 - - run: | + - name: Set up Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + + - name: Install pandoc + run: | + wget -q https://github.com/jgm/pandoc/releases/download/2.19.2/pandoc-2.19.2-1-amd64.deb + sudo dpkg -i pandoc-2.19.2-1-amd64.deb + + - name: Install d2lbook + run: | git clone https://github.com/openmlsys/d2l-book.git cd d2l-book + # Fix Python 3.10+ incompatibility: bibtex<2.0.0 depends on oset which + # uses collections.MutableSet removed in Python 3.10. + sed -i "s/'sphinxcontrib-bibtex<2.0.0'/'sphinxcontrib-bibtex>=2.5.0'/" setup.py python3 -m pip install . - - run: d2lbook build html + + - name: Install Python dependencies + run: python3 -m pip install -r requirements.txt + + - name: Build English HTML + run: bash build_html.sh + + build-zh: + name: Build (Chinese) + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + + - name: Install pandoc + run: | + wget -q https://github.com/jgm/pandoc/releases/download/2.19.2/pandoc-2.19.2-1-amd64.deb + sudo dpkg -i pandoc-2.19.2-1-amd64.deb + + - name: Install d2lbook + run: | + git clone https://github.com/openmlsys/d2l-book.git + cd d2l-book + sed -i "s/'sphinxcontrib-bibtex<2.0.0'/'sphinxcontrib-bibtex>=2.5.0'/" setup.py + python3 -m pip install . + + - name: Install Python dependencies + run: python3 -m pip install -r requirements.txt + + - name: Build Chinese HTML + run: bash build_html_zh.sh + + build: + name: build + needs: [build-en, build-zh] + runs-on: ubuntu-22.04 + steps: + - run: echo "All builds passed" diff --git a/.github/workflows/update_docs.yml b/.github/workflows/update_docs.yml index ed97b02..bf957f8 100644 --- a/.github/workflows/update_docs.yml +++ b/.github/workflows/update_docs.yml @@ -1,5 +1,4 @@ - -name: CI +name: Deploy Docs on: pull_request: @@ -7,30 +6,56 @@ on: - closed jobs: - if_merged: + deploy: if: github.event.pull_request.merged == true - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - uses: s-weigand/setup-conda@v1 - - run: conda config --append channels conda-forge - - run: python3 -m pip install -r requirements.txt - - run: conda install -y pandoc==2.17 - - run: pip install sphinx-mathjax-offline + runs-on: ubuntu-22.04 - - run: | + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + + - name: Install pandoc + run: | + wget -q https://github.com/jgm/pandoc/releases/download/2.19.2/pandoc-2.19.2-1-amd64.deb + sudo dpkg -i pandoc-2.19.2-1-amd64.deb + + - name: Install d2lbook + run: | git clone https://github.com/openmlsys/d2l-book.git cd d2l-book + # Fix Python 3.10+ incompatibility: bibtex<2.0.0 depends on oset which + # uses collections.MutableSet removed in Python 3.10. + sed -i "s/'sphinxcontrib-bibtex<2.0.0'/'sphinxcontrib-bibtex>=2.5.0'/" setup.py python3 -m pip install . - - run: sh build_html.sh - - run: cd .. - - run: git clone https://github.com/openmlsys/openmlsys.github.io.git - - run: cp -r openmlsys-zh/_build/html/* openmlsys.github.io/docs/ - - run: | + + - name: Install Python dependencies + run: python3 -m pip install -r requirements.txt sphinx-mathjax-offline + + - name: Build English HTML + run: bash build_html.sh + + - name: Build Chinese HTML + run: bash build_html_zh.sh + + - name: Deploy to GitHub Pages + run: | + git clone https://github.com/openmlsys/openmlsys.github.io.git + + # English → root (default language) + cp -r openmlsys-zh/en_chapters/_build/html/* openmlsys.github.io/docs/ + + # Chinese → /cn/ subdirectory + mkdir -p openmlsys.github.io/docs/cn + cp -r openmlsys-zh/zh_chapters/_build/html/* openmlsys.github.io/docs/cn/ + cd openmlsys.github.io + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" git add . - git commit -m 'update docs' + git commit -m "deploy: update docs (en+zh) from openmlsys-zh@${{ github.sha }}" git push - - - diff --git a/README.md b/README.md index 713c625..7237d98 100644 --- a/README.md +++ b/README.md @@ -1,79 +1,132 @@ +

+ OpenMLSys Logo +

+ +

+ + CI + + + Book Online + + + License + + + GitHub Stars + +

+ +

+ 中文 | English +

+ +--- + # 机器学习系统:设计和实现 -本开源项目试图给读者讲解现代机器学习系统的设计原理和实现经验。 +本开源项目讲解现代机器学习系统的设计原理和实现经验,涵盖从编程接口、计算图、编译器到分布式训练的完整技术栈。 -🔥 **书籍网页版:** [机器学习系统:设计和实现](https://openmlsys.github.io/) +**在线阅读:** [openmlsys.github.io](https://openmlsys.github.io/) -🔥 **书籍PDF:** 将在勘误后,2022年中发布。 +## 目录 -## 发布 - -- 27/06/2022: OpenMLSys社区发布通俗易懂的高性能AI算子开发教程,助力学生和工程师60分钟理解算子性能优化的关键知识点。相应的[技术博客](https://zhuanlan.zhihu.com/p/531498210)和[复现代码](https://github.com/openmlsys/openmlsys-cuda)都已免费公开。感谢@[Jie Ren](https://github.com/JieRen98) 和 @[Wenteng Liang](https://github.com/Went-Liang) 的贡献!🔥 -- 17/03/2022: 本书处于勘误阶段。如发现文字和图片错误,可创建Issue并@[章节编辑](info/editors.md)。我们非常欢迎社区提交PR直接勘误。 +- [适用读者](#适用读者) +- [内容介绍](#内容介绍) +- [构建指南](#构建指南) +- [贡献指南](#贡献指南) +- [社区](#社区) +- [许可证](#许可证) ## 适用读者 -本书的常见读者包括: - -- **学生:** - 随着大量机器学习课程在大学中的普及,学生已经开始掌握大量机器学习的基础理论和神经网络的实现。然而,需要训练出可以实际应用的机器学习模型,需要对现代机器学习系统有充分的认识。 - -- **科研人员:** - 研发新型的机器学习模型不仅仅需要会使用基础的机器学习系统接口。同时,新型的模型需要给系统提供新的自定义算子(Custom - Operators),又或者是会利用高级的分布式执行算子来实现大模型的开发。这一系列需求都需要对底层系统具有充分认识。 - -- **开发人员:** - 大量的数据和AI驱动的公司都部署了机器学习基础设施。这一设施的核心就是机器学习系统。因此了解机器学习系统有助于开发人员对于系统性能调优,以定位问题,并且根据业务需求对机器学习系统进行深度定制。 +- **学生**:掌握机器学习基础理论后,希望深入了解现代机器学习系统设计与实现的同学。 +- **科研人员**:需要开发自定义算子(Custom Operators)或利用分布式执行实现大模型的研究者。 +- **开发人员**:负责机器学习基础设施建设,需要对系统性能调优和深度定制的工程师。 ## 内容介绍 -现代机器学习框架具有复杂的内部架构和繁多的外部相关组件。在本书中,我们将对其细致拆分,深入解读: +本书分为基础篇、进阶篇和扩展篇三个部分: -基础: +### 基础篇 -- **编程接口:** 为了支持海量应用,机器学习框架的编程接口设计具有大量的设计哲学,在易用性和性能之间取得平衡。本书将讲述编程接口的演进,机器学习工作流,定义深度学习模型,以及用C/C++进行框架开发。 +| 章节 | 内容 | +|------|------| +| [编程接口](chapter_programming_interface/) | 框架接口设计哲学、机器学习工作流、深度学习模型定义、C/C++ 框架开发 | +| [计算图](chapter_computational_graph/) | 计算图基本构成、生成方法、调度策略、自动微分 | -- **计算图:** 机器学习框架需要支持自动微分,硬件加速器,多编程前端等。实现这些支持的核心技术是:计算图(Computational Graph)。本书将讲述计算图的基本构成,生成方法和调度策略。 +### 进阶篇 -性能进阶: +| 章节 | 内容 | +|------|------| +| [编译器前端和中间表示](chapter_frontend_and_ir/) | 类型推导、中间表示(IR)、自动微分、常见优化 Pass | +| [编译器后端和运行时](chapter_backend_and_runtime/) | 计算图优化、算子选择、内存分配、计算调度与执行 | +| [硬件加速器](chapter_accelerator/) | GPU/Ascend 架构原理、高性能编程接口(CUDA/CANN) | +| [数据处理框架](chapter_data_processing/) | 易用性、高效性、保序性、分布式数据处理 | +| [模型部署](chapter_model_deployment/) | 模型转换、模型压缩、模型推理、安全保护 | +| [分布式训练](chapter_distributed_training/) | 数据并行、模型并行、流水线并行、集合通讯、参数服务器 | -- **编译器前端:** - 机器学习框架需要利用编译器前端技术对计算图进行功能拓展和性能优化。本书将讲述常见的前端技术,包括类型推导,中间表示(Intermediate Representation),自动微分等。 +### 扩展篇 -- **编译器后端和运行时:** - 机器学习框架的一个核心目标是:如何充分利用异构硬件。这其中会涉及编译器后端技术,以及将计算图算子(Operator)调度到硬件上的运行时(Runtime)。本书将讲述计算图优化,算子选择,内存分配和计算调度与执行。 - -- **硬件加速器:** - 机器学习框架的基本运行单元是算子,而算子的实现必须充分利用硬件加速器(GPU和Ascend)的特性。本书将会讲述硬件加速器的基本构成原理和常见的高性能编程接口。 - -- **数据处理框架:** - 机器学习框架会集成高性能框架来进行数据预处理。本书将会讲述这一类数据处理框架在设计中需要达到的多个目标:易用性,高效性,保序性,分布式等。 - -- **模型部署:** - 在模型完成训练后,用户需要将模型部署到终端设备(如云服务器,移动终端和无人车)。这其中涉及到的模型转换,模型压缩,模型推理和安全保护等知识也会在本书中讨论。 - -- **分布式训练:** - 机器学习模型的训练需要消耗大量资源。越来越多的机器学习框架因此原生支持分布式训练。在本书中我们将会讨论常见的分布式训练方法(包括数据并行,模型并行和流水线并行),以及实现这些方法的系统架构(包括集合通讯和参数服务器)。 - -功能拓展: - -- **深度学习推荐系统:** 推荐系统是目前机器学习应用最成功的领域之一。本书将会概括推荐系统的运作原理,详细描述大规模工业场景下的推荐系统架构设计。 - -- **联邦学习系统:** 随着数据保护法规和隐私保护的崛起,联邦学习正成为日益重要的研究领域。本书将会介绍联邦学习的常用方法以及相关系统实现。 - -- **强化学习系统:** 强化学习是走向通用人工智能的关键技术。本书将会介绍目前常见的强化学习系统(包括单智能体和多智能体等)。 - -- **可解释性AI系统:** 随着机器学习在安全攸关(Safety-critical)领域的应用,机器学习系统越来越需要对决策给出充分解释。本书将会讨论可解释AI系统的常用方法和落地实践经验。 - -- **机器人系统:** 机器人(无人车,无人机,家用机器人等)作为机器学习技术重要的应用领域,在最近数年得到了广泛应用。在实践中,机器人系统在实时性,安全性,鲁棒性等方面都有极高要求,这要求开发者具有算法和系统的双重思维,从而解决实际问题。本书中我们将结合最新研究成果和机器人系统实践经验讲解该类系统的设计原则和实现细节。 - - -我们在持续拓展拓展本书的内容,如元学习系统,自动并行,深度学习集群调度,绿色AI系统,图学习系统等。我们也非常欢迎社区对于新内容提出建议,贡献章节。 +| 章节 | 内容 | +|------|------| +| [深度学习推荐系统](chapter_recommender_system/) | 推荐系统原理、大规模工业场景架构设计 | +| [联邦学习系统](chapter_federated_learning/) | 联邦学习方法、隐私保护、系统实现 | +| [强化学习系统](chapter_reinforcement_learning/) | 单智能体/多智能体强化学习系统 | +| [可解释性 AI 系统](chapter_explainable_AI/) | 可解释 AI 方法与落地实践 | +| [机器人学习系统](chapter_rl_sys/) | 机器人感知、规划、控制与系统安全 | ## 构建指南 -请参考[构建指南](info/info.md)来了解如何构建本书的网页版本和PDF版本。 +### 环境依赖 -## 写作指南 +- Python >= 3.10 +- pandoc >= 2.19 -我们欢迎大家来一起贡献和更新本书的内容。常见的贡献方式是提交PR来更新和添加Markdown文件。写作的风格和图片要求请参考[风格指南](info/style.md)。同时,机器学习领域涉及到大量的中英文翻译,相关的翻译要求请参考[术语指南](info/terminology.md)。 +### 安装步骤 + +```bash +# 克隆仓库 +git clone https://github.com/openmlsys/openmlsys-zh.git +cd openmlsys-zh + +# 安装 d2lbook +git clone https://github.com/openmlsys/d2l-book.git +cd d2l-book && pip install . && cd .. + +# 安装 Python 依赖 +pip install -r requirements.txt +``` + +### 编译 HTML + +```bash +sh build_html.sh +# 生成结果在 _build/html/ +``` + +更多细节请参考 [构建指南](info/info.md)。 + +## 贡献指南 + +我们欢迎任何形式的贡献,包括: + +- **勘误**:发现文字或图片错误,请创建 Issue 并 @ [章节编辑](info/editors.md),或直接提交 PR。 +- **内容更新**:提交 PR 更新或添加 Markdown 文件。 +- **新章节**:欢迎社区对元学习系统、自动并行、集群调度、绿色 AI、图学习等主题贡献章节。 + +提交前请阅读: +- [写作风格指南](info/style.md) +- [中英文术语对照](info/terminology.md) + +## 社区 + +

+ 微信群二维码 +
+ 扫码加入微信交流群 +

+ +## 许可证 + +本项目采用 [知识共享 署名-非商业性使用-相同方式共享 4.0 国际许可协议](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.zh) 授权。 diff --git a/README_EN.md b/README_EN.md new file mode 100644 index 0000000..e0cbde4 --- /dev/null +++ b/README_EN.md @@ -0,0 +1,128 @@ +

+ OpenMLSys Logo +

+ +

+ + CI + + + Book Online + + + License + + + GitHub Stars + +

+ +

+ 中文 | English +

+ +--- + +# Machine Learning Systems: Design and Implementation + +An open-source book explaining the design principles and implementation experience of modern machine learning systems, covering the complete technology stack from programming interfaces and computational graphs to compilers and distributed training. + +**Read Online:** [openmlsys.github.io](https://openmlsys.github.io/) + +## Table of Contents + +- [Target Audience](#target-audience) +- [Content Overview](#content-overview) +- [Build Guide](#build-guide) +- [Contributing](#contributing) +- [Community](#community) +- [License](#license) + +## Target Audience + +- **Students**: Those who have mastered machine learning fundamentals and want to deeply understand the design and implementation of modern ML systems. +- **Researchers**: Those who need to develop custom operators or leverage distributed execution for large model development. +- **Engineers**: Those responsible for building ML infrastructure and need to tune system performance or customize ML systems for business needs. + +## Content Overview + +The book is organized into three parts: Fundamentals, Advanced Topics, and Extensions. + +### Part I: Fundamentals + +| Chapter | Content | +|---------|---------| +| [Programming Interface](chapter_programming_interface/) | Framework API design, ML workflows, deep learning model definition, C/C++ framework development | +| [Computational Graph](chapter_computational_graph/) | Graph components, generation methods, scheduling strategies, automatic differentiation | + +### Part II: Advanced Topics + +| Chapter | Content | +|---------|---------| +| [Compiler Frontend & IR](chapter_frontend_and_ir/) | Type inference, intermediate representation (IR), automatic differentiation, common optimization passes | +| [Compiler Backend & Runtime](chapter_backend_and_runtime/) | Graph optimization, operator selection, memory allocation, compute scheduling and execution | +| [Hardware Accelerators](chapter_accelerator/) | GPU/Ascend architecture, high-performance programming interfaces (CUDA/CANN) | +| [Data Processing](chapter_data_processing/) | Usability, efficiency, order preservation, distributed data processing | +| [Model Deployment](chapter_model_deployment/) | Model conversion, compression, inference, and security | +| [Distributed Training](chapter_distributed_training/) | Data parallelism, model parallelism, pipeline parallelism, collective communication, parameter servers | + +### Part III: Extensions + +| Chapter | Content | +|---------|---------| +| [Recommender Systems](chapter_recommender_system/) | Recommendation principles, large-scale industrial architecture | +| [Federated Learning](chapter_federated_learning/) | Federated learning methods, privacy protection, system implementation | +| [Reinforcement Learning Systems](chapter_reinforcement_learning/) | Single-agent and multi-agent RL systems | +| [Explainable AI Systems](chapter_explainable_AI/) | XAI methods and production practices | +| [Robot Learning Systems](chapter_rl_sys/) | Robot perception, planning, control, and system safety | + +## Build Guide + +### Prerequisites + +- Python >= 3.10 +- pandoc >= 2.19 + +### Installation + +```bash +# Clone the repository +git clone https://github.com/openmlsys/openmlsys-zh.git +cd openmlsys-zh + +# Install d2lbook +git clone https://github.com/openmlsys/d2l-book.git +cd d2l-book && pip install . && cd .. + +# Install Python dependencies +pip install -r requirements.txt +``` + +### Build HTML + +```bash +sh build_html.sh +# Output is in _build/html/ +``` + +For more details, see the [Build Guide](info/info.md). + +## Contributing + +We welcome all forms of contributions, including: + +- **Errata**: If you find text or figure errors, please open an Issue and @ the [chapter editors](info/editors.md), or submit a PR directly. +- **Content updates**: Submit PRs to update or add Markdown files. +- **New chapters**: We welcome community contributions on topics such as meta-learning systems, automatic parallelism, cluster scheduling, green AI, and graph learning. + +Before contributing, please read: +- [Writing Style Guide](info/style.md) +- [Terminology Guide](info/terminology.md) + +## Community + +Join our WeChat group by scanning the QR code in [info/mlsys_group.png](info/mlsys_group.png). + +## License + +This project is licensed under the [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/). diff --git a/build_html.sh b/build_html.sh index 20590a4..b052657 100644 --- a/build_html.sh +++ b/build_html.sh @@ -1,10 +1,28 @@ #!/bin/bash +# Build the English (en) version of the book from en_chapters/. +# Output: en_chapters/_build/html/ +# +# Resources (img/, references/, static/, mlsys.bib) live at the repo root and +# are symlinked into en_chapters/ so d2lbook can find them at relative paths. set -e +ROOT="$(cd "$(dirname "$0")" && pwd)" + +# ── Create resource symlinks ────────────────────────────────────────────────── +for target in img references static mlsys.bib; do + link="$ROOT/en_chapters/$target" + if [ ! -e "$link" ]; then + ln -sf "$ROOT/$target" "$link" + fi +done + +# ── Build ───────────────────────────────────────────────────────────────────── +cd "$ROOT/en_chapters" + rm -rf _build/rst _build/html d2lbook build rst cp static/frontpage.html _build/rst/ d2lbook build html -cp -r static/image/* _build/html/_images/ -python3 tools/format_tables.py \ No newline at end of file +cp -r static/image/* _build/html/_images/ 2>/dev/null || true +python3 "$ROOT/tools/format_tables.py" diff --git a/build_html_zh.sh b/build_html_zh.sh new file mode 100755 index 0000000..5d9ec90 --- /dev/null +++ b/build_html_zh.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Build the Chinese (zh) version of the book from zh_chapters/. +# Output: zh_chapters/_build/html/ +# +# Resources (img/, references/, static/, mlsys.bib) live at the repo root and +# are symlinked into zh_chapters/ so d2lbook can find them at relative paths. + +set -e + +ROOT="$(cd "$(dirname "$0")" && pwd)" + +# ── Create resource symlinks ────────────────────────────────────────────────── +for target in img references static mlsys.bib; do + link="$ROOT/zh_chapters/$target" + if [ ! -e "$link" ]; then + ln -sf "$ROOT/$target" "$link" + fi +done + +# ── Build ───────────────────────────────────────────────────────────────────── +cd "$ROOT/zh_chapters" + +rm -rf _build/rst _build/html +d2lbook build rst +cp static/frontpage.html _build/rst/ +d2lbook build html +cp -r static/image/* _build/html/_images/ 2>/dev/null || true +python3 "$ROOT/tools/format_tables.py" diff --git a/chapter_reinforcement_learning/distributed_node_rl.md b/chapter_reinforcement_learning/distributed_node_rl.md deleted file mode 100644 index 2b6cf89..0000000 --- a/chapter_reinforcement_learning/distributed_node_rl.md +++ /dev/null @@ -1,43 +0,0 @@ -## 分布式强化学习系统 - -分布式强化学习系统是比上面介绍的单节点强化学习系统更强大的一种。它能支持多环境多模型并行处理,主要是能同时在多个实际计算机系统上对多个模型进行更新,将大大提高强化学习系统的学习速度和整体表现。我们这里介绍分布式强化学习常见的算法和系统。 - -异步优势行动-批判者(Asynchronous Advantage Actor-Critic,A3C)是由DeepMind研究人员 :cite:`mnih2016asynchronous`于2016年提出的可以在多个计算设备上并行更新网络的学习算法。相比于 :numref:`ch12/ch12-rlzoo`中的单节点强化学习系统,A3C通过创建一组工作者(Worker),并将每个工作者分配到不同的计算设备上且为他们各自创建可以交互的环境来实现并行采样和模型更新,同时用一个主(Master)节点维护这些行动者(Actor)和批判者(Critic)网络的更新。行动者是策略网络,批判者是价值网络,分别对应强化学习中的策略和价值函数。通过这样的设计,整个算法的各个工作者可以实时将所采集到样本计算出的梯度回传到主节点,来更新主节点的模型参数,并在主节点模型更新后即时下发到各个工作者进行模型更新。每个工作者可以单独在一个 GPU 上进行运算,从而整个算法可以在一个 GPU 集群上并行更新模型,算法结构由 :numref:`ch12/ch12-a3c`所示。研究表明,分布式强化学习训练除加速模型学习之外,由于其更新梯度是由多个计算节点各自对环境采样计算得到的,还有利于稳定学习表现。 - -![A3C分布式算法架构](../img/ch12/ch12-a3c.png) - -:width:`800px` - -:label:`ch12/ch12-a3c` - -重要性加权行动-学习者架构(Importance Weighted Actor-Learner Architecture,IMPALA) 是由Lasse Espeholt等人于2018年 :cite:`espeholt2018impala`提出的能够实现多机集群训练的强化学习框架,如:numref:`ch12/ch12-impala`所示。与 A3C 算法类似,IMPALA 能够在多个 GPU 上并行进行梯度计算。具体地,IMPALA 并行多个行动者(Actor)和学习者(Learner),每个行动者包含一个策略网络,并用这个策略网络与一个环境进行交互,以收集样本。所收集到的样本轨迹由行动者发送到各自的学习者,进行梯度计算。所有的学习者中有一个称为主学习者,它可以和其他所有学习者通信获取他们计算的梯度,从而在主学习者内部对模型进行更新,随后下发到各个学习者及行动者,做新一轮的采样和梯度计算。IMPALA 被证明是比 A3C 更高效的分布式计算架构,它同时得益于一个特殊设计的学习者内的梯度计算函数,称为 V-轨迹目标(V-trace Target),通过重要性加权来稳定训练。我们这里侧重对分布式强化学习结构的介绍,对此不再赘述。感兴趣的读者可以参考原论文 - -![IMPALA分布式算法架构](../img/ch12/ch12-impala.png) - -:width:`800px` - -:label:`ch12/ch12-impala` - -以上是两个著名的分布式强化学习算法A3C和IMPALA,最近研究中还有许多其他成果,如SEED :cite:`espeholt2019seed`、Ape-X :cite:`horgan2018distributed`等都对分布式强化学习有更好的效果,我们不再做过多介绍。下面我们将讨论几个典型的分布式强化学习算法库。 - -![RLlib系统架构](../img/ch12/ch12-rllib-arch.svg) - -:width:`800px` - -:label:`ch12/ch12-rllib` - -Ray :cite:`moritz2018ray`是由伯克利大学几名研究人员发起的一个分布式计算框架,基于Ray之上构建了一个专门针对强化学习的系统RLlib :cite:`liang2017ray`。RLlib 是一个面向工业级应用的开源强化学习框架,同时 -包含了强化学习的算法库,没有太多强化学习经验的人也可以很方便地使用 RLlib。 - -![RLlib分布式训练](../img/ch12/ch12-rllib-distributed.svg) - -:width:`600px` - -:label:`ch12/ch12-rllib_dist` - -RLlib的系统架构如 :numref:`ch12/ch12-rllib`所示,系统底层是构建在 Ray 的分布式计算和通信的基础组建之 -上,面向强化学习的领域概念,在 Python 层抽象了 Trainer, Environment, Policy 等基础组件,并为各个抽象组件提供了一些常用的内置实现,同时用户可以根据自己的算法场景对组件进行扩展,通过这些内置以及自定义的算法组件,研究人员可以方便快速地实现具体的强化学习算法。RLlib支持多种范式的分布式强化学习训练,如 :numref:`ch12/ch12-rllib_dist`所示为基于同步采样的强化学习算法的分布式训练架构。其中每一个 Rollout Worker 为一个独立进程,负责和对应的环境进行交互以完成经验采集,多个 Rollout Worker 可以并行地完成环境交互;Trainer 负责 Rollout Worker之间的协调,策略优化,以及将更新后的策略同步到 Rollout Worker 中。 - -强化学习中的策略通常可以采用深度神经网络,而基于深度神经网络的分布式强化学习训练,可以采用 RLlib 结合 PyTorch 或者 TensorFlow 等深度学习框架协同完成,深度学习框架负责策略网络的训练和更新,RLlib 负责强化学习的算法计算。此外 RLlib 支持与并行的向量化(Vectorized)环境交互,允许外接模拟器,以及可以进行离线(Offline)强化学习。 - -对于分布式系统中样本回放缓冲池的管理,我们会提到另一个工作Reverb :cite:`cassirer2021reverb`。回忆本章开头,我们介绍了强化学习中的状态、动作、奖励等概念,实际强化学习算法进行训练所使用的数据正是存放在经验缓冲池中的这些数据元组,而每种数据自身的格式可能又有不同,实际使用时也需要对不同的数据做不同类型的操作。常见的数据操作类型如拼接、截取、乘积、转置、部分乘积、取均值、取极值等,而每种操作都可能需要对特定数据的特定维度进行,这常常给现有的强化学习框架在实践中产生一定的困难。为了方便强化学习过程中灵活使用不同的数据形式,Reverb 设计了数据块的概念(Chunks),所有使用的训练数据在缓冲池中都使用数据块的格式进行管理和调用,这一设计基于数据是多维张量的特点,增大了数据使用的灵活性和访问速度。Acme :cite:`hoffman2020acme`是近年来由DeepMind提出的一个分布式强化学习框架,同样是针对学术界的研究和工业界的应用,它基于 Reverb 对样本缓冲池的数据管理,结合分布式采样的结构,给出了一个更快的分布式强化学习解决方案。Reverb 帮助解决了数据管理和传输的效率问题,使得 Acme得以将分布式计算的效力充分发挥,研究人员用 Acme 在大量强化学习基准测试中取得了显著的速度提升。 \ No newline at end of file diff --git a/en_chapters/appendix_machine_learning_introduction/classic_machine_learning.md b/en_chapters/appendix_machine_learning_introduction/classic_machine_learning.md new file mode 100644 index 0000000..2086545 --- /dev/null +++ b/en_chapters/appendix_machine_learning_introduction/classic_machine_learning.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/appendix_machine_learning_introduction/classic_machine_learning.md] diff --git a/en_chapters/appendix_machine_learning_introduction/gradient_descent.md b/en_chapters/appendix_machine_learning_introduction/gradient_descent.md new file mode 100644 index 0000000..d642e9c --- /dev/null +++ b/en_chapters/appendix_machine_learning_introduction/gradient_descent.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/appendix_machine_learning_introduction/gradient_descent.md] diff --git a/en_chapters/appendix_machine_learning_introduction/index.md b/en_chapters/appendix_machine_learning_introduction/index.md new file mode 100644 index 0000000..8b1eaa2 --- /dev/null +++ b/en_chapters/appendix_machine_learning_introduction/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/appendix_machine_learning_introduction/index.md] diff --git a/en_chapters/appendix_machine_learning_introduction/neural_network.md b/en_chapters/appendix_machine_learning_introduction/neural_network.md new file mode 100644 index 0000000..f248fc9 --- /dev/null +++ b/en_chapters/appendix_machine_learning_introduction/neural_network.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/appendix_machine_learning_introduction/neural_network.md] diff --git a/en_chapters/chapter_accelerator/accelerator_architecture.md b/en_chapters/chapter_accelerator/accelerator_architecture.md new file mode 100644 index 0000000..5805e4d --- /dev/null +++ b/en_chapters/chapter_accelerator/accelerator_architecture.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_accelerator/accelerator_architecture.md] diff --git a/en_chapters/chapter_accelerator/accelerator_introduction.md b/en_chapters/chapter_accelerator/accelerator_introduction.md new file mode 100644 index 0000000..44b0a73 --- /dev/null +++ b/en_chapters/chapter_accelerator/accelerator_introduction.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_accelerator/accelerator_introduction.md] diff --git a/en_chapters/chapter_accelerator/accelerator_practise.md b/en_chapters/chapter_accelerator/accelerator_practise.md new file mode 100644 index 0000000..a18225b --- /dev/null +++ b/en_chapters/chapter_accelerator/accelerator_practise.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_accelerator/accelerator_practise.md] diff --git a/en_chapters/chapter_accelerator/accelerator_programming.md b/en_chapters/chapter_accelerator/accelerator_programming.md new file mode 100644 index 0000000..d3eed66 --- /dev/null +++ b/en_chapters/chapter_accelerator/accelerator_programming.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_accelerator/accelerator_programming.md] diff --git a/en_chapters/chapter_accelerator/index.md b/en_chapters/chapter_accelerator/index.md new file mode 100644 index 0000000..aa6d859 --- /dev/null +++ b/en_chapters/chapter_accelerator/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_accelerator/index.md] diff --git a/en_chapters/chapter_accelerator/summary.md b/en_chapters/chapter_accelerator/summary.md new file mode 100644 index 0000000..12fb41a --- /dev/null +++ b/en_chapters/chapter_accelerator/summary.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_accelerator/summary.md] diff --git a/en_chapters/chapter_backend_and_runtime/compute_schedule_and_execute.md b/en_chapters/chapter_backend_and_runtime/compute_schedule_and_execute.md new file mode 100644 index 0000000..627d9b8 --- /dev/null +++ b/en_chapters/chapter_backend_and_runtime/compute_schedule_and_execute.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_backend_and_runtime/compute_schedule_and_execute.md] diff --git a/en_chapters/chapter_backend_and_runtime/graph_optimizer.md b/en_chapters/chapter_backend_and_runtime/graph_optimizer.md new file mode 100644 index 0000000..139673f --- /dev/null +++ b/en_chapters/chapter_backend_and_runtime/graph_optimizer.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_backend_and_runtime/graph_optimizer.md] diff --git a/en_chapters/chapter_backend_and_runtime/index.md b/en_chapters/chapter_backend_and_runtime/index.md new file mode 100644 index 0000000..aeca214 --- /dev/null +++ b/en_chapters/chapter_backend_and_runtime/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_backend_and_runtime/index.md] diff --git a/en_chapters/chapter_backend_and_runtime/kernel_selecter.md b/en_chapters/chapter_backend_and_runtime/kernel_selecter.md new file mode 100644 index 0000000..9e1f32c --- /dev/null +++ b/en_chapters/chapter_backend_and_runtime/kernel_selecter.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_backend_and_runtime/kernel_selecter.md] diff --git a/en_chapters/chapter_backend_and_runtime/memory_allocator.md b/en_chapters/chapter_backend_and_runtime/memory_allocator.md new file mode 100644 index 0000000..73479d1 --- /dev/null +++ b/en_chapters/chapter_backend_and_runtime/memory_allocator.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_backend_and_runtime/memory_allocator.md] diff --git a/en_chapters/chapter_backend_and_runtime/op_compiler.md b/en_chapters/chapter_backend_and_runtime/op_compiler.md new file mode 100644 index 0000000..95dd3be --- /dev/null +++ b/en_chapters/chapter_backend_and_runtime/op_compiler.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_backend_and_runtime/op_compiler.md] diff --git a/en_chapters/chapter_backend_and_runtime/overview.md b/en_chapters/chapter_backend_and_runtime/overview.md new file mode 100644 index 0000000..3c2539e --- /dev/null +++ b/en_chapters/chapter_backend_and_runtime/overview.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_backend_and_runtime/overview.md] diff --git a/en_chapters/chapter_backend_and_runtime/summary.md b/en_chapters/chapter_backend_and_runtime/summary.md new file mode 100644 index 0000000..11383e4 --- /dev/null +++ b/en_chapters/chapter_backend_and_runtime/summary.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_backend_and_runtime/summary.md] diff --git a/en_chapters/chapter_computational_graph/background_and_functionality.md b/en_chapters/chapter_computational_graph/background_and_functionality.md new file mode 100644 index 0000000..669f0c6 --- /dev/null +++ b/en_chapters/chapter_computational_graph/background_and_functionality.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_computational_graph/background_and_functionality.md] diff --git a/en_chapters/chapter_computational_graph/components_of_computational_graph.md b/en_chapters/chapter_computational_graph/components_of_computational_graph.md new file mode 100644 index 0000000..90aaa2a --- /dev/null +++ b/en_chapters/chapter_computational_graph/components_of_computational_graph.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_computational_graph/components_of_computational_graph.md] diff --git a/en_chapters/chapter_computational_graph/generation_of_computational_graph.md b/en_chapters/chapter_computational_graph/generation_of_computational_graph.md new file mode 100644 index 0000000..24ef339 --- /dev/null +++ b/en_chapters/chapter_computational_graph/generation_of_computational_graph.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_computational_graph/generation_of_computational_graph.md] diff --git a/en_chapters/chapter_computational_graph/index.md b/en_chapters/chapter_computational_graph/index.md new file mode 100644 index 0000000..910fe59 --- /dev/null +++ b/en_chapters/chapter_computational_graph/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_computational_graph/index.md] diff --git a/en_chapters/chapter_computational_graph/schedule_of_computational_graph.md b/en_chapters/chapter_computational_graph/schedule_of_computational_graph.md new file mode 100644 index 0000000..86f68ce --- /dev/null +++ b/en_chapters/chapter_computational_graph/schedule_of_computational_graph.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_computational_graph/schedule_of_computational_graph.md] diff --git a/en_chapters/chapter_computational_graph/summary.md b/en_chapters/chapter_computational_graph/summary.md new file mode 100644 index 0000000..3ed7400 --- /dev/null +++ b/en_chapters/chapter_computational_graph/summary.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_computational_graph/summary.md] diff --git a/en_chapters/chapter_data_processing/data_order.md b/en_chapters/chapter_data_processing/data_order.md new file mode 100644 index 0000000..c202234 --- /dev/null +++ b/en_chapters/chapter_data_processing/data_order.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_data_processing/data_order.md] diff --git a/en_chapters/chapter_data_processing/extension.md b/en_chapters/chapter_data_processing/extension.md new file mode 100644 index 0000000..29f19a5 --- /dev/null +++ b/en_chapters/chapter_data_processing/extension.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_data_processing/extension.md] diff --git a/en_chapters/chapter_data_processing/index.md b/en_chapters/chapter_data_processing/index.md new file mode 100644 index 0000000..3af1318 --- /dev/null +++ b/en_chapters/chapter_data_processing/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_data_processing/index.md] diff --git a/en_chapters/chapter_data_processing/performance.md b/en_chapters/chapter_data_processing/performance.md new file mode 100644 index 0000000..75c7193 --- /dev/null +++ b/en_chapters/chapter_data_processing/performance.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_data_processing/performance.md] diff --git a/en_chapters/chapter_data_processing/program_model.md b/en_chapters/chapter_data_processing/program_model.md new file mode 100644 index 0000000..0e4f5b7 --- /dev/null +++ b/en_chapters/chapter_data_processing/program_model.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_data_processing/program_model.md] diff --git a/en_chapters/chapter_data_processing/requirements.md b/en_chapters/chapter_data_processing/requirements.md new file mode 100644 index 0000000..7085c33 --- /dev/null +++ b/en_chapters/chapter_data_processing/requirements.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_data_processing/requirements.md] diff --git a/en_chapters/chapter_data_processing/summary.md b/en_chapters/chapter_data_processing/summary.md new file mode 100644 index 0000000..3836106 --- /dev/null +++ b/en_chapters/chapter_data_processing/summary.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_data_processing/summary.md] diff --git a/en_chapters/chapter_distributed_training/cluster.md b/en_chapters/chapter_distributed_training/cluster.md new file mode 100644 index 0000000..cb74a6a --- /dev/null +++ b/en_chapters/chapter_distributed_training/cluster.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_distributed_training/cluster.md] diff --git a/en_chapters/chapter_distributed_training/collective.md b/en_chapters/chapter_distributed_training/collective.md new file mode 100644 index 0000000..1b51153 --- /dev/null +++ b/en_chapters/chapter_distributed_training/collective.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_distributed_training/collective.md] diff --git a/en_chapters/chapter_distributed_training/index.md b/en_chapters/chapter_distributed_training/index.md new file mode 100644 index 0000000..c71a92a --- /dev/null +++ b/en_chapters/chapter_distributed_training/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_distributed_training/index.md] diff --git a/en_chapters/chapter_distributed_training/methods.md b/en_chapters/chapter_distributed_training/methods.md new file mode 100644 index 0000000..0c704ec --- /dev/null +++ b/en_chapters/chapter_distributed_training/methods.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_distributed_training/methods.md] diff --git a/en_chapters/chapter_distributed_training/overview.md b/en_chapters/chapter_distributed_training/overview.md new file mode 100644 index 0000000..2f76c7e --- /dev/null +++ b/en_chapters/chapter_distributed_training/overview.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_distributed_training/overview.md] diff --git a/en_chapters/chapter_distributed_training/parameter_servers.md b/en_chapters/chapter_distributed_training/parameter_servers.md new file mode 100644 index 0000000..72f23e2 --- /dev/null +++ b/en_chapters/chapter_distributed_training/parameter_servers.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_distributed_training/parameter_servers.md] diff --git a/en_chapters/chapter_distributed_training/summary.md b/en_chapters/chapter_distributed_training/summary.md new file mode 100644 index 0000000..662eb3b --- /dev/null +++ b/en_chapters/chapter_distributed_training/summary.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_distributed_training/summary.md] diff --git a/en_chapters/chapter_explainable_AI/explainable_ai.md b/en_chapters/chapter_explainable_AI/explainable_ai.md new file mode 100644 index 0000000..3fa9fba --- /dev/null +++ b/en_chapters/chapter_explainable_AI/explainable_ai.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_explainable_AI/explainable_ai.md] diff --git a/en_chapters/chapter_explainable_AI/index.md b/en_chapters/chapter_explainable_AI/index.md new file mode 100644 index 0000000..86c298c --- /dev/null +++ b/en_chapters/chapter_explainable_AI/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_explainable_AI/index.md] diff --git a/en_chapters/chapter_federated_learning/horizontal_fl.md b/en_chapters/chapter_federated_learning/horizontal_fl.md new file mode 100644 index 0000000..21fc87e --- /dev/null +++ b/en_chapters/chapter_federated_learning/horizontal_fl.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_federated_learning/horizontal_fl.md] diff --git a/en_chapters/chapter_federated_learning/index.md b/en_chapters/chapter_federated_learning/index.md new file mode 100644 index 0000000..f059db0 --- /dev/null +++ b/en_chapters/chapter_federated_learning/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_federated_learning/index.md] diff --git a/en_chapters/chapter_federated_learning/outlook.md b/en_chapters/chapter_federated_learning/outlook.md new file mode 100644 index 0000000..474e227 --- /dev/null +++ b/en_chapters/chapter_federated_learning/outlook.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_federated_learning/outlook.md] diff --git a/en_chapters/chapter_federated_learning/overview.md b/en_chapters/chapter_federated_learning/overview.md new file mode 100644 index 0000000..b401351 --- /dev/null +++ b/en_chapters/chapter_federated_learning/overview.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_federated_learning/overview.md] diff --git a/en_chapters/chapter_federated_learning/privacy_encryption_algorithm.md b/en_chapters/chapter_federated_learning/privacy_encryption_algorithm.md new file mode 100644 index 0000000..a5f36bc --- /dev/null +++ b/en_chapters/chapter_federated_learning/privacy_encryption_algorithm.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_federated_learning/privacy_encryption_algorithm.md] diff --git a/en_chapters/chapter_federated_learning/summary.md b/en_chapters/chapter_federated_learning/summary.md new file mode 100644 index 0000000..a6231d9 --- /dev/null +++ b/en_chapters/chapter_federated_learning/summary.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_federated_learning/summary.md] diff --git a/en_chapters/chapter_federated_learning/vertical_fl.md b/en_chapters/chapter_federated_learning/vertical_fl.md new file mode 100644 index 0000000..46124ef --- /dev/null +++ b/en_chapters/chapter_federated_learning/vertical_fl.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_federated_learning/vertical_fl.md] diff --git a/en_chapters/chapter_frontend_and_ir/ad.md b/en_chapters/chapter_frontend_and_ir/ad.md new file mode 100644 index 0000000..5d4d543 --- /dev/null +++ b/en_chapters/chapter_frontend_and_ir/ad.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_frontend_and_ir/ad.md] diff --git a/en_chapters/chapter_frontend_and_ir/ai_compiler_design_principle.md b/en_chapters/chapter_frontend_and_ir/ai_compiler_design_principle.md new file mode 100644 index 0000000..c57a470 --- /dev/null +++ b/en_chapters/chapter_frontend_and_ir/ai_compiler_design_principle.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_frontend_and_ir/ai_compiler_design_principle.md] diff --git a/en_chapters/chapter_frontend_and_ir/common_frontend_optimization_pass.md b/en_chapters/chapter_frontend_and_ir/common_frontend_optimization_pass.md new file mode 100644 index 0000000..4e35b87 --- /dev/null +++ b/en_chapters/chapter_frontend_and_ir/common_frontend_optimization_pass.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_frontend_and_ir/common_frontend_optimization_pass.md] diff --git a/en_chapters/chapter_frontend_and_ir/index.md b/en_chapters/chapter_frontend_and_ir/index.md new file mode 100644 index 0000000..fa859b7 --- /dev/null +++ b/en_chapters/chapter_frontend_and_ir/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_frontend_and_ir/index.md] diff --git a/en_chapters/chapter_frontend_and_ir/intermediate_representation.md b/en_chapters/chapter_frontend_and_ir/intermediate_representation.md new file mode 100644 index 0000000..5f013d0 --- /dev/null +++ b/en_chapters/chapter_frontend_and_ir/intermediate_representation.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_frontend_and_ir/intermediate_representation.md] diff --git a/en_chapters/chapter_frontend_and_ir/overview_of_frontend.md b/en_chapters/chapter_frontend_and_ir/overview_of_frontend.md new file mode 100644 index 0000000..7a77be1 --- /dev/null +++ b/en_chapters/chapter_frontend_and_ir/overview_of_frontend.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_frontend_and_ir/overview_of_frontend.md] diff --git a/en_chapters/chapter_frontend_and_ir/summary.md b/en_chapters/chapter_frontend_and_ir/summary.md new file mode 100644 index 0000000..f5afa7b --- /dev/null +++ b/en_chapters/chapter_frontend_and_ir/summary.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_frontend_and_ir/summary.md] diff --git a/en_chapters/chapter_frontend_and_ir/type_system_and_static_analysis.md b/en_chapters/chapter_frontend_and_ir/type_system_and_static_analysis.md new file mode 100644 index 0000000..e3aed85 --- /dev/null +++ b/en_chapters/chapter_frontend_and_ir/type_system_and_static_analysis.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_frontend_and_ir/type_system_and_static_analysis.md] diff --git a/en_chapters/chapter_introduction/applications.md b/en_chapters/chapter_introduction/applications.md new file mode 100644 index 0000000..56382e3 --- /dev/null +++ b/en_chapters/chapter_introduction/applications.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_introduction/applications.md] diff --git a/en_chapters/chapter_introduction/architecture.md b/en_chapters/chapter_introduction/architecture.md new file mode 100644 index 0000000..ccc2e96 --- /dev/null +++ b/en_chapters/chapter_introduction/architecture.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_introduction/architecture.md] diff --git a/en_chapters/chapter_introduction/design.md b/en_chapters/chapter_introduction/design.md new file mode 100644 index 0000000..9f8575d --- /dev/null +++ b/en_chapters/chapter_introduction/design.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_introduction/design.md] diff --git a/en_chapters/chapter_introduction/ecosystem.md b/en_chapters/chapter_introduction/ecosystem.md new file mode 100644 index 0000000..0580fba --- /dev/null +++ b/en_chapters/chapter_introduction/ecosystem.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_introduction/ecosystem.md] diff --git a/en_chapters/chapter_introduction/index.md b/en_chapters/chapter_introduction/index.md new file mode 100644 index 0000000..554fdd7 --- /dev/null +++ b/en_chapters/chapter_introduction/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_introduction/index.md] diff --git a/en_chapters/chapter_introduction/readers.md b/en_chapters/chapter_introduction/readers.md new file mode 100644 index 0000000..a49bb6e --- /dev/null +++ b/en_chapters/chapter_introduction/readers.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_introduction/readers.md] diff --git a/en_chapters/chapter_model_deployment/index.md b/en_chapters/chapter_model_deployment/index.md new file mode 100644 index 0000000..03bd0b1 --- /dev/null +++ b/en_chapters/chapter_model_deployment/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_model_deployment/index.md] diff --git a/en_chapters/chapter_model_deployment/model_compression.md b/en_chapters/chapter_model_deployment/model_compression.md new file mode 100644 index 0000000..a6f7d82 --- /dev/null +++ b/en_chapters/chapter_model_deployment/model_compression.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_model_deployment/model_compression.md] diff --git a/en_chapters/chapter_model_deployment/model_converter_and_optimizer.md b/en_chapters/chapter_model_deployment/model_converter_and_optimizer.md new file mode 100644 index 0000000..7d29e75 --- /dev/null +++ b/en_chapters/chapter_model_deployment/model_converter_and_optimizer.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_model_deployment/model_converter_and_optimizer.md] diff --git a/en_chapters/chapter_model_deployment/model_deployment_introduction.md b/en_chapters/chapter_model_deployment/model_deployment_introduction.md new file mode 100644 index 0000000..3c63a17 --- /dev/null +++ b/en_chapters/chapter_model_deployment/model_deployment_introduction.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_model_deployment/model_deployment_introduction.md] diff --git a/en_chapters/chapter_model_deployment/model_inference.md b/en_chapters/chapter_model_deployment/model_inference.md new file mode 100644 index 0000000..2070a80 --- /dev/null +++ b/en_chapters/chapter_model_deployment/model_inference.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_model_deployment/model_inference.md] diff --git a/en_chapters/chapter_model_deployment/model_security.md b/en_chapters/chapter_model_deployment/model_security.md new file mode 100644 index 0000000..b8c6277 --- /dev/null +++ b/en_chapters/chapter_model_deployment/model_security.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_model_deployment/model_security.md] diff --git a/en_chapters/chapter_model_deployment/summary.md b/en_chapters/chapter_model_deployment/summary.md new file mode 100644 index 0000000..13e4935 --- /dev/null +++ b/en_chapters/chapter_model_deployment/summary.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_model_deployment/summary.md] diff --git a/en_chapters/chapter_preface/index.md b/en_chapters/chapter_preface/index.md new file mode 100644 index 0000000..4bc0a55 --- /dev/null +++ b/en_chapters/chapter_preface/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_preface/index.md] diff --git a/en_chapters/chapter_preface_advanced/index.md b/en_chapters/chapter_preface_advanced/index.md new file mode 100644 index 0000000..1329884 --- /dev/null +++ b/en_chapters/chapter_preface_advanced/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_preface_advanced/index.md] diff --git a/en_chapters/chapter_preface_extension/index.md b/en_chapters/chapter_preface_extension/index.md new file mode 100644 index 0000000..084f4fb --- /dev/null +++ b/en_chapters/chapter_preface_extension/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_preface_extension/index.md] diff --git a/en_chapters/chapter_programming_interface/c_python_interaction.md b/en_chapters/chapter_programming_interface/c_python_interaction.md new file mode 100644 index 0000000..5a9b9b1 --- /dev/null +++ b/en_chapters/chapter_programming_interface/c_python_interaction.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_programming_interface/c_python_interaction.md] diff --git a/en_chapters/chapter_programming_interface/development_history.md b/en_chapters/chapter_programming_interface/development_history.md new file mode 100644 index 0000000..9bb7490 --- /dev/null +++ b/en_chapters/chapter_programming_interface/development_history.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_programming_interface/development_history.md] diff --git a/en_chapters/chapter_programming_interface/index.md b/en_chapters/chapter_programming_interface/index.md new file mode 100644 index 0000000..80eea58 --- /dev/null +++ b/en_chapters/chapter_programming_interface/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_programming_interface/index.md] diff --git a/en_chapters/chapter_programming_interface/ml_programming_paradigm.md b/en_chapters/chapter_programming_interface/ml_programming_paradigm.md new file mode 100644 index 0000000..36e9b84 --- /dev/null +++ b/en_chapters/chapter_programming_interface/ml_programming_paradigm.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_programming_interface/ml_programming_paradigm.md] diff --git a/en_chapters/chapter_programming_interface/ml_workflow.md b/en_chapters/chapter_programming_interface/ml_workflow.md new file mode 100644 index 0000000..fc5d572 --- /dev/null +++ b/en_chapters/chapter_programming_interface/ml_workflow.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_programming_interface/ml_workflow.md] diff --git a/en_chapters/chapter_programming_interface/neural_network_layer.md b/en_chapters/chapter_programming_interface/neural_network_layer.md new file mode 100644 index 0000000..1c5a55e --- /dev/null +++ b/en_chapters/chapter_programming_interface/neural_network_layer.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_programming_interface/neural_network_layer.md] diff --git a/en_chapters/chapter_programming_interface/summary.md b/en_chapters/chapter_programming_interface/summary.md new file mode 100644 index 0000000..23643fe --- /dev/null +++ b/en_chapters/chapter_programming_interface/summary.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_programming_interface/summary.md] diff --git a/en_chapters/chapter_recommender_system/case_study.md b/en_chapters/chapter_recommender_system/case_study.md new file mode 100644 index 0000000..e1ff369 --- /dev/null +++ b/en_chapters/chapter_recommender_system/case_study.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_recommender_system/case_study.md] diff --git a/en_chapters/chapter_recommender_system/index.md b/en_chapters/chapter_recommender_system/index.md new file mode 100644 index 0000000..c89b484 --- /dev/null +++ b/en_chapters/chapter_recommender_system/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_recommender_system/index.md] diff --git a/en_chapters/chapter_recommender_system/model_update.md b/en_chapters/chapter_recommender_system/model_update.md new file mode 100644 index 0000000..77eb824 --- /dev/null +++ b/en_chapters/chapter_recommender_system/model_update.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_recommender_system/model_update.md] diff --git a/en_chapters/chapter_recommender_system/multi_stage_recommender_system.md b/en_chapters/chapter_recommender_system/multi_stage_recommender_system.md new file mode 100644 index 0000000..17a8561 --- /dev/null +++ b/en_chapters/chapter_recommender_system/multi_stage_recommender_system.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_recommender_system/multi_stage_recommender_system.md] diff --git a/en_chapters/chapter_recommender_system/summary.md b/en_chapters/chapter_recommender_system/summary.md new file mode 100644 index 0000000..48a6aa7 --- /dev/null +++ b/en_chapters/chapter_recommender_system/summary.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_recommender_system/summary.md] diff --git a/en_chapters/chapter_recommender_system/system_architecture.md b/en_chapters/chapter_recommender_system/system_architecture.md new file mode 100644 index 0000000..7303366 --- /dev/null +++ b/en_chapters/chapter_recommender_system/system_architecture.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_recommender_system/system_architecture.md] diff --git a/en_chapters/chapter_reinforcement_learning/index.md b/en_chapters/chapter_reinforcement_learning/index.md new file mode 100644 index 0000000..878ef04 --- /dev/null +++ b/en_chapters/chapter_reinforcement_learning/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_reinforcement_learning/index.md] diff --git a/en_chapters/chapter_reinforcement_learning/marl.md b/en_chapters/chapter_reinforcement_learning/marl.md new file mode 100644 index 0000000..2e2d97f --- /dev/null +++ b/en_chapters/chapter_reinforcement_learning/marl.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_reinforcement_learning/marl.md] diff --git a/en_chapters/chapter_reinforcement_learning/marl_sys.md b/en_chapters/chapter_reinforcement_learning/marl_sys.md new file mode 100644 index 0000000..067f422 --- /dev/null +++ b/en_chapters/chapter_reinforcement_learning/marl_sys.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_reinforcement_learning/marl_sys.md] diff --git a/en_chapters/chapter_reinforcement_learning/rl_introduction.md b/en_chapters/chapter_reinforcement_learning/rl_introduction.md new file mode 100644 index 0000000..fccaed6 --- /dev/null +++ b/en_chapters/chapter_reinforcement_learning/rl_introduction.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_reinforcement_learning/rl_introduction.md] diff --git a/en_chapters/chapter_reinforcement_learning/single_node_rl.md b/en_chapters/chapter_reinforcement_learning/single_node_rl.md new file mode 100644 index 0000000..80648ff --- /dev/null +++ b/en_chapters/chapter_reinforcement_learning/single_node_rl.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_reinforcement_learning/single_node_rl.md] diff --git a/en_chapters/chapter_reinforcement_learning/summary.md b/en_chapters/chapter_reinforcement_learning/summary.md new file mode 100644 index 0000000..03373f9 --- /dev/null +++ b/en_chapters/chapter_reinforcement_learning/summary.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_reinforcement_learning/summary.md] diff --git a/en_chapters/chapter_rl_sys/control.md b/en_chapters/chapter_rl_sys/control.md new file mode 100644 index 0000000..f172c22 --- /dev/null +++ b/en_chapters/chapter_rl_sys/control.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_rl_sys/control.md] diff --git a/en_chapters/chapter_rl_sys/control_code_ex.md b/en_chapters/chapter_rl_sys/control_code_ex.md new file mode 100644 index 0000000..1f9d80c --- /dev/null +++ b/en_chapters/chapter_rl_sys/control_code_ex.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_rl_sys/control_code_ex.md] diff --git a/en_chapters/chapter_rl_sys/index.md b/en_chapters/chapter_rl_sys/index.md new file mode 100644 index 0000000..e5e7aad --- /dev/null +++ b/en_chapters/chapter_rl_sys/index.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_rl_sys/index.md] diff --git a/en_chapters/chapter_rl_sys/perception.md b/en_chapters/chapter_rl_sys/perception.md new file mode 100644 index 0000000..f820531 --- /dev/null +++ b/en_chapters/chapter_rl_sys/perception.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_rl_sys/perception.md] diff --git a/en_chapters/chapter_rl_sys/perception_code_ex.md b/en_chapters/chapter_rl_sys/perception_code_ex.md new file mode 100644 index 0000000..9656c9f --- /dev/null +++ b/en_chapters/chapter_rl_sys/perception_code_ex.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_rl_sys/perception_code_ex.md] diff --git a/en_chapters/chapter_rl_sys/planning.md b/en_chapters/chapter_rl_sys/planning.md new file mode 100644 index 0000000..8004124 --- /dev/null +++ b/en_chapters/chapter_rl_sys/planning.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_rl_sys/planning.md] diff --git a/en_chapters/chapter_rl_sys/planning_code_ex.md b/en_chapters/chapter_rl_sys/planning_code_ex.md new file mode 100644 index 0000000..a2fe6ac --- /dev/null +++ b/en_chapters/chapter_rl_sys/planning_code_ex.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_rl_sys/planning_code_ex.md] diff --git a/en_chapters/chapter_rl_sys/rl_sys_intro.md b/en_chapters/chapter_rl_sys/rl_sys_intro.md new file mode 100644 index 0000000..90b1f5b --- /dev/null +++ b/en_chapters/chapter_rl_sys/rl_sys_intro.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_rl_sys/rl_sys_intro.md] diff --git a/en_chapters/chapter_rl_sys/robot_learning.md b/en_chapters/chapter_rl_sys/robot_learning.md new file mode 100644 index 0000000..62f7980 --- /dev/null +++ b/en_chapters/chapter_rl_sys/robot_learning.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_rl_sys/robot_learning.md] diff --git a/en_chapters/chapter_rl_sys/robot_safety.md b/en_chapters/chapter_rl_sys/robot_safety.md new file mode 100644 index 0000000..08065dc --- /dev/null +++ b/en_chapters/chapter_rl_sys/robot_safety.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_rl_sys/robot_safety.md] diff --git a/en_chapters/chapter_rl_sys/ros.md b/en_chapters/chapter_rl_sys/ros.md new file mode 100644 index 0000000..cb5c9d5 --- /dev/null +++ b/en_chapters/chapter_rl_sys/ros.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_rl_sys/ros.md] diff --git a/en_chapters/chapter_rl_sys/ros_code_ex.md b/en_chapters/chapter_rl_sys/ros_code_ex.md new file mode 100644 index 0000000..0ddec7a --- /dev/null +++ b/en_chapters/chapter_rl_sys/ros_code_ex.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_rl_sys/ros_code_ex.md] diff --git a/en_chapters/chapter_rl_sys/summary.md b/en_chapters/chapter_rl_sys/summary.md new file mode 100644 index 0000000..5b26b41 --- /dev/null +++ b/en_chapters/chapter_rl_sys/summary.md @@ -0,0 +1 @@ +[TODO: src = zh_chapters/chapter_rl_sys/summary.md] diff --git a/config.ini b/en_chapters/config.ini similarity index 80% rename from config.ini rename to en_chapters/config.ini index 9866d7b..de5ecae 100644 --- a/config.ini +++ b/en_chapters/config.ini @@ -22,7 +22,7 @@ notebooks = *.md */*.md resources = img/ references/ # Files that will be skipped. -exclusions = */*_origin.md README.md info/* contrib/*md +exclusions = */*_origin.md README.md README_EN.md info/* contrib/*md zh_chapters/*/* # If True (default), then will evaluate the notebook to obtain outputs. eval_notebook = True @@ -65,6 +65,7 @@ sphinx_configs = numfig_format = {'figure': '图%%s', 'table': '表%%s', 'code-b 'figure_align': 'H', 'fncychap': '\\usepackage[Sonny]{fncychap}', } + bibtex_bibfiles = ['references/accelerator.bib', 'references/appendix.bib', 'references/backend.bib', 'references/data.bib', 'references/explainable.bib', 'references/extension.bib', 'references/federated.bib', 'references/frontend.bib', 'references/graph.bib', 'references/interface.bib', 'references/introduction.bib', 'references/model.bib', 'references/model_deployment.bib', 'references/recommender.bib', 'references/reinforcement.bib', 'references/rlsys.bib', 'references/training.bib'] @@ -74,7 +75,7 @@ sphinx_configs = numfig_format = {'figure': '图%%s', 'table': '表%%s', 'code-b # items: name, URL, and a fontawesome icon # (https://fontawesome.com/icons?d=gallery). Items are separated by commas. header_links = GitHub, https://github.com/openmlsys/openmlsys-zh, fab fa-github, - English, https://openmlsys.github.io/html-en, fas fa-external-link-alt + 中文, https://openmlsys.github.io/cn/, fas fa-language favicon = static/favicon.png diff --git a/en_chapters/img b/en_chapters/img new file mode 120000 index 0000000..0af1dd5 --- /dev/null +++ b/en_chapters/img @@ -0,0 +1 @@ +/chivier-disk/hyq-home/Projects/openmlsys-zh/img \ No newline at end of file diff --git a/en_chapters/index.md b/en_chapters/index.md new file mode 100644 index 0000000..01d9a27 --- /dev/null +++ b/en_chapters/index.md @@ -0,0 +1,41 @@ +Machine Learning Systems: Design and Implementation +==================================================== + +```eval_rst +.. raw:: html + :file: frontpage.html +``` + +```toc +:maxdepth: 2 +:numbered: + +chapter_preface/index +chapter_introduction/index +chapter_programming_interface/index +chapter_computational_graph/index + +chapter_preface_advanced/index + +chapter_frontend_and_ir/index +chapter_backend_and_runtime/index +chapter_accelerator/index +chapter_data_processing/index +chapter_model_deployment/index +chapter_distributed_training/index + +chapter_preface_extension/index + +chapter_recommender_system/index +chapter_federated_learning/index +chapter_reinforcement_learning/index +chapter_explainable_AI/index +chapter_rl_sys/index + +``` + +```toc +:maxdepth: 1 + +appendix_machine_learning_introduction/index +``` diff --git a/en_chapters/mlsys.bib b/en_chapters/mlsys.bib new file mode 100644 index 0000000..b70a7a8 --- /dev/null +++ b/en_chapters/mlsys.bib @@ -0,0 +1,1308 @@ +@article{rosenblatt1958perceptron, + title={The perceptron: a probabilistic model for information storage and organization in the brain.}, + author={Rosenblatt, Frank}, + journal={Psychological Review}, + volume={65}, + number={6}, + pages={386}, + year={1958}, + publisher={American Psychological Association} +} + +@article{lecun1989backpropagation, + title={Backpropagation applied to handwritten zip code recognition}, + author={LeCun, Yann and Boser, Bernhard and Denker, John S and Henderson, Donnie and Howard, Richard E and Hubbard, Wayne and Jackel, Lawrence D}, + journal={Neural computation}, + volume={1}, + number={4}, + pages={541--551}, + year={1989}, + publisher={MIT Press} +} + +@article{lanctot2017unified, + title={A unified game-theoretic approach to multiagent reinforcement learning}, + author={Lanctot, Marc and Zambaldi, Vinicius and Gruslys, Audrunas and Lazaridou, Angeliki and Tuyls, Karl and P{\'e}rolat, Julien and Silver, David and Graepel, Thore}, + journal={Advances in neural information processing systems}, + volume={30}, + year={2017} +} + + +@article{mnih2013playing, + title={Playing atari with deep reinforcement learning}, + author={Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Graves, Alex and Antonoglou, Ioannis and Wierstra, Daan and Riedmiller, Martin}, + journal={arXiv preprint arXiv:1312.5602}, + year={2013} +} + +@article{sunehag2017value, + title={Value-decomposition networks for cooperative multi-agent learning}, + author={Sunehag, Peter and Lever, Guy and Gruslys, Audrunas and Czarnecki, Wojciech Marian and Zambaldi, Vinicius and Jaderberg, Max and Lanctot, Marc and Sonnerat, Nicolas and Leibo, Joel Z and Tuyls, Karl and others}, + journal={arXiv preprint arXiv:1706.05296}, + year={2017} +} + + +@inproceedings{rashid2018qmix, + title={Qmix: Monotonic value function factorisation for deep multi-agent reinforcement learning}, + author={Rashid, Tabish and Samvelyan, Mikayel and Schroeder, Christian and Farquhar, Gregory and Foerster, Jakob and Whiteson, Shimon}, + booktitle={International Conference on Machine Learning}, + pages={4295--4304}, + year={2018}, + organization={PMLR} +} + +@inproceedings{foerster2018counterfactual, + title={Counterfactual multi-agent policy gradients}, + author={Foerster, Jakob and Farquhar, Gregory and Afouras, Triantafyllos and Nardelli, Nantas and Whiteson, Shimon}, + booktitle={Proceedings of the AAAI conference on artificial intelligence}, + volume={32}, + number={1}, + year={2018} +} + + +@inproceedings{krizhevsky2012imagenet, + title={Imagenet classification with deep convolutional neural networks}, + author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E}, + booktitle={Advances in Neural Information Processing Systems}, + pages={1097--1105}, + year={2012} +} + +@inproceedings{he2016deep, + title={{Deep Residual Learning for Image Recognition}}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2016} +} + +@article{rumelhart1986learning, + title={Learning representations by back-propagating errors}, + author={Rumelhart, David E and Hinton, Geoffrey E and Williams, Ronald J}, + journal={Nature}, + volume={323}, + number={6088}, + pages={533}, + year={1986}, + publisher={Nature Publishing Group} +} + +@article{Hochreiter1997lstm, + author = {Hochreiter, Sepp and Hochreiter, S and Schmidhuber, J{\"{u}}rgen and Schmidhuber, J}, + isbn = {08997667 (ISSN)}, + issn = {0899-7667}, + journal = {Neural Computation}, + number = {8}, + pages = {1735--80}, + pmid = {9377276}, + title = {{Long Short-Term Memory.}}, + volume = {9}, + year = {1997} +} + +@inproceedings{vaswani2017attention, + title={Attention is all you need}, + author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, + booktitle={Advances in Neural Information Processing Systems}, + pages={5998--6008}, + year={2017} +} + +@article{lecun2015deep, + title={Deep learning}, + author={LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey}, + journal={Nature}, + volume={521}, + number={7553}, + pages={436}, + year={2015}, + publisher={Nature Publishing Group} +} + +@inproceedings{KingmaAdam2014, + title = {{Adam}: A Method for Stochastic Optimization}, + author = {Kingma, Diederik and Ba, Jimmy}, + booktitle = {Proceedings of the International Conference on Learning Representations (ICLR)}, + year = {2014} +} + +@techreport{tieleman2012rmsprop, + title={Divide the gradient by a running average of its recent magnitude. COURSERA: Neural networks for machine learning}, + author={Tieleman, T and Hinton, G}, + year={2017}, + institution={Technical Report} +} + +@article{duchi2011adagrad, + title={Adaptive subgradient methods for online learning and stochastic optimization}, + author={Duchi, John and Hazan, Elad and Singer, Yoram}, + journal={Journal of Machine Learning Research (JMLR)}, + volume={12}, + number={Jul}, + pages={2121--2159}, + year={2011} +} + +@inproceedings{meijer2006linq, + title={Linq: reconciling object, relations and xml in the. net framework}, + author={Meijer, Erik and Beckman, Brian and Bierman, Gavin}, + booktitle={Proceedings of the 2006 ACM SIGMOD international conference on Management of data}, + pages={706--706}, + year={2006} +} + +@inproceedings{murray2013naiad, + title={Naiad: a timely dataflow system}, + author={Murray, Derek G and McSherry, Frank and Isaacs, Rebecca and Isard, Michael and Barham, Paul and Abadi, Mart{\'\i}n}, + booktitle={Proceedings of the Twenty-Fourth ACM Symposium on Operating Systems Principles}, + pages={439--455}, + year={2013} +} + +@inproceedings{mnih2016asynchronous, + title={Asynchronous methods for deep reinforcement learning}, + author={Mnih, Volodymyr and Badia, Adria Puigdomenech and Mirza, Mehdi and Graves, Alex and Lillicrap, Timothy and Harley, Tim and Silver, David and Kavukcuoglu, Koray}, + booktitle={International Conference on Machine Learning (ICML)}, + pages={1928--1937}, + year={2016} +} + +@article{espeholt2018impala, + title={Impala: Scalable distributed deep-rl with importance weighted actor-learner architectures}, + author={Espeholt, Lasse and Soyer, Hubert and Munos, Remi and Simonyan, Karen and Mnih, Volodymir and Ward, Tom and Doron, Yotam and Firoiu, Vlad and Harley, Tim and Dunning, Iain and others}, + journal={arXiv preprint arXiv:1802.01561}, + year={2018} +} + +@article{espeholt2019seed, + title={Seed rl: Scalable and efficient deep-rl with accelerated central inference}, + author={Espeholt, Lasse and Marinier, Rapha{\"e}l and Stanczyk, Piotr and Wang, Ke and Michalski, Marcin}, + journal={arXiv preprint arXiv:1910.06591}, + year={2019} +} + +@misc{horgan2018distributed, + title={Distributed Prioritized Experience Replay}, + author={Dan Horgan and John Quan and David Budden and Gabriel Barth-Maron and Matteo Hessel and Hado van Hasselt and David Silver}, + year={2018}, + eprint={1803.00933}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} + +@inproceedings{moritz2018ray, + title={Ray: A distributed framework for emerging $\{$AI$\}$ applications}, + author={Moritz, Philipp and Nishihara, Robert and Wang, Stephanie and Tumanov, Alexey and Liaw, Richard and Liang, Eric and Elibol, Melih and Yang, Zongheng and Paul, William and Jordan, Michael I and others}, + booktitle={13th $\{$USENIX$\}$ Symposium on Operating Systems Design and Implementation ($\{$OSDI$\}$ 18)}, + pages={561--577}, + year={2018} +} + +@inproceedings{zaharia2010spark, + title={Spark: Cluster computing with working sets}, + author={Zaharia, Matei and Chowdhury, Mosharaf and Franklin, Michael J and Shenker, Scott and Stoica, Ion}, + booktitle={2nd USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 10)}, + year={2010} +} + +@article{fetterly2009dryadlinq, + title={DryadLINQ: A system for general-purpose distributed data-parallel computing using a high-level language}, + author={Fetterly, Yuan Yu Michael Isard Dennis and Budiu, Mihai and Erlingsson, {\'U}lfar and Currey, Pradeep Kumar Gunda Jon}, + journal={Proc. LSDS-IR}, + volume={8}, + year={2009} +} + +@article{murray2021tf, + title={tf. data: A machine learning data processing framework}, + author={Murray, Derek G and Simsa, Jiri and Klimovic, Ana and Indyk, Ihor}, + journal={arXiv preprint arXiv:2101.12127}, + year={2021} +} + +@article{mohan2020analyzing, + title={Analyzing and mitigating data stalls in dnn training}, + author={Mohan, Jayashree and Phanishayee, Amar and Raniwala, Ashish and Chidambaram, Vijay}, + journal={arXiv preprint arXiv:2007.06775}, + year={2020} +} + +@misc{rmpygil + author = "Sam Gross", + title = "Multithreaded Python without the GIL", + howpublished = "Website", + year = {2021}, + note = {\url{https://docs.google.com/document/d/18CXhDb1ygxg-YXNBJNzfzZsDFosB5e6BfnXLlejd9l0/edit#heading=h.kcngwrty1lv}} +} + +@misc{nvidia_dali + author = "NVIDIA", + title = "DALI", + howpublished = "Website", + year = {2018}, + note = {\url{https://github.com/NVIDIA/DALI}} +} + +@misc{minddata + author = "HuaWei", + title = "Dataset Plugin", + howpublished = "Website", + year = {2020}, + note = {\url{https://gitee.com/mindspore/dataset-plugin}} +} + +@article{liang2017ray, + title={Ray rllib: A composable and scalable reinforcement learning library}, + author={Liang, Eric and Liaw, Richard and Nishihara, Robert and Moritz, Philipp and Fox, Roy and Gonzalez, Joseph and Goldberg, Ken and Stoica, Ion}, + journal={arXiv preprint arXiv:1712.09381}, + pages={85}, + year={2017} +} + +@article{cassirer2021reverb, + title={Reverb: A Framework For Experience Replay}, + author={Cassirer, Albin and Barth-Maron, Gabriel and Brevdo, Eugene and Ramos, Sabela and Boyd, Toby and Sottiaux, Thibault and Kroiss, Manuel}, + journal={arXiv preprint arXiv:2102.04736}, + year={2021} +} + +@article{hoffman2020acme, + title={Acme: A research framework for distributed reinforcement learning}, + author={Hoffman, Matt and Shahriari, Bobak and Aslanides, John and Barth-Maron, Gabriel and Behbahani, Feryal and Norman, Tamara and Abdolmaleki, Abbas and Cassirer, Albin and Yang, Fan and Baumli, Kate and others}, + journal={arXiv preprint arXiv:2006.00979}, + year={2020} +} + +@article{ding2020efficient, + title={Efficient Reinforcement Learning Development with RLzoo}, + author={Ding, Zihan and Yu, Tianyang and Huang, Yanhua and Zhang, Hongming and Li, Guo and Guo, Quancheng and Mai, Luo and Dong, Hao}, + journal={arXiv preprint arXiv:2009.08644}, + year={2020} +} + +@article{makoviychuk2021isaac, + title={Isaac Gym: High Performance GPU-Based Physics Simulation For Robot Learning}, + author={Makoviychuk, Viktor and Wawrzyniak, Lukasz and Guo, Yunrong and Lu, Michelle and Storey, Kier and Macklin, Miles and Hoeller, David and Rudin, Nikita and Allshire, Arthur and Handa, Ankur and others}, + journal={arXiv preprint arXiv:2108.10470}, + year={2021} +} + +@article{vinyals2019grandmaster, + title={Grandmaster level in StarCraft II using multi-agent reinforcement learning}, + author={Vinyals, Oriol and Babuschkin, Igor and Czarnecki, Wojciech M and Mathieu, Micha{\"e}l and Dudzik, Andrew and Chung, Junyoung and Choi, David H and Powell, Richard and Ewalds, Timo and Georgiev, Petko and others}, + journal={Nature}, + volume={575}, + number={7782}, + pages={350--354}, + year={2019}, + publisher={Nature Publishing Group} +} + +@article{berner2019dota, + title={Dota 2 with large scale deep reinforcement learning}, + author={Berner, Christopher and Brockman, Greg and Chan, Brooke and Cheung, Vicki and D{\k{e}}biak, Przemys{\l}aw and Dennison, Christy and Farhi, David and Fischer, Quirin and Hashme, Shariq and Hesse, Chris and others}, + journal={arXiv preprint arXiv:1912.06680}, + year={2019} +} + +@article{han2020tstarbot, + title={Tstarbot-x: An open-sourced and comprehensive study for efficient league training in starcraft ii full game}, + author={Han, Lei and Xiong, Jiechao and Sun, Peng and Sun, Xinghai and Fang, Meng and Guo, Qingwei and Chen, Qiaobo and Shi, Tengfei and Yu, Hongsheng and Wu, Xipeng and others}, + journal={arXiv preprint arXiv:2011.13729}, + year={2020} +} + +@inproceedings{wang2021scc, + title={SCC: an efficient deep reinforcement learning agent mastering the game of StarCraft II}, + author={Wang, Xiangjun and Song, Junxiao and Qi, Penghui and Peng, Peng and Tang, Zhenkun and Zhang, Wei and Li, Weimin and Pi, Xiongjun and He, Jujie and Gao, Chao and others}, + booktitle={International Conference on Machine Learning}, + pages={10905--10915}, + year={2021}, + organization={PMLR} +} + +@inproceedings{MLSYS2021_979d472a, + author = {Yin, Chunxing and Acun, Bilge and Wu, Carole-Jean and Liu, Xing}, + booktitle = {Proceedings of Machine Learning and Systems}, + editor = {A. Smola and A. Dimakis and I. Stoica}, + pages = {448--462}, + title = {TT-Rec: Tensor Train Compression for Deep Learning Recommendation Models}, + url = {https://proceedings.mlsys.org/paper/2021/file/979d472a84804b9f647bc185a877a8b5-Paper.pdf}, + volume = {3}, + year = {2021} +} + +@inproceedings{MLSYS2020_f7e6c855, + author = {Zhao, Weijie and Xie, Deping and Jia, Ronglai and Qian, Yulei and Ding, Ruiquan and Sun, Mingming and Li, Ping}, + booktitle = {Proceedings of Machine Learning and Systems}, + editor = {I. Dhillon and D. Papailiopoulos and V. Sze}, + pages = {412--428}, + title = {Distributed Hierarchical GPU Parameter Server for Massive Scale Deep Learning Ads Systems}, + url = {https://proceedings.mlsys.org/paper/2020/file/f7e6c85504ce6e82442c770f7c8606f0-Paper.pdf}, + volume = {2}, + year = {2020} +} + +@article{zionex, + title={Software-Hardware Co-design for Fast and Scalable Training of Deep Learning Recommendation Models}, + author={Mudigere, Dheevatsa and Hao, Yuchen and Huang, Jianyu and Jia, Zhihao and Tulloch, Andrew and Sridharan, Srinivas and Liu, Xing and Ozdal, Mustafa and Nie, Jade and Park, Jongsoo and others}, + journal={arXiv preprint arXiv:2104.05158}, + year={2021} +} + +@inproceedings{gong2020edgerec, + title={EdgeRec: Recommender System on Edge in Mobile Taobao}, + author={Gong, Yu and Jiang, Ziwen and Feng, Yufei and Hu, Binbin and Zhao, Kaiqi and Liu, Qingwen and Ou, Wenwu}, + booktitle={Proceedings of the 29th ACM International Conference on Information \& Knowledge Management}, + pages={2477--2484}, + year={2020} +} + +@inproceedings{NEURIPS2020_a1d4c20b, + author = {He, Chaoyang and Annavaram, Murali and Avestimehr, Salman}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}, + pages = {14068--14080}, + publisher = {Curran Associates, Inc.}, + title = {Group Knowledge Transfer: Federated Learning of Large CNNs at the Edge}, + url = {https://proceedings.neurips.cc/paper/2020/file/a1d4c20b182ad7137ab3606f0e3fc8a4-Paper.pdf}, + volume = {33}, + year = {2020} +} + +@INPROCEEDINGS{9355295, + author={Xie, Minhui and Ren, Kai and Lu, Youyou and Yang, Guangxu and Xu, Qingxing and Wu, Bihai and Lin, Jiazhen and Ao, Hongbo and Xu, Wanhong and Shu, Jiwu}, + booktitle={SC20: International Conference for High Performance Computing, Networking, Storage and Analysis}, + title={Kraken: Memory-Efficient Continual Learning for Large-Scale Real-Time Recommendations}, + year={2020}, + volume={}, + number={}, + pages={1-17}, + doi={10.1109/SC41405.2020.00025} +} + +@inproceedings{MLSYS2021_ec895663, + author = {Jiang, Wenqi and He, Zhenhao and Zhang, Shuai and Preu\ss er, Thomas B. and Zeng, Kai and Feng, Liang and Zhang, Jiansong and Liu, Tongxuan and Li , Yong and Zhou, Jingren and Zhang, Ce and Alonso, Gustavo}, + booktitle = {Proceedings of Machine Learning and Systems}, + editor = {A. Smola and A. Dimakis and I. Stoica}, + pages = {845--859}, + title = {MicroRec: Efficient Recommendation Inference by Hardware and Data Structure Solutions}, + url = {https://proceedings.mlsys.org/paper/2021/file/ec8956637a99787bd197eacd77acce5e-Paper.pdf}, + volume = {3}, + year = {2021} +} + +@inproceedings{10.1145/3394486.3403059, +author = {Shi, Hao-Jun Michael and Mudigere, Dheevatsa and Naumov, Maxim and Yang, Jiyan}, +title = {Compositional Embeddings Using Complementary Partitions for Memory-Efficient Recommendation Systems}, +year = {2020}, +isbn = {9781450379984}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3394486.3403059}, +doi = {10.1145/3394486.3403059}, +abstract = {}, +booktitle = {Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining}, +pages = {165–175}, +numpages = {11}, +keywords = {model compression, recommendation systems, embeddings}, +location = {Virtual Event, CA, USA}, +series = {KDD '20} +} + +@misc{ginart2021mixed, + title={Mixed Dimension Embeddings with Application to Memory-Efficient Recommendation Systems}, + author={Antonio Ginart and Maxim Naumov and Dheevatsa Mudigere and Jiyan Yang and James Zou}, + year={2021}, + eprint={1909.11810}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} + +@inproceedings{10.1145/2020408.2020444, +author = {Chu, Wei and Zinkevich, Martin and Li, Lihong and Thomas, Achint and Tseng, Belle}, +title = {Unbiased Online Active Learning in Data Streams}, +year = {2011}, +isbn = {9781450308137}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/2020408.2020444}, +doi = {10.1145/2020408.2020444}, +abstract = {Unlabeled samples can be intelligently selected for labeling to minimize classification error. In many real-world applications, a large number of unlabeled samples arrive in a streaming manner, making it impossible to maintain all the data in a candidate pool. In this work, we focus on binary classification problems and study selective labeling in data streams where a decision is required on each sample sequentially. We consider the unbiasedness property in the sampling process, and design optimal instrumental distributions to minimize the variance in the stochastic process. Meanwhile, Bayesian linear classifiers with weighted maximum likelihood are optimized online to estimate parameters. In empirical evaluation, we collect a data stream of user-generated comments on a commercial news portal in 30 consecutive days, and carry out offline evaluation to compare various sampling strategies, including unbiased active learning, biased variants, and random sampling. Experimental results verify the usefulness of online active learning, especially in the non-stationary situation with concept drift.}, +booktitle = {Proceedings of the 17th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, +pages = {195–203}, +numpages = {9}, +keywords = {unbiasedness, bayesian online learning, active learning, data streaming, adaptive importance sampling}, +location = {San Diego, California, USA}, +series = {KDD '11} +} + +@inproceedings{10.1145/3267809.3267817, +author = {Tian, Huangshi and Yu, Minchen and Wang, Wei}, +title = {Continuum: A Platform for Cost-Aware, Low-Latency Continual Learning}, +year = {2018}, +isbn = {9781450360111}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3267809.3267817}, +doi = {10.1145/3267809.3267817}, +abstract = {Many machine learning applications operate in dynamic environments that change over time, in which models must be continually updated to capture the recent trend in data. However, most of today's learning frameworks perform training offline, without a system support for continual model updating.In this paper, we design and implement Continuum, a general-purpose platform that streamlines the implementation and deployment of continual model updating across existing learning frameworks. In pursuit of fast data incorporation, we further propose two update policies, cost-aware and best-effort, that judiciously determine when to perform model updating, with and without accounting for the training cost (machine-time), respectively. Theoretical analysis shows that cost-aware policy is 2-competitive. We implement both polices in Continuum, and evaluate their performance through EC2 deployment and trace-driven simulations. The evaluation shows that Continuum results in reduced data incorporation latency, lower training cost, and improved model quality in a number of popular online learning applications that span multiple application domains, programming languages, and frameworks.}, +booktitle = {Proceedings of the ACM Symposium on Cloud Computing}, +pages = {26–40}, +numpages = {15}, +keywords = {Competitive Analysis, Continual Learning System, Online Algorithm}, +location = {Carlsbad, CA, USA}, +series = {SoCC '18} +} + +@inproceedings{10.1145/2648584.2648589, +author = {He, Xinran and Pan, Junfeng and Jin, Ou and Xu, Tianbing and Liu, Bo and Xu, Tao and Shi, Yanxin and Atallah, Antoine and Herbrich, Ralf and Bowers, Stuart and Candela, Joaquin Qui\~{n}onero}, +title = {Practical Lessons from Predicting Clicks on Ads at Facebook}, +year = {2014}, +isbn = {9781450329996}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/2648584.2648589}, +doi = {10.1145/2648584.2648589}, +abstract = {Online advertising allows advertisers to only bid and pay for measurable user responses, such as clicks on ads. As a consequence, click prediction systems are central to most online advertising systems. With over 750 million daily active users and over 1 million active advertisers, predicting clicks on Facebook ads is a challenging machine learning task. In this paper we introduce a model which combines decision trees with logistic regression, outperforming either of these methods on its own by over 3%, an improvement with significant impact to the overall system performance. We then explore how a number of fundamental parameters impact the final prediction performance of our system. Not surprisingly, the most important thing is to have the right features: those capturing historical information about the user or ad dominate other types of features. Once we have the right features and the right model (decisions trees plus logistic regression), other factors play small roles (though even small improvements are important at scale). Picking the optimal handling for data freshness, learning rate schema and data sampling improve the model slightly, though much less than adding a high-value feature, or picking the right model to begin with.}, +booktitle = {Proceedings of the Eighth International Workshop on Data Mining for Online Advertising}, +pages = {1–9}, +numpages = {9}, +location = {New York, NY, USA}, +series = {ADKDD'14} +} + +@misc{2017NVIDIA, + author={NVIDIA}, + title={NVIDIA Tesla V100 GPU Architecture: The World's Most Advanced Datacenter GPU}, + year={2017}, + howpublished = "Website", + note = {\url{http://www.nvidia.com/object/volta-architecture-whitepaper.html}} +} + +@inproceedings{2021Ascend, + title={Ascend: a Scalable and Unified Architecture for Ubiquitous Deep Neural Network Computing : Industry Track Paper}, + author={Liao, Heng and Tu, Jiajin and Xia, Jing and Liu, Hu and Zhou, Xiping and Yuan, Honghui and Hu, Yuxing}, + booktitle={2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA)}, + year={2021}, + pages = {789–801}, + doi = {10.1109/HPCA51647.2021.00071}, +} + +@article{2018Modeling, + title={Modeling Deep Learning Accelerator Enabled GPUs}, + author={Raihan, M. A. and Goli, N. and Aamodt, T.}, + journal={arXiv e-prints arXiv:1811.08309}, + year={2018} +} + +@book{2007Engineering, + title={Engineering a Compiler}, + author={ Cooper, Keith D. and Torczon, Linda }, + publisher={Engineering A Compiler}, + year={2007}, +} + +@article{ragan2013halide, + title={Halide: a language and compiler for optimizing parallelism, locality, and recomputation in image processing pipelines}, + author={Ragan-Kelley, Jonathan and Barnes, Connelly and Adams, Andrew and Paris, Sylvain and Durand, Fr{\'e}do and Amarasinghe, Saman}, + journal={Acm Sigplan Notices}, + volume={48}, + number={6}, + pages={519--530}, + year={2013}, + publisher={ACM New York, NY, USA} +} + +@inproceedings{verdoolaege2010isl, + title={isl: An integer set library for the polyhedral model}, + author={Verdoolaege, Sven}, + booktitle={International Congress on Mathematical Software}, + pages={299--302}, + year={2010}, + organization={Springer} +} + +@article{chen2018tvm, + title={TVM: end-to-end optimization stack for deep learning}, + author={Chen, Tianqi and Moreau, Thierry and Jiang, Ziheng and Shen, Haichen and Yan, Eddie Q and Wang, Leyuan and Hu, Yuwei and Ceze, Luis and Guestrin, Carlos and Krishnamurthy, Arvind}, + journal={arXiv preprint arXiv:1802.04799}, + volume={11}, + pages={20}, + year={2018}, + publisher={CoRR} +} + +@inproceedings{zheng2020ansor, + title={Ansor: Generating $\{$High-Performance$\}$ Tensor Programs for Deep Learning}, + author={Zheng, Lianmin and Jia, Chengfan and Sun, Minmin and Wu, Zhao and Yu, Cody Hao and Haj-Ali, Ameer and Wang, Yida and Yang, Jun and Zhuo, Danyang and Sen, Koushik and others}, + booktitle={14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)}, + pages={863--879}, + year={2020} +} + +@inproceedings{zhao2021akg, + title={AKG: automatic kernel generation for neural processing units using polyhedral transformations}, + author={Zhao, Jie and Li, Bojie and Nie, Wang and Geng, Zhen and Zhang, Renwei and Gao, Xiong and Cheng, Bin and Wu, Chen and Cheng, Yun and Li, Zheng and others}, + booktitle={Proceedings of the 42nd ACM SIGPLAN International Conference on Programming Language Design and Implementation}, + pages={1233--1248}, + year={2021} +} + +@article{lattner2020mlir, + title={MLIR: A compiler infrastructure for the end of Moore's law}, + author={Lattner, Chris and Amini, Mehdi and Bondhugula, Uday and Cohen, Albert and Davis, Andy and Pienaar, Jacques and Riddle, River and Shpeisman, Tatiana and Vasilache, Nicolas and Zinenko, Oleksandr}, + journal={arXiv preprint arXiv:2002.11054}, + year={2020} +} + +@article{vasilache2022composable, + title={Composable and Modular Code Generation in MLIR: A Structured and Retargetable Approach to Tensor Compiler Construction}, + author={Vasilache, Nicolas and Zinenko, Oleksandr and Bik, Aart JC and Ravishankar, Mahesh and Raoux, Thomas and Belyaev, Alexander and Springer, Matthias and Gysi, Tobias and Caballero, Diego and Herhut, Stephan and others}, + journal={arXiv preprint arXiv:2202.03293}, + year={2022} +} + +@inproceedings{bastoul2004code, + title={Code generation in the polyhedral model is easier than you think}, + author={Bastoul, C{\'e}dric}, + booktitle={Proceedings. 13th International Conference on Parallel Architecture and Compilation Techniques, 2004. PACT 2004.}, + pages={7--16}, + year={2004}, + organization={IEEE} +} + +@ARTICLE{2020tkde_li, + author={Li, Xiao-Hui and Cao, Caleb Chen and Shi, Yuhan and Bai, Wei and Gao, Han and Qiu, Luyu and Wang, Cong and Gao, Yuanyuan and Zhang, Shenjia and Xue, Xun and Chen, Lei}, + journal={IEEE Transactions on Knowledge and Data Engineering}, + title={A Survey of Data-driven and Knowledge-aware eXplainable AI}, + year={2020}, + volume={}, + number={}, + pages={1-1}, + doi={10.1109/TKDE.2020.2983930} +} + +@article{erhan2009visualizing, + title={Visualizing higher-layer features of a deep network}, + author={Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Vincent, Pascal}, + journal={University of Montreal}, + volume={1341}, + number={3}, + pages={1}, + year={2009} +} + +@misc{kim2018interpretability, + title={Interpretability Beyond Feature Attribution: Quantitative Testing with Concept Activation Vectors (TCAV)}, + author={Been Kim and Martin Wattenberg and Justin Gilmer and Carrie Cai and James Wexler and Fernanda Viegas and Rory Sayres}, + year={2018}, + eprint={1711.11279}, + archivePrefix={arXiv}, + primaryClass={stat.ML} +} + +@article{riedl2019human, + title={Human-centered artificial intelligence and machine learning}, + author={Riedl, Mark O.}, + journal={Human Behavior and Emerging Technologies}, + volume={1}, + number={1}, + pages={33--36}, + year={2019}, + publisher={Wiley Online Library} + +} + +@inproceedings{10.1145/3460231.3474255, +author = {de Souza Pereira Moreira, Gabriel and Rabhi, Sara and Lee, Jeong Min and Ak, Ronay and Oldridge, Even}, +title = {Transformers4Rec: Bridging the Gap between NLP and Sequential / Session-Based Recommendation}, +year = {2021}, +isbn = {9781450384582}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3460231.3474255}, +doi = {10.1145/3460231.3474255}, +abstract = {}, +booktitle = {Fifteenth ACM Conference on Recommender Systems}, +pages = {143–153}, +numpages = {11}, +location = {Amsterdam, Netherlands}, +series = {RecSys '21} +} + +@inproceedings{10.1145/3124749.3124754, +author = {Wang, Ruoxi and Fu, Bin and Fu, Gang and Wang, Mingliang}, +title = {Deep & Cross Network for Ad Click Predictions}, +year = {2017}, +isbn = {9781450351942}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3124749.3124754}, +doi = {10.1145/3124749.3124754}, +abstract = {Feature engineering has been the key to the success of many prediction models. However, the process is nontrivial and often requires manual feature engineering or exhaustive searching. DNNs are able to automatically learn feature interactions; however, they generate all the interactions implicitly, and are not necessarily efficient in learning all types of cross features. In this paper, we propose the Deep & Cross Network (DCN) which keeps the benefits of a DNN model, and beyond that, it introduces a novel cross network that is more efficient in learning certain bounded-degree feature interactions. In particular, DCN explicitly applies feature crossing at each layer, requires no manual feature engineering, and adds negligible extra complexity to the DNN model. Our experimental results have demonstrated its superiority over the state-of-art algorithms on the CTR prediction dataset and dense classification dataset, in terms of both model accuracy and memory usage.}, +booktitle = {Proceedings of the ADKDD'17}, +articleno = {12}, +numpages = {7}, +keywords = {CTR Prediction, Deep Learning, Neural Networks, Feature Crossing}, +location = {Halifax, NS, Canada}, +series = {ADKDD'17} +} + +@inproceedings{ijcai2017-239, + author = {Huifeng Guo and Ruiming TANG and Yunming Ye and Zhenguo Li and Xiuqiang He}, + title = {DeepFM: A Factorization-Machine based Neural Network for CTR Prediction}, + booktitle = {Proceedings of the Twenty-Sixth International Joint Conference on + Artificial Intelligence, {IJCAI-17}}, + pages = {1725--1731}, + year = {2017}, + doi = {10.24963/ijcai.2017/239}, + url = {https://doi.org/10.24963/ijcai.2017/239}, +} + +@article{naumov2019deep, + title={Deep learning recommendation model for personalization and recommendation systems}, + author={Naumov, Maxim and Mudigere, Dheevatsa and Shi, Hao-Jun Michael and Huang, Jianyu and Sundaraman, Narayanan and Park, Jongsoo and Wang, Xiaodong and Gupta, Udit and Wu, Carole-Jean and Azzolini, Alisson G and others}, + journal={arXiv preprint arXiv:1906.00091}, + year={2019} +} + +@inproceedings{NIPS2015_86df7dcf, + author = {Sculley, D. and Holt, Gary and Golovin, Daniel and Davydov, Eugene and Phillips, Todd and Ebner, Dietmar and Chaudhary, Vinay and Young, Michael and Crespo, Jean-Fran\c{c}ois and Dennison, Dan}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, + pages = {}, + publisher = {Curran Associates, Inc.}, + title = {Hidden Technical Debt in Machine Learning Systems}, + url = {https://proceedings.neurips.cc/paper/2015/file/86df7dcfd896fcaf2674f757a2463eba-Paper.pdf}, + volume = {28}, + year = {2015} +} + +@misc{Merlin, + note={Accessed on 2022-03-24}, + author = {NVIDIA}, + year = {2022}, + title = {{{NVIDIA Merlin}}}, + howpublished = {\url{https://github.com/NVIDIA-Merlin/Merlin}}, +} + +@misc{NVTabular, + note={Accessed on 2022-03-24}, + author = {NVIDIA}, + year = {2022}, + title = {{{NVIDIA NVTabular}}}, + howpublished = {\url{https://github.com/NVIDIA-Merlin/NVTabular}}, +} + +@misc{HugeCTR, + note={Accessed on 2022-03-24}, + author = {NVIDIA}, + year = {2022}, + title = {{{NVIDIA HugeCTR}}}, + howpublished = {\url{https://github.com/NVIDIA-Merlin/HugeCTR}}, +} + +@misc{Triton, + note={Accessed on 2022-03-24}, + author = {NVIDIA}, + year = {2022}, + title = {{{NVIDIA Triton}}}, + howpublished = {\url{https://github.com/triton-inference-server/server}}, +} + +@inproceedings{10.1145/3437801.3441578, +author = {Fang, Jiarui and Yu, Yang and Zhao, Chengduo and Zhou, Jie}, +title = {TurboTransformers: An Efficient GPU Serving System for Transformer Models}, +year = {2021}, +isbn = {9781450382946}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3437801.3441578}, +doi = {10.1145/3437801.3441578}, +abstract = {The transformer is the most critical algorithm innovation of the Nature Language Processing (NLP) field in recent years. Unlike the Recurrent Neural Network (RNN) models, transformers are able to process on dimensions of sequence lengths in parallel, therefore leads to better accuracy on long sequences. However, efficient deployments of them for online services in data centers equipped with GPUs are not easy. First, more computation introduced by transformer structures makes it more challenging to meet the latency and throughput constraints of serving. Second, NLP tasks take in sentences of variable length. The variability of input dimensions brings a severe problem to efficient memory management and serving optimization.To solve the above challenges, this paper designed a transformer serving system called TurboTransformers, which consists of a computing runtime and a serving framework. Three innovative features make it stand out from other similar works. An efficient parallel algorithm is proposed for GPU-based batch reduction operations, like Softmax and LayerNorm, which are major hot spots besides BLAS routines. A memory allocation algorithm, which better balances the memory footprint and allocation/free efficiency, is designed for variable-length input situations. A serving framework equipped with a new batch scheduler using dynamic programming achieves the optimal throughput on variable-length requests. The system can achieve the state-of-the-art transformer model serving performance on GPU platforms and can be seamlessly integrated into your PyTorch code with a few lines of code.}, +booktitle = {Proceedings of the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, +pages = {389–402}, +numpages = {14}, +keywords = {serving system, deep learning runtime, GPU, transformers}, +location = {Virtual Event, Republic of Korea}, +series = {PPoPP '21} +} + +@inproceedings{wang-etal-2021-lightseq, + title = "{L}ight{S}eq: A High Performance Inference Library for Transformers", + author = "Wang, Xiaohui and + Xiong, Ying and + Wei, Yang and + Wang, Mingxuan and + Li, Lei", + booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Papers", + month = jun, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.naacl-industry.15", + doi = "10.18653/v1/2021.naacl-industry.15", + pages = "113--120", + abstract = "Transformer and its variants have achieved great success in natural language processing. Since Transformer models are huge in size, serving these models is a challenge for real industrial applications. In this paper, we propose , a highly efficient inference library for models in the Transformer family. includes a series of GPU optimization techniques to both streamline the computation of Transformer layers and reduce memory footprint. supports models trained using PyTorch and Tensorflow. Experimental results on standard machine translation benchmarks show that achieves up to 14x speedup compared with TensorFlow and 1.4x speedup compared with , a concurrent CUDA implementation. The code will be released publicly after the review.", +} + +@inproceedings{quigley2009ros, + title={ROS: an open-source Robot Operating System}, + author={Quigley, Morgan and Conley, Ken and Gerkey, Brian and Faust, Josh and Foote, Tully and Leibs, Jeremy and Wheeler, Rob and Ng, Andrew Y and others}, + booktitle={ICRA workshop on open source software}, + volume={3}, + number={3.2}, + pages={5}, + year={2009}, + organization={Kobe, Japan} +} + +@inproceedings{maruyama2016exploring, + title={Exploring the performance of ROS2}, + author={Maruyama, Yuya and Kato, Shinpei and Azumi, Takuya}, + booktitle={Proceedings of the 13th ACM SIGBED International Conference on Embedded Software (EMSOFT)}, + pages={1--10}, + year={2016} +} + +@inproceedings{ding2019camnet, + title={CamNet: Coarse-to-fine retrieval for camera re-localization}, + author={Ding, Mingyu and Wang, Zhe and Sun, Jiankai and Shi, Jianping and Luo, Ping}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={2871--2880}, + year={2019} +} + +@inproceedings{yi2020segvoxelnet, + title={Segvoxelnet: Exploring semantic context and depth-aware features for 3d vehicle detection from point cloud}, + author={Yi, Hongwei and Shi, Shaoshuai and Ding, Mingyu and Sun, Jiankai and Xu, Kui and Zhou, Hui and Wang, Zhe and Li, Sheng and Wang, Guoping}, + booktitle={2020 IEEE International Conference on Robotics and Automation (ICRA)}, + pages={2274--2280}, + year={2020}, + organization={IEEE} +} + +@ARTICLE{9712373, author={Sun, Jiankai and Huang, De-An and Lu, Bo and Liu, Yun-Hui and Zhou, Bolei and Garg, Animesh}, journal={IEEE Robotics and Automation Letters}, title={PlaTe: Visually-Grounded Planning With Transformers in Procedural Tasks}, year={2022}, volume={7}, number={2}, pages={4924-4930}, doi={10.1109/LRA.2022.3150855}} + +@inproceedings{li2018undeepvo, + title={Undeepvo: Monocular visual odometry through unsupervised deep learning}, + author={Li, Ruihao and Wang, Sen and Long, Zhiqiang and Gu, Dongbing}, + booktitle={2018 IEEE international conference on robotics and automation (ICRA)}, + pages={7286--7291}, + year={2018}, + organization={IEEE} +} + +@inproceedings{quintero2021motion, + title={Motion planning via bayesian learning in the dark}, + author={Quintero-Pena, Carlos and Chamzas, Constantinos and Unhelkar, Vaibhav and Kavraki, Lydia E}, + booktitle={ICRA: Workshop on Machine Learning for Motion Planning}, + year={2021} +} + +@MISC{ML4KP, +author = {Edgar Granados and Aravind Sivaramakrishnan and Troy McMahon and Zakary Littlefield and Kostas E. Bekris}, +title = {Machine Learning for Kinodynamic Planning (ML4KP)}, +howpublished = {\url{https://github.com/PRX-Kinodynamic/ML4KP}}, +year = {2021--2021} +} + + + +@article{aradi2020survey, + title={Survey of deep reinforcement learning for motion planning of autonomous vehicles}, + author={Aradi, Szil{\'a}rd}, + journal={IEEE Transactions on Intelligent Transportation Systems}, + year={2020}, + publisher={IEEE} +} + +@article{vianna2021neural, + title={Neural Network Based Model Predictive Control for an Autonomous Vehicle}, + author={Vianna, Maria Luiza Costa and Goubault, Eric and Putot, Sylvie}, + journal={arXiv preprint arXiv:2107.14573}, + year={2021} +} + +@article{qiu2021egocentric, + title={Egocentric Human Trajectory Forecasting with a Wearable Camera and Multi-Modal Fusion}, + author={Qiu, Jianing and Chen, Lipeng and Gu, Xiao and Lo, Frank P-W and Tsai, Ya-Yen and Sun, Jiankai and Liu, Jiaqi and Lo, Benny}, + journal={arXiv preprint arXiv:2111.00993}, + year={2021} +} + +@InProceedings{pmlr-v155-huang21a, + title = {Learning a Decision Module by Imitating Driver’s Control Behaviors}, + author = {Huang, Junning and Xie, Sirui and Sun, Jiankai and Ma, Qiurui and Liu, Chunxiao and Lin, Dahua and Zhou, Bolei}, + booktitle = {Proceedings of the 2020 Conference on Robot Learning}, + pages = {1--10}, + year = {2021}, + editor = {Kober, Jens and Ramos, Fabio and Tomlin, Claire}, + volume = {155}, + series = {Proceedings of Machine Learning Research}, + month = {16--18 Nov}, + publisher = {PMLR}, + pdf = {https://proceedings.mlr.press/v155/huang21a/huang21a.pdf}, + url = {https://proceedings.mlr.press/v155/huang21a.html}, + abstract = {Autonomous driving systems have a pipeline of perception, decision, planning, and control. The decision module processes information from the perception module and directs the execution of downstream planning and control modules. On the other hand, the recent success of deep learning suggests that this pipeline could be replaced by end-to-end neural control policies, however, safety cannot be well guaranteed for the data-driven neural networks. In this work, we propose a hybrid framework to learn neural decisions in the classical modular pipeline through end-to-end imitation learning. This hybrid framework can preserve the merits of the classical pipeline such as the strict enforcement of physical and logical constraints while learning complex driving decisions from data. To circumvent the ambiguous annotation of human driving decisions, our method learns high-level driving decisions by imitating low-level control behaviors. We show in the simulation experiments that our modular driving agent can generalize its driving decision and control to various complex scenarios where the rule-based programs fail. It can also generate smoother and safer driving trajectories than end-to-end neural policies. Demo and code are available at https://decisionforce.github.io/modulardecision/.} +} + + +@InProceedings{pmlr-v155-sun21a, + title = {Neuro-Symbolic Program Search for Autonomous Driving Decision Module Design}, + author = {Sun, Jiankai and Sun, Hao and Han, Tian and Zhou, Bolei}, + booktitle = {Proceedings of the 2020 Conference on Robot Learning}, + pages = {21--30}, + year = {2021}, + editor = {Kober, Jens and Ramos, Fabio and Tomlin, Claire}, + volume = {155}, + series = {Proceedings of Machine Learning Research}, + month = {16--18 Nov}, + publisher = {PMLR}, + pdf = {https://proceedings.mlr.press/v155/sun21a/sun21a.pdf}, + url = {https://proceedings.mlr.press/v155/sun21a.html}, + abstract = {As a promising topic in cognitive robotics, neuro-symbolic modeling integrates symbolic reasoning and neural representation altogether. However, previous neuro-symbolic models usually wire their structures and the connections manually, making the underlying parameters sub-optimal. In this work, we propose the Neuro-Symbolic Program Search (NSPS) to improve the autonomous driving system design. NSPS is a novel automated search method that synthesizes the Neuro-Symbolic Programs. It can produce robust and expressive Neuro-Symbolic Programs and automatically tune the hyper-parameters. We validate NSPS in the CARLA driving simulation environment. The resulting Neuro-Symbolic Decision Programs successfully handle multiple traffic scenarios. Compared with previous neural-network-based driving and rule-based methods, our neuro-symbolic driving pipeline achieves more stable and safer behaviors in complex driving scenarios while maintaining an interpretable symbolic decision-making process.} +} + +@ARTICLE{9491826, author={Lu, Sidi and Shi, Weisong}, journal={IEEE Internet Computing}, title={The Emergence of Vehicle Computing}, year={2021}, volume={25}, number={3}, pages={18-22}, doi={10.1109/MIC.2021.3066076}} + +@article{benekohal1988carsim, + title={CARSIM: Car-following model for simulation of traffic in normal and stop-and-go conditions}, + author={Benekohal, Rahim F and Treiterer, Joseph}, + journal={Transportation research record}, + volume={1194}, + pages={99--111}, + year={1988}, + publisher={SAGE Publishing} +} + +@book{buehler2009darpa, + title={The DARPA urban challenge: autonomous vehicles in city traffic}, + author={Buehler, Martin and Iagnemma, Karl and Singh, Sanjiv}, + volume={56}, + year={2009}, + publisher={springer} +} + + +@InProceedings{pmlr-v100-bansal20a, + title = {Combining Optimal Control and Learning for Visual Navigation in Novel Environments}, + author = {Bansal, Somil and Tolani, Varun and Gupta, Saurabh and Malik, Jitendra and Tomlin, Claire}, + booktitle = {Proceedings of the Conference on Robot Learning}, + pages = {420--429}, + year = {2020}, + editor = {Kaelbling, Leslie Pack and Kragic, Danica and Sugiura, Komei}, + volume = {100}, + series = {Proceedings of Machine Learning Research}, + month = {30 Oct--01 Nov}, + publisher = {PMLR}, + pdf = {http://proceedings.mlr.press/v100/bansal20a/bansal20a.pdf}, + url = {https://proceedings.mlr.press/v100/bansal20a.html}, + abstract = {Model-based control is a popular paradigm for robot navigation because it can leverage a known dynamics model to efficiently plan robust robot trajectories. However, it is challenging to use model-based methods in settings where the environment is a priori unknown and can only be observed partially through onboard sensors on the robot. In this work, we address this short-coming by coupling model-based control with learning-based perception. The learning-based perception module produces a series of waypoints that guide the robot to the goal via a collision-free path. These waypoints are used by a model-based planner to generate a smooth and dynamically feasible trajectory that is executed on the physical system using feedback control. Our experiments in simulated real-world cluttered environments and on an actual ground vehicle demonstrate that the proposed approach can reach goal locations more reliably and efficiently in novel environments as compared to purely geometric mapping-based or end-to-end learning-based alternatives. Our approach does not rely on detailed explicit 3D maps of the environment, works well with low frame rates, and generalizes well from simulation to the real world. Videos describing our approach and experiments are available on the project website4.} +} + +@article{levine2018learning, + title={Learning hand-eye coordination for robotic grasping with deep learning and large-scale data collection}, + author={Levine, Sergey and Pastor, Peter and Krizhevsky, Alex and Ibarz, Julian and Quillen, Deirdre}, + journal={The International journal of robotics research}, + volume={37}, + number={4-5}, + pages={421--436}, + year={2018}, + publisher={SAGE Publications Sage UK: London, England} +} + +@incollection{peters2016robot, + title={Robot learning}, + author={Peters, Jan and Lee, Daniel D and Kober, Jens and Nguyen-Tuong, Duy and Bagnell, J Andrew and Schaal, Stefan}, + booktitle={Springer Handbook of Robotics}, + pages={357--398}, + year={2016}, + publisher={Springer} +} + +@article{saxena2014robobrain, + title={Robobrain: Large-scale knowledge engine for robots}, + author={Saxena, Ashutosh and Jain, Ashesh and Sener, Ozan and Jami, Aditya and Misra, Dipendra K and Koppula, Hema S}, + journal={arXiv preprint arXiv:1412.0691}, + year={2014} +} + +@inproceedings{zhu2017target, + title={Target-driven visual navigation in indoor scenes using deep reinforcement learning}, + author={Zhu, Yuke and Mottaghi, Roozbeh and Kolve, Eric and Lim, Joseph J and Gupta, Abhinav and Fei-Fei, Li and Farhadi, Ali}, + booktitle={2017 IEEE international conference on robotics and automation (ICRA)}, + pages={3357--3364}, + year={2017}, + organization={IEEE} +} + +@ARTICLE{9123682, author={Pan, Bowen and Sun, Jiankai and Leung, Ho Yin Tiga and Andonian, Alex and Zhou, Bolei}, journal={IEEE Robotics and Automation Letters}, title={Cross-View Semantic Segmentation for Sensing Surroundings}, year={2020}, volume={5}, number={3}, pages={4867-4873}, doi={10.1109/LRA.2020.3004325}} + +@article{tang2018ba, + title={Ba-net: Dense bundle adjustment network}, + author={Tang, Chengzhou and Tan, Ping}, + journal={arXiv preprint arXiv:1806.04807}, + year={2018} +} + +@inproceedings{tanaka2021learning, + title={Learning To Bundle-Adjust: A Graph Network Approach to Faster Optimization of Bundle Adjustment for Vehicular SLAM}, + author={Tanaka, Tetsuya and Sasagawa, Yukihiro and Okatani, Takayuki}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={6250--6259}, + year={2021} +} + +@inproceedings{tobin2017domain, + title={Domain randomization for transferring deep neural networks from simulation to the real world}, + author={Tobin, Josh and Fong, Rachel and Ray, Alex and Schneider, Jonas and Zaremba, Wojciech and Abbeel, Pieter}, + booktitle={2017 IEEE/RSJ international conference on intelligent robots and systems (IROS)}, + pages={23--30}, + year={2017}, + organization={IEEE} +} + +@inproceedings{finn2017deep, + title={Deep visual foresight for planning robot motion}, + author={Finn, Chelsea and Levine, Sergey}, + booktitle={2017 IEEE International Conference on Robotics and Automation (ICRA)}, + pages={2786--2793}, + year={2017}, + organization={IEEE} +} + +@article{duan2017one, + title={One-shot imitation learning}, + author={Duan, Yan and Andrychowicz, Marcin and Stadie, Bradly and Jonathan Ho, OpenAI and Schneider, Jonas and Sutskever, Ilya and Abbeel, Pieter and Zaremba, Wojciech}, + journal={Advances in neural information processing systems}, + volume={30}, + year={2017} +} + +@book{koubaa2017robot, + title={Robot Operating System (ROS).}, + author={Koub{\^a}a, Anis and others}, + volume={1}, + year={2017}, + publisher={Springer} +} + +@article{coleman2014reducing, + title={Reducing the barrier to entry of complex robotic software: a moveit! case study}, + author={Coleman, David and Sucan, Ioan and Chitta, Sachin and Correll, Nikolaus}, + journal={arXiv preprint arXiv:1404.3785}, + year={2014} +} + +@inproceedings{salzmann2020trajectron++, + title={Trajectron++: Dynamically-feasible trajectory forecasting with heterogeneous data}, + author={Salzmann, Tim and Ivanovic, Boris and Chakravarty, Punarjay and Pavone, Marco}, + booktitle={European Conference on Computer Vision}, + pages={683--700}, + year={2020}, + organization={Springer} +} + +@inproceedings{gog2021pylot, + title={Pylot: A modular platform for exploring latency-accuracy tradeoffs in autonomous vehicles}, + author={Gog, Ionel and Kalra, Sukrit and Schafhalter, Peter and Wright, Matthew A and Gonzalez, Joseph E and Stoica, Ion}, + booktitle={2021 IEEE International Conference on Robotics and Automation (ICRA)}, + pages={8806--8813}, + year={2021}, + organization={IEEE} +} + +@inproceedings{Dosovitskiy17, + title = { {CARLA}: {An} Open Urban Driving Simulator}, + author = {Alexey Dosovitskiy and German Ros and Felipe Codevilla and Antonio Lopez and Vladlen Koltun}, + booktitle = {Proceedings of the 1st Annual Conference on Robot Learning}, + pages = {1--16}, + year = {2017} +} + +@inproceedings{10.1145/3492321.3519576, +author = {Gog, Ionel and Kalra, Sukrit and Schafhalter, Peter and Gonzalez, Joseph E. and Stoica, Ion}, +title = {D3: A Dynamic Deadline-Driven Approach for Building Autonomous Vehicles}, +year = {2022}, +isbn = {9781450391627}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3492321.3519576}, +doi = {10.1145/3492321.3519576}, +abstract = {Autonomous vehicles (AVs) must drive across a variety of challenging environments that impose continuously-varying deadlines and runtime-accuracy tradeoffs on their software pipelines. A deadline-driven execution of such AV pipelines requires a new class of systems that enable the computation to maximize accuracy under dynamically-varying deadlines. Designing these systems presents interesting challenges that arise from combining ease-of-development of AV pipelines with deadline specification and enforcement mechanisms.Our work addresses these challenges through D3 (Dynamic Deadline-Driven), a novel execution model that centralizes the deadline management, and allows applications to adjust their computation by modeling missed deadlines as exceptions. Further, we design and implement ERDOS, an open-source realization of D3 for AV pipelines that exposes finegrained execution events to applications, and provides mechanisms to speculatively execute computation and enforce deadlines between an arbitrary set of events. Finally, we address the crucial lack of AV benchmarks through our state-of-the-art open-source AV pipeline, Pylot, that works seamlessly across simulators and real AVs. We evaluate the efficacy of D3 and ERDOS by driving Pylot across challenging driving scenarios spanning 50km, and observe a 68% reduction in collisions as compared to prior execution models.}, +booktitle = {Proceedings of the Seventeenth European Conference on Computer Systems}, +pages = {453–471}, +numpages = {19}, +location = {Rennes, France}, +series = {EuroSys '22} +} + +@article{li2021metadrive, + author = {Li, Quanyi and Peng, Zhenghao and Xue, Zhenghai and Zhang, Qihang and Zhou, Bolei}, + journal = {ArXiv preprint}, + title = {Metadrive: Composing diverse driving scenarios for generalizable reinforcement learning}, + url = {https://arxiv.org/abs/2109.12674}, + volume = {abs/2109.12674}, + year = {2021} +} + +@article{peng2021learning, + author = {Peng, Zhenghao and Li, Quanyi and Hui, Ka Ming and Liu, Chunxiao and Zhou, Bolei}, + journal = {Advances in Neural Information Processing Systems}, + title = {Learning to Simulate Self-Driven Particles System with Coordinated Policy Optimization}, + volume = {34}, + year = {2021} +} + + +@inproceedings{peng2021safe, + author = {Peng, Zhenghao and Li, Quanyi and Liu, Chunxiao and Zhou, Bolei}, + booktitle = {5th Annual Conference on Robot Learning}, + title = {Safe Driving via Expert Guided Policy Optimization}, + year = {2021} +} + +@ARTICLE{8421746, author={Qin, Tong and Li, Peiliang and Shen, Shaojie}, journal={IEEE Transactions on Robotics}, title={VINS-Mono: A Robust and Versatile Monocular Visual-Inertial State Estimator}, year={2018}, volume={34}, number={4}, pages={1004-1020}, doi={10.1109/TRO.2018.2853729}} + +@article{campos2021orb, + title={Orb-slam3: An accurate open-source library for visual, visual--inertial, and multimap slam}, + author={Campos, Carlos and Elvira, Richard and Rodr{\'\i}guez, Juan J G{\'o}mez and Montiel, Jos{\'e} MM and Tard{\'o}s, Juan D}, + journal={IEEE Transactions on Robotics}, + volume={37}, + number={6}, + pages={1874--1890}, + year={2021}, + publisher={IEEE} +} + +@inproceedings{li2021efficient, + author = {Li, Quanyi and Peng, Zhenghao and Zhou, Bolei}, + booktitle = {International Conference on Learning Representations}, + title = {Efficient Learning of Safe Driving Policy via Human-AI Copilot Optimization}, + year = {2021} +} + +@article{chaplot2020learning, + title={Learning to explore using active neural slam}, + author={Chaplot, Devendra Singh and Gandhi, Dhiraj and Gupta, Saurabh and Gupta, Abhinav and Salakhutdinov, Ruslan}, + journal={arXiv preprint arXiv:2004.05155}, + year={2020} +} + +@article{teed2021droid, + title={Droid-slam: Deep visual slam for monocular, stereo, and rgb-d cameras}, + author={Teed, Zachary and Deng, Jia}, + journal={Advances in Neural Information Processing Systems}, + volume={34}, + year={2021} +} + +@article{brunke2021safe, + title={Safe learning in robotics: From learning-based control to safe reinforcement learning}, + author={Brunke, Lukas and Greeff, Melissa and Hall, Adam W and Yuan, Zhaocong and Zhou, Siqi and Panerati, Jacopo and Schoellig, Angela P}, + journal={Annual Review of Control, Robotics, and Autonomous Systems}, + volume={5}, + year={2021}, + publisher={Annual Reviews} +} + + +@InProceedings{pmlr-v144-gama21a, + title = {Graph Neural Networks for Distributed Linear-Quadratic Control}, + author = {Gama, Fernando and Sojoudi, Somayeh}, + booktitle = {Proceedings of the 3rd Conference on Learning for Dynamics and Control}, + pages = {111--124}, + year = {2021}, + editor = {Jadbabaie, Ali and Lygeros, John and Pappas, George J. and A. Parrilo, Pablo and Recht, Benjamin and Tomlin, Claire J. and Zeilinger, Melanie N.}, + volume = {144}, + series = {Proceedings of Machine Learning Research}, + month = {07 -- 08 June}, + publisher = {PMLR}, + pdf = {http://proceedings.mlr.press/v144/gama21a/gama21a.pdf}, + url = {https://proceedings.mlr.press/v144/gama21a.html}, + abstract = {The linear-quadratic controller is one of the fundamental problems in control theory. The optimal solution is a linear controller that requires access to the state of the entire system at any given time. When considering a network system, this renders the optimal controller a centralized one. The interconnected nature of a network system often demands a distributed controller, where different components of the system are controlled based only on local information. Unlike the classical centralized case, obtaining the optimal distributed controller is usually an intractable problem. Thus, we adopt a graph neural network (GNN) as a parametrization of distributed controllers. GNNs are naturally local and have distributed architectures, making them well suited for learning nonlinear distributed controllers. By casting the linear-quadratic problem as a self-supervised learning problem, we are able to find the best GNN-based distributed controller. We also derive sufficient conditions for the resulting closed-loop system to be stable. We run extensive simulations to study the performance of GNN-based distributed controllers and showcase that they are a computationally efficient parametrization with scalability and transferability capabilities.} +} + + +@InProceedings{pmlr-v144-mehrjou21a, + title = {Neural Lyapunov Redesign}, + author = {Mehrjou, Arash and Ghavamzadeh, Mohammad and Sch\"olkopf, Bernhard}, + booktitle = {Proceedings of the 3rd Conference on Learning for Dynamics and Control}, + pages = {459--470}, + year = {2021}, + editor = {Jadbabaie, Ali and Lygeros, John and Pappas, George J. and A. Parrilo, Pablo and Recht, Benjamin and Tomlin, Claire J. and Zeilinger, Melanie N.}, + volume = {144}, + series = {Proceedings of Machine Learning Research}, + month = {07 -- 08 June}, + publisher = {PMLR}, + pdf = {http://proceedings.mlr.press/v144/mehrjou21a/mehrjou21a.pdf}, + url = {https://proceedings.mlr.press/v144/mehrjou21a.html}, + abstract = {Learning controllers merely based on a performance metric has been proven effective in many physical and non-physical tasks in both control theory and reinforcement learning. However, in practice, the controller must guarantee some notion of safety to ensure that it does not harm either the agent or the environment. Stability is a crucial notion of safety, whose violation can certainly cause unsafe behaviors. Lyapunov functions are effective tools to assess stability in nonlinear dynamical systems. In this paper, we combine an improving Lyapunov function with automatic controller synthesis in an iterative fashion to obtain control policies with large safe regions. We propose a two-player collaborative algorithm that alternates between estimating a Lyapunov function and deriving a controller that gradually enlarges the stability region of the closed-loop system. We provide theoretical results on the class of systems that can be treated with the proposed algorithm and empirically evaluate the effectiveness of our method using an exemplary dynamical system.} +} + + +@InProceedings{pmlr-v144-zhang21b, + title = {{LEOC}: A Principled Method in Integrating Reinforcement Learning and Classical Control Theory}, + author = {Zhang, Naifu and Capel, Nicholas}, + booktitle = {Proceedings of the 3rd Conference on Learning for Dynamics and Control}, + pages = {689--701}, + year = {2021}, + editor = {Jadbabaie, Ali and Lygeros, John and Pappas, George J. and A. Parrilo, Pablo and Recht, Benjamin and Tomlin, Claire J. and Zeilinger, Melanie N.}, + volume = {144}, + series = {Proceedings of Machine Learning Research}, + month = {07 -- 08 June}, + publisher = {PMLR}, + pdf = {http://proceedings.mlr.press/v144/zhang21b/zhang21b.pdf}, + url = {https://proceedings.mlr.press/v144/zhang21b.html}, + abstract = {There have been attempts in reinforcement learning to exploit a priori knowledge about the structure of the system. This paper proposes a hybrid reinforcement learning controller which dynamically interpolates a model-based linear controller and an arbitrary differentiable policy. The linear controller is designed based on local linearised model knowledge, and stabilises the system in a neighbourhood about an operating point. The coefficients of interpolation between the two controllers are determined by a scaled distance function measuring the distance between the current state and the operating point. The overall hybrid controller is proven to maintain the stability guarantee around the neighborhood of the operating point and still possess the universal function approximation property of the arbitrary non-linear policy. Learning has been done on both model-based (PILCO) and model-free (DDPG) frameworks. Simulation experiments performed in OpenAI gym demonstrate stability and robustness of the proposed hybrid controller. This paper thus introduces a principled method allowing for the direct importing of control methodology into reinforcement learning.} +} + + +@InProceedings{pmlr-v144-rafailov21a, + title = {Offline Reinforcement Learning from Images with Latent Space Models}, + author = {Rafailov, Rafael and Yu, Tianhe and Rajeswaran, Aravind and Finn, Chelsea}, + booktitle = {Proceedings of the 3rd Conference on Learning for Dynamics and Control}, + pages = {1154--1168}, + year = {2021}, + editor = {Jadbabaie, Ali and Lygeros, John and Pappas, George J. and A. Parrilo, Pablo and Recht, Benjamin and Tomlin, Claire J. and Zeilinger, Melanie N.}, + volume = {144}, + series = {Proceedings of Machine Learning Research}, + month = {07 -- 08 June}, + publisher = {PMLR}, + pdf = {http://proceedings.mlr.press/v144/rafailov21a/rafailov21a.pdf}, + url = {https://proceedings.mlr.press/v144/rafailov21a.html}, + abstract = {Offline reinforcement learning (RL) refers to the task of learning policies from a static dataset of environment interactions. Offline RL enables extensive utilization and re-use of historical datasets, while also alleviating safety concerns associated with online exploration, thereby expanding the real-world applicability of RL. Most prior work in offline RL has focused on tasks with compact state representations. However, the ability to learn directly from rich observation spaces like images is critical for real-world applications like robotics. In this work, we build on recent advances in model-based algorithms for offline RL, and extend them to high-dimensional visual observation spaces. Model-based offline RL algorithms have achieved state of the art results in state based tasks and are minimax optimal. However, they rely crucially on the ability to quantify uncertainty in the model predictions. This is particularly challenging with image observations. To overcome this challenge, we propose to learn a latent-state dynamics model, and represent the uncertainty in the latent space. Our approach is both tractable in practice and corresponds to maximizing a lower bound of the ELBO in the unknown POMDP. Through experiments on a range of challenging image-based locomotion and robotic manipulation tasks, we find that our algorithm significantly outperforms previous offline model-free RL methods as well as state-of-the-art online visual model-based RL methods. Moreover, we also find that our approach excels on an image-based drawer closing task on a real robot using a pre-existing dataset. All results including videos can be found online at \url{https://sites.google.com/view/lompo/}.} +} + +@inproceedings{chen2020transferable, + title={Transferable active grasping and real embodied dataset}, + author={Chen, Xiangyu and Ye, Zelin and Sun, Jiankai and Fan, Yuda and Hu, Fang and Wang, Chenxi and Lu, Cewu}, + booktitle={2020 IEEE International Conference on Robotics and Automation (ICRA)}, + pages={3611--3618}, + year={2020}, + organization={IEEE} +} + +@article{sun2021adversarial, + title={Adversarial inverse reinforcement learning with self-attention dynamics model}, + author={Sun, Jiankai and Yu, Lantao and Dong, Pinqian and Lu, Bo and Zhou, Bolei}, + journal={IEEE Robotics and Automation Letters}, + volume={6}, + number={2}, + pages={1880--1886}, + year={2021}, + publisher={IEEE} +} + +@article{huang2018navigationnet, + title={NavigationNet: A large-scale interactive indoor navigation dataset}, + author={Huang, He and Shen, Yujing and Sun, Jiankai and Lu, Cewu}, + journal={arXiv preprint arXiv:1808.08374}, + year={2018} +} + +@inproceedings{xu2019depth, + title={Depth completion from sparse lidar data with depth-normal constraints}, + author={Xu, Yan and Zhu, Xinge and Shi, Jianping and Zhang, Guofeng and Bao, Hujun and Li, Hongsheng}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={2811--2820}, + year={2019} +} + +@inproceedings{zhu2020ssn, + title={Ssn: Shape signature networks for multi-class object detection from point clouds}, + author={Zhu, Xinge and Ma, Yuexin and Wang, Tai and Xu, Yan and Shi, Jianping and Lin, Dahua}, + booktitle={European Conference on Computer Vision}, + pages={581--597}, + year={2020}, + organization={Springer} +} + +@inproceedings{huang2019prior, + title={Prior guided dropout for robust visual localization in dynamic environments}, + author={Huang, Zhaoyang and Xu, Yan and Shi, Jianping and Zhou, Xiaowei and Bao, Hujun and Zhang, Guofeng}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={2791--2800}, + year={2019} +} + +@article{xu2020selfvoxelo, + title={Selfvoxelo: Self-supervised lidar odometry with voxel-based deep neural networks}, + author={Xu, Yan and Huang, Zhaoyang and Lin, Kwan-Yee and Zhu, Xinge and Shi, Jianping and Bao, Hujun and Zhang, Guofeng and Li, Hongsheng}, + journal={arXiv preprint arXiv:2010.09343}, + year={2020} +} + +@article{huang2021life, + title={LIFE: Lighting Invariant Flow Estimation}, + author={Huang, Zhaoyang and Pan, Xiaokun and Xu, Runsen and Xu, Yan and Zhang, Guofeng and Li, Hongsheng and others}, + journal={arXiv preprint arXiv:2104.03097}, + year={2021} +} + +@inproceedings{huang2021vs, + title={VS-Net: Voting with Segmentation for Visual Localization}, + author={Huang, Zhaoyang and Zhou, Han and Li, Yijin and Yang, Bangbang and Xu, Yan and Zhou, Xiaowei and Bao, Hujun and Zhang, Guofeng and Li, Hongsheng}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={6101--6111}, + year={2021} +} + +@article{yang2021pdnet, + title={PDNet: Towards Better One-stage Object Detection with Prediction Decoupling}, + author={Yang, Li and Xu, Yan and Wang, Shaoru and Yuan, Chunfeng and Zhang, Ziqi and Li, Bing and Hu, Weiming}, + journal={arXiv preprint arXiv:2104.13876}, + year={2021} +} + +@article{xu2022robust, + title={Robust Self-supervised LiDAR Odometry via Representative Structure Discovery and 3D Inherent Error Modeling}, + author={Xu, Yan and Lin, Junyi and Shi, Jianping and Zhang, Guofeng and Wang, Xiaogang and Li, Hongsheng}, + journal={IEEE Robotics and Automation Letters}, + year={2022}, + publisher={IEEE} +} + +@article{xu2022rnnpose, + title={RNNPose: Recurrent 6-DoF Object Pose Refinement with Robust Correspondence Field Estimation and Pose Optimization}, + author={Xu, Yan and Lin, Junyi and Zhang, Guofeng and Wang, Xiaogang and Li, Hongsheng}, + journal={arXiv preprint arXiv:2203.12870}, + year={2022} +} + +@article{Sun2022SelfSupervisedTA, + title={Self-Supervised Traffic Advisors: Distributed, Multi-view Traffic Prediction for Smart Cities}, + author={Jiankai Sun and Shreyas Kousik and David Fridovich-Keil and Mac Schwager}, + journal={arXiv preprint}, + year={2022} +} + +@ARTICLE{9813561, author={Qiu, Jianing and Chen, Lipeng and Gu, Xiao and Lo, Frank P.-W. and Tsai, Ya-Yen and Sun, Jiankai and Liu, Jiaqi and Lo, Benny}, journal={IEEE Robotics and Automation Letters}, title={Egocentric Human Trajectory Forecasting with a Wearable Camera and Multi-Modal Fusion}, year={2022}, volume={}, number={}, pages={1-8}, doi={10.1109/LRA.2022.3188101}} + +@article{MegBA, + title={MegBA: A High-Performance and Distributed Library for Large-Scale Bundle Adjustment}, + author={Ren, Jie and Liang, Wenteng and Yan, Ran and Mai, Luo and Liu, Shiwen and Liu, Xiao}, + journal={European Conference on Computer Vision}, + year={2022} +} + +@inproceedings{li2023behavior, + title={Behavior-1k: A benchmark for embodied ai with 1,000 everyday activities and realistic simulation}, + author={Li, Chengshu and Zhang, Ruohan and Wong, Josiah and Gokmen, Cem and Srivastava, Sanjana and Mart{\'\i}n-Mart{\'\i}n, Roberto and Wang, Chen and Levine, Gabrael and Lingelbach, Michael and Sun, Jiankai and others}, + booktitle={Conference on Robot Learning}, + pages={80--93}, + year={2023}, + organization={PMLR} +} + +@article{wang2023mimicplay, + title={MimicPlay: Long-Horizon Imitation Learning by Watching Human Play}, + author={Wang, Chen and Fan, Linxi and Sun, Jiankai and Zhang, Ruohan and Fei-Fei, Li and Xu, Danfei and Zhu, Yuke and Anandkumar, Anima}, + journal={arXiv preprint arXiv:2302.12422}, + year={2023} +} + diff --git a/en_chapters/references b/en_chapters/references new file mode 120000 index 0000000..543a78f --- /dev/null +++ b/en_chapters/references @@ -0,0 +1 @@ +/chivier-disk/hyq-home/Projects/openmlsys-zh/references \ No newline at end of file diff --git a/en_chapters/static b/en_chapters/static new file mode 120000 index 0000000..1ca9b6a --- /dev/null +++ b/en_chapters/static @@ -0,0 +1 @@ +/chivier-disk/hyq-home/Projects/openmlsys-zh/static \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 5fcac34..634113e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,7 @@ bs4 -sphinx==4.4.0 +# pin setuptools<81 so pkg_resources (used by notedown) remains available +setuptools<81 +# d2lbook pins sphinxcontrib-bibtex<2.0.0, but that version's oset dep +# uses collections.MutableSet which was removed in Python 3.10. +# We override it here (installed after d2lbook to win the conflict). +sphinxcontrib-bibtex>=2.5.0 diff --git a/appendix_machine_learning_introduction/classic_machine_learning.md b/zh_chapters/appendix_machine_learning_introduction/classic_machine_learning.md similarity index 100% rename from appendix_machine_learning_introduction/classic_machine_learning.md rename to zh_chapters/appendix_machine_learning_introduction/classic_machine_learning.md diff --git a/appendix_machine_learning_introduction/gradient_descent.md b/zh_chapters/appendix_machine_learning_introduction/gradient_descent.md similarity index 100% rename from appendix_machine_learning_introduction/gradient_descent.md rename to zh_chapters/appendix_machine_learning_introduction/gradient_descent.md diff --git a/appendix_machine_learning_introduction/index.md b/zh_chapters/appendix_machine_learning_introduction/index.md similarity index 100% rename from appendix_machine_learning_introduction/index.md rename to zh_chapters/appendix_machine_learning_introduction/index.md diff --git a/appendix_machine_learning_introduction/neural_network.md b/zh_chapters/appendix_machine_learning_introduction/neural_network.md similarity index 100% rename from appendix_machine_learning_introduction/neural_network.md rename to zh_chapters/appendix_machine_learning_introduction/neural_network.md diff --git a/chapter_accelerator/accelerator_architecture.md b/zh_chapters/chapter_accelerator/accelerator_architecture.md similarity index 100% rename from chapter_accelerator/accelerator_architecture.md rename to zh_chapters/chapter_accelerator/accelerator_architecture.md diff --git a/chapter_accelerator/accelerator_introduction.md b/zh_chapters/chapter_accelerator/accelerator_introduction.md similarity index 100% rename from chapter_accelerator/accelerator_introduction.md rename to zh_chapters/chapter_accelerator/accelerator_introduction.md diff --git a/chapter_accelerator/accelerator_practise.md b/zh_chapters/chapter_accelerator/accelerator_practise.md similarity index 100% rename from chapter_accelerator/accelerator_practise.md rename to zh_chapters/chapter_accelerator/accelerator_practise.md diff --git a/chapter_accelerator/accelerator_programming.md b/zh_chapters/chapter_accelerator/accelerator_programming.md similarity index 100% rename from chapter_accelerator/accelerator_programming.md rename to zh_chapters/chapter_accelerator/accelerator_programming.md diff --git a/chapter_accelerator/index.md b/zh_chapters/chapter_accelerator/index.md similarity index 100% rename from chapter_accelerator/index.md rename to zh_chapters/chapter_accelerator/index.md diff --git a/chapter_accelerator/summary.md b/zh_chapters/chapter_accelerator/summary.md similarity index 100% rename from chapter_accelerator/summary.md rename to zh_chapters/chapter_accelerator/summary.md diff --git a/chapter_backend_and_runtime/compute_schedule_and_execute.md b/zh_chapters/chapter_backend_and_runtime/compute_schedule_and_execute.md similarity index 98% rename from chapter_backend_and_runtime/compute_schedule_and_execute.md rename to zh_chapters/chapter_backend_and_runtime/compute_schedule_and_execute.md index bfa1131..3b3507f 100644 --- a/chapter_backend_and_runtime/compute_schedule_and_execute.md +++ b/zh_chapters/chapter_backend_and_runtime/compute_schedule_and_execute.md @@ -1,231 +1,231 @@ -## 计算调度与执行 - -经过算子选择与内存分配之后,计算任务可以通过运行时完成计算的调度与在硬件上的执行。根据是否将算子编译为计算图,计算的调度可以分为单算子调度与计算图调度两种方式,例如在MindSpore中分别提供了PyNative模式和Graph模式。而根据硬件提供的能力差异,计算图的执行方式又可以分为逐算子下发执行的交互式执行以及将整个计算图或者部分子图一次性下发到硬件的下沉式执行两种模式。 - -### 单算子调度 - -单算子调度是相对于计算图而言,算法或者模型中包含的算子通过Python语言的运行时被逐个调度执行。例如PyTorch的默认执行方式,TensorFlow的eager模式,以及MindSpore的PyNative模式。以MindSpore为例,如代码所示。 - -```python -import mindspore.nn as nn -from mindspore import context - -class Computation(nn.Cell): - def construct(self, x, y): - m = x * y - n = x - y - print(m) - z = m + n - return z - -compute = Computation() -c = compute(1, 2) -print(c) -``` - -上述脚本将所有的计算逻辑定义在Computation类的construct方法中,由于在脚本开头的context中预先设置了单算子执行模式,construct中的计算将被Python的运行时逐行调用执行,同时可以在代码中的任意位置添加print命令以便打印中间的计算结果。 - -单算子执行的调用链路如 :numref:`single_op_exec`所示,算子在Python侧被触发执行后,会经过机器学习框架初始化,其中需要确定包括算子的精度,输入与输出的类型和大小以及对应的硬件设备等信息,接着框架会为该算子分配计算所需的内存,最后交给具体的硬件计算设备完成计算的执行。 - -![单算子执行](../img/ch05/single_op_exec.PNG) -:width:`800px` -:label:`single_op_exec` - -单算子调度方式的好处在于其灵活性,由于算子直接通过Python运行时调度,一方面可以表达任意复杂的计算逻辑,尤其是在需要复杂控制流以及需要Python原生数据结构支持来实现复杂算法的场景;另一方面单算子调度对于程序正确性的调试非常便利,开发人员可以在代码执行过程中打印任意需要调试的变量;最后一点是通过Python运行时驱动算子的方式,可以在计算中与Python庞大而丰富的生态库协同完成计算任务。 - -### 计算图调度 - -虽然单算子调度具有如上所述的优点,其缺点也很明显。一方面是难于进行计算性能的优化,原因是由于缺乏计算图的全局信息,单算子执行时无法根据上下文完成算子融合,代数化简等优化;另一方面由于缺乏计算的拓扑关系,整个计算只能串行调度执行,即无法通过运行时完成并行计算。例如上述示例代码的计算逻辑可以表达为 :numref:`graph_exec`所示。由该计算图可以看出,其中乘法和减法之间并没有依赖关系,因此这两个计算可以并行执行,而这样的并行执行信息只有将计算表达为计算图后才能完成分析,这也是计算图调度相对于单算子调度的优势之一。 - -![计算图](../img/ch05/graph_exec.png) -:width:`800px` -:label:`graph_exec` - -下面我们开始介绍计算图的调度方式,在一个典型的异构计算环境中,主要存在CPU、GPU以及NPU等多种计算设备,因此一张计算图可以由运行在不同设备上的算子组成为异构计算图。 :numref:`computation_graph`展示了一个典型的由异构硬件共同参与的计算图。 - -![异构硬件计算图](../img/ch05/computation_graph.png) -:width:`800px` -:label:`computation_graph` - -所述计算图由如下几类异构硬件对应的算子组成: - -- **CPU算子**:由C++语言编写实现并在主机上通过CPU执行的算子,CPU计算的性能取决于是否能够充分利用CPU多核心的计算能力。 - -- **GPU算子**:以英伟达GPU芯片为例,通过在主机侧将GPU - Kernel逐个下发到GPU设备上,由GPU芯片执行算子的计算逻辑,由于芯片上具备大量的并行执行单元,可以为高度并行的算法提供强大的加速能力。 - -- **NPU算子**:以华为Ascend芯片为例, - Ascend是一个高度集成的SoC芯片,NPU的优势是支持将部分或整个计算图下沉到芯片中完成计算,计算过程中不与Host发生交互,因此具备较高的计算性能。 - -- **Python算子**:在执行模式上与CPU算子类似,都是由主机上的CPU执行计算,区别在于计算逻辑是由Python语言的运行时通过Python解释器解释执行。 - -异构计算图能够被正确表达的首要条件是准确标识算子执行所在的设备,例如异构计算图 :numref:`computation_graph`中所标识的CPU、GPU和Ascend -Kernel,以及被标记为被Python语言运行时执行的Python -Kernel。主流框架均提供了指定算子所在运行设备的能力,以MindSpore为例,一段简单的异构计算代码如下所示。 - -```python -import numpy as np -from mindspore import Tensor -import mindspore.ops.operations as ops -from mindspore.common.api import jit - -# 创建算子并指定执行算子的硬件设备 -add = ops.Add().add_prim_attr('primitive_target', 'CPU') -sub = ops.Sub().add_prim_attr('primitive_target', 'GPU') - -# 指定按照静态计算图模式执行函数 -@jit -def compute(x, y, z): - r = add(x, y) - return sub(r, z) - -# 创建实参 -x = Tensor(np.ones([2, 2]).astype(np.float32)) -y = Tensor(np.ones([2, 2]).astype(np.float32)) -z = Tensor(np.ones([2, 2]).astype(np.float32)) - -# 执行计算 -output = compute(x, y, z) -``` - -上述代码片段完成了x + y - z的计算逻辑,其中Add算子被设置为在CPU上执行,Sub算子被设置为在GPU上执行,从而形成了CPU与GPU协同的异构计算,通过类似的标签机制,可以实现任意复杂的多硬件协同的异构计算表达。 -另外一类较为特殊的异构是Python算子,Python语言的优势在于表达的灵活性和开发效率,以及丰富的周边生态,因此将Python算子引入到计算图中和其他异构硬件的算子协同计算,对计算的灵活性会产生非常大的帮助。与CPU、GPU分别执行在不同设备上的异构不同,Python算子和C++实现的CPU算子都是通过主机侧的CPU核执行,差异在于Python算子是通过统一的计算图进行描述,因此也需要在后端运行时中触发执行。为了在计算图中能够表达Python算子,框架需要提供相应的支持。 - -完成计算图中算子对应设备的标记以后,计算图已经准备好被调度与执行,根据硬件能力的差异,可以将异构计算图的执行分为三种模式,分别是逐算子交互式执行,整图下沉执行与子图下沉执行。交互式执行主要针对CPU和GPU的场景,计算图中的算子按照输入和输出的依赖关系被逐个调度与执行;而整图下沉执行模式主要是针对NPU芯片而言,这类芯片主要的优势是能够将整个神经网络的计算图一次性下发到设备上,无需借助主机的CPU能力而独立完成计算图中所有算子的调度与执行,减少了主机和芯片的交互次数,借助NPU的张量加速能力,提高了计算效率和性能;子图下沉执行模式是前面两种执行模式的结合,由于计算图自身表达的灵活性,对于复杂场景的计算图在NPU芯片上进行整图下沉执行的效率不一定能达到最优,因此可以将对于NPU芯片执行效率低下的部分分离出来,交给CPU或者GPU等执行效率更高的设备处理,而将部分更适合NPU计算的子图下沉到NPU进行计算,这样可以兼顾性能和灵活性两方面。 - -上述异构计算图可以实现两个目的,一个是异构硬件加速,将特定的计算放置到合适的硬件上执行;第二个是实现算子间的并发执行,从计算图上可以看出,kernel_1和kernel_2之间没有依赖关系,kernel_3和kernel_4之间也没有依赖关系,因此这两组CPU和GPU算子在逻辑上可以被框架并发调用,而kernel_5依赖kernel_3和kernel_4的输出作为输入,因此kernel_5需要等待kernel_3和kernel_4执行完成后再被触发执行。 - -虽然在计算图上可以充分表达算子间的并发关系,在实际代码中会产生由于并发而引起的一些不预期的副作用场景,例如如下代码所示: - -```python -import mindspore as ms -from mindspore import Parameter, Tensor -import mindspore.ops.operations as ops -from mindspore.common.api import jit - -# 定义全局变量 -x = Parameter(Tensor([1.0], ms.float32), name="x") -y = Tensor([0.2], ms.float32) -z = Tensor([0.3], ms.float32) - -# 指定按照静态计算图模式执行函数 -@jit -def compute(y, z): - ops.Assign()(x, y) - ops.Assign()(x, z) - r = ops.Sub()(x, y) - return r - -compute(y, z) -``` - -上述代码表达了如下计算逻辑: - -```text -x = y -x = z -x = x - y -``` - -这段简单的计算逻辑翻译到计算图上可以表示为 :numref:`side_effect_1`所示。 - -![并发算子执行](../img/ch05/side_effect_1.png) -:width:`800px` -:label:`side_effect_1` - -代码中所示三行计算之间并没有依赖关系,因此这三个算子在计算图的逻辑上可以被并发执行,然而根据代码的语义,显而易见是需要确保程序能够被顺序执行,这里引入的问题被称为副作用,副作用是指修改了在函数外部定义的状态变量的行为。由于副作用的引入而导致了错误并发关系的发生,一种解决方案是在计算图编译阶段通过添加算子间的依赖,将并发执行逻辑转换为顺序执行逻辑,转换后的计算图如 :numref:`side_effect_2`所示。 - -![消除副作用](../img/ch05/side_effect_2.png) -:width:`800px` -:label:`side_effect_2` - - -图中虚线箭头表达了算子之间的依赖关系,添加依赖关系后,算子会按照Assign_1、Assign_2、Sub_1的顺序串行执行,与代码原本的语义保持一致。 - -### 交互式执行 - -如上所述,交互式执行模式下,框架的运行时根据计算图中算子的依赖关系,按照某种执行序(例如广度优先序)逐个将算子下发到硬件上执行。为了助于理解和对比,先引入非异构计算图(计算图中的算子都是在同一类设备上)的执行方式,异构计算图的执行是基于非异构计算图基础之上的。 - -1、非异构计算图的执行方式 - -![非异构计算图](../img/ch05/graph_exec_1.png) -:width:`800px` -:label:`graph_exec_1` - -如 :numref:`graph_exec_1`是一张非异构计算图,计算图上全部Kernel均为GPU算子,执行方式一般分为串行执行和并行执行: - -![串行执行](../img/ch05/graph_exec_2.png) -:width:`800px` -:label:`graph_exec_2` - -![并行执行](../img/ch05/graph_exec_3.png) -:width:`800px` -:label:`graph_exec_3` - -- **串行执行**:将计算图展开为执行序列,按照执行序逐个串行执行,如 :numref:`graph_exec_2`所示。其特点为执行顺序固定,单线程执行,对系统资源要求相对较低。 - -- **并行执行**:将计算图按照算子之间的依赖关系展开,有依赖关系的算子通过输入依赖保证执行顺序,没有依赖关系的算子则可以并行执行,如 :numref:`graph_exec_3`所示,Kernel_1和Kernel_2没有依赖可以并行执行,Kernel_3和Kernel_4没有依赖可以并行执行。其特点为执行顺序不固定,每轮执行的算子顺序大概率不一样,多线程执行,对系统资源要求相对较高。 - -串行执行和并行执行各有优点和缺点,总结对比见 :numref:`serial_vs_parallel`。 - -:串行执行和并行执行之对比 - -| 执行方式 | 串行执行 | 并行执行 | -|--------------|----------|------| -|算子执行顺序 | 固定 | 不固定 | -|算子执行线程 |单线程 | 多线程 | -|所需执行资源 | 较低 | 较高 | -:label:`serial_vs_parallel` - -2、异构计算图的执行方式 - -![异构计算图](../img/ch05/graph_exec_4.png) -:width:`800px` -:label:`graph_exec_4` - -如 :numref:`graph_exec_4`是一张异构计算图,其中Kernel_1、Kernel_2、Kernel_5、Kernel_9为CPU算子,Kernel_6为python算子(执行也是在CPU上),Kernel_3和Kernel_4为GPU算子,Kernel_7和Kernel_8为GPU算子。 -一般来说计算图的优化都是基于非异构计算图来实现的,要求计算图中的算子为同一设备上的,方便算子间的融合替换等优化操作,因此需要将一张异构计算图切分为多个非异构计算图,这里切分就比较灵活了,可以定义各种切分规则,一般按照产生尽量少的子图的切分规则来切分,尽量将多的同一设备上的算子放在一张子图中,如 :numref:`graph_exec_5`所示,最后产生5张子图:Graph_1\_CPU、Graph_2\_GPU、Graph_3\_CPU、Graph_4\_Ascend、Graph_5\_CPU。 - -![异构计算图切分](../img/ch05/graph_exec_5.png) -:width:`800px` -:label:`graph_exec_5` - -将一张异构计算图切分为多个子计算图后,执行方式一般分为子图拆分执行和子图合并执行: - -- **子图拆分执行**:将切分后的多个子图分开执行,即一个子图执行完再执行另一个子图,如 :numref:`graph_exec_6`所示,上一个子图的输出数据会传输给下一个子图的输入数据,并且下一个子图需要将输入数据拷贝为本图的device数据,如Graph_2\_GPU需要将Graph_1\_CPU的输出数据从CPU拷贝到GPU,反过来Graph_3\_CPU需要将Graph2GPU的输出数据从GPU拷贝到CPU,子图之间互相切换执行有一定的开销。 - -- **子图合并执行**:将切分后的多个子图进行合并,合并为一个整体的DAG执行,如 :numref:`graph_exec_7`所示,通过算子的设备属性来插入拷贝算子以实现不同设备上的算子数据传输,并且拷贝算子也是进入整图中的,从而形成一个大的整图执行,减少子图之间的切换执行开销。 - -![子图拆分](../img/ch05/graph_exec_6.png) -:width:`800px` -:label:`graph_exec_6` - -![子图合并](../img/ch05/graph_exec_7.png) -:width:`800px` -:label:`graph_exec_7` - -由于子图合并执行能够减少子图之间的切换执行开销,因此一般来说子图合并执行性能较高,总结对比见 :numref:`partitioning_vs_merging`。 - -:子图拆分和子图合并之对比 - -| 执行方式 | 子图拆分 | 子图合并| -| --------------|------------------|--------------| -| 异构数据传输 | 子图之间拷贝 | 算子之间拷贝| -| 执行额外开销 | 子图切换执行开销 | 无| -| 执行并发粒度 | 子图并发 | 算子原生并发| -:label:`partitioning_vs_merging` - - -3、异构计算图的执行加速 - -前面讲述了非异构计算图的两种执行方式和异构计算图的两种执行方式,其中异构计算图又是在非异构计算图的基础之上,因此异构计算图按照两两组合共有四种执行方式,以MindSpore为例,采用的是子图合并并行执行,示例图如 :numref:`graph_exec_5`所示,首先是作为一张整图来执行可以避免子图切换的执行开销,然后在整图内并行执行,可以最大粒度的发挥并发执行优势,达到最优的执行性能。 - -![异构硬件加速](../img/ch05/graph_exec_8.png) -:width:`800px` -:label:`graph_exec_8` - -### 下沉式执行 - -下沉式执行是通过专用芯片的SoC架构,将整个或部分计算图一次性调度到芯片上以完成全量数据的计算。例如对于Ascend芯片,多个Ascend算子组成的计算图可以在执行前被编译成为一个Task,通过Ascend驱动程序提供的接口,将包含多个算子的Task一次性下发到硬件上调度执行。因此上例中可以将Ascend的算子Kernel_7和Kernel_8优化为一个子图Graph_4\_Ascend,再将该子图编译成为一个Task,并下沉到Ascend上执行,如 :numref:`graph_exec_8`所示。 - -下沉式执行由于避免了在计算过程中主机侧和设备侧的交互,因此可以获得更好的整体计算性能。然而下沉式执行也存在一些局限,例如在动态shape算子,复杂控制流等场景下会面临较大的技术挑战。 +## 计算调度与执行 + +经过算子选择与内存分配之后,计算任务可以通过运行时完成计算的调度与在硬件上的执行。根据是否将算子编译为计算图,计算的调度可以分为单算子调度与计算图调度两种方式,例如在MindSpore中分别提供了PyNative模式和Graph模式。而根据硬件提供的能力差异,计算图的执行方式又可以分为逐算子下发执行的交互式执行以及将整个计算图或者部分子图一次性下发到硬件的下沉式执行两种模式。 + +### 单算子调度 + +单算子调度是相对于计算图而言,算法或者模型中包含的算子通过Python语言的运行时被逐个调度执行。例如PyTorch的默认执行方式,TensorFlow的eager模式,以及MindSpore的PyNative模式。以MindSpore为例,如代码所示。 + +```python +import mindspore.nn as nn +from mindspore import context + +class Computation(nn.Cell): + def construct(self, x, y): + m = x * y + n = x - y + print(m) + z = m + n + return z + +compute = Computation() +c = compute(1, 2) +print(c) +``` + +上述脚本将所有的计算逻辑定义在Computation类的construct方法中,由于在脚本开头的context中预先设置了单算子执行模式,construct中的计算将被Python的运行时逐行调用执行,同时可以在代码中的任意位置添加print命令以便打印中间的计算结果。 + +单算子执行的调用链路如 :numref:`single_op_exec`所示,算子在Python侧被触发执行后,会经过机器学习框架初始化,其中需要确定包括算子的精度,输入与输出的类型和大小以及对应的硬件设备等信息,接着框架会为该算子分配计算所需的内存,最后交给具体的硬件计算设备完成计算的执行。 + +![单算子执行](../img/ch05/single_op_exec.PNG) +:width:`800px` +:label:`single_op_exec` + +单算子调度方式的好处在于其灵活性,由于算子直接通过Python运行时调度,一方面可以表达任意复杂的计算逻辑,尤其是在需要复杂控制流以及需要Python原生数据结构支持来实现复杂算法的场景;另一方面单算子调度对于程序正确性的调试非常便利,开发人员可以在代码执行过程中打印任意需要调试的变量;最后一点是通过Python运行时驱动算子的方式,可以在计算中与Python庞大而丰富的生态库协同完成计算任务。 + +### 计算图调度 + +虽然单算子调度具有如上所述的优点,其缺点也很明显。一方面是难于进行计算性能的优化,原因是由于缺乏计算图的全局信息,单算子执行时无法根据上下文完成算子融合,代数化简等优化;另一方面由于缺乏计算的拓扑关系,整个计算只能串行调度执行,即无法通过运行时完成并行计算。例如上述示例代码的计算逻辑可以表达为 :numref:`graph_exec`所示。由该计算图可以看出,其中乘法和减法之间并没有依赖关系,因此这两个计算可以并行执行,而这样的并行执行信息只有将计算表达为计算图后才能完成分析,这也是计算图调度相对于单算子调度的优势之一。 + +![计算图](../img/ch05/graph_exec.png) +:width:`800px` +:label:`graph_exec` + +下面我们开始介绍计算图的调度方式,在一个典型的异构计算环境中,主要存在CPU、GPU以及NPU等多种计算设备,因此一张计算图可以由运行在不同设备上的算子组成为异构计算图。 :numref:`computation_graph`展示了一个典型的由异构硬件共同参与的计算图。 + +![异构硬件计算图](../img/ch05/computation_graph.png) +:width:`800px` +:label:`computation_graph` + +所述计算图由如下几类异构硬件对应的算子组成: + +- **CPU算子**:由C++语言编写实现并在主机上通过CPU执行的算子,CPU计算的性能取决于是否能够充分利用CPU多核心的计算能力。 + +- **GPU算子**:以英伟达GPU芯片为例,通过在主机侧将GPU + Kernel逐个下发到GPU设备上,由GPU芯片执行算子的计算逻辑,由于芯片上具备大量的并行执行单元,可以为高度并行的算法提供强大的加速能力。 + +- **NPU算子**:以华为Ascend芯片为例, + Ascend是一个高度集成的SoC芯片,NPU的优势是支持将部分或整个计算图下沉到芯片中完成计算,计算过程中不与Host发生交互,因此具备较高的计算性能。 + +- **Python算子**:在执行模式上与CPU算子类似,都是由主机上的CPU执行计算,区别在于计算逻辑是由Python语言的运行时通过Python解释器解释执行。 + +异构计算图能够被正确表达的首要条件是准确标识算子执行所在的设备,例如异构计算图 :numref:`computation_graph`中所标识的CPU、GPU和Ascend +Kernel,以及被标记为被Python语言运行时执行的Python +Kernel。主流框架均提供了指定算子所在运行设备的能力,以MindSpore为例,一段简单的异构计算代码如下所示。 + +```python +import numpy as np +from mindspore import Tensor +import mindspore.ops.operations as ops +from mindspore.common.api import jit + +# 创建算子并指定执行算子的硬件设备 +add = ops.Add().add_prim_attr('primitive_target', 'CPU') +sub = ops.Sub().add_prim_attr('primitive_target', 'GPU') + +# 指定按照静态计算图模式执行函数 +@jit +def compute(x, y, z): + r = add(x, y) + return sub(r, z) + +# 创建实参 +x = Tensor(np.ones([2, 2]).astype(np.float32)) +y = Tensor(np.ones([2, 2]).astype(np.float32)) +z = Tensor(np.ones([2, 2]).astype(np.float32)) + +# 执行计算 +output = compute(x, y, z) +``` + +上述代码片段完成了x + y - z的计算逻辑,其中Add算子被设置为在CPU上执行,Sub算子被设置为在GPU上执行,从而形成了CPU与GPU协同的异构计算,通过类似的标签机制,可以实现任意复杂的多硬件协同的异构计算表达。 +另外一类较为特殊的异构是Python算子,Python语言的优势在于表达的灵活性和开发效率,以及丰富的周边生态,因此将Python算子引入到计算图中和其他异构硬件的算子协同计算,对计算的灵活性会产生非常大的帮助。与CPU、GPU分别执行在不同设备上的异构不同,Python算子和C++实现的CPU算子都是通过主机侧的CPU核执行,差异在于Python算子是通过统一的计算图进行描述,因此也需要在后端运行时中触发执行。为了在计算图中能够表达Python算子,框架需要提供相应的支持。 + +完成计算图中算子对应设备的标记以后,计算图已经准备好被调度与执行,根据硬件能力的差异,可以将异构计算图的执行分为三种模式,分别是逐算子交互式执行,整图下沉执行与子图下沉执行。交互式执行主要针对CPU和GPU的场景,计算图中的算子按照输入和输出的依赖关系被逐个调度与执行;而整图下沉执行模式主要是针对NPU芯片而言,这类芯片主要的优势是能够将整个神经网络的计算图一次性下发到设备上,无需借助主机的CPU能力而独立完成计算图中所有算子的调度与执行,减少了主机和芯片的交互次数,借助NPU的张量加速能力,提高了计算效率和性能;子图下沉执行模式是前面两种执行模式的结合,由于计算图自身表达的灵活性,对于复杂场景的计算图在NPU芯片上进行整图下沉执行的效率不一定能达到最优,因此可以将对于NPU芯片执行效率低下的部分分离出来,交给CPU或者GPU等执行效率更高的设备处理,而将部分更适合NPU计算的子图下沉到NPU进行计算,这样可以兼顾性能和灵活性两方面。 + +上述异构计算图可以实现两个目的,一个是异构硬件加速,将特定的计算放置到合适的硬件上执行;第二个是实现算子间的并发执行,从计算图上可以看出,kernel_1和kernel_2之间没有依赖关系,kernel_3和kernel_4之间也没有依赖关系,因此这两组CPU和GPU算子在逻辑上可以被框架并发调用,而kernel_5依赖kernel_3和kernel_4的输出作为输入,因此kernel_5需要等待kernel_3和kernel_4执行完成后再被触发执行。 + +虽然在计算图上可以充分表达算子间的并发关系,在实际代码中会产生由于并发而引起的一些不预期的副作用场景,例如如下代码所示: + +```python +import mindspore as ms +from mindspore import Parameter, Tensor +import mindspore.ops.operations as ops +from mindspore.common.api import jit + +# 定义全局变量 +x = Parameter(Tensor([1.0], ms.float32), name="x") +y = Tensor([0.2], ms.float32) +z = Tensor([0.3], ms.float32) + +# 指定按照静态计算图模式执行函数 +@jit +def compute(y, z): + ops.Assign()(x, y) + ops.Assign()(x, z) + r = ops.Sub()(x, y) + return r + +compute(y, z) +``` + +上述代码表达了如下计算逻辑: + +```text +x = y +x = z +x = x - y +``` + +这段简单的计算逻辑翻译到计算图上可以表示为 :numref:`side_effect_1`所示。 + +![并发算子执行](../img/ch05/side_effect_1.png) +:width:`800px` +:label:`side_effect_1` + +代码中所示三行计算之间并没有依赖关系,因此这三个算子在计算图的逻辑上可以被并发执行,然而根据代码的语义,显而易见是需要确保程序能够被顺序执行,这里引入的问题被称为副作用,副作用是指修改了在函数外部定义的状态变量的行为。由于副作用的引入而导致了错误并发关系的发生,一种解决方案是在计算图编译阶段通过添加算子间的依赖,将并发执行逻辑转换为顺序执行逻辑,转换后的计算图如 :numref:`side_effect_2`所示。 + +![消除副作用](../img/ch05/side_effect_2.png) +:width:`800px` +:label:`side_effect_2` + + +图中虚线箭头表达了算子之间的依赖关系,添加依赖关系后,算子会按照Assign_1、Assign_2、Sub_1的顺序串行执行,与代码原本的语义保持一致。 + +### 交互式执行 + +如上所述,交互式执行模式下,框架的运行时根据计算图中算子的依赖关系,按照某种执行序(例如广度优先序)逐个将算子下发到硬件上执行。为了助于理解和对比,先引入非异构计算图(计算图中的算子都是在同一类设备上)的执行方式,异构计算图的执行是基于非异构计算图基础之上的。 + +1、非异构计算图的执行方式 + +![非异构计算图](../img/ch05/graph_exec_1.png) +:width:`800px` +:label:`graph_exec_1` + +如 :numref:`graph_exec_1`是一张非异构计算图,计算图上全部Kernel均为GPU算子,执行方式一般分为串行执行和并行执行: + +![串行执行](../img/ch05/graph_exec_2.png) +:width:`800px` +:label:`graph_exec_2` + +![并行执行](../img/ch05/graph_exec_3.png) +:width:`800px` +:label:`graph_exec_3` + +- **串行执行**:将计算图展开为执行序列,按照执行序逐个串行执行,如 :numref:`graph_exec_2`所示。其特点为执行顺序固定,单线程执行,对系统资源要求相对较低。 + +- **并行执行**:将计算图按照算子之间的依赖关系展开,有依赖关系的算子通过输入依赖保证执行顺序,没有依赖关系的算子则可以并行执行,如 :numref:`graph_exec_3`所示,Kernel_1和Kernel_2没有依赖可以并行执行,Kernel_3和Kernel_4没有依赖可以并行执行。其特点为执行顺序不固定,每轮执行的算子顺序大概率不一样,多线程执行,对系统资源要求相对较高。 + +串行执行和并行执行各有优点和缺点,总结对比见 :numref:`serial_vs_parallel`。 + +:串行执行和并行执行之对比 + +| 执行方式 | 串行执行 | 并行执行 | +|--------------|----------|------| +|算子执行顺序 | 固定 | 不固定 | +|算子执行线程 |单线程 | 多线程 | +|所需执行资源 | 较低 | 较高 | +:label:`serial_vs_parallel` + +2、异构计算图的执行方式 + +![异构计算图](../img/ch05/graph_exec_4.png) +:width:`800px` +:label:`graph_exec_4` + +如 :numref:`graph_exec_4`是一张异构计算图,其中Kernel_1、Kernel_2、Kernel_5、Kernel_9为CPU算子,Kernel_6为python算子(执行也是在CPU上),Kernel_3和Kernel_4为GPU算子,Kernel_7和Kernel_8为GPU算子。 +一般来说计算图的优化都是基于非异构计算图来实现的,要求计算图中的算子为同一设备上的,方便算子间的融合替换等优化操作,因此需要将一张异构计算图切分为多个非异构计算图,这里切分就比较灵活了,可以定义各种切分规则,一般按照产生尽量少的子图的切分规则来切分,尽量将多的同一设备上的算子放在一张子图中,如 :numref:`graph_exec_5`所示,最后产生5张子图:Graph_1\_CPU、Graph_2\_GPU、Graph_3\_CPU、Graph_4\_Ascend、Graph_5\_CPU。 + +![异构计算图切分](../img/ch05/graph_exec_5.png) +:width:`800px` +:label:`graph_exec_5` + +将一张异构计算图切分为多个子计算图后,执行方式一般分为子图拆分执行和子图合并执行: + +- **子图拆分执行**:将切分后的多个子图分开执行,即一个子图执行完再执行另一个子图,如 :numref:`graph_exec_6`所示,上一个子图的输出数据会传输给下一个子图的输入数据,并且下一个子图需要将输入数据拷贝为本图的device数据,如Graph_2\_GPU需要将Graph_1\_CPU的输出数据从CPU拷贝到GPU,反过来Graph_3\_CPU需要将Graph2GPU的输出数据从GPU拷贝到CPU,子图之间互相切换执行有一定的开销。 + +- **子图合并执行**:将切分后的多个子图进行合并,合并为一个整体的DAG执行,如 :numref:`graph_exec_7`所示,通过算子的设备属性来插入拷贝算子以实现不同设备上的算子数据传输,并且拷贝算子也是进入整图中的,从而形成一个大的整图执行,减少子图之间的切换执行开销。 + +![子图拆分](../img/ch05/graph_exec_6.png) +:width:`800px` +:label:`graph_exec_6` + +![子图合并](../img/ch05/graph_exec_7.png) +:width:`800px` +:label:`graph_exec_7` + +由于子图合并执行能够减少子图之间的切换执行开销,因此一般来说子图合并执行性能较高,总结对比见 :numref:`partitioning_vs_merging`。 + +:子图拆分和子图合并之对比 + +| 执行方式 | 子图拆分 | 子图合并| +| --------------|------------------|--------------| +| 异构数据传输 | 子图之间拷贝 | 算子之间拷贝| +| 执行额外开销 | 子图切换执行开销 | 无| +| 执行并发粒度 | 子图并发 | 算子原生并发| +:label:`partitioning_vs_merging` + + +3、异构计算图的执行加速 + +前面讲述了非异构计算图的两种执行方式和异构计算图的两种执行方式,其中异构计算图又是在非异构计算图的基础之上,因此异构计算图按照两两组合共有四种执行方式,以MindSpore为例,采用的是子图合并并行执行,示例图如 :numref:`graph_exec_5`所示,首先是作为一张整图来执行可以避免子图切换的执行开销,然后在整图内并行执行,可以最大粒度的发挥并发执行优势,达到最优的执行性能。 + +![异构硬件加速](../img/ch05/graph_exec_8.png) +:width:`800px` +:label:`graph_exec_8` + +### 下沉式执行 + +下沉式执行是通过专用芯片的SoC架构,将整个或部分计算图一次性调度到芯片上以完成全量数据的计算。例如对于Ascend芯片,多个Ascend算子组成的计算图可以在执行前被编译成为一个Task,通过Ascend驱动程序提供的接口,将包含多个算子的Task一次性下发到硬件上调度执行。因此上例中可以将Ascend的算子Kernel_7和Kernel_8优化为一个子图Graph_4\_Ascend,再将该子图编译成为一个Task,并下沉到Ascend上执行,如 :numref:`graph_exec_8`所示。 + +下沉式执行由于避免了在计算过程中主机侧和设备侧的交互,因此可以获得更好的整体计算性能。然而下沉式执行也存在一些局限,例如在动态shape算子,复杂控制流等场景下会面临较大的技术挑战。 diff --git a/chapter_backend_and_runtime/graph_optimizer.md b/zh_chapters/chapter_backend_and_runtime/graph_optimizer.md similarity index 98% rename from chapter_backend_and_runtime/graph_optimizer.md rename to zh_chapters/chapter_backend_and_runtime/graph_optimizer.md index 7ac11da..fb3df76 100644 --- a/chapter_backend_and_runtime/graph_optimizer.md +++ b/zh_chapters/chapter_backend_and_runtime/graph_optimizer.md @@ -1,56 +1,56 @@ -## 计算图优化 - -后端的计算图优化主要是针对硬件的优化,根据优化适用于所有硬件还是只适合特定硬件,可以分为通用硬件优化和特定硬件优化,例如为了适配硬件指令限制而做的子图变换和与特定硬件无关的算子内存IO优化。 - -### 通用硬件优化 - -通用硬件优化主要指与特定硬件类型无关的计算图优化,优化的核心是子图的等价变换:在计算图中尝试匹配特定的子图结构,找到目标子图结构后,通过等价替换方式,将其替换成对硬件更友好的子图结构。 - -以优化内存IO为例。深度学习算子按其对资源的需求可以分为两类: -计算密集型算子,这些算子的时间绝大部分花在计算上,如卷积、全连接等; -访存密集型算子,这些算子的时间绝大部分花在访存上,他们大部分是Element-Wise算子,例如 ReLU、Element-Wise Sum等。 -在典型的深度学习模型中,一般计算密集型和访存密集型算子是相伴出现的,最简单的例子是“Conv + ReLU”。Conv卷积算子是计算密集型,ReLU算子是访存密集型算子,ReLU算子可以直接取Conv算子的计算结果进行计算,因此可以将二者融合成一个算子来进行计算,从而减少内存访问延时和带宽压力,提高执行效率。 - -例如:“Conv + Conv + Sum + ReLU”的融合,从 :numref:`conv_sum_relu`中可以看到融合后的算子减少了两个内存的读和写的操作,优化了Conv的输出和Sum的输出的读和写的操作。 - -![Elementwise算子融合](../img/ch05/conv_sum_relu.png) -:width:`800px` -:label:`conv_sum_relu` - -除了上述针对特定算子类型结构的融合优化外,基于自动算子生成技术,还可以实现更灵活、更极致的通用优化。以 MindSpore 的图算融合技术为例,图算融合通过“算子拆解、算子聚合、算子重建”三个主要阶段让计算图中的计算更密集,并进一步减少低效的内存访问。 - -![图算融合](../img/ch05/graph_kernel.png) -:width:`800px` -:label:`graph_kernel` - - :numref:`graph_kernel`中,算子拆解阶段(Expander)将计算图中一些复杂算子(composite -op,图中Op1、Op3、Op4)展开为计算等价的基本算子组合( -图中虚线正方形框包围着的部分);在算子聚合阶段(Aggregation),将计算图中将基本算子(basic -op,如图中Op2)、拆解后的算子(expanded -op)组合融合,形成一个更大范围的算子组合;在算子重建阶段(Reconstruction)中,按照输入tensor到输出tensor的仿射关系将基本算子进行分类:elemwise、 -broadcast、reduce、transform等,并在这基础上归纳出不同的通用计算规则(如 -elemwise + reduce 规则:elemwise + -reduce在满足一定条件后可以高效执行),根据这些计算规则不断地从这个大的算子组合上进行分析、筛选,最终重新构建成新的算子(如图中虚线正方形包围的两个算子 -New Op1 和 New -Op2)。图算融合通过对计算图结构的拆解和聚合,可以实现跨算子边界的联合优化;并在算子重建中,通过通用的计算规则,以必要的访存作为代价,生成对硬件更友好、执行更高效的新算子。 - -### 特定硬件优化 - -特定硬件优化是指该计算图的优化是在特定硬件上才能做的优化,常见的基于硬件的优化包括由于硬件指令的限制而做的优化,特定硬件存储格式导致的优化等。 - -1、硬件指令限制 - -在一些特定的硬件上,IR中计算节点没有直接对应的硬件算子,只能通过子图的变换来达到子图中所有算子在对应的硬件上的存在。例如在MindSpore中,昇腾芯片上的Concat算子,只支持有限的输入个数(63个),因此当前端IR上的输入个数大于限制输入的时候,需要将该计算节点拆分成等价的多个Concat节点,如 :numref:`concat`所示: -当Concat有100个输入时,单个算子只支持最多63个输入,此时会将该计算节点拆分成两个Concat节点,分别为63个输入和37个输入的两个算子。 - -![Concat算子拆分](../img/ch05/concat.png) -:width:`800px` -:label:`concat` - -2、数据排布格式的限制 - -针对不同特点的计算平台和不同的算子,为了追求最好的性能,一般都需要选择不同的数据排布格式(Format),而这些排布格式可能跟框架缺省的排布格式是不一样的。在这种情况下,一般的做法是算子在执行完成后对输出插入一个格式转换操作,把排布格式转换回框架的缺省排布格式,这就引入了额外的内存操作。以 :numref:`transdata`为例,在昇腾平台上Conv算子在输入和输出的内存排布为5HD时是性能最优的,所以可以看到Conv算子输出结果的格式是5HD,然后通过一个转换操作转回了框架缺省的NCHW,紧接着,后面又是一个Conv算子,它需要5HD的输入,所以又做了一个NCHW到5HD的转换。我们很容易看出,虚线框内的两个转换操作互为逆操作,可以相互抵消。通过对计算图的模式匹配,可以将该类型的操作消除。 - -![数据排布格式转换消除](../img/ch05/transdata.png) -:width:`800px` -:label:`transdata` +## 计算图优化 + +后端的计算图优化主要是针对硬件的优化,根据优化适用于所有硬件还是只适合特定硬件,可以分为通用硬件优化和特定硬件优化,例如为了适配硬件指令限制而做的子图变换和与特定硬件无关的算子内存IO优化。 + +### 通用硬件优化 + +通用硬件优化主要指与特定硬件类型无关的计算图优化,优化的核心是子图的等价变换:在计算图中尝试匹配特定的子图结构,找到目标子图结构后,通过等价替换方式,将其替换成对硬件更友好的子图结构。 + +以优化内存IO为例。深度学习算子按其对资源的需求可以分为两类: +计算密集型算子,这些算子的时间绝大部分花在计算上,如卷积、全连接等; +访存密集型算子,这些算子的时间绝大部分花在访存上,他们大部分是Element-Wise算子,例如 ReLU、Element-Wise Sum等。 +在典型的深度学习模型中,一般计算密集型和访存密集型算子是相伴出现的,最简单的例子是“Conv + ReLU”。Conv卷积算子是计算密集型,ReLU算子是访存密集型算子,ReLU算子可以直接取Conv算子的计算结果进行计算,因此可以将二者融合成一个算子来进行计算,从而减少内存访问延时和带宽压力,提高执行效率。 + +例如:“Conv + Conv + Sum + ReLU”的融合,从 :numref:`conv_sum_relu`中可以看到融合后的算子减少了两个内存的读和写的操作,优化了Conv的输出和Sum的输出的读和写的操作。 + +![Elementwise算子融合](../img/ch05/conv_sum_relu.png) +:width:`800px` +:label:`conv_sum_relu` + +除了上述针对特定算子类型结构的融合优化外,基于自动算子生成技术,还可以实现更灵活、更极致的通用优化。以 MindSpore 的图算融合技术为例,图算融合通过“算子拆解、算子聚合、算子重建”三个主要阶段让计算图中的计算更密集,并进一步减少低效的内存访问。 + +![图算融合](../img/ch05/graph_kernel.png) +:width:`800px` +:label:`graph_kernel` + + :numref:`graph_kernel`中,算子拆解阶段(Expander)将计算图中一些复杂算子(composite +op,图中Op1、Op3、Op4)展开为计算等价的基本算子组合( +图中虚线正方形框包围着的部分);在算子聚合阶段(Aggregation),将计算图中将基本算子(basic +op,如图中Op2)、拆解后的算子(expanded +op)组合融合,形成一个更大范围的算子组合;在算子重建阶段(Reconstruction)中,按照输入tensor到输出tensor的仿射关系将基本算子进行分类:elemwise、 +broadcast、reduce、transform等,并在这基础上归纳出不同的通用计算规则(如 +elemwise + reduce 规则:elemwise + +reduce在满足一定条件后可以高效执行),根据这些计算规则不断地从这个大的算子组合上进行分析、筛选,最终重新构建成新的算子(如图中虚线正方形包围的两个算子 +New Op1 和 New +Op2)。图算融合通过对计算图结构的拆解和聚合,可以实现跨算子边界的联合优化;并在算子重建中,通过通用的计算规则,以必要的访存作为代价,生成对硬件更友好、执行更高效的新算子。 + +### 特定硬件优化 + +特定硬件优化是指该计算图的优化是在特定硬件上才能做的优化,常见的基于硬件的优化包括由于硬件指令的限制而做的优化,特定硬件存储格式导致的优化等。 + +1、硬件指令限制 + +在一些特定的硬件上,IR中计算节点没有直接对应的硬件算子,只能通过子图的变换来达到子图中所有算子在对应的硬件上的存在。例如在MindSpore中,昇腾芯片上的Concat算子,只支持有限的输入个数(63个),因此当前端IR上的输入个数大于限制输入的时候,需要将该计算节点拆分成等价的多个Concat节点,如 :numref:`concat`所示: +当Concat有100个输入时,单个算子只支持最多63个输入,此时会将该计算节点拆分成两个Concat节点,分别为63个输入和37个输入的两个算子。 + +![Concat算子拆分](../img/ch05/concat.png) +:width:`800px` +:label:`concat` + +2、数据排布格式的限制 + +针对不同特点的计算平台和不同的算子,为了追求最好的性能,一般都需要选择不同的数据排布格式(Format),而这些排布格式可能跟框架缺省的排布格式是不一样的。在这种情况下,一般的做法是算子在执行完成后对输出插入一个格式转换操作,把排布格式转换回框架的缺省排布格式,这就引入了额外的内存操作。以 :numref:`transdata`为例,在昇腾平台上Conv算子在输入和输出的内存排布为5HD时是性能最优的,所以可以看到Conv算子输出结果的格式是5HD,然后通过一个转换操作转回了框架缺省的NCHW,紧接着,后面又是一个Conv算子,它需要5HD的输入,所以又做了一个NCHW到5HD的转换。我们很容易看出,虚线框内的两个转换操作互为逆操作,可以相互抵消。通过对计算图的模式匹配,可以将该类型的操作消除。 + +![数据排布格式转换消除](../img/ch05/transdata.png) +:width:`800px` +:label:`transdata` diff --git a/chapter_backend_and_runtime/index.md b/zh_chapters/chapter_backend_and_runtime/index.md similarity index 97% rename from chapter_backend_and_runtime/index.md rename to zh_chapters/chapter_backend_and_runtime/index.md index e52d5a8..e995880 100644 --- a/chapter_backend_and_runtime/index.md +++ b/zh_chapters/chapter_backend_and_runtime/index.md @@ -1,31 +1,31 @@ -# 编译器后端和运行时 - -在上一章节,详细讲述了一个AI编译器前端的主要功能,重点介绍了中间表示以及自动微分。在得到中间表示后,如何充分利用硬件资源高效地执行,是编译器后端和运行时要解决的问题。 - -在本章节中, 将会介绍AI编译器后端的一些基本概念,详细描述后端的计算图优化、算子选择等流程。通过对编译器前端提供的中间表示进行优化,充分发挥硬件能力,从而提高程序的执行效率。在此基础上,介绍运行时是如何对计算任务进行内存分配以及高效地调度执行。 - -本章的学习目标包括: - -- 了解编译器后端和运行时的作用 - -- 掌握计算图优化的常用方法 - -- 掌握算子选择的常用方法 - -- 掌握内存分配的常用方法 - -- 掌握计算图调度和执行的常用方法 - -- 了解目前算子编译器的基本特点以及其尚未收敛的几个问题 - -```toc -:maxdepth: 2 - -overview -graph_optimizer -kernel_selecter -memory_allocator -compute_schedule_and_execute -op_compiler -summary +# 编译器后端和运行时 + +在上一章节,详细讲述了一个AI编译器前端的主要功能,重点介绍了中间表示以及自动微分。在得到中间表示后,如何充分利用硬件资源高效地执行,是编译器后端和运行时要解决的问题。 + +在本章节中, 将会介绍AI编译器后端的一些基本概念,详细描述后端的计算图优化、算子选择等流程。通过对编译器前端提供的中间表示进行优化,充分发挥硬件能力,从而提高程序的执行效率。在此基础上,介绍运行时是如何对计算任务进行内存分配以及高效地调度执行。 + +本章的学习目标包括: + +- 了解编译器后端和运行时的作用 + +- 掌握计算图优化的常用方法 + +- 掌握算子选择的常用方法 + +- 掌握内存分配的常用方法 + +- 掌握计算图调度和执行的常用方法 + +- 了解目前算子编译器的基本特点以及其尚未收敛的几个问题 + +```toc +:maxdepth: 2 + +overview +graph_optimizer +kernel_selecter +memory_allocator +compute_schedule_and_execute +op_compiler +summary ``` \ No newline at end of file diff --git a/chapter_backend_and_runtime/kernel_selecter.md b/zh_chapters/chapter_backend_and_runtime/kernel_selecter.md similarity index 98% rename from chapter_backend_and_runtime/kernel_selecter.md rename to zh_chapters/chapter_backend_and_runtime/kernel_selecter.md index 869f607..8793d10 100644 --- a/chapter_backend_and_runtime/kernel_selecter.md +++ b/zh_chapters/chapter_backend_and_runtime/kernel_selecter.md @@ -1,84 +1,84 @@ -## 算子选择 - -过计算图优化后,需要对IR图上的每个节点进行算子选择,才能生成真正在设备上执行的算子序列。由于IR图上的节点可能有后端的很多算子与其对应,不同规格的算子在不同的情况下执行效率各不相同,在算子选择阶段的主要任务就是如何根据IR图中的信息在众多算子中选择出最合适的一个算子去目标设备上执行。 - -### 算子选择的基础概念 - -经历了后端的图优化后,IR图中的每一个节点都有一组算子与之对应。此时的IR图中的每一个节点可以认为是用户可见的最小硬件执行单元,代表了用户代码的一个操作,对于这个操作还没有具体生成有关设备信息的细节描述。这些信息是算子选择所选择的内容信息,称之为算子信息。算子信息主要包括以下内容: - -1. 针对不同特点的计算平台和不同的算子,为了追求最好的性能,一般都需要选择不同的数据排布格式。机器学习系统常见的数据排布格式有NCHW和NHWC等。 - -2. 对于不同的硬件支持不同的计算精度,例如float32、float16和int32等。算子选择需要在所支持各种数据类型的算子中选择出用户所设定的数据类型最为相符的算子。 - -**数据排布格式** - -机器学习系统中很多运算都会转换成为矩阵的乘法,例如卷积运算。我们知道矩阵乘法$A\times B = C$ -是以A的一行乘以B的一列求和后得到C的一个元素。以 :numref:`matmuldatalayout`为例,在 :numref:`matmuldatalayout`的上方,矩阵数据的存储是按照行优先来进行存储,虽然B在存储时是按照行存储,但是读取数据时却按照列进行读取,假如我们能把B的格式进行转换转换为列存储,例如 :numref:`matmuldatalayout`下方所示,这样就可以通过访问连续内存的方式加快数据访问速度进而提升运算速度。由此可见不同的数据排布方式对性能有很大影响。 - -![矩阵乘法数据排布示意图](../img/ch05/matmuldatalayout.png) -:width:`800px` -:label:`matmuldatalayout` - -在机器学习系统中常见的数据格式一般有两种,分别为NCHW类型和NHWC类型。其中N代表了数据输入的批大小,C代表了图像的通道,H和W分别代表图像输入的高和宽。:numref:`data_format`展示了BatchSize为2,通道数16和大小为5\*4的数据逻辑示意图。 - -![常见数据格式](../img/ch05/data_format.png) -:width:`800px` -:label:`data_format` - -但是计算机的存储并不能够直接将这样的矩阵放到内存中,需要将其展平成1维后存储,这样就涉及逻辑上的索引如何映射成为内存中的索引,即如何根据逻辑数据索引来映射到内存中的1维数据索引。 - -对于NCHW的数据是先取W轴方向数据,再取H轴方向数据,再取C轴方向,最后取N轴方向。其中物理存储与逻辑存储的之间的映射关系为 -$$offsetnchw(n,c,h,w) = n*CHW + c*HW + h*W +w$$ -如 :numref:`nchw`所示,这种格式中,是按照最低维度W轴方向进行展开,W轴相邻的元素在内存排布中同样是相邻的。如果需要取下一个图片上的相同位置的元素,就必须跳过整个图像的尺寸($C*H*W$)。比如有8张32\*32的RGB图像,此时$N=8,C=3,H=32,W=32$。在内存中存储它们需要先按照W轴方向进行展开,然后按照H轴排列,这样之后便完成了一个通道的处理,之后按照同样的方式处理下一个通道。处理完全部通道后,处理下一张图片。PyTorch和MindSpore框架默认使用NCHW格式。 - -![RGB图片下的NHWC数据格式](../img/ch05/nchw.png) -:width:`800px` -:label:`nchw` - -类似的NHWC数据格式是先取C方向数据,再取W方向,然后是H方向,最后取N方向。NHWC是Tensorflow默认的数据格式。这种格式在PyTorch中称为Channel-Last。 -$$offsetnhwc(n,h,w,c) = n*HWC + h*WC + w*C +c$$ - :numref:`nchwandnhwc`展示了不同数据格式下逻辑排布到内存物理侧数据排布的映射。\[x:1\]代表从最内侧维度到最下一维度的索引变换。比如\[a:1\]表示当前行W轴结束后,下一个H轴排布。\[b:1\]表示最内侧C轴排布完成后进行按照W轴进行排列。 - -![NCHW与NHWC数据存储格式](../img/ch05/nchwandnhwc.png) -:width:`800px` -:label:`nchwandnhwc` - -上述的数据存储格式具有很大的灵活性,很多框架都采用上述的两种格式作为默认的数据排布格式。但是在硬件上对数据操作时,此时的数据排布可能还不是最优的。在机器学习系统中,用户输入的数据往往会远远大于计算部件一次性计算所能容纳的最大范围,所以此时必须将输入的数据进行切片分批送到运算部件中进行运算。为了加速运算很多框架又引入了一些块布局格式来进行进一步的优化,这种优化可以使用一些硬件的加速指令,对数据进行搬移和运算。比如oneDNN上的nChw16c -和nChw8c -格式,以及Ascend芯片的5HD等格式。这种特殊的数据格式与硬件更为贴合,可以快速的将矩阵向量化,并且极大的利用片内缓存。 - -**数据精度** - -通常深度学习的系统,使用的是单精度(float32)表示。这种数据类型占用32位内存。还有一种精度较低的数据类型为半精度(float16),其内部占用了16位的内存。由于很多硬件会对半精度数据类型进行优化,半精度的计算吞吐量可以是单精度的$2\sim 8$倍,且半精度占用的内存更小,这样可以输入更大的批大小(BatchSize),进而减少总体训练时间。接下来详细看一下半精度浮点数与精度浮点数的区别。 - -![浮点数的二进制表示](../img/ch05/floatdtype.png) -:width:`800px` -:label:`floatdtype` - -如 :numref:`floatdtype`中Sig代表符号位,占1位,表示了机器数的正负,Exponent表示指数位,Mantissa为尾数位。其中float16类型的数据采用二进制的科学计数法转换为十进制的计算方式如式$$(-1)^{sign}\times 2^{exponent-15}\times (\frac{mantissa}{1024}+1)$$所示。 -其中如果指数位全为0时,且尾数位全为0时表示数字0。 -如果指数位全为0,尾数位不全为0则表示一个非常小的数值。 -当指数全为1,尾数位全为0表示根据符号位正无穷大,或者负无穷大。 -若指数全为1,但是尾数位不为0,则表示NAN。 -其中bfloat16并不属于一个通用的数据类型,是Google提出的一种特殊的类型,现在一般只在一些TPU上训练使用,其指数位数与float32位数保持一致,可以较快的与float32进行数据转换。由于bfloat16并不是一种通用类型,IEEE中也并没有提出该类型的标准。 - -**算子信息库** - -前面讲述了数据格式和数据精度的概念,基于这两个概念,在不同硬件下会有不同的算子支持,一个硬件上支持的所有算子的集合定义为该硬件的算子信息库。算子选择过程就是从算子信息库中选择最合适算子的过程。 - -### 算子选择的过程 - -前文介绍了算子选择主要是针对IR图中的每一个操作节点选择出最为合适的算子。其中算子信息主要包括了支持设备类型、数据类型和数据排布格式三个方面。经过编译器前端类型推导与静态分析的阶段后,IR图中已经推导出了用户代码侧的数据类型。下面介绍算子选择的基本过程。 - -如图 :numref:`select_kernel`所示,展示了算子选择过程。首先,选择算子执行的硬件设备。不同的硬件设备上,算子的实现、支持数据类型、执行效率通常会有所差别。这一步往往是用户自己指定的,若用户未指定,则编译器后端会为用户匹配一个默认的设备。 -然后,后端会根据IR图中推导出的数据类型和内存排布格式选择对应的算子。 - -![算子选择过程](../img/ch05/select_kernel.png) -:width:`800px` -:label:`select_kernel` - -理想情况下算子选择所选择出的算子类型,应该与用户预期的类型保持一致。但是由于软硬件的限制,很可能算子的数据类型不能满足用户所期待的数据类型,此时需要对该节点进行升精度或者降精度处理才能匹配到合适的算子。比如在MindSpore 的Ascend后端由于硬件限制导致Conv2D算子只存在float16一种数据类型。如果用户设置的整网使用的数据类型为float32数据,那么只能对Conv2D算子的输入数据进行降精度处理,即将输入数据类型从float32转换成float16。 - -算子的数据排布格式转换是一个比较耗时的操作,为了避免频繁的格式转换所带来的内存搬运开销,数据应该尽可能地以同样的格式在算子之间传递,算子和算子的衔接要尽可能少的出现数据排布格式不一致的现象。另外,数据类型不同导致的降精度可能会使得误差变大,收敛速度变慢甚至不收敛,所以数据类型的选择也要结合具体算子分析。 - -总的来说,一个好的算子选择算法应该尽可能的保持数据类型与用户设置的数据类型一致,且尽可能少的出现数据格式转换。 +## 算子选择 + +过计算图优化后,需要对IR图上的每个节点进行算子选择,才能生成真正在设备上执行的算子序列。由于IR图上的节点可能有后端的很多算子与其对应,不同规格的算子在不同的情况下执行效率各不相同,在算子选择阶段的主要任务就是如何根据IR图中的信息在众多算子中选择出最合适的一个算子去目标设备上执行。 + +### 算子选择的基础概念 + +经历了后端的图优化后,IR图中的每一个节点都有一组算子与之对应。此时的IR图中的每一个节点可以认为是用户可见的最小硬件执行单元,代表了用户代码的一个操作,对于这个操作还没有具体生成有关设备信息的细节描述。这些信息是算子选择所选择的内容信息,称之为算子信息。算子信息主要包括以下内容: + +1. 针对不同特点的计算平台和不同的算子,为了追求最好的性能,一般都需要选择不同的数据排布格式。机器学习系统常见的数据排布格式有NCHW和NHWC等。 + +2. 对于不同的硬件支持不同的计算精度,例如float32、float16和int32等。算子选择需要在所支持各种数据类型的算子中选择出用户所设定的数据类型最为相符的算子。 + +**数据排布格式** + +机器学习系统中很多运算都会转换成为矩阵的乘法,例如卷积运算。我们知道矩阵乘法$A\times B = C$ +是以A的一行乘以B的一列求和后得到C的一个元素。以 :numref:`matmuldatalayout`为例,在 :numref:`matmuldatalayout`的上方,矩阵数据的存储是按照行优先来进行存储,虽然B在存储时是按照行存储,但是读取数据时却按照列进行读取,假如我们能把B的格式进行转换转换为列存储,例如 :numref:`matmuldatalayout`下方所示,这样就可以通过访问连续内存的方式加快数据访问速度进而提升运算速度。由此可见不同的数据排布方式对性能有很大影响。 + +![矩阵乘法数据排布示意图](../img/ch05/matmuldatalayout.png) +:width:`800px` +:label:`matmuldatalayout` + +在机器学习系统中常见的数据格式一般有两种,分别为NCHW类型和NHWC类型。其中N代表了数据输入的批大小,C代表了图像的通道,H和W分别代表图像输入的高和宽。:numref:`data_format`展示了BatchSize为2,通道数16和大小为5\*4的数据逻辑示意图。 + +![常见数据格式](../img/ch05/data_format.png) +:width:`800px` +:label:`data_format` + +但是计算机的存储并不能够直接将这样的矩阵放到内存中,需要将其展平成1维后存储,这样就涉及逻辑上的索引如何映射成为内存中的索引,即如何根据逻辑数据索引来映射到内存中的1维数据索引。 + +对于NCHW的数据是先取W轴方向数据,再取H轴方向数据,再取C轴方向,最后取N轴方向。其中物理存储与逻辑存储的之间的映射关系为 +$$offsetnchw(n,c,h,w) = n*CHW + c*HW + h*W +w$$ +如 :numref:`nchw`所示,这种格式中,是按照最低维度W轴方向进行展开,W轴相邻的元素在内存排布中同样是相邻的。如果需要取下一个图片上的相同位置的元素,就必须跳过整个图像的尺寸($C*H*W$)。比如有8张32\*32的RGB图像,此时$N=8,C=3,H=32,W=32$。在内存中存储它们需要先按照W轴方向进行展开,然后按照H轴排列,这样之后便完成了一个通道的处理,之后按照同样的方式处理下一个通道。处理完全部通道后,处理下一张图片。PyTorch和MindSpore框架默认使用NCHW格式。 + +![RGB图片下的NHWC数据格式](../img/ch05/nchw.png) +:width:`800px` +:label:`nchw` + +类似的NHWC数据格式是先取C方向数据,再取W方向,然后是H方向,最后取N方向。NHWC是Tensorflow默认的数据格式。这种格式在PyTorch中称为Channel-Last。 +$$offsetnhwc(n,h,w,c) = n*HWC + h*WC + w*C +c$$ + :numref:`nchwandnhwc`展示了不同数据格式下逻辑排布到内存物理侧数据排布的映射。\[x:1\]代表从最内侧维度到最下一维度的索引变换。比如\[a:1\]表示当前行W轴结束后,下一个H轴排布。\[b:1\]表示最内侧C轴排布完成后进行按照W轴进行排列。 + +![NCHW与NHWC数据存储格式](../img/ch05/nchwandnhwc.png) +:width:`800px` +:label:`nchwandnhwc` + +上述的数据存储格式具有很大的灵活性,很多框架都采用上述的两种格式作为默认的数据排布格式。但是在硬件上对数据操作时,此时的数据排布可能还不是最优的。在机器学习系统中,用户输入的数据往往会远远大于计算部件一次性计算所能容纳的最大范围,所以此时必须将输入的数据进行切片分批送到运算部件中进行运算。为了加速运算很多框架又引入了一些块布局格式来进行进一步的优化,这种优化可以使用一些硬件的加速指令,对数据进行搬移和运算。比如oneDNN上的nChw16c +和nChw8c +格式,以及Ascend芯片的5HD等格式。这种特殊的数据格式与硬件更为贴合,可以快速的将矩阵向量化,并且极大的利用片内缓存。 + +**数据精度** + +通常深度学习的系统,使用的是单精度(float32)表示。这种数据类型占用32位内存。还有一种精度较低的数据类型为半精度(float16),其内部占用了16位的内存。由于很多硬件会对半精度数据类型进行优化,半精度的计算吞吐量可以是单精度的$2\sim 8$倍,且半精度占用的内存更小,这样可以输入更大的批大小(BatchSize),进而减少总体训练时间。接下来详细看一下半精度浮点数与精度浮点数的区别。 + +![浮点数的二进制表示](../img/ch05/floatdtype.png) +:width:`800px` +:label:`floatdtype` + +如 :numref:`floatdtype`中Sig代表符号位,占1位,表示了机器数的正负,Exponent表示指数位,Mantissa为尾数位。其中float16类型的数据采用二进制的科学计数法转换为十进制的计算方式如式$$(-1)^{sign}\times 2^{exponent-15}\times (\frac{mantissa}{1024}+1)$$所示。 +其中如果指数位全为0时,且尾数位全为0时表示数字0。 +如果指数位全为0,尾数位不全为0则表示一个非常小的数值。 +当指数全为1,尾数位全为0表示根据符号位正无穷大,或者负无穷大。 +若指数全为1,但是尾数位不为0,则表示NAN。 +其中bfloat16并不属于一个通用的数据类型,是Google提出的一种特殊的类型,现在一般只在一些TPU上训练使用,其指数位数与float32位数保持一致,可以较快的与float32进行数据转换。由于bfloat16并不是一种通用类型,IEEE中也并没有提出该类型的标准。 + +**算子信息库** + +前面讲述了数据格式和数据精度的概念,基于这两个概念,在不同硬件下会有不同的算子支持,一个硬件上支持的所有算子的集合定义为该硬件的算子信息库。算子选择过程就是从算子信息库中选择最合适算子的过程。 + +### 算子选择的过程 + +前文介绍了算子选择主要是针对IR图中的每一个操作节点选择出最为合适的算子。其中算子信息主要包括了支持设备类型、数据类型和数据排布格式三个方面。经过编译器前端类型推导与静态分析的阶段后,IR图中已经推导出了用户代码侧的数据类型。下面介绍算子选择的基本过程。 + +如图 :numref:`select_kernel`所示,展示了算子选择过程。首先,选择算子执行的硬件设备。不同的硬件设备上,算子的实现、支持数据类型、执行效率通常会有所差别。这一步往往是用户自己指定的,若用户未指定,则编译器后端会为用户匹配一个默认的设备。 +然后,后端会根据IR图中推导出的数据类型和内存排布格式选择对应的算子。 + +![算子选择过程](../img/ch05/select_kernel.png) +:width:`800px` +:label:`select_kernel` + +理想情况下算子选择所选择出的算子类型,应该与用户预期的类型保持一致。但是由于软硬件的限制,很可能算子的数据类型不能满足用户所期待的数据类型,此时需要对该节点进行升精度或者降精度处理才能匹配到合适的算子。比如在MindSpore 的Ascend后端由于硬件限制导致Conv2D算子只存在float16一种数据类型。如果用户设置的整网使用的数据类型为float32数据,那么只能对Conv2D算子的输入数据进行降精度处理,即将输入数据类型从float32转换成float16。 + +算子的数据排布格式转换是一个比较耗时的操作,为了避免频繁的格式转换所带来的内存搬运开销,数据应该尽可能地以同样的格式在算子之间传递,算子和算子的衔接要尽可能少的出现数据排布格式不一致的现象。另外,数据类型不同导致的降精度可能会使得误差变大,收敛速度变慢甚至不收敛,所以数据类型的选择也要结合具体算子分析。 + +总的来说,一个好的算子选择算法应该尽可能的保持数据类型与用户设置的数据类型一致,且尽可能少的出现数据格式转换。 diff --git a/chapter_backend_and_runtime/memory_allocator.md b/zh_chapters/chapter_backend_and_runtime/memory_allocator.md similarity index 99% rename from chapter_backend_and_runtime/memory_allocator.md rename to zh_chapters/chapter_backend_and_runtime/memory_allocator.md index 59da24d..cab2825 100644 --- a/chapter_backend_and_runtime/memory_allocator.md +++ b/zh_chapters/chapter_backend_and_runtime/memory_allocator.md @@ -1,68 +1,68 @@ -## 内存分配 -:label:`ch05-sec-memory_pool` - -内存在传统计算机存储器层次结构中有着重要的地位,它是连接高速缓存和磁盘之间的桥梁,有着比高速缓存更大的空间,比磁盘更快的访问速度。随着深度学习的发展,深度神经网络的模型越来越复杂,AI芯片上的内存很可能无法容纳一个大型网络模型。因此,对内存进行复用是一个重要的优化手段。此外,通过连续内存分配和 In-Place内存分配还可以提高某些算子的执行效率。 - -### Device内存概念 - -在深度学习体系结构中,通常将与硬件加速器(如GPU、AI芯片等)相邻的内存称之为设备(Device)内存,而与CPU相邻的内存称之为主机(Host)内存。如 :numref:`host-device-memory`所示,CPU可以合法地访问主机上的内存,而无法直接访问设备上的内存;同理,AI芯片可以访问设备上的内存,却无法访问主机上的内存。因此,在网络训练过程中,往往需要从磁盘加载数据到主机内存中,然后在主机内存中做数据处理,再从主机内存拷贝到设备内存中,最后设备才能合法地访问数据。算子全部计算完成后,用户要获取训练结果,又需要把数据从设备内存拷贝到主机内存中。 - -![主机内存和设备内存](../img/ch05/host-device-memory.png) -:width:`800px` -:label:`host-device-memory` - -### 内存分配 {#内存分配-1} - -内存分配模块主要负责给图中算子的输入、输出分配Device内存。用户的前端脚本经过编译器前端处理后得到中间表达,后端根据中间表达进行算子选择和相关优化,可以得到算子最终的输入输出张量的形状、数据类型(Data Type)、格式(Format)等信息,根据这些信息可以计算出算子输入、输出张量的尺寸大小。基本的计算方法如式$$size=\left (\prod_{i=0}^{dimension}shape_i\right ) * sizeof\left ( data type \right )$$所示。得到张量的尺寸大小后,往往还需要对内存大小进行对齐操作。内存通常以4字节、8字节或16字节为一组进行访问,如果被搬运的内存大小不是这些值的倍数,内存后面会填充相应数量的空数据以使得内存长度达到这些值的倍数。因此,访问非对齐的内存可能会更加耗时。 - -![内存分配示例](../img/ch05/memory_allocate.png) -:width:`800px` -:label:`memory_allocate` - -下面以 :numref:`memory_allocate`为例介绍内存分配的大致流程。首先给输入张量、Conv2D的权重和Conv2D的输出分配内存地址。然后为BatchNorm的输入分配地址时,发现BatchNorm的输入就是Conv2D算子的输出,而该张量的地址已经在之前分配过了,因此只需要将Conv2D算子的输出地址共享给BatchNorm的输入,就可以避免内存的重复申请以及内存的冗余拷贝。以此类推,可以发现整个过程中可以将待分配的内存分成三种类型:一是整张图的输入张量,二是算子的权重或者属性,三是算子的输出张量,三种类型在训练过程中的生命周期有所不同。 - -在CPU上常常使用malloc函数直接申请内存,这种方式申请内存好处是随时申请随时释放,简单易用。然而在许多对性能要求严苛的计算场景中,由于所申请内存块的大小不定,频繁申请释放会降低性能。通常会使用内存池的方式去管理内存,先申请一定数量的内存块留作备用,当程序有内存申请需求时,直接从内存池中的内存块中申请。当程序释放该内存块时,内存池会进行回收并用作后续程序内存申请时使用。 -在深度学习框架中,设备内存的申请也是非常频繁的,往往也是通过内存池的方式去管理设备内存,并让设备内存的生命周期与张量的生命周期保持一致。不同的深度学习框架在内存池的设计上大同小异,以:numref:`device_malloc`的MindSpore框架内存申请为例,进程会从设备上申请足够大的内存,然后通过双游标从两端偏移为张量分配内存。首先从申请的首地址开始进行偏移,为算子权重的张量分配内存,这部分张量生命周期较长,往往持续整个训练过程。然后从申请设备地址的末尾开始偏移,为算子的输出张量分配内存,这部分内存的生命周期较短,往往在该算子计算结束并且后续计算过程中无需再次使用该算子的输出的情况下,其生命周期就可以结束。通过这种方式,只需要从设备上申请一次足够大的内存,后续算子的内存分配都是通过指针偏移进行分配,减少了直接从设备申请内存的耗时。 - - -![双游标法分配内存](../img/ch05/device_malloc.png) -:width:`800px` -:label:`device_malloc` - -### 内存复用 - -在机器学习系统中,内存复用是指分析张量的生命周期,将生命周期结束的张量的设备内存释放回内存池并用于后续张量的内存分配。内存复用的目的是提高内存的利用率,让有限的设备内存容纳更大的模型。 -以 :numref:`memory_allocate`为例,当BatchNorm算子计算结束后,输出1不再被任何算子使用,则该张量的设备内存可以被回收,并且如果输出1的内存尺寸大于等于输出3的内存尺寸,则从输出1回收的地址可以用于输出3的内存分配,从而达到复用输出1地址的目的。 - -![内存生命周期图](../img/ch05/combine_memory_reuse_and_no_reuse.png) -:width:`800px` -:label:`combine_memory_reuse_and_no_reuse` - -为了更好地描述内存复用问题,通过内存生命周期图来辅助理解。如 :numref:`combine_memory_reuse_and_no_reuse`所示,图中横坐标表示张量的生命周期,图中纵坐标表示内存大小。在生命周期内,某一个张量将一直占用某块设备内存,直至生命周期结束才会释放相应内存块。通过张量生命周期和内存大小可以构造出矩形块,而内存分配要求解的目标是在内存生命周期图中容纳更多的矩形块,问题的约束是矩形块之间无碰撞。 :numref:`combine_memory_reuse_and_no_reuse`左边是在未使用任何内存复用策略的情况下的内存生命周期图,此时内存同时只能容纳T0、T1、T2、T3四个张量。 - -内存复用策略的求解是一个NP完全的问题。许多深度学习框架通常采用贪心的策略去分配内存,例如采用BestFit算法,每次直接从内存池中选取可以满足条件的最小内存块,然而这种贪心的策略往往会陷入局部最优解,而无法求得全局最优解。为了更好地逼近内存分配策略全局最优解,MindSpore框架提出了一种新的内存分配算法 -SOMAS(Safe Optimized Memory Allocation Solver,安全优化的内存分配求解器)。SOMAS将计算图并行流与数据依赖进行聚合分析,得到算子间祖先关系,构建张量全局生命周期互斥约束,使用多种启发式算法求解最优的内存静态规划,实现逼近理论极限的内存复用,从而提升支持的内存大小。 - -由 :numref:`combine_memory_reuse_and_no_reuse`右边所示,经过SOMAS求解之后,同样的内存大小,可支持的Tensor数量达到了7个。 - -### 常见的内存分配优化手段 - -#### 内存融合 - -上述内存分配的方式,都是以单个张量的维度去分配的,每个张量分配到的设备地址往往是离散的。但是对于某些特殊的算子,如AllReduce通信算子,需要为它们分配连续的内存。通信算子的执行包含通信等待、数据搬移、计算等步骤,而在大规模分布式集群的场景下,通信的耗时往往是性能瓶颈。针对这种场景,如 :numref:`memory_fusion`所示,可以将多个通信算子融合成一个,为通信算子的输入分配连续的内存,从而减少通信的次数。 -又比如分布式训练中的神经网络权重初始化,通常将一个训练进程中的权重初始化,然后将该权重广播到其他进程中。当一个网络有较多权重的时候,需要多次进行广播。通常可以为所有权重分配连续的内存地址,然后广播一次,节省大量通信的耗时。 - -![通信算子内存融合](../img/ch05/memory_fusion.png) -:width:`800px` -:label:`memory_fusion` - -#### In-Place算子 - -在内存分配流程中,会为每个算子的输入和输出都分配不同的内存。然而对很多算子而言,为其分配不同的输入和输出地址,会浪费内存并且影响计算性能。例如优化器算子,其计算的目的就是更新神经网络的权重;例如Python语法中的 += 和 *= 操作符,将计算结果更新到符号左边的变量中;例如 a[0]=b 语法,将 a[0] 的值更新为 b。诸如此类计算有一个特点,都是为了更新输入的值。下面以张量的 a[0]=b 操作为例介绍In-Place的优点。 - :numref:`inplace-op`左边是非In-Place操作的实现,step1将张量a拷贝到张量a',step2将张量b赋值给张量a',step3将张量a'拷贝到张量a。 :numref:`inplace-op`右边是算子In-Place操作的实现,仅用一个步骤将张量b拷贝到张量a对应的位置上。对比两种实现,可以发现In-Place操作节省了两次拷贝的耗时,并且省去了张量a'内存的申请。 - -![In-Place算子内存分配](../img/ch05/inplace-op.png) -:width:`800px` -:label:`inplace-op` - -这节简单介绍了设备内存的概念,内存分配的流程,和一些优化内存分配的方法。内存分配是编译器后端的最重要部分之一,内存的合理分配,不仅关系到相同内存容量下能否支持更大的网络模型,也关系到模型在硬件上的执行效率。 +## 内存分配 +:label:`ch05-sec-memory_pool` + +内存在传统计算机存储器层次结构中有着重要的地位,它是连接高速缓存和磁盘之间的桥梁,有着比高速缓存更大的空间,比磁盘更快的访问速度。随着深度学习的发展,深度神经网络的模型越来越复杂,AI芯片上的内存很可能无法容纳一个大型网络模型。因此,对内存进行复用是一个重要的优化手段。此外,通过连续内存分配和 In-Place内存分配还可以提高某些算子的执行效率。 + +### Device内存概念 + +在深度学习体系结构中,通常将与硬件加速器(如GPU、AI芯片等)相邻的内存称之为设备(Device)内存,而与CPU相邻的内存称之为主机(Host)内存。如 :numref:`host-device-memory`所示,CPU可以合法地访问主机上的内存,而无法直接访问设备上的内存;同理,AI芯片可以访问设备上的内存,却无法访问主机上的内存。因此,在网络训练过程中,往往需要从磁盘加载数据到主机内存中,然后在主机内存中做数据处理,再从主机内存拷贝到设备内存中,最后设备才能合法地访问数据。算子全部计算完成后,用户要获取训练结果,又需要把数据从设备内存拷贝到主机内存中。 + +![主机内存和设备内存](../img/ch05/host-device-memory.png) +:width:`800px` +:label:`host-device-memory` + +### 内存分配 {#内存分配-1} + +内存分配模块主要负责给图中算子的输入、输出分配Device内存。用户的前端脚本经过编译器前端处理后得到中间表达,后端根据中间表达进行算子选择和相关优化,可以得到算子最终的输入输出张量的形状、数据类型(Data Type)、格式(Format)等信息,根据这些信息可以计算出算子输入、输出张量的尺寸大小。基本的计算方法如式$$size=\left (\prod_{i=0}^{dimension}shape_i\right ) * sizeof\left ( data type \right )$$所示。得到张量的尺寸大小后,往往还需要对内存大小进行对齐操作。内存通常以4字节、8字节或16字节为一组进行访问,如果被搬运的内存大小不是这些值的倍数,内存后面会填充相应数量的空数据以使得内存长度达到这些值的倍数。因此,访问非对齐的内存可能会更加耗时。 + +![内存分配示例](../img/ch05/memory_allocate.png) +:width:`800px` +:label:`memory_allocate` + +下面以 :numref:`memory_allocate`为例介绍内存分配的大致流程。首先给输入张量、Conv2D的权重和Conv2D的输出分配内存地址。然后为BatchNorm的输入分配地址时,发现BatchNorm的输入就是Conv2D算子的输出,而该张量的地址已经在之前分配过了,因此只需要将Conv2D算子的输出地址共享给BatchNorm的输入,就可以避免内存的重复申请以及内存的冗余拷贝。以此类推,可以发现整个过程中可以将待分配的内存分成三种类型:一是整张图的输入张量,二是算子的权重或者属性,三是算子的输出张量,三种类型在训练过程中的生命周期有所不同。 + +在CPU上常常使用malloc函数直接申请内存,这种方式申请内存好处是随时申请随时释放,简单易用。然而在许多对性能要求严苛的计算场景中,由于所申请内存块的大小不定,频繁申请释放会降低性能。通常会使用内存池的方式去管理内存,先申请一定数量的内存块留作备用,当程序有内存申请需求时,直接从内存池中的内存块中申请。当程序释放该内存块时,内存池会进行回收并用作后续程序内存申请时使用。 +在深度学习框架中,设备内存的申请也是非常频繁的,往往也是通过内存池的方式去管理设备内存,并让设备内存的生命周期与张量的生命周期保持一致。不同的深度学习框架在内存池的设计上大同小异,以:numref:`device_malloc`的MindSpore框架内存申请为例,进程会从设备上申请足够大的内存,然后通过双游标从两端偏移为张量分配内存。首先从申请的首地址开始进行偏移,为算子权重的张量分配内存,这部分张量生命周期较长,往往持续整个训练过程。然后从申请设备地址的末尾开始偏移,为算子的输出张量分配内存,这部分内存的生命周期较短,往往在该算子计算结束并且后续计算过程中无需再次使用该算子的输出的情况下,其生命周期就可以结束。通过这种方式,只需要从设备上申请一次足够大的内存,后续算子的内存分配都是通过指针偏移进行分配,减少了直接从设备申请内存的耗时。 + + +![双游标法分配内存](../img/ch05/device_malloc.png) +:width:`800px` +:label:`device_malloc` + +### 内存复用 + +在机器学习系统中,内存复用是指分析张量的生命周期,将生命周期结束的张量的设备内存释放回内存池并用于后续张量的内存分配。内存复用的目的是提高内存的利用率,让有限的设备内存容纳更大的模型。 +以 :numref:`memory_allocate`为例,当BatchNorm算子计算结束后,输出1不再被任何算子使用,则该张量的设备内存可以被回收,并且如果输出1的内存尺寸大于等于输出3的内存尺寸,则从输出1回收的地址可以用于输出3的内存分配,从而达到复用输出1地址的目的。 + +![内存生命周期图](../img/ch05/combine_memory_reuse_and_no_reuse.png) +:width:`800px` +:label:`combine_memory_reuse_and_no_reuse` + +为了更好地描述内存复用问题,通过内存生命周期图来辅助理解。如 :numref:`combine_memory_reuse_and_no_reuse`所示,图中横坐标表示张量的生命周期,图中纵坐标表示内存大小。在生命周期内,某一个张量将一直占用某块设备内存,直至生命周期结束才会释放相应内存块。通过张量生命周期和内存大小可以构造出矩形块,而内存分配要求解的目标是在内存生命周期图中容纳更多的矩形块,问题的约束是矩形块之间无碰撞。 :numref:`combine_memory_reuse_and_no_reuse`左边是在未使用任何内存复用策略的情况下的内存生命周期图,此时内存同时只能容纳T0、T1、T2、T3四个张量。 + +内存复用策略的求解是一个NP完全的问题。许多深度学习框架通常采用贪心的策略去分配内存,例如采用BestFit算法,每次直接从内存池中选取可以满足条件的最小内存块,然而这种贪心的策略往往会陷入局部最优解,而无法求得全局最优解。为了更好地逼近内存分配策略全局最优解,MindSpore框架提出了一种新的内存分配算法 +SOMAS(Safe Optimized Memory Allocation Solver,安全优化的内存分配求解器)。SOMAS将计算图并行流与数据依赖进行聚合分析,得到算子间祖先关系,构建张量全局生命周期互斥约束,使用多种启发式算法求解最优的内存静态规划,实现逼近理论极限的内存复用,从而提升支持的内存大小。 + +由 :numref:`combine_memory_reuse_and_no_reuse`右边所示,经过SOMAS求解之后,同样的内存大小,可支持的Tensor数量达到了7个。 + +### 常见的内存分配优化手段 + +#### 内存融合 + +上述内存分配的方式,都是以单个张量的维度去分配的,每个张量分配到的设备地址往往是离散的。但是对于某些特殊的算子,如AllReduce通信算子,需要为它们分配连续的内存。通信算子的执行包含通信等待、数据搬移、计算等步骤,而在大规模分布式集群的场景下,通信的耗时往往是性能瓶颈。针对这种场景,如 :numref:`memory_fusion`所示,可以将多个通信算子融合成一个,为通信算子的输入分配连续的内存,从而减少通信的次数。 +又比如分布式训练中的神经网络权重初始化,通常将一个训练进程中的权重初始化,然后将该权重广播到其他进程中。当一个网络有较多权重的时候,需要多次进行广播。通常可以为所有权重分配连续的内存地址,然后广播一次,节省大量通信的耗时。 + +![通信算子内存融合](../img/ch05/memory_fusion.png) +:width:`800px` +:label:`memory_fusion` + +#### In-Place算子 + +在内存分配流程中,会为每个算子的输入和输出都分配不同的内存。然而对很多算子而言,为其分配不同的输入和输出地址,会浪费内存并且影响计算性能。例如优化器算子,其计算的目的就是更新神经网络的权重;例如Python语法中的 += 和 *= 操作符,将计算结果更新到符号左边的变量中;例如 a[0]=b 语法,将 a[0] 的值更新为 b。诸如此类计算有一个特点,都是为了更新输入的值。下面以张量的 a[0]=b 操作为例介绍In-Place的优点。 + :numref:`inplace-op`左边是非In-Place操作的实现,step1将张量a拷贝到张量a',step2将张量b赋值给张量a',step3将张量a'拷贝到张量a。 :numref:`inplace-op`右边是算子In-Place操作的实现,仅用一个步骤将张量b拷贝到张量a对应的位置上。对比两种实现,可以发现In-Place操作节省了两次拷贝的耗时,并且省去了张量a'内存的申请。 + +![In-Place算子内存分配](../img/ch05/inplace-op.png) +:width:`800px` +:label:`inplace-op` + +这节简单介绍了设备内存的概念,内存分配的流程,和一些优化内存分配的方法。内存分配是编译器后端的最重要部分之一,内存的合理分配,不仅关系到相同内存容量下能否支持更大的网络模型,也关系到模型在硬件上的执行效率。 diff --git a/chapter_backend_and_runtime/op_compiler.md b/zh_chapters/chapter_backend_and_runtime/op_compiler.md similarity index 100% rename from chapter_backend_and_runtime/op_compiler.md rename to zh_chapters/chapter_backend_and_runtime/op_compiler.md diff --git a/chapter_backend_and_runtime/overview.md b/zh_chapters/chapter_backend_and_runtime/overview.md similarity index 99% rename from chapter_backend_and_runtime/overview.md rename to zh_chapters/chapter_backend_and_runtime/overview.md index 0ab166a..a56d7eb 100644 --- a/chapter_backend_and_runtime/overview.md +++ b/zh_chapters/chapter_backend_and_runtime/overview.md @@ -1,29 +1,29 @@ -## 概述 - -编译器前端主要将用户代码进行解析翻译得到计算图IR,并对其进行设备信息无关的优化,此时的优化并不考虑程序执行的底层硬件信息。编译器后端的主要职责是对前端下发的IR做进一步的计算图优化,让其更加贴合硬件,并为IR中的计算节点选择在硬件上执行的算子,然后为每个算子的输入输出分配硬件内存,最终生成一个可以在硬件上执行的任务序列。 - -如 :numref:`compiler-backend-architecture`所示,编译器后端处于前端和硬件驱动层中间,主要负责计算图优化、算子选择和内存分配的任务。首先,需要根据硬件设备的特性将IR图进行等价图变换,以便在硬件上能够找到对应的执行算子,该过程是计算图优化的重要步骤之一。前端IR是通过解析用户代码生成的,属于一个较高的抽象层次,隐藏一些底层运行的细节信息,此时无法直接对应硬件上的算子(算子是设备上的基本计算序列,例如MatMul、Convolution、ReLU等),需要将细节信息进行展开后,才能映射到目标硬件上的算子。对于某些前端IR的子集来说,一个算子便能够执行对应的功能,此时可以将这些IR节点合并成为一个计算节点,该过程称之为算子融合;对于一些复杂计算,后端并没有直接与之对应的算子,但是可以通过几个基本运算的算子组合达到同样的计算效果,此时可以将前端IR节点拆分成多个小算子。在完成计算图优化之后,就要进行算子选择过程,为每个计算节点选择执行算子。算子选择是在得到优化的IR图后选取最合适的目标设备算子的过程。针对用户代码所产生的IR往往可以映射成多种不同的硬件算子,但是这些不同硬件算子的执行效率往往有很大差别,如何根据前端IR选择出最高效的算子,是算子选择的核心问题。算子选择本质上是一个模式匹配问题。其最简单的方法就是每一个IR节点对应一个目标硬件的算子,但是这种方法往往对目标硬件的资源利用比较差。现有的编译器一般都对每一个IR节点提供了多个候选的算子,算子选择目标就是从中选择最优的一个算子作为最终执行在设备上的算子。总的来说,在机器学习系统中,对前端生成的IR图上的各个节点进行拆分和融合,让前端所表示的高层次IR逐步转换为可以在硬件设备上执行的低层次IR。得到了这种更加贴合硬件的IR后,对于每个单节点的IR可能仍然有很多种不同的选择,例如可以选择不同的输入输出格式和数据类型,需要对IR图上每个节点选择出最为合适的算子,算子选择过程可以认为是针对IR图的细粒度优化过程,最终生成完整的算子序列。最后,遍历算子序列,为每个算子分配相应的输入输出内存,然后将算子加载到设备上执行计算。 - -![编译器后端总体架构简图](../img/ch05/compiler-backend-architecture.png) -:width:`800px` -:label:`compiler-backend-architecture` - -### 计算图优化 - -计算图优化是在不影响模型的数值特性的基础上,通过图变换达到简化计算、减少资源开销、适配硬件的执行能力、提升执行性能的目的。 - -### 算子选择 - -算子选择是将IR图上的每个计算节点映射到设备上可执行算子的过程,一个IR图上的计算节点往往可以对应多个设备上的算子,这个过程中需要考虑算子的规格,算子的执行效率等问题,算子选择目标就是从中选择最优的一个算子。 - -### 内存分配 - -经过计算图优化和算子选择之后,我们可以得到IR图中每个算子的输入输出的形状(Shape)、数据类型、存储格式。根据这些信息,计算输入输出数据的大小,并为输入输出分配设备上的内存,然后将算子加载到设备上才能真正执行计算。此外,为了更充分地例用设备内存资源,可以对内存进行复用,提高内存利用率。 - -### 计算调度与执行 - -经过算子选择与内存分配之后,计算任务可以通过运行时完成计算的调度与在硬件上的执行。根据是否将算子编译为计算图,计算的调度可以分为单算子调度与计算图调度两种方式。而根据硬件提供的能力差异,计算图的执行方式又可以分为逐算子下发执行的交互式执行以及将整个计算图或者部分子图一次性下发到硬件的下沉式执行两种模式。 - -### 算子编译器 - -作为AI编译器中一个重要组成部分,算子编译器把单个简单或复杂的算子经过表达和优化后编译为一个单独的可执行文件。目前业界面对算子编译器仍有许多有趣的问题尚未得出明确结论,相关的处理逻辑与方法也尚未收敛。本小节希望将这些问题简单抛出,并给出业界比较典型的几种处理方式。若能对业界朋友们和同学们有所启发甚至若能对这些问题起到促进收敛的作用,那真是再好不过!目前尚待收敛的问题包括而不限于:如何通过算子编译器进行性能优化?算子编译器如何兼容不同体系结构特点的芯片?面对输入Python代码的灵活性以及神经网络训练时动态性的情况,该如何充分将这些完美表达出来? +## 概述 + +编译器前端主要将用户代码进行解析翻译得到计算图IR,并对其进行设备信息无关的优化,此时的优化并不考虑程序执行的底层硬件信息。编译器后端的主要职责是对前端下发的IR做进一步的计算图优化,让其更加贴合硬件,并为IR中的计算节点选择在硬件上执行的算子,然后为每个算子的输入输出分配硬件内存,最终生成一个可以在硬件上执行的任务序列。 + +如 :numref:`compiler-backend-architecture`所示,编译器后端处于前端和硬件驱动层中间,主要负责计算图优化、算子选择和内存分配的任务。首先,需要根据硬件设备的特性将IR图进行等价图变换,以便在硬件上能够找到对应的执行算子,该过程是计算图优化的重要步骤之一。前端IR是通过解析用户代码生成的,属于一个较高的抽象层次,隐藏一些底层运行的细节信息,此时无法直接对应硬件上的算子(算子是设备上的基本计算序列,例如MatMul、Convolution、ReLU等),需要将细节信息进行展开后,才能映射到目标硬件上的算子。对于某些前端IR的子集来说,一个算子便能够执行对应的功能,此时可以将这些IR节点合并成为一个计算节点,该过程称之为算子融合;对于一些复杂计算,后端并没有直接与之对应的算子,但是可以通过几个基本运算的算子组合达到同样的计算效果,此时可以将前端IR节点拆分成多个小算子。在完成计算图优化之后,就要进行算子选择过程,为每个计算节点选择执行算子。算子选择是在得到优化的IR图后选取最合适的目标设备算子的过程。针对用户代码所产生的IR往往可以映射成多种不同的硬件算子,但是这些不同硬件算子的执行效率往往有很大差别,如何根据前端IR选择出最高效的算子,是算子选择的核心问题。算子选择本质上是一个模式匹配问题。其最简单的方法就是每一个IR节点对应一个目标硬件的算子,但是这种方法往往对目标硬件的资源利用比较差。现有的编译器一般都对每一个IR节点提供了多个候选的算子,算子选择目标就是从中选择最优的一个算子作为最终执行在设备上的算子。总的来说,在机器学习系统中,对前端生成的IR图上的各个节点进行拆分和融合,让前端所表示的高层次IR逐步转换为可以在硬件设备上执行的低层次IR。得到了这种更加贴合硬件的IR后,对于每个单节点的IR可能仍然有很多种不同的选择,例如可以选择不同的输入输出格式和数据类型,需要对IR图上每个节点选择出最为合适的算子,算子选择过程可以认为是针对IR图的细粒度优化过程,最终生成完整的算子序列。最后,遍历算子序列,为每个算子分配相应的输入输出内存,然后将算子加载到设备上执行计算。 + +![编译器后端总体架构简图](../img/ch05/compiler-backend-architecture.png) +:width:`800px` +:label:`compiler-backend-architecture` + +### 计算图优化 + +计算图优化是在不影响模型的数值特性的基础上,通过图变换达到简化计算、减少资源开销、适配硬件的执行能力、提升执行性能的目的。 + +### 算子选择 + +算子选择是将IR图上的每个计算节点映射到设备上可执行算子的过程,一个IR图上的计算节点往往可以对应多个设备上的算子,这个过程中需要考虑算子的规格,算子的执行效率等问题,算子选择目标就是从中选择最优的一个算子。 + +### 内存分配 + +经过计算图优化和算子选择之后,我们可以得到IR图中每个算子的输入输出的形状(Shape)、数据类型、存储格式。根据这些信息,计算输入输出数据的大小,并为输入输出分配设备上的内存,然后将算子加载到设备上才能真正执行计算。此外,为了更充分地例用设备内存资源,可以对内存进行复用,提高内存利用率。 + +### 计算调度与执行 + +经过算子选择与内存分配之后,计算任务可以通过运行时完成计算的调度与在硬件上的执行。根据是否将算子编译为计算图,计算的调度可以分为单算子调度与计算图调度两种方式。而根据硬件提供的能力差异,计算图的执行方式又可以分为逐算子下发执行的交互式执行以及将整个计算图或者部分子图一次性下发到硬件的下沉式执行两种模式。 + +### 算子编译器 + +作为AI编译器中一个重要组成部分,算子编译器把单个简单或复杂的算子经过表达和优化后编译为一个单独的可执行文件。目前业界面对算子编译器仍有许多有趣的问题尚未得出明确结论,相关的处理逻辑与方法也尚未收敛。本小节希望将这些问题简单抛出,并给出业界比较典型的几种处理方式。若能对业界朋友们和同学们有所启发甚至若能对这些问题起到促进收敛的作用,那真是再好不过!目前尚待收敛的问题包括而不限于:如何通过算子编译器进行性能优化?算子编译器如何兼容不同体系结构特点的芯片?面对输入Python代码的灵活性以及神经网络训练时动态性的情况,该如何充分将这些完美表达出来? diff --git a/chapter_backend_and_runtime/summary.md b/zh_chapters/chapter_backend_and_runtime/summary.md similarity index 99% rename from chapter_backend_and_runtime/summary.md rename to zh_chapters/chapter_backend_and_runtime/summary.md index b91662e..139d78e 100644 --- a/chapter_backend_and_runtime/summary.md +++ b/zh_chapters/chapter_backend_and_runtime/summary.md @@ -1,27 +1,27 @@ -## 总结 - -- 编译器后端主要负责计算图优化、算子选择、内存分配这三个任务。 - -- 计算图优化是在不影响模型的数值特性的基础上,通过图变换达到减少资源开销、适配硬件的执行能力、提升执行性能的目的。 - -- 计算图优化主要分为硬件通用优化和特定硬件优化,例如与硬件无关的算子内存IO优化和为了适配特定硬件指令限制而做的子图变换。 - -- 算子选择是为IR图中的每个计算节点选择一个最适合在设备上执行的算子。 - -- 数据存在多种存储格式和计算精度,不同的存储格式和计算精度在不同场景下对算子计算性能有较大的影响,所以算子选择需要综合考虑各方面影响选择最优的算子。 - -- 经过计算图优化和算子选择之后,得到了最终的IR。基于最终的IR,需要为算子的输入输出Tensor分配内存,然后加载算子到硬件上执行。 - -- 内存复用是一个重要的内存分配优化手段,可以让设备上容纳更大的网络模型。 - -- 将通信算子的内存进行融合,可以提高通信的效率;合理分配In-Place算子的内存,可以节省内存使用并且提高计算效率。 - -- 运行时对于算子的执行可以分为单算子调度和计算图调度两种模式,而在计算图调度模式中,根据具体硬件的能力又可以分为交互式执行和下沉式执行两种方式,交互式执行具备更多的灵活性,下沉执行可以获得更好的计算性能。 - -- 算子编译器是优化硬件性能的关键组件。其中,调度策略的优化和基于多面体模型算法的优化是两个关键技术。 - -## 扩展阅读 - -- 内存分配作为机器学习后端的重要部分,建议阅读 [Sublinear Memory Cost](https://arxiv.org/abs/1604.06174)、 [Dynamic Tensor Rematerialization](https://arxiv.org/abs/2006.09616)。 -- 对于运行时的调度以及执行,建议阅读 [A Lightweight Parallel and Heterogeneous Task Graph Computing System](https://arxiv.org/abs/2004.10908)、 [Dynamic Control Flow in Large-Scale Machine Learning](https://arxiv.org/abs/1805.01772)、[DEEP LEARNING WITH DYNAMIC COMPUTATION GRAPHS](https://arxiv.org/abs/1702.02181)。 -- 算子编译器是本书的扩展部分,建议阅读提出计算与调度分离的论文: [Halide: A Language and Compiler for Optimizing Parallelism, Locality, and Recomputation in Image Processing Pipelines](https://dl.acm.org/doi/abs/10.1145/2499370.2462176),以及介绍调度空间优化的论文 [Ansor: Generating High-Performance Tensor Programs for Deep Learning](https://arxiv.org/abs/2006.06762)和 [olly - Polyhedral optimization in LLVM](https://arxiv.org/abs/2105.04555) +## 总结 + +- 编译器后端主要负责计算图优化、算子选择、内存分配这三个任务。 + +- 计算图优化是在不影响模型的数值特性的基础上,通过图变换达到减少资源开销、适配硬件的执行能力、提升执行性能的目的。 + +- 计算图优化主要分为硬件通用优化和特定硬件优化,例如与硬件无关的算子内存IO优化和为了适配特定硬件指令限制而做的子图变换。 + +- 算子选择是为IR图中的每个计算节点选择一个最适合在设备上执行的算子。 + +- 数据存在多种存储格式和计算精度,不同的存储格式和计算精度在不同场景下对算子计算性能有较大的影响,所以算子选择需要综合考虑各方面影响选择最优的算子。 + +- 经过计算图优化和算子选择之后,得到了最终的IR。基于最终的IR,需要为算子的输入输出Tensor分配内存,然后加载算子到硬件上执行。 + +- 内存复用是一个重要的内存分配优化手段,可以让设备上容纳更大的网络模型。 + +- 将通信算子的内存进行融合,可以提高通信的效率;合理分配In-Place算子的内存,可以节省内存使用并且提高计算效率。 + +- 运行时对于算子的执行可以分为单算子调度和计算图调度两种模式,而在计算图调度模式中,根据具体硬件的能力又可以分为交互式执行和下沉式执行两种方式,交互式执行具备更多的灵活性,下沉执行可以获得更好的计算性能。 + +- 算子编译器是优化硬件性能的关键组件。其中,调度策略的优化和基于多面体模型算法的优化是两个关键技术。 + +## 扩展阅读 + +- 内存分配作为机器学习后端的重要部分,建议阅读 [Sublinear Memory Cost](https://arxiv.org/abs/1604.06174)、 [Dynamic Tensor Rematerialization](https://arxiv.org/abs/2006.09616)。 +- 对于运行时的调度以及执行,建议阅读 [A Lightweight Parallel and Heterogeneous Task Graph Computing System](https://arxiv.org/abs/2004.10908)、 [Dynamic Control Flow in Large-Scale Machine Learning](https://arxiv.org/abs/1805.01772)、[DEEP LEARNING WITH DYNAMIC COMPUTATION GRAPHS](https://arxiv.org/abs/1702.02181)。 +- 算子编译器是本书的扩展部分,建议阅读提出计算与调度分离的论文: [Halide: A Language and Compiler for Optimizing Parallelism, Locality, and Recomputation in Image Processing Pipelines](https://dl.acm.org/doi/abs/10.1145/2499370.2462176),以及介绍调度空间优化的论文 [Ansor: Generating High-Performance Tensor Programs for Deep Learning](https://arxiv.org/abs/2006.06762)和 [olly - Polyhedral optimization in LLVM](https://arxiv.org/abs/2105.04555) diff --git a/chapter_computational_graph/background_and_functionality.md b/zh_chapters/chapter_computational_graph/background_and_functionality.md similarity index 100% rename from chapter_computational_graph/background_and_functionality.md rename to zh_chapters/chapter_computational_graph/background_and_functionality.md diff --git a/chapter_computational_graph/components_of_computational_graph.md b/zh_chapters/chapter_computational_graph/components_of_computational_graph.md similarity index 100% rename from chapter_computational_graph/components_of_computational_graph.md rename to zh_chapters/chapter_computational_graph/components_of_computational_graph.md diff --git a/chapter_computational_graph/generation_of_computational_graph.md b/zh_chapters/chapter_computational_graph/generation_of_computational_graph.md similarity index 100% rename from chapter_computational_graph/generation_of_computational_graph.md rename to zh_chapters/chapter_computational_graph/generation_of_computational_graph.md diff --git a/chapter_computational_graph/index.md b/zh_chapters/chapter_computational_graph/index.md similarity index 100% rename from chapter_computational_graph/index.md rename to zh_chapters/chapter_computational_graph/index.md diff --git a/chapter_computational_graph/schedule_of_computational_graph.md b/zh_chapters/chapter_computational_graph/schedule_of_computational_graph.md similarity index 100% rename from chapter_computational_graph/schedule_of_computational_graph.md rename to zh_chapters/chapter_computational_graph/schedule_of_computational_graph.md diff --git a/chapter_computational_graph/summary.md b/zh_chapters/chapter_computational_graph/summary.md similarity index 100% rename from chapter_computational_graph/summary.md rename to zh_chapters/chapter_computational_graph/summary.md diff --git a/chapter_data_processing/data_order.md b/zh_chapters/chapter_data_processing/data_order.md similarity index 100% rename from chapter_data_processing/data_order.md rename to zh_chapters/chapter_data_processing/data_order.md diff --git a/chapter_data_processing/extension.md b/zh_chapters/chapter_data_processing/extension.md similarity index 100% rename from chapter_data_processing/extension.md rename to zh_chapters/chapter_data_processing/extension.md diff --git a/chapter_data_processing/index.md b/zh_chapters/chapter_data_processing/index.md similarity index 100% rename from chapter_data_processing/index.md rename to zh_chapters/chapter_data_processing/index.md diff --git a/chapter_data_processing/performance.md b/zh_chapters/chapter_data_processing/performance.md similarity index 100% rename from chapter_data_processing/performance.md rename to zh_chapters/chapter_data_processing/performance.md diff --git a/chapter_data_processing/program_model.md b/zh_chapters/chapter_data_processing/program_model.md similarity index 100% rename from chapter_data_processing/program_model.md rename to zh_chapters/chapter_data_processing/program_model.md diff --git a/chapter_data_processing/requirements.md b/zh_chapters/chapter_data_processing/requirements.md similarity index 100% rename from chapter_data_processing/requirements.md rename to zh_chapters/chapter_data_processing/requirements.md diff --git a/chapter_data_processing/summary.md b/zh_chapters/chapter_data_processing/summary.md similarity index 100% rename from chapter_data_processing/summary.md rename to zh_chapters/chapter_data_processing/summary.md diff --git a/chapter_distributed_training/cluster.md b/zh_chapters/chapter_distributed_training/cluster.md similarity index 100% rename from chapter_distributed_training/cluster.md rename to zh_chapters/chapter_distributed_training/cluster.md diff --git a/chapter_distributed_training/collective.md b/zh_chapters/chapter_distributed_training/collective.md similarity index 100% rename from chapter_distributed_training/collective.md rename to zh_chapters/chapter_distributed_training/collective.md diff --git a/chapter_distributed_training/index.md b/zh_chapters/chapter_distributed_training/index.md similarity index 100% rename from chapter_distributed_training/index.md rename to zh_chapters/chapter_distributed_training/index.md diff --git a/chapter_distributed_training/methods.md b/zh_chapters/chapter_distributed_training/methods.md similarity index 100% rename from chapter_distributed_training/methods.md rename to zh_chapters/chapter_distributed_training/methods.md diff --git a/chapter_distributed_training/overview.md b/zh_chapters/chapter_distributed_training/overview.md similarity index 100% rename from chapter_distributed_training/overview.md rename to zh_chapters/chapter_distributed_training/overview.md diff --git a/chapter_distributed_training/parameter_servers.md b/zh_chapters/chapter_distributed_training/parameter_servers.md similarity index 100% rename from chapter_distributed_training/parameter_servers.md rename to zh_chapters/chapter_distributed_training/parameter_servers.md diff --git a/chapter_distributed_training/summary.md b/zh_chapters/chapter_distributed_training/summary.md similarity index 100% rename from chapter_distributed_training/summary.md rename to zh_chapters/chapter_distributed_training/summary.md diff --git a/chapter_explainable_AI/explainable_ai.md b/zh_chapters/chapter_explainable_AI/explainable_ai.md similarity index 100% rename from chapter_explainable_AI/explainable_ai.md rename to zh_chapters/chapter_explainable_AI/explainable_ai.md diff --git a/chapter_explainable_AI/index.md b/zh_chapters/chapter_explainable_AI/index.md similarity index 100% rename from chapter_explainable_AI/index.md rename to zh_chapters/chapter_explainable_AI/index.md diff --git a/chapter_federated_learning/horizontal_fl.md b/zh_chapters/chapter_federated_learning/horizontal_fl.md similarity index 98% rename from chapter_federated_learning/horizontal_fl.md rename to zh_chapters/chapter_federated_learning/horizontal_fl.md index 37388dc..63e7f0b 100644 --- a/chapter_federated_learning/horizontal_fl.md +++ b/zh_chapters/chapter_federated_learning/horizontal_fl.md @@ -1,57 +1,57 @@ -## 横向联邦学习 - -### 云云场景中的横向联邦 - -在横向联邦学习系统中,具有相同数据结构的多个参与者通过云服务器协同建立机器学习模型。一个典型的假设是参与者是诚实的,而服务器是诚实但好奇的,因此不允许任何参与者向服务器泄漏原始的梯度信息。这种系统的训练过程通常包括以下四个步骤: - -①:参与者在本地计算训练梯度,使用加密、差分隐私或秘密共享技术掩码所选梯度,并将掩码后的结果发送到服务器。 - -②:服务器执行安全聚合,不了解任何参与者的梯度信息。 - -③:服务器将汇总后的结果发送给参与者。 - -④:参与者用解密的梯度更新他们各自的模型。 - -和传统分布式学习相比,联邦学习存在训练结点不稳定和通信代价大的难点。这些难点导致了联邦学习无法和传统分布式学习一样:在每次单步训练之后,同步不同训练结点上的权重。为了提高计算通信比并降低频繁通信带来的高能耗,谷歌公司在2017年 :cite:`fedavg`提出了联邦平均算法(Federated Averaging,FedAvg)。 :numfef:`ch10-federated-learning-fedavg`展示了FedAvg的整体流程。在每轮训练过程中,客户端进行了多次单步训练。然后服务端聚合多个客户端的权重,并取加权平均。 - -![联邦平均算法](../img/ch10/ch10-federated-learning-fedavg.png) -:width:`800px` -:label:`ch10-federated-learning-fedavg` - -### 端云场景中的横向联邦 - -端云联邦学习的总体流程和云云联邦学习一样,但端云联邦学习面临的难点还包括以下三个方面: - -1.高昂的通信代价。和云云联邦学习不同之处,端云联邦学习的通信开销主要在于单次的通信量,而云云联邦学习的开销主要在于通信的频率。在端云联邦学习场景中,通常的通信网络可能是WLAN或移动数据,网络通信速度可能比本地计算慢许多个数量级,这就造成高昂的通信代价成为了联邦学习的关键瓶颈。 - -2.系统异质性。由于客户端设备硬件条件(CPU、内存)、网络连接(3G、4G、5G、WIFI)和电源(电池电量)的变化,联邦学习网络中每个设备的存储、计算和通信能力都有可能不同。网络和设备本身的限制可能导致某一时间仅有一部分设备处于活动状态。此外,设备还会出现没电、网络无法接入等突发状况,导致瞬时无法连通。这种异质性的系统架构影响了联邦学习整体策略的制定。 - -3.隐私问题。由于端云联邦学习的客户端无法参与每一轮迭代,因此在数据隐私保护上的难度高于其他的分布式学习方法。而且,在联邦学习过程中,端云传递模型的更新信息还存在向第三方或中央服务器暴露敏感信息的风险。隐私保护成为端云联邦学习需要重点考虑的问题。 - -为了解决端云联邦学习带来的挑战,MindSpore Federated设计了分布式FL-Server架构。系统由调度器模块、服务器模块和客户端模块三个部分组成,其系统架构如 :numref:`ch10-federated-learning-architecture`所示。各个模块的功能说明: - -- 联邦学习调度器: - - 联邦学习调度器(FL-Scheduler)协助集群组网,并负责管理面任务的下发。 - -- 联邦学习服务器: - - 联邦学习服务器(FL-Server)提供客户端选择、限时通信、分布式联邦聚合功能。FL-Server需要具备支持端云千万台设备的能力以及支持边缘服务器的接入和安全处理的逻辑。 - -- 联邦学习客户端: - - 联邦学习客户端(FL-Client)负责本地数据训练,并在和FL-Server进行通信时,对上传权重进行安全加密。 - -![联邦学习系统架构图](../img/ch10/ch10-federated-learning-architecture.svg) - -:label:`ch10-federated-learning-architecture` - -此外,MindSpore Federated针对端云联邦学习设计了出四大特性: - -1.限时通信:在FL-Server和FL-Client建立连接后,启动全局的计时器和计数器。当预先设定的时间窗口内的FL-Server接收到FL-Client训练后的模型参数满足初始接入的所有FL-Client的一定比例后,就可以进行聚合。若时间窗内没有达到比例阈值,则进入下一轮迭代。保证即使有海量FL-Client接入的情况下,也不会由于个别FL-Client训练时间过长或掉线导致的整个联邦学习过程卡死。 - -2.松耦合组网:使用FL-Server集群。每个FL-Server接收和下发权重给部分FL-Client,减少单个FL-Server的带宽压力。此外,支持FL-Client以松散的方式接入。任意FL-Client的中途退出都不会影响全局任务,并且FL-Client在任意时刻访问任意FL-Server都能获得训练所需的全量数据。 - -3.加密模块:MindSpore Federated为了防止模型梯度的泄露,部署了多种加密算法:本地差分隐私(Local Differential Privacy,LDP)、基于多方安全计算(MPC)的安全聚合算法和华为自研的基于符号的维度选择差分隐私算法(Sign-based Dimension Selection,SignDS)。 - -4.通信压缩模块:MindSpore Federated分别在FL-Server下发模型参数和FL-Client上传模型参数时,使用量化和稀疏等手段将权重压缩编码成较小的数据格式,并在对端将压缩编码后的数据解码为原始的数据。 +## 横向联邦学习 + +### 云云场景中的横向联邦 + +在横向联邦学习系统中,具有相同数据结构的多个参与者通过云服务器协同建立机器学习模型。一个典型的假设是参与者是诚实的,而服务器是诚实但好奇的,因此不允许任何参与者向服务器泄漏原始的梯度信息。这种系统的训练过程通常包括以下四个步骤: + +①:参与者在本地计算训练梯度,使用加密、差分隐私或秘密共享技术掩码所选梯度,并将掩码后的结果发送到服务器。 + +②:服务器执行安全聚合,不了解任何参与者的梯度信息。 + +③:服务器将汇总后的结果发送给参与者。 + +④:参与者用解密的梯度更新他们各自的模型。 + +和传统分布式学习相比,联邦学习存在训练结点不稳定和通信代价大的难点。这些难点导致了联邦学习无法和传统分布式学习一样:在每次单步训练之后,同步不同训练结点上的权重。为了提高计算通信比并降低频繁通信带来的高能耗,谷歌公司在2017年 :cite:`fedavg`提出了联邦平均算法(Federated Averaging,FedAvg)。 :numfef:`ch10-federated-learning-fedavg`展示了FedAvg的整体流程。在每轮训练过程中,客户端进行了多次单步训练。然后服务端聚合多个客户端的权重,并取加权平均。 + +![联邦平均算法](../img/ch10/ch10-federated-learning-fedavg.png) +:width:`800px` +:label:`ch10-federated-learning-fedavg` + +### 端云场景中的横向联邦 + +端云联邦学习的总体流程和云云联邦学习一样,但端云联邦学习面临的难点还包括以下三个方面: + +1.高昂的通信代价。和云云联邦学习不同之处,端云联邦学习的通信开销主要在于单次的通信量,而云云联邦学习的开销主要在于通信的频率。在端云联邦学习场景中,通常的通信网络可能是WLAN或移动数据,网络通信速度可能比本地计算慢许多个数量级,这就造成高昂的通信代价成为了联邦学习的关键瓶颈。 + +2.系统异质性。由于客户端设备硬件条件(CPU、内存)、网络连接(3G、4G、5G、WIFI)和电源(电池电量)的变化,联邦学习网络中每个设备的存储、计算和通信能力都有可能不同。网络和设备本身的限制可能导致某一时间仅有一部分设备处于活动状态。此外,设备还会出现没电、网络无法接入等突发状况,导致瞬时无法连通。这种异质性的系统架构影响了联邦学习整体策略的制定。 + +3.隐私问题。由于端云联邦学习的客户端无法参与每一轮迭代,因此在数据隐私保护上的难度高于其他的分布式学习方法。而且,在联邦学习过程中,端云传递模型的更新信息还存在向第三方或中央服务器暴露敏感信息的风险。隐私保护成为端云联邦学习需要重点考虑的问题。 + +为了解决端云联邦学习带来的挑战,MindSpore Federated设计了分布式FL-Server架构。系统由调度器模块、服务器模块和客户端模块三个部分组成,其系统架构如 :numref:`ch10-federated-learning-architecture`所示。各个模块的功能说明: + +- 联邦学习调度器: + + 联邦学习调度器(FL-Scheduler)协助集群组网,并负责管理面任务的下发。 + +- 联邦学习服务器: + + 联邦学习服务器(FL-Server)提供客户端选择、限时通信、分布式联邦聚合功能。FL-Server需要具备支持端云千万台设备的能力以及支持边缘服务器的接入和安全处理的逻辑。 + +- 联邦学习客户端: + + 联邦学习客户端(FL-Client)负责本地数据训练,并在和FL-Server进行通信时,对上传权重进行安全加密。 + +![联邦学习系统架构图](../img/ch10/ch10-federated-learning-architecture.svg) + +:label:`ch10-federated-learning-architecture` + +此外,MindSpore Federated针对端云联邦学习设计了出四大特性: + +1.限时通信:在FL-Server和FL-Client建立连接后,启动全局的计时器和计数器。当预先设定的时间窗口内的FL-Server接收到FL-Client训练后的模型参数满足初始接入的所有FL-Client的一定比例后,就可以进行聚合。若时间窗内没有达到比例阈值,则进入下一轮迭代。保证即使有海量FL-Client接入的情况下,也不会由于个别FL-Client训练时间过长或掉线导致的整个联邦学习过程卡死。 + +2.松耦合组网:使用FL-Server集群。每个FL-Server接收和下发权重给部分FL-Client,减少单个FL-Server的带宽压力。此外,支持FL-Client以松散的方式接入。任意FL-Client的中途退出都不会影响全局任务,并且FL-Client在任意时刻访问任意FL-Server都能获得训练所需的全量数据。 + +3.加密模块:MindSpore Federated为了防止模型梯度的泄露,部署了多种加密算法:本地差分隐私(Local Differential Privacy,LDP)、基于多方安全计算(MPC)的安全聚合算法和华为自研的基于符号的维度选择差分隐私算法(Sign-based Dimension Selection,SignDS)。 + +4.通信压缩模块:MindSpore Federated分别在FL-Server下发模型参数和FL-Client上传模型参数时,使用量化和稀疏等手段将权重压缩编码成较小的数据格式,并在对端将压缩编码后的数据解码为原始的数据。 diff --git a/chapter_federated_learning/index.md b/zh_chapters/chapter_federated_learning/index.md similarity index 100% rename from chapter_federated_learning/index.md rename to zh_chapters/chapter_federated_learning/index.md diff --git a/chapter_federated_learning/outlook.md b/zh_chapters/chapter_federated_learning/outlook.md similarity index 99% rename from chapter_federated_learning/outlook.md rename to zh_chapters/chapter_federated_learning/outlook.md index e500bdf..263933d 100644 --- a/chapter_federated_learning/outlook.md +++ b/zh_chapters/chapter_federated_learning/outlook.md @@ -1,35 +1,35 @@ -## 展望 - -为了实现联邦学习的大规模商用,我们仍然需要做许多的研究工作。比如我们无法查看联邦学习的分布式化的数据,那就很难选择模型的超参数以及设定优化器,只能采用一些基于模拟的方案来调测模型;比如用于移动设备时,单用户的标签数据很少,甚至无法获取数据的标签信息,联邦学习如何用于无监督学习;比如由于参与方的数据分布不一致,训练同一个全局模型,很难评价模型对于每个参与方的好坏;比如数据一直是公司的核心资产,不同的公司一直在致力于收集数据和创造数据孤岛,如何有效地激励公司或者机构参与联邦学习的系统中来。下面将介绍一些MindSpore Federated在进行的一些尝试和业界的相关工作。 - -**异构场景下的联邦学习** - -之前探讨的横向联邦学习和纵向联邦学习都是让不同的参与方共同建立一个共享的机器学习模型。然而,企业级联邦学习框架往往需要适应多种异构场景,如数据异构(不同客户端数据规模以及分布不一致),设备异构(不同客户端设备计算能力,通信效率不一致),以及模型异构(不同本地客户端模型学到的特征不一致)。 - -比较主流的两种联邦学习异构场景下的工作: - -1)对异构数据具有高度鲁棒性的本地模型个性化联邦学习策略: - -联邦学习训练的是一个全局模型,基于所有数据得到一个全局最优解,但是不同参与方的数据量和分布都是不同的,很多场景下全局模型无法在把握整体的同时又照顾到这种差异。当某一方的数据和整体偏离比较大时,联邦学习的效果确实有可能不如本地训练的效果。那么如何在所有参与方总体的收益最大化的同时,让个体的收益也能够最大化,这就是个性化联邦学习。 - -个性化联邦学习并不要求所有参与方最终使用的模型必须是一样的,比如允许每个参与方在参与联邦学习之后,根据自己的数据对模型进行微调,从而生成本方独特的个性化模型。在进行个性化微调之后,往往模型在本地测试集上的效果会更好。在这种方式下,不同参与方的模型结构是一样的,但是模型参数会有所不同。还有一些方案,是让所有的参与方拥有同样的特征提取层,但是任务分类层不同。还有的思路是将知识蒸馏引入联邦学习中,将联邦学习的全局模型作为教师模型,将个性化模型作为学生模型,可以缓解个性化过程中的过拟合问题。 - -2)对于异构模型进行模型聚合的策略研究: - -一般在FedAvg的联邦聚合范式下,本地迭代训练次数越少、聚合地越频繁,模型收敛精度会越好,尤其是在不同参与客户端的数据是非IID情况下。但是聚合会带来通信成本开销,联邦学习存在通信成本与模型精度的Trade-Off。因此很多研究者聚焦于如何设计自适应聚合方案,要求在给定训练时间开销的前提下,找到本地更新和全局通信之间的最佳平衡,令全局模型的泛化误差最小。 - - **通信效率提升** - -在联邦学习流程中,每一个全局训练轮次里,每个参与方都需要给服务端发送完整的参数。然后服务端将聚合后的参数下发。现代的深度学习网络动辄有数百万甚至更大量级的参数,如此多的参数传输将会带来巨大的通信开销。为了降低通信开销,MindSpore Federated采取了一些改善通信效率的方法: - -1)智能调频策略:通过改变全局模型聚合的轮次来提高联邦学习效率,减少训练任务达到收敛的通信开销。一种直觉是在联邦学习流程的初期,不同参与方的参数变化较为一致,因此设置较小的聚合频率,可以减少通信成本;在联邦学习流程的后期,不同参与方的参数变化较为不一致,因此设置较大的聚合频率,可以使得模型快速收敛。 - -2)通信压缩方案:对权重差进行量化以及稀疏化操作,即每次通信仅上传一小部分量化后的权重差值。之所以选择权重差做量化和稀疏,是因为它比权重值的分布更易拟合,而且稀疏性更高。量化就是将float32的数据类型映射到int8甚至更低比特表示的数值上,一方面降低存储和通信开销,另一方面可以更好地采用一些压缩编码方式进行传输(如哈夫曼编码、有限状态熵编码等)。比较常用的稀疏化方法有Top-K稀疏,即按梯度的绝对值从小到大排序,每轮只上传前k个参数。通信压缩方案一般是精度有损的,如何选取合适的k是一个有挑战性的问题。 - -**联邦生态** - -在前面的章节中,我们介绍了面向隐私保护的联邦学习领域的一些技术与实践,然而随着探索地更加深入,联邦学习领域也变得更具包容性,它涵盖了机器学习、模型压缩部署、信息安全、加密算法、博弈论等等。随着越来越多的公司、高校和机构参与进来,现在的联邦学习已经不仅仅是一种技术解决方案,还是一个隐私保护的生态系统,比如不同的参与方希望以可持续的方式加入联邦学习流程,如何设计激励机制以确保利润可以相对公平地被各参与方共享,同时对于恶意的实施攻击或者破坏行为的参与方进行有效遏制。 - -另外,随着用户数据隐私保护和合理使用的法律法规越来越多的被推出,制定联邦学习的技术标准显得愈加重要,这一标准能够在法律监管部门和技术开发人员之间建立一座桥梁,让企业知道采用何种技术,能够在合乎法规的同时更好地进行信息的共享。 - -2020年底正式出版推行了由IEEE 标准委员会通过的联邦学习国际标准(IEEE P3652.1),该标准旨在提供一个搭建联邦学习的体系架构和应用的指导方针,主要内容包括:联邦学习的描述和定义、场景需求分类和安全测评、联邦学习个性指标的评估如何量化、联合管控的需求。这也是国际上首个针对人工智能协同技术框架订立的标准,标志着联邦学习开启大规模工业化应用的新篇章。 +## 展望 + +为了实现联邦学习的大规模商用,我们仍然需要做许多的研究工作。比如我们无法查看联邦学习的分布式化的数据,那就很难选择模型的超参数以及设定优化器,只能采用一些基于模拟的方案来调测模型;比如用于移动设备时,单用户的标签数据很少,甚至无法获取数据的标签信息,联邦学习如何用于无监督学习;比如由于参与方的数据分布不一致,训练同一个全局模型,很难评价模型对于每个参与方的好坏;比如数据一直是公司的核心资产,不同的公司一直在致力于收集数据和创造数据孤岛,如何有效地激励公司或者机构参与联邦学习的系统中来。下面将介绍一些MindSpore Federated在进行的一些尝试和业界的相关工作。 + +**异构场景下的联邦学习** + +之前探讨的横向联邦学习和纵向联邦学习都是让不同的参与方共同建立一个共享的机器学习模型。然而,企业级联邦学习框架往往需要适应多种异构场景,如数据异构(不同客户端数据规模以及分布不一致),设备异构(不同客户端设备计算能力,通信效率不一致),以及模型异构(不同本地客户端模型学到的特征不一致)。 + +比较主流的两种联邦学习异构场景下的工作: + +1)对异构数据具有高度鲁棒性的本地模型个性化联邦学习策略: + +联邦学习训练的是一个全局模型,基于所有数据得到一个全局最优解,但是不同参与方的数据量和分布都是不同的,很多场景下全局模型无法在把握整体的同时又照顾到这种差异。当某一方的数据和整体偏离比较大时,联邦学习的效果确实有可能不如本地训练的效果。那么如何在所有参与方总体的收益最大化的同时,让个体的收益也能够最大化,这就是个性化联邦学习。 + +个性化联邦学习并不要求所有参与方最终使用的模型必须是一样的,比如允许每个参与方在参与联邦学习之后,根据自己的数据对模型进行微调,从而生成本方独特的个性化模型。在进行个性化微调之后,往往模型在本地测试集上的效果会更好。在这种方式下,不同参与方的模型结构是一样的,但是模型参数会有所不同。还有一些方案,是让所有的参与方拥有同样的特征提取层,但是任务分类层不同。还有的思路是将知识蒸馏引入联邦学习中,将联邦学习的全局模型作为教师模型,将个性化模型作为学生模型,可以缓解个性化过程中的过拟合问题。 + +2)对于异构模型进行模型聚合的策略研究: + +一般在FedAvg的联邦聚合范式下,本地迭代训练次数越少、聚合地越频繁,模型收敛精度会越好,尤其是在不同参与客户端的数据是非IID情况下。但是聚合会带来通信成本开销,联邦学习存在通信成本与模型精度的Trade-Off。因此很多研究者聚焦于如何设计自适应聚合方案,要求在给定训练时间开销的前提下,找到本地更新和全局通信之间的最佳平衡,令全局模型的泛化误差最小。 + + **通信效率提升** + +在联邦学习流程中,每一个全局训练轮次里,每个参与方都需要给服务端发送完整的参数。然后服务端将聚合后的参数下发。现代的深度学习网络动辄有数百万甚至更大量级的参数,如此多的参数传输将会带来巨大的通信开销。为了降低通信开销,MindSpore Federated采取了一些改善通信效率的方法: + +1)智能调频策略:通过改变全局模型聚合的轮次来提高联邦学习效率,减少训练任务达到收敛的通信开销。一种直觉是在联邦学习流程的初期,不同参与方的参数变化较为一致,因此设置较小的聚合频率,可以减少通信成本;在联邦学习流程的后期,不同参与方的参数变化较为不一致,因此设置较大的聚合频率,可以使得模型快速收敛。 + +2)通信压缩方案:对权重差进行量化以及稀疏化操作,即每次通信仅上传一小部分量化后的权重差值。之所以选择权重差做量化和稀疏,是因为它比权重值的分布更易拟合,而且稀疏性更高。量化就是将float32的数据类型映射到int8甚至更低比特表示的数值上,一方面降低存储和通信开销,另一方面可以更好地采用一些压缩编码方式进行传输(如哈夫曼编码、有限状态熵编码等)。比较常用的稀疏化方法有Top-K稀疏,即按梯度的绝对值从小到大排序,每轮只上传前k个参数。通信压缩方案一般是精度有损的,如何选取合适的k是一个有挑战性的问题。 + +**联邦生态** + +在前面的章节中,我们介绍了面向隐私保护的联邦学习领域的一些技术与实践,然而随着探索地更加深入,联邦学习领域也变得更具包容性,它涵盖了机器学习、模型压缩部署、信息安全、加密算法、博弈论等等。随着越来越多的公司、高校和机构参与进来,现在的联邦学习已经不仅仅是一种技术解决方案,还是一个隐私保护的生态系统,比如不同的参与方希望以可持续的方式加入联邦学习流程,如何设计激励机制以确保利润可以相对公平地被各参与方共享,同时对于恶意的实施攻击或者破坏行为的参与方进行有效遏制。 + +另外,随着用户数据隐私保护和合理使用的法律法规越来越多的被推出,制定联邦学习的技术标准显得愈加重要,这一标准能够在法律监管部门和技术开发人员之间建立一座桥梁,让企业知道采用何种技术,能够在合乎法规的同时更好地进行信息的共享。 + +2020年底正式出版推行了由IEEE 标准委员会通过的联邦学习国际标准(IEEE P3652.1),该标准旨在提供一个搭建联邦学习的体系架构和应用的指导方针,主要内容包括:联邦学习的描述和定义、场景需求分类和安全测评、联邦学习个性指标的评估如何量化、联合管控的需求。这也是国际上首个针对人工智能协同技术框架订立的标准,标志着联邦学习开启大规模工业化应用的新篇章。 diff --git a/chapter_federated_learning/overview.md b/zh_chapters/chapter_federated_learning/overview.md similarity index 100% rename from chapter_federated_learning/overview.md rename to zh_chapters/chapter_federated_learning/overview.md diff --git a/chapter_federated_learning/privacy_encryption_algorithm.md b/zh_chapters/chapter_federated_learning/privacy_encryption_algorithm.md similarity index 97% rename from chapter_federated_learning/privacy_encryption_algorithm.md rename to zh_chapters/chapter_federated_learning/privacy_encryption_algorithm.md index 6c9daf0..641eb07 100644 --- a/chapter_federated_learning/privacy_encryption_algorithm.md +++ b/zh_chapters/chapter_federated_learning/privacy_encryption_algorithm.md @@ -1,140 +1,140 @@ -## 隐私加密算法 - -联邦学习过程中,用户数据仅用于本地设备训练,不需要上传至中央FL-Server。这样可以避免用户个人数据的直接泄露。然而联邦学习框架中,模型的权重以明文形式上云仍然存在间接泄露用户隐私的风险。敌手获取到用户上传的明文权重后,可以通过重构、模型逆向等攻击恢复用户的个人训练数据,导致用户隐私泄露。 - -MindSpore Federated框架,提供了基于本地差分隐私(LDP)、基于多方安全计算(MPC)的安全聚合算法和华为自研的基于符号的维度选择差分隐私算法(SignDS),在本地模型的权重上云前对其进行加噪或加扰。在保证模型可用性的前提下,解决联邦学习中的隐私泄露问题。 - -### 基于LDP的安全聚合 - -差分隐私(differential privacy)是一种保护用户数据隐私的机制。差分隐私定义为: -$$ -Pr[\mathcal{K}(D)\in S] \le e^{\epsilon} Pr[\mathcal{K}(D’) \in S]+\delta -$$ - -对于两个差别只有一条记录的数据集$D$和$D’$,通过随机算法$\mathcal{K}$,输出结果为集合$S$子集的概率满足上面公式。$\epsilon$为差分隐私预算,$\delta$为扰动,$\epsilon$和$\delta$越小,说明$\mathcal{K}$在$D$和$D’$上输出的数据分布越接近。 - -在联邦学习中,假设FL-Client本地训练之后的模型权重矩阵是$W$,由于模型在训练过程中会“记住”训练集的特征,所以敌手可以借助$W$还原出用户的训练数据集。 - -MindSpore Federated提供基于本地差分隐私的安全聚合算法,防止本地模型的权重上云时泄露隐私数据。 - -FL-Client会生成一个与本地模型权重矩阵$W$相同维度的差分噪声矩阵$G$,然后将二者相加,得到一个满足差分隐私定义的权重矩阵$W_p$: - -$$ -W_p=W+G -$$ - -FL-Client将加噪后的模型权重矩阵$W_p$上传至云侧FL-Server进行联邦聚合。噪声矩阵$G$相当于给原模型加上了一层掩码,在降低模型泄露敏感数据风险的同时,也会影响模型训练的收敛性。如何在模型隐私性和可用性之间取得更好的平衡,仍然是一个值得研究的问题。实验表明,当参与方的数量$n$足够大时(一般指1000以上),大部分噪声能够相互抵消,本地差分隐私机制对聚合模型的精度和收敛性没有明显影响。 - -### 基于MPC的安全聚合 - -尽管差分隐私技术可以适当保护用户数据隐私,但是当参与FL-Client数量比较少或者高斯噪声幅值较大时,模型精度会受较大影响。为了同时满足模型保护和模型收敛这两个要求,MindSpore Federated提供了基于MPC的安全聚合方案。 - -尽管差分隐私技术可以适当保护用户数据隐私,但是当参与FL-Client数量比较少或者高斯噪声幅值较大时,模型精度会受较大影响。为了同时满足模型保护和模型收敛这两个要求,MindSpore Federated提供了基于MPC的安全聚合方案。 - -在这种训练模式下,假设参与的FL-Client集合为$U$,对于任意FL-Client $u$和$v$,它们会两两协商出一对随机扰动$p_{uv}$、$p_{vu}$,满足 - -$$ -\label{puv} - p_{uv}= - \begin{cases} - -p_{vu}, &u{\neq}v\\ - 0, &u=v - \end{cases} -$$ -于是每个FL-Client $u$ 在上传模型权重至FL-Server前,会在原模型权重$x_u$加上它与其它用户协商的扰动: - -$$ -x_{encrypt}=x_u+\sum\limits_{v{\in}U}p_{uv} -$$ - -从而FL-Server聚合结果$\overline{x}$为: -$$ -\label{eq:juhejieguo} -\overline{x}=\sum\limits_{u{\in}U}(x_{u}+\sum\limits_{v{\in}U}p_{uv})=\sum\limits_{u{\in}U}x_{u}+\sum\limits_{u{\in}U}\sum\limits_{v{\in}U}p_{uv}=\sum\limits_{u{\in}U}x_{u} -$$ -上面的过程只是介绍了聚合算法的主要思想,基于MPC的聚合方案是精度无损的,代价是通讯轮次的增加。 - -### 基于LDP-SignDS算法的安全聚合 - -对于先前的基于维度加噪的LDP算法,添加到每个维度的噪声规模基本上与模型参数的数量成正比。因此,对于高维模型,可能需要非常多的参与方来减轻噪音对模型收敛的影响。为了解决上述“维度依赖”问题,MindSpore Federated 进一步提供了基于维度选择的**Sign-based Dimension Selection (SignDS)** :cite:`jiang2022signds`算法。 - -SignDS算法的主要思想是,对于每一条真实的本地更新$\Delta\in\mathbb{R}^{d}$,FL-Client首先选择一小部分更新最明显的维度构建Top-K集合$S_k$,并以此选择一个维度集合$J$返回给FL-Server。FL-Server根据维度集合$J$构建一条对应的稀疏更新$\Delta^\prime$,并聚合所有稀疏更新用于更新全局模型。由于本地模型更新与本地数据信息相关联,直接选取真实的最大更新维度可能导致隐私泄露。对此,SignDS算法在两方面实现了隐私安全保证。一方面,算法使用了一种基数机制(Exponential Mechanism, EM :cite:`mcsherry2007mechanism`)的维度选择算法**EM-MDS**,使得所选维度集满足严格的$\epsilon$-LDP保证;另一方面,在构建稀疏更新时,对所选维度分配一个常量值而不直接使用实际更新值,以保证稀疏更新和本地数据不再直接关联。由于维度选择满足$\epsilon$-LDP,且分配给所选维度的更新值与本地数据无关,根据差分隐私的传递性 :cite:`dwork2014algorithmic`,所构建的稀疏更新同样满足$\epsilon$-LDP保证。**相较于之前基于维度加噪的LDP算法,SignDS算法可以显著提升高维模型的训练精度。同时,由于FL-Client只需上传一小部分的维度值而不是所有的模型权重,因此联邦学习的上行通信量也被大大降低。** - -下面,我们分别对Top-K集合$S_k$的构建和EM-MDS维度选择算法进行详细介绍。 - -首先,由于实际更新值有正负,直接给所有选定的维度分配相同的常量值可能会明显改变模型更新方向,影响模型收敛。为了解决这个问题,SignDS提出了一种基于符号的Top-K集合构建策略。具体来讲,算法引入了一个额外的符号变量$s\in\\{-1,1\\}$。该变量由FL-Client以等概率随机采样,用于确定本地更新$\Delta$的Top-K集合$S_k$。如果$s=1$,我们将$\Delta$按**真实更新值**排序,并将**最大**的$k$个更新维度记为$S_k$。我们进一步从$S_k$中随机选择一部分维度,并将$s=1$作为这些维度的更新值用以构建稀疏更新。直觉上,$S_k$中维度的更新值很可能大于零。因此,将$s=1$分配给选定的维度不会导致模型更新方向的太大差异,从而减轻了对模型精度的影响。类似的,当$s=-1$时,我们选取**最小**的$k$个更新维度记为$S_k$,并将$s=-1$分配给所选维度。 - -下面,我们进一步介绍用于维度选择的EM-MDS算法。简单来说,EM-MDS算法的目的是从输出维度域$\mathcal{J}$中以一定概率$\mathcal{P}$随机选择一个维度集合$J\in\mathcal{J}$,不同维度集合对应的概率不同。我们假设$J$总共包含$h$个维度,其中有$\nu$个维度属于Top-K集合(即$|S_k \cap J|=\nu$,且$\nu\in[0,h]$),另外$h-\nu$个维度属于非Top-K集合。直观上,$\nu$越大,$J$中包含的Top-K维度越多,模型收敛越好。因此,我们希望给$\nu$较大的维度集合分配更高的概率。基于这个想法,我们将评分函数定义为: -$$ -u(S_{k}, J) = 𝟙(|S_k\cap J| \geq \nu_{th}) = 𝟙(\nu \geq \nu_{th}) -$$ -:eqlabel:`score_function` - -$u(S_{k}, J)$用来衡量输出维度集合$J$中包含的Top-K维度的数量是否超过某一阈值$\nu_{th}$($\nu_{th}\in[1,h]$),超过则为1,否则为0。进一步,$u(S_{k}, J)$的敏感度可计算为: - -$$ -\phi = \max_{J\in\mathcal{J}} ||u(S_{k}, J) - u(S^\prime_{k}, J)||= 1 - 0 = 1 -$$ -:eqlabel:`sensitivity` - -注意 :eqref:`sensitivity`对于任意一对不同的Top-K集合$S_k$和$S_k^\prime$均成立。 - -根据以上定义,EM-MDS算法描述如下: - -*给定真实本地更新$\Delta\in\mathbb{R}^{d}$的Top-K集合$S_k$和隐私预算$\epsilon$,输出维度集合$J\in\mathcal{J}$的采样概率为:* - -$$ - \mathcal{P}=\frac{\mathrm{exp}(\frac{\epsilon}{\phi}\cdot u(S_{k}, J))}{\sum_{J^\prime\in\mathcal{J}}\mathrm{exp}(\frac{\epsilon}{\phi}\cdot u(S_{k}, J^\prime))} - = - \frac{\mathrm{exp}(\epsilon\cdot 𝟙(\nu \geq \nu_{th}))}{\sum_{\tau=0}^{\tau=h}\omega_{\tau}\cdot \mathrm{exp}(\epsilon\cdot 𝟙(\tau\geq\nu_{th}))} - = - \frac{\mathrm{exp}(\epsilon\cdot 𝟙(\nu \geq \nu_{th}))}{\sum_{\tau=0}^{\tau=\nu_{th}-1}\omega_{\tau} + \sum_{\tau=\nu_{th}}^{\tau=h}\omega_{\tau}\cdot \mathrm{exp}(\epsilon)} -$$ -:eqlabel:`emmds` - -*其中,$\nu$是$J$中包含的Top-K维度数量,$\nu_{th}$是评分函数的阈值,$J^\prime$是任意一输出维度集合,$\omega_{\tau}=\binom{k}{\tau}\binom{d-k}{h-\tau}$是所有包含$\tau$个Top-K维度的集合数。* - -我们进一步提供了EM-MDS算法的隐私证明: - -对于每个FL-Client,给定随机采样的符号值$x$,任意两个本地更新$\Delta$,$\Delta^\prime$的Top-K集合记为$S_k$和$S_k^\prime$,对于任意输出维度集合$J\in\mathcal{J}$,令$\nu=|S_k \cap J|$, $\nu^\prime=|S_k^\prime \cap J|$为$J$与两组Top-K维度集的交集数量。根据 :eqref:`emmds`,以下不等式成立: - -$$ -\frac{\mathrm{Pr}\[J|\Delta\]}{\mathrm{Pr}\[J|\Delta^\prime\]} = \frac{\mathrm{Pr}\[J|S_{k}\]}{\mathrm{Pr}\[J|S^\prime_{k}\]} = \frac{\frac{\mathrm{exp}(\frac{\epsilon}{\phi}\cdot u(S_{k}, J))}{\sum_{J^\prime\in\mathcal{J}}\mathrm{exp}(\frac{\epsilon}{\phi}\cdot u(S_{k}, J^\prime))}}{\frac{\mathrm{exp}(\frac{\epsilon}{\phi}\cdot u(S^\prime_{k}, J))}{\sum_{J^\prime\in\mathcal{J}}\mathrm{exp}(\frac{\epsilon}{\phi}\cdot u(S^\prime_{k}, J^\prime))}} - = \frac{\frac{\mathrm{exp}(\epsilon\cdot 𝟙(\nu \geq \nu_{th}))}{\sum_{\tau=0}^{\tau=h}\omega_{\tau}\cdot \mathrm{exp}(\epsilon\cdot 𝟙(\tau\geq\nu_{th}))}}{\frac{ - \mathrm{exp}(\epsilon\cdot 𝟙(\nu^\prime \geq \nu_{th}))}{\sum_{\tau=0}^{\tau=h}\omega_{\tau}\cdot \mathrm{exp}(\epsilon\cdot 𝟙(\tau\geq\nu_{th}))}} \\ - = \frac{\mathrm{exp}(\epsilon\cdot 𝟙(\nu \geq \nu_{th}))}{ - \mathrm{exp}(\epsilon\cdot 𝟙(\nu^\prime \geq \nu_{th}))} - \leq \frac{\mathrm{exp}(\epsilon\cdot 1)}{\mathrm{exp}(\epsilon\cdot 0)} = \mathrm{exp}(\epsilon) -$$ - -*证明EM-MDS算法满足$\epsilon$-LDP保证。* - -值得注意的是,计算 :eqref:`emmds`需要先确定Top-K维度数的阈值$\nu_{th}$。为此,我们首先推导在给定阈值$\nu_{th}$时,任意一组输出维度集合$J$包含的Top-K维度的概率分布和期望: - -$$ -\mathrm{Pr}(\nu=\tau|\nu_{th})= - \begin{cases} - \omega_{\tau} / \Omega \quad \quad \quad \quad \quad \mathrm{ } &if \quad \tau\in\[0,\nu_{th}\) \\ - \omega_{\tau}\cdot\mathrm{exp}(\epsilon) / \Omega \quad \quad &if \quad \tau\in\[\nu_{th},h\] - \end{cases} -$$ -:eqlabel:`discrete-prob` - -$$ - \mathbb{E}\[\nu|\nu_{th}\] = \sum_{\tau=0}^{\tau=h}\tau\cdot \mathrm{Pr}(\nu=\tau|\nu_{th}) -$$ -:eqlabel:`expectation` - -这里,$\Omega$为 :eqref:`emmds`中$\mathcal{P}$的分母部分。直觉上,$\mathbb{E}\[\nu|\nu_{th}\]$越高,随机采样的$J$集合中包含的Top-K维度的概率就越大,从而模型效用就越好。因此,我们将$\mathbb{E}\[\nu|\nu_{th}\]$最高时的阈值确定为目标阈值$\nu_{th}^{\*}$,即: - -$$ -\nu_{th}^{\*} = \underset{\nu_{th}\in\[1, h\]}{\operatorname{argmax}} \mathbb{E}\[\nu|\nu_{th}\] -$$ -:eqlabel:`threshold` - -最后,我们在 :numref:`signds_workflow`中描述了SignDS算法的详细流程。给定本地模型更新$\Delta$,我们首先随机采样一个符号值$s$并构建Top-K集合$S_k$。接下来,我们根据 :eqref:`threshold`确定阈值$\nu_{th}^{\*}$并遵循 :eqref:`emmds`定义的概率选择输出集合$J$。考虑到输出域$\mathcal{J}$包含$\binom{d}{k}$个可能的维度集合,以一定概率直接从$\mathcal{J}$中随机采样一个组合需要很大的计算成本和空间成本。因此,我们采用了逆采样算法以提升计算效率。具体来说,我们首先从标准均匀分布中采样一个随机值$\beta\sim U(0,1)$,并根据 :eqref:`discrete-prob`中$p(\nu=\tau|\nu_{th})$的累计概率分布$CDF_{\tau}$确定输出维度集合中包含的Top-K维度数$\nu$。最后,我们从Top-K集合$S_k$中随机选取$\nu$个维度,从非Top-K集合中随机采样$h-\nu$个维度,以构建最终的输出维度集合$J$。 - -![SignDS工作流程](../img/ch10/ch10-federated-learning-signds.PNG) -:width:`800px` -:label:`signds_workflow` - +## 隐私加密算法 + +联邦学习过程中,用户数据仅用于本地设备训练,不需要上传至中央FL-Server。这样可以避免用户个人数据的直接泄露。然而联邦学习框架中,模型的权重以明文形式上云仍然存在间接泄露用户隐私的风险。敌手获取到用户上传的明文权重后,可以通过重构、模型逆向等攻击恢复用户的个人训练数据,导致用户隐私泄露。 + +MindSpore Federated框架,提供了基于本地差分隐私(LDP)、基于多方安全计算(MPC)的安全聚合算法和华为自研的基于符号的维度选择差分隐私算法(SignDS),在本地模型的权重上云前对其进行加噪或加扰。在保证模型可用性的前提下,解决联邦学习中的隐私泄露问题。 + +### 基于LDP的安全聚合 + +差分隐私(differential privacy)是一种保护用户数据隐私的机制。差分隐私定义为: +$$ +Pr[\mathcal{K}(D)\in S] \le e^{\epsilon} Pr[\mathcal{K}(D’) \in S]+\delta +$$ + +对于两个差别只有一条记录的数据集$D$和$D’$,通过随机算法$\mathcal{K}$,输出结果为集合$S$子集的概率满足上面公式。$\epsilon$为差分隐私预算,$\delta$为扰动,$\epsilon$和$\delta$越小,说明$\mathcal{K}$在$D$和$D’$上输出的数据分布越接近。 + +在联邦学习中,假设FL-Client本地训练之后的模型权重矩阵是$W$,由于模型在训练过程中会“记住”训练集的特征,所以敌手可以借助$W$还原出用户的训练数据集。 + +MindSpore Federated提供基于本地差分隐私的安全聚合算法,防止本地模型的权重上云时泄露隐私数据。 + +FL-Client会生成一个与本地模型权重矩阵$W$相同维度的差分噪声矩阵$G$,然后将二者相加,得到一个满足差分隐私定义的权重矩阵$W_p$: + +$$ +W_p=W+G +$$ + +FL-Client将加噪后的模型权重矩阵$W_p$上传至云侧FL-Server进行联邦聚合。噪声矩阵$G$相当于给原模型加上了一层掩码,在降低模型泄露敏感数据风险的同时,也会影响模型训练的收敛性。如何在模型隐私性和可用性之间取得更好的平衡,仍然是一个值得研究的问题。实验表明,当参与方的数量$n$足够大时(一般指1000以上),大部分噪声能够相互抵消,本地差分隐私机制对聚合模型的精度和收敛性没有明显影响。 + +### 基于MPC的安全聚合 + +尽管差分隐私技术可以适当保护用户数据隐私,但是当参与FL-Client数量比较少或者高斯噪声幅值较大时,模型精度会受较大影响。为了同时满足模型保护和模型收敛这两个要求,MindSpore Federated提供了基于MPC的安全聚合方案。 + +尽管差分隐私技术可以适当保护用户数据隐私,但是当参与FL-Client数量比较少或者高斯噪声幅值较大时,模型精度会受较大影响。为了同时满足模型保护和模型收敛这两个要求,MindSpore Federated提供了基于MPC的安全聚合方案。 + +在这种训练模式下,假设参与的FL-Client集合为$U$,对于任意FL-Client $u$和$v$,它们会两两协商出一对随机扰动$p_{uv}$、$p_{vu}$,满足 + +$$ +\label{puv} + p_{uv}= + \begin{cases} + -p_{vu}, &u{\neq}v\\ + 0, &u=v + \end{cases} +$$ +于是每个FL-Client $u$ 在上传模型权重至FL-Server前,会在原模型权重$x_u$加上它与其它用户协商的扰动: + +$$ +x_{encrypt}=x_u+\sum\limits_{v{\in}U}p_{uv} +$$ + +从而FL-Server聚合结果$\overline{x}$为: +$$ +\label{eq:juhejieguo} +\overline{x}=\sum\limits_{u{\in}U}(x_{u}+\sum\limits_{v{\in}U}p_{uv})=\sum\limits_{u{\in}U}x_{u}+\sum\limits_{u{\in}U}\sum\limits_{v{\in}U}p_{uv}=\sum\limits_{u{\in}U}x_{u} +$$ +上面的过程只是介绍了聚合算法的主要思想,基于MPC的聚合方案是精度无损的,代价是通讯轮次的增加。 + +### 基于LDP-SignDS算法的安全聚合 + +对于先前的基于维度加噪的LDP算法,添加到每个维度的噪声规模基本上与模型参数的数量成正比。因此,对于高维模型,可能需要非常多的参与方来减轻噪音对模型收敛的影响。为了解决上述“维度依赖”问题,MindSpore Federated 进一步提供了基于维度选择的**Sign-based Dimension Selection (SignDS)** :cite:`jiang2022signds`算法。 + +SignDS算法的主要思想是,对于每一条真实的本地更新$\Delta\in\mathbb{R}^{d}$,FL-Client首先选择一小部分更新最明显的维度构建Top-K集合$S_k$,并以此选择一个维度集合$J$返回给FL-Server。FL-Server根据维度集合$J$构建一条对应的稀疏更新$\Delta^\prime$,并聚合所有稀疏更新用于更新全局模型。由于本地模型更新与本地数据信息相关联,直接选取真实的最大更新维度可能导致隐私泄露。对此,SignDS算法在两方面实现了隐私安全保证。一方面,算法使用了一种基数机制(Exponential Mechanism, EM :cite:`mcsherry2007mechanism`)的维度选择算法**EM-MDS**,使得所选维度集满足严格的$\epsilon$-LDP保证;另一方面,在构建稀疏更新时,对所选维度分配一个常量值而不直接使用实际更新值,以保证稀疏更新和本地数据不再直接关联。由于维度选择满足$\epsilon$-LDP,且分配给所选维度的更新值与本地数据无关,根据差分隐私的传递性 :cite:`dwork2014algorithmic`,所构建的稀疏更新同样满足$\epsilon$-LDP保证。**相较于之前基于维度加噪的LDP算法,SignDS算法可以显著提升高维模型的训练精度。同时,由于FL-Client只需上传一小部分的维度值而不是所有的模型权重,因此联邦学习的上行通信量也被大大降低。** + +下面,我们分别对Top-K集合$S_k$的构建和EM-MDS维度选择算法进行详细介绍。 + +首先,由于实际更新值有正负,直接给所有选定的维度分配相同的常量值可能会明显改变模型更新方向,影响模型收敛。为了解决这个问题,SignDS提出了一种基于符号的Top-K集合构建策略。具体来讲,算法引入了一个额外的符号变量$s\in\\{-1,1\\}$。该变量由FL-Client以等概率随机采样,用于确定本地更新$\Delta$的Top-K集合$S_k$。如果$s=1$,我们将$\Delta$按**真实更新值**排序,并将**最大**的$k$个更新维度记为$S_k$。我们进一步从$S_k$中随机选择一部分维度,并将$s=1$作为这些维度的更新值用以构建稀疏更新。直觉上,$S_k$中维度的更新值很可能大于零。因此,将$s=1$分配给选定的维度不会导致模型更新方向的太大差异,从而减轻了对模型精度的影响。类似的,当$s=-1$时,我们选取**最小**的$k$个更新维度记为$S_k$,并将$s=-1$分配给所选维度。 + +下面,我们进一步介绍用于维度选择的EM-MDS算法。简单来说,EM-MDS算法的目的是从输出维度域$\mathcal{J}$中以一定概率$\mathcal{P}$随机选择一个维度集合$J\in\mathcal{J}$,不同维度集合对应的概率不同。我们假设$J$总共包含$h$个维度,其中有$\nu$个维度属于Top-K集合(即$|S_k \cap J|=\nu$,且$\nu\in[0,h]$),另外$h-\nu$个维度属于非Top-K集合。直观上,$\nu$越大,$J$中包含的Top-K维度越多,模型收敛越好。因此,我们希望给$\nu$较大的维度集合分配更高的概率。基于这个想法,我们将评分函数定义为: +$$ +u(S_{k}, J) = 𝟙(|S_k\cap J| \geq \nu_{th}) = 𝟙(\nu \geq \nu_{th}) +$$ +:eqlabel:`score_function` + +$u(S_{k}, J)$用来衡量输出维度集合$J$中包含的Top-K维度的数量是否超过某一阈值$\nu_{th}$($\nu_{th}\in[1,h]$),超过则为1,否则为0。进一步,$u(S_{k}, J)$的敏感度可计算为: + +$$ +\phi = \max_{J\in\mathcal{J}} ||u(S_{k}, J) - u(S^\prime_{k}, J)||= 1 - 0 = 1 +$$ +:eqlabel:`sensitivity` + +注意 :eqref:`sensitivity`对于任意一对不同的Top-K集合$S_k$和$S_k^\prime$均成立。 + +根据以上定义,EM-MDS算法描述如下: + +*给定真实本地更新$\Delta\in\mathbb{R}^{d}$的Top-K集合$S_k$和隐私预算$\epsilon$,输出维度集合$J\in\mathcal{J}$的采样概率为:* + +$$ + \mathcal{P}=\frac{\mathrm{exp}(\frac{\epsilon}{\phi}\cdot u(S_{k}, J))}{\sum_{J^\prime\in\mathcal{J}}\mathrm{exp}(\frac{\epsilon}{\phi}\cdot u(S_{k}, J^\prime))} + = + \frac{\mathrm{exp}(\epsilon\cdot 𝟙(\nu \geq \nu_{th}))}{\sum_{\tau=0}^{\tau=h}\omega_{\tau}\cdot \mathrm{exp}(\epsilon\cdot 𝟙(\tau\geq\nu_{th}))} + = + \frac{\mathrm{exp}(\epsilon\cdot 𝟙(\nu \geq \nu_{th}))}{\sum_{\tau=0}^{\tau=\nu_{th}-1}\omega_{\tau} + \sum_{\tau=\nu_{th}}^{\tau=h}\omega_{\tau}\cdot \mathrm{exp}(\epsilon)} +$$ +:eqlabel:`emmds` + +*其中,$\nu$是$J$中包含的Top-K维度数量,$\nu_{th}$是评分函数的阈值,$J^\prime$是任意一输出维度集合,$\omega_{\tau}=\binom{k}{\tau}\binom{d-k}{h-\tau}$是所有包含$\tau$个Top-K维度的集合数。* + +我们进一步提供了EM-MDS算法的隐私证明: + +对于每个FL-Client,给定随机采样的符号值$x$,任意两个本地更新$\Delta$,$\Delta^\prime$的Top-K集合记为$S_k$和$S_k^\prime$,对于任意输出维度集合$J\in\mathcal{J}$,令$\nu=|S_k \cap J|$, $\nu^\prime=|S_k^\prime \cap J|$为$J$与两组Top-K维度集的交集数量。根据 :eqref:`emmds`,以下不等式成立: + +$$ +\frac{\mathrm{Pr}\[J|\Delta\]}{\mathrm{Pr}\[J|\Delta^\prime\]} = \frac{\mathrm{Pr}\[J|S_{k}\]}{\mathrm{Pr}\[J|S^\prime_{k}\]} = \frac{\frac{\mathrm{exp}(\frac{\epsilon}{\phi}\cdot u(S_{k}, J))}{\sum_{J^\prime\in\mathcal{J}}\mathrm{exp}(\frac{\epsilon}{\phi}\cdot u(S_{k}, J^\prime))}}{\frac{\mathrm{exp}(\frac{\epsilon}{\phi}\cdot u(S^\prime_{k}, J))}{\sum_{J^\prime\in\mathcal{J}}\mathrm{exp}(\frac{\epsilon}{\phi}\cdot u(S^\prime_{k}, J^\prime))}} + = \frac{\frac{\mathrm{exp}(\epsilon\cdot 𝟙(\nu \geq \nu_{th}))}{\sum_{\tau=0}^{\tau=h}\omega_{\tau}\cdot \mathrm{exp}(\epsilon\cdot 𝟙(\tau\geq\nu_{th}))}}{\frac{ + \mathrm{exp}(\epsilon\cdot 𝟙(\nu^\prime \geq \nu_{th}))}{\sum_{\tau=0}^{\tau=h}\omega_{\tau}\cdot \mathrm{exp}(\epsilon\cdot 𝟙(\tau\geq\nu_{th}))}} \\ + = \frac{\mathrm{exp}(\epsilon\cdot 𝟙(\nu \geq \nu_{th}))}{ + \mathrm{exp}(\epsilon\cdot 𝟙(\nu^\prime \geq \nu_{th}))} + \leq \frac{\mathrm{exp}(\epsilon\cdot 1)}{\mathrm{exp}(\epsilon\cdot 0)} = \mathrm{exp}(\epsilon) +$$ + +*证明EM-MDS算法满足$\epsilon$-LDP保证。* + +值得注意的是,计算 :eqref:`emmds`需要先确定Top-K维度数的阈值$\nu_{th}$。为此,我们首先推导在给定阈值$\nu_{th}$时,任意一组输出维度集合$J$包含的Top-K维度的概率分布和期望: + +$$ +\mathrm{Pr}(\nu=\tau|\nu_{th})= + \begin{cases} + \omega_{\tau} / \Omega \quad \quad \quad \quad \quad \mathrm{ } &if \quad \tau\in\[0,\nu_{th}\) \\ + \omega_{\tau}\cdot\mathrm{exp}(\epsilon) / \Omega \quad \quad &if \quad \tau\in\[\nu_{th},h\] + \end{cases} +$$ +:eqlabel:`discrete-prob` + +$$ + \mathbb{E}\[\nu|\nu_{th}\] = \sum_{\tau=0}^{\tau=h}\tau\cdot \mathrm{Pr}(\nu=\tau|\nu_{th}) +$$ +:eqlabel:`expectation` + +这里,$\Omega$为 :eqref:`emmds`中$\mathcal{P}$的分母部分。直觉上,$\mathbb{E}\[\nu|\nu_{th}\]$越高,随机采样的$J$集合中包含的Top-K维度的概率就越大,从而模型效用就越好。因此,我们将$\mathbb{E}\[\nu|\nu_{th}\]$最高时的阈值确定为目标阈值$\nu_{th}^{\*}$,即: + +$$ +\nu_{th}^{\*} = \underset{\nu_{th}\in\[1, h\]}{\operatorname{argmax}} \mathbb{E}\[\nu|\nu_{th}\] +$$ +:eqlabel:`threshold` + +最后,我们在 :numref:`signds_workflow`中描述了SignDS算法的详细流程。给定本地模型更新$\Delta$,我们首先随机采样一个符号值$s$并构建Top-K集合$S_k$。接下来,我们根据 :eqref:`threshold`确定阈值$\nu_{th}^{\*}$并遵循 :eqref:`emmds`定义的概率选择输出集合$J$。考虑到输出域$\mathcal{J}$包含$\binom{d}{k}$个可能的维度集合,以一定概率直接从$\mathcal{J}$中随机采样一个组合需要很大的计算成本和空间成本。因此,我们采用了逆采样算法以提升计算效率。具体来说,我们首先从标准均匀分布中采样一个随机值$\beta\sim U(0,1)$,并根据 :eqref:`discrete-prob`中$p(\nu=\tau|\nu_{th})$的累计概率分布$CDF_{\tau}$确定输出维度集合中包含的Top-K维度数$\nu$。最后,我们从Top-K集合$S_k$中随机选取$\nu$个维度,从非Top-K集合中随机采样$h-\nu$个维度,以构建最终的输出维度集合$J$。 + +![SignDS工作流程](../img/ch10/ch10-federated-learning-signds.PNG) +:width:`800px` +:label:`signds_workflow` + diff --git a/chapter_federated_learning/summary.md b/zh_chapters/chapter_federated_learning/summary.md similarity index 99% rename from chapter_federated_learning/summary.md rename to zh_chapters/chapter_federated_learning/summary.md index db8187c..f1195c1 100644 --- a/chapter_federated_learning/summary.md +++ b/zh_chapters/chapter_federated_learning/summary.md @@ -1,3 +1,3 @@ -## 小结 - -在这一章,我们简单介绍了联邦学习的背景、系统架构、联邦平均算法、隐私加密算法以及实际部署时的挑战。联邦学习是一个新起步的人工智能算法,可以在“数据保护”与“数据孤岛”这两大约束条件下,建立有效的机器学习模型。此外,由于联邦学习场景的特殊性(端侧数据不上传、安全隐私要求高和数据非独立同分布等特点),使得系统和算法的开发难度更高:如何平衡计算和通讯的开销,如何保证模型不会泄露隐私,算法如何在非独立同分布场景下收敛。这些难点都需要开发人员对实际的联邦学习场景有更深刻的认识。 +## 小结 + +在这一章,我们简单介绍了联邦学习的背景、系统架构、联邦平均算法、隐私加密算法以及实际部署时的挑战。联邦学习是一个新起步的人工智能算法,可以在“数据保护”与“数据孤岛”这两大约束条件下,建立有效的机器学习模型。此外,由于联邦学习场景的特殊性(端侧数据不上传、安全隐私要求高和数据非独立同分布等特点),使得系统和算法的开发难度更高:如何平衡计算和通讯的开销,如何保证模型不会泄露隐私,算法如何在非独立同分布场景下收敛。这些难点都需要开发人员对实际的联邦学习场景有更深刻的认识。 diff --git a/chapter_federated_learning/vertical_fl.md b/zh_chapters/chapter_federated_learning/vertical_fl.md similarity index 99% rename from chapter_federated_learning/vertical_fl.md rename to zh_chapters/chapter_federated_learning/vertical_fl.md index e32c065..bb18b03 100644 --- a/chapter_federated_learning/vertical_fl.md +++ b/zh_chapters/chapter_federated_learning/vertical_fl.md @@ -1,61 +1,61 @@ -## 纵向联邦学习 - -现在我们介绍另一种联邦学习算法:纵向联邦学习(Vertical Federated Learning)。纵向联邦学习的参与方拥有相同样本空间、不同特征空间的数据,通过共有样本数据进行安全联合建模,在金融、广告等领域拥有广泛的应用场景。和横向联邦学习相比,纵向联邦学习的参与方之间需要协同完成数据求交集、模型联合训练和模型联合推理。并且,参与方越多,纵向联邦学习系统的复杂度就越高。 - -下面以企业A和企业B两方为例来介绍纵向联邦学习的基本架构和流程。假设企业A有特征数据和标签数据,可以独立建模;企业B有特征数据,缺乏标签数据,因此无法独立建模。由于隐私法规和行业规范等原因,两个企业之间的数据无法直接互通。企业A和企业B可采用纵向联邦学习解决方案进行合作,数据不出本地,使用双方共同样本数据进行联合建模和训练。最终双方都能获得一个更强大的模型。 - -### 纵向联邦架构 - -![纵向联邦两方架构](../img/ch10/ch10-federated-learning-vfl-arch.svg) -:width:`800px` -:label:`federated-learning-vfl-arch` - -纵向联邦学习系统中的模型训练一般分为如下阶段: -- 样本对齐:首先对齐企业A和企业B中具有相同ID(Identification)的样本数据。在数据对齐阶段,系统会采用加密算法对数据进行保护,确保任何一方的用户数据不会暴露。 -- 联合训练:在确定企业A和企业B共有用户数据后,可以使用这些共有的数据来协同训练一个业务模型。模型训练过程中,模型参数信息以加密方式进行传递。已训练好的联邦学习模型可以部署在联邦学习系统的各参与方。 - -### 样本对齐 - -隐私集合求交(Private Set Intersection,PSI)技术是纵向联邦学习中数据样本对齐的常用解决方案。业界PSI实现方案有多种:基于电路、基于公钥加密、基于不经意传输协议和基于全同态加密等。不同PSI方案各有优劣势。例如,基于公钥加密方案不需要辅助服务器运行,但公钥加密的计算开销大;而基于不经意传输方案计算性能高,但通信开销较大。因此在具体应用时,要根据实际场景来选择功能、性能和安全之间的最佳平衡方案。 - -基于RSA盲签名是一种基于公钥加密的经典PSI方法,也是当前业界纵向联邦学习系统中广泛应用的技术之一。下面以企业A和企业B为例描述RSA盲签名算法的基本流程。 - -![纵向联邦样本对齐](../img/ch10/ch10-federated-learning-vfl-data.png) -:width:`600px` -:label:`federated-learning-vfl-data` - -企业A作为服务端,拥有一个包含了标签数据+样本ID的集合。企业B则作为客户端,拥有样本ID集合。首先,企业A利用RSA算法生成私钥和公钥。其中,私钥保留在服务端,公钥则发送给企业B。 - -服务端利用RSA算法计算出参与样本对齐的ID的签名: -$$t_j=H^{'}(K_{a:j})$$ -其中,$K_{a:j}=(H(a_j))^d \ mod \ n$,是采用私钥$d$加密的对$H(a_j)$的RSA加密的结果。$H()$和$H^{'}()$是哈希函数。 - -_同样,在客户端侧对样本ID进行公钥加密,并乘以一个随机数$R_{b,i}$用于加盲扰动: -$$y_i=H(b_i)\cdot(R_{b,i})^e \ mod \ n$$ -客户端侧将上述计算出来的$\{y_1,...,y_v\}$值传输给服务端侧。服务端侧收到$y_i$值后,使用私钥$d$进行签名并计算: -$$y_i^{'}=y_i^d \ mod \ n$$ -然后将计算出的$\{y_1^{'},...,y_v^{'}\}$和$\{t_1,...,t_w\}$发送给客户端侧。 -而客户端侧收到$y_i^{'}$和$t_j$后,首先完成去盲操作: -$$K_{b:i}={y_i}^{'}/R_{b,i}$$ -并将自己的ID签名与服务端发过来的ID签名进行样本对齐,得到加密和哈希组合状态下的ID交集$I$, -$${t_i}^{'}=H^{'}(K_{b:i}) \\I=\{t_1,...,t_w\}\cap \{{t_1}^{'},...,{t_v}^{'}\}$$ - -最后,将对齐后的样本ID交集$I$发送给服务端,服务端利用自身的映射表单独求取明文结果。这样企业A和企业B在加密状态下完成了求取相交的用户集合,并且在整个过程中双方非重叠样本ID都不会对外暴露。 - -### 联合训练 - -在样本ID对齐后,开发人员就可以使用这些公共的数据来建立机器学习模型。 - -目前,线性回归、决策树和神经网络等模型已经被广泛应用到纵向联邦学习系统中。在纵向联邦学习的模型训练过程中,一般会引入第三方协作者C来实现中心服务器功能,并且假设这个第三方协作者C是可信的,不会与其他参与方合谋。中心服务器在训练过程中作为中立方,产生和分发密钥,并对加密数据进行解密和计算。但中心服务器角色是非必须的,例如在两方联邦学习的场景下,不需要第三方协作者C来协调双方的训练任务,可以由具有标签数据的企业A来充当中心服务器的角色。不失一般性,下面继续以包含第三方协作者C的方案来描述纵向联邦学习模型联合训练过程。 - -![纵向联邦联合建模](../img/ch10/ch10-federated-learning-vfl-train.svg) -:width:`800px` -:label:`federated-learning-vfl-train` - -- 第一步:由第三方协作者C创建密钥对,将公钥发送给企业A和B。 -- 第二步:在企业A和B侧分别计算梯度和损失计算需要的中间结果,并进行加密和交换。 -- 第三步:企业A和B分别计算加密梯度和添加掩码。同时企业A还将计算加密损失值。计算完成后,企业A和B向第三方协作者C发送加密后的值。 -- 第四步:第三方协作者C对梯度和损失值解密,然后将结果发送回企业A和B。 -- 第五步:企业A和B将收到的值首先去除梯度上的掩码,然后更新本地模型参数。 - -在整个训练过程中,企业A和B之间的任何敏感数据都是经过加密算法加密之后再发出的各自的信任域。同态加密(Homomorphic Encryption,HE)是业界联邦学习框架常用的算法之一。同态加密是指加密过后的两份数据进行某些运算之后直接解密,可以得到真实数据经过相同运算的结果。当这种运算是加法时,就称为加法同态加密。将加密函数记为$[[\cdot]]$。 +## 纵向联邦学习 + +现在我们介绍另一种联邦学习算法:纵向联邦学习(Vertical Federated Learning)。纵向联邦学习的参与方拥有相同样本空间、不同特征空间的数据,通过共有样本数据进行安全联合建模,在金融、广告等领域拥有广泛的应用场景。和横向联邦学习相比,纵向联邦学习的参与方之间需要协同完成数据求交集、模型联合训练和模型联合推理。并且,参与方越多,纵向联邦学习系统的复杂度就越高。 + +下面以企业A和企业B两方为例来介绍纵向联邦学习的基本架构和流程。假设企业A有特征数据和标签数据,可以独立建模;企业B有特征数据,缺乏标签数据,因此无法独立建模。由于隐私法规和行业规范等原因,两个企业之间的数据无法直接互通。企业A和企业B可采用纵向联邦学习解决方案进行合作,数据不出本地,使用双方共同样本数据进行联合建模和训练。最终双方都能获得一个更强大的模型。 + +### 纵向联邦架构 + +![纵向联邦两方架构](../img/ch10/ch10-federated-learning-vfl-arch.svg) +:width:`800px` +:label:`federated-learning-vfl-arch` + +纵向联邦学习系统中的模型训练一般分为如下阶段: +- 样本对齐:首先对齐企业A和企业B中具有相同ID(Identification)的样本数据。在数据对齐阶段,系统会采用加密算法对数据进行保护,确保任何一方的用户数据不会暴露。 +- 联合训练:在确定企业A和企业B共有用户数据后,可以使用这些共有的数据来协同训练一个业务模型。模型训练过程中,模型参数信息以加密方式进行传递。已训练好的联邦学习模型可以部署在联邦学习系统的各参与方。 + +### 样本对齐 + +隐私集合求交(Private Set Intersection,PSI)技术是纵向联邦学习中数据样本对齐的常用解决方案。业界PSI实现方案有多种:基于电路、基于公钥加密、基于不经意传输协议和基于全同态加密等。不同PSI方案各有优劣势。例如,基于公钥加密方案不需要辅助服务器运行,但公钥加密的计算开销大;而基于不经意传输方案计算性能高,但通信开销较大。因此在具体应用时,要根据实际场景来选择功能、性能和安全之间的最佳平衡方案。 + +基于RSA盲签名是一种基于公钥加密的经典PSI方法,也是当前业界纵向联邦学习系统中广泛应用的技术之一。下面以企业A和企业B为例描述RSA盲签名算法的基本流程。 + +![纵向联邦样本对齐](../img/ch10/ch10-federated-learning-vfl-data.png) +:width:`600px` +:label:`federated-learning-vfl-data` + +企业A作为服务端,拥有一个包含了标签数据+样本ID的集合。企业B则作为客户端,拥有样本ID集合。首先,企业A利用RSA算法生成私钥和公钥。其中,私钥保留在服务端,公钥则发送给企业B。 + +服务端利用RSA算法计算出参与样本对齐的ID的签名: +$$t_j=H^{'}(K_{a:j})$$ +其中,$K_{a:j}=(H(a_j))^d \ mod \ n$,是采用私钥$d$加密的对$H(a_j)$的RSA加密的结果。$H()$和$H^{'}()$是哈希函数。 + +_同样,在客户端侧对样本ID进行公钥加密,并乘以一个随机数$R_{b,i}$用于加盲扰动: +$$y_i=H(b_i)\cdot(R_{b,i})^e \ mod \ n$$ +客户端侧将上述计算出来的$\{y_1,...,y_v\}$值传输给服务端侧。服务端侧收到$y_i$值后,使用私钥$d$进行签名并计算: +$$y_i^{'}=y_i^d \ mod \ n$$ +然后将计算出的$\{y_1^{'},...,y_v^{'}\}$和$\{t_1,...,t_w\}$发送给客户端侧。 +而客户端侧收到$y_i^{'}$和$t_j$后,首先完成去盲操作: +$$K_{b:i}={y_i}^{'}/R_{b,i}$$ +并将自己的ID签名与服务端发过来的ID签名进行样本对齐,得到加密和哈希组合状态下的ID交集$I$, +$${t_i}^{'}=H^{'}(K_{b:i}) \\I=\{t_1,...,t_w\}\cap \{{t_1}^{'},...,{t_v}^{'}\}$$ + +最后,将对齐后的样本ID交集$I$发送给服务端,服务端利用自身的映射表单独求取明文结果。这样企业A和企业B在加密状态下完成了求取相交的用户集合,并且在整个过程中双方非重叠样本ID都不会对外暴露。 + +### 联合训练 + +在样本ID对齐后,开发人员就可以使用这些公共的数据来建立机器学习模型。 + +目前,线性回归、决策树和神经网络等模型已经被广泛应用到纵向联邦学习系统中。在纵向联邦学习的模型训练过程中,一般会引入第三方协作者C来实现中心服务器功能,并且假设这个第三方协作者C是可信的,不会与其他参与方合谋。中心服务器在训练过程中作为中立方,产生和分发密钥,并对加密数据进行解密和计算。但中心服务器角色是非必须的,例如在两方联邦学习的场景下,不需要第三方协作者C来协调双方的训练任务,可以由具有标签数据的企业A来充当中心服务器的角色。不失一般性,下面继续以包含第三方协作者C的方案来描述纵向联邦学习模型联合训练过程。 + +![纵向联邦联合建模](../img/ch10/ch10-federated-learning-vfl-train.svg) +:width:`800px` +:label:`federated-learning-vfl-train` + +- 第一步:由第三方协作者C创建密钥对,将公钥发送给企业A和B。 +- 第二步:在企业A和B侧分别计算梯度和损失计算需要的中间结果,并进行加密和交换。 +- 第三步:企业A和B分别计算加密梯度和添加掩码。同时企业A还将计算加密损失值。计算完成后,企业A和B向第三方协作者C发送加密后的值。 +- 第四步:第三方协作者C对梯度和损失值解密,然后将结果发送回企业A和B。 +- 第五步:企业A和B将收到的值首先去除梯度上的掩码,然后更新本地模型参数。 + +在整个训练过程中,企业A和B之间的任何敏感数据都是经过加密算法加密之后再发出的各自的信任域。同态加密(Homomorphic Encryption,HE)是业界联邦学习框架常用的算法之一。同态加密是指加密过后的两份数据进行某些运算之后直接解密,可以得到真实数据经过相同运算的结果。当这种运算是加法时,就称为加法同态加密。将加密函数记为$[[\cdot]]$。 diff --git a/chapter_frontend_and_ir/ad.md b/zh_chapters/chapter_frontend_and_ir/ad.md similarity index 100% rename from chapter_frontend_and_ir/ad.md rename to zh_chapters/chapter_frontend_and_ir/ad.md diff --git a/chapter_frontend_and_ir/ai_compiler_design_principle.md b/zh_chapters/chapter_frontend_and_ir/ai_compiler_design_principle.md similarity index 100% rename from chapter_frontend_and_ir/ai_compiler_design_principle.md rename to zh_chapters/chapter_frontend_and_ir/ai_compiler_design_principle.md diff --git a/chapter_frontend_and_ir/common_frontend_optimization_pass.md b/zh_chapters/chapter_frontend_and_ir/common_frontend_optimization_pass.md similarity index 100% rename from chapter_frontend_and_ir/common_frontend_optimization_pass.md rename to zh_chapters/chapter_frontend_and_ir/common_frontend_optimization_pass.md diff --git a/chapter_frontend_and_ir/index.md b/zh_chapters/chapter_frontend_and_ir/index.md similarity index 100% rename from chapter_frontend_and_ir/index.md rename to zh_chapters/chapter_frontend_and_ir/index.md diff --git a/chapter_frontend_and_ir/intermediate_representation.md b/zh_chapters/chapter_frontend_and_ir/intermediate_representation.md similarity index 100% rename from chapter_frontend_and_ir/intermediate_representation.md rename to zh_chapters/chapter_frontend_and_ir/intermediate_representation.md diff --git a/chapter_frontend_and_ir/overview_of_frontend.md b/zh_chapters/chapter_frontend_and_ir/overview_of_frontend.md similarity index 100% rename from chapter_frontend_and_ir/overview_of_frontend.md rename to zh_chapters/chapter_frontend_and_ir/overview_of_frontend.md diff --git a/chapter_frontend_and_ir/summary.md b/zh_chapters/chapter_frontend_and_ir/summary.md similarity index 100% rename from chapter_frontend_and_ir/summary.md rename to zh_chapters/chapter_frontend_and_ir/summary.md diff --git a/chapter_frontend_and_ir/type_system_and_static_analysis.md b/zh_chapters/chapter_frontend_and_ir/type_system_and_static_analysis.md similarity index 100% rename from chapter_frontend_and_ir/type_system_and_static_analysis.md rename to zh_chapters/chapter_frontend_and_ir/type_system_and_static_analysis.md diff --git a/chapter_introduction/applications.md b/zh_chapters/chapter_introduction/applications.md similarity index 100% rename from chapter_introduction/applications.md rename to zh_chapters/chapter_introduction/applications.md diff --git a/chapter_introduction/architecture.md b/zh_chapters/chapter_introduction/architecture.md similarity index 100% rename from chapter_introduction/architecture.md rename to zh_chapters/chapter_introduction/architecture.md diff --git a/chapter_introduction/design.md b/zh_chapters/chapter_introduction/design.md similarity index 100% rename from chapter_introduction/design.md rename to zh_chapters/chapter_introduction/design.md diff --git a/chapter_introduction/ecosystem.md b/zh_chapters/chapter_introduction/ecosystem.md similarity index 100% rename from chapter_introduction/ecosystem.md rename to zh_chapters/chapter_introduction/ecosystem.md diff --git a/chapter_introduction/index.md b/zh_chapters/chapter_introduction/index.md similarity index 100% rename from chapter_introduction/index.md rename to zh_chapters/chapter_introduction/index.md diff --git a/chapter_introduction/readers.md b/zh_chapters/chapter_introduction/readers.md similarity index 100% rename from chapter_introduction/readers.md rename to zh_chapters/chapter_introduction/readers.md diff --git a/chapter_model_deployment/index.md b/zh_chapters/chapter_model_deployment/index.md similarity index 100% rename from chapter_model_deployment/index.md rename to zh_chapters/chapter_model_deployment/index.md diff --git a/chapter_model_deployment/model_compression.md b/zh_chapters/chapter_model_deployment/model_compression.md similarity index 100% rename from chapter_model_deployment/model_compression.md rename to zh_chapters/chapter_model_deployment/model_compression.md diff --git a/chapter_model_deployment/model_converter_and_optimizer.md b/zh_chapters/chapter_model_deployment/model_converter_and_optimizer.md similarity index 100% rename from chapter_model_deployment/model_converter_and_optimizer.md rename to zh_chapters/chapter_model_deployment/model_converter_and_optimizer.md diff --git a/chapter_model_deployment/model_deployment_introduction.md b/zh_chapters/chapter_model_deployment/model_deployment_introduction.md similarity index 100% rename from chapter_model_deployment/model_deployment_introduction.md rename to zh_chapters/chapter_model_deployment/model_deployment_introduction.md diff --git a/chapter_model_deployment/model_inference.md b/zh_chapters/chapter_model_deployment/model_inference.md similarity index 100% rename from chapter_model_deployment/model_inference.md rename to zh_chapters/chapter_model_deployment/model_inference.md diff --git a/chapter_model_deployment/model_security.md b/zh_chapters/chapter_model_deployment/model_security.md similarity index 100% rename from chapter_model_deployment/model_security.md rename to zh_chapters/chapter_model_deployment/model_security.md diff --git a/chapter_model_deployment/summary.md b/zh_chapters/chapter_model_deployment/summary.md similarity index 100% rename from chapter_model_deployment/summary.md rename to zh_chapters/chapter_model_deployment/summary.md diff --git a/chapter_preface/index.md b/zh_chapters/chapter_preface/index.md similarity index 100% rename from chapter_preface/index.md rename to zh_chapters/chapter_preface/index.md diff --git a/chapter_preface_advanced/index.md b/zh_chapters/chapter_preface_advanced/index.md similarity index 100% rename from chapter_preface_advanced/index.md rename to zh_chapters/chapter_preface_advanced/index.md diff --git a/chapter_preface_extension/index.md b/zh_chapters/chapter_preface_extension/index.md similarity index 100% rename from chapter_preface_extension/index.md rename to zh_chapters/chapter_preface_extension/index.md diff --git a/chapter_programming_interface/c_python_interaction.md b/zh_chapters/chapter_programming_interface/c_python_interaction.md similarity index 100% rename from chapter_programming_interface/c_python_interaction.md rename to zh_chapters/chapter_programming_interface/c_python_interaction.md diff --git a/chapter_programming_interface/development_history.md b/zh_chapters/chapter_programming_interface/development_history.md similarity index 100% rename from chapter_programming_interface/development_history.md rename to zh_chapters/chapter_programming_interface/development_history.md diff --git a/chapter_programming_interface/index.md b/zh_chapters/chapter_programming_interface/index.md similarity index 100% rename from chapter_programming_interface/index.md rename to zh_chapters/chapter_programming_interface/index.md diff --git a/chapter_programming_interface/ml_programming_paradigm.md b/zh_chapters/chapter_programming_interface/ml_programming_paradigm.md similarity index 100% rename from chapter_programming_interface/ml_programming_paradigm.md rename to zh_chapters/chapter_programming_interface/ml_programming_paradigm.md diff --git a/chapter_programming_interface/ml_workflow.md b/zh_chapters/chapter_programming_interface/ml_workflow.md similarity index 100% rename from chapter_programming_interface/ml_workflow.md rename to zh_chapters/chapter_programming_interface/ml_workflow.md diff --git a/chapter_programming_interface/neural_network_layer.md b/zh_chapters/chapter_programming_interface/neural_network_layer.md similarity index 100% rename from chapter_programming_interface/neural_network_layer.md rename to zh_chapters/chapter_programming_interface/neural_network_layer.md diff --git a/chapter_programming_interface/summary.md b/zh_chapters/chapter_programming_interface/summary.md similarity index 100% rename from chapter_programming_interface/summary.md rename to zh_chapters/chapter_programming_interface/summary.md diff --git a/chapter_recommender_system/case_study.md b/zh_chapters/chapter_recommender_system/case_study.md similarity index 100% rename from chapter_recommender_system/case_study.md rename to zh_chapters/chapter_recommender_system/case_study.md diff --git a/chapter_recommender_system/index.md b/zh_chapters/chapter_recommender_system/index.md similarity index 100% rename from chapter_recommender_system/index.md rename to zh_chapters/chapter_recommender_system/index.md diff --git a/chapter_recommender_system/model_update.md b/zh_chapters/chapter_recommender_system/model_update.md similarity index 100% rename from chapter_recommender_system/model_update.md rename to zh_chapters/chapter_recommender_system/model_update.md diff --git a/chapter_recommender_system/multi_stage_recommender_system.md b/zh_chapters/chapter_recommender_system/multi_stage_recommender_system.md similarity index 100% rename from chapter_recommender_system/multi_stage_recommender_system.md rename to zh_chapters/chapter_recommender_system/multi_stage_recommender_system.md diff --git a/chapter_recommender_system/summary.md b/zh_chapters/chapter_recommender_system/summary.md similarity index 100% rename from chapter_recommender_system/summary.md rename to zh_chapters/chapter_recommender_system/summary.md diff --git a/chapter_recommender_system/system_architecture.md b/zh_chapters/chapter_recommender_system/system_architecture.md similarity index 100% rename from chapter_recommender_system/system_architecture.md rename to zh_chapters/chapter_recommender_system/system_architecture.md diff --git a/chapter_reinforcement_learning/index.md b/zh_chapters/chapter_reinforcement_learning/index.md similarity index 100% rename from chapter_reinforcement_learning/index.md rename to zh_chapters/chapter_reinforcement_learning/index.md diff --git a/chapter_reinforcement_learning/marl.md b/zh_chapters/chapter_reinforcement_learning/marl.md similarity index 99% rename from chapter_reinforcement_learning/marl.md rename to zh_chapters/chapter_reinforcement_learning/marl.md index d230b5a..d274171 100644 --- a/chapter_reinforcement_learning/marl.md +++ b/zh_chapters/chapter_reinforcement_learning/marl.md @@ -1,41 +1,41 @@ -## 多智能体强化学习 - -以上所讲述的强化学习内容都为单智能体强化学习,而在近来的强化学习研究中,多智能体强化学习越来越受到研究人员关注。回想在本小节初介绍的单智能体强化学习框架 :numref:`ch12/ch12-rl-framework`,其中我们只有单个智能体产生的单个动作对环境产生影响,环境也返回单个奖励值给智能体。这里我们把单智能体强化学习扩展到多智能体强化学习,可以得到至少两种可能的多智能体强化学习框架,如 :numref:`ch12/ch12-marl`所示。 :numref:`ch12/ch12-marl`(a)为多智能体同时执行动作的情况,他们相互之间观察不到彼此的动作,他们的动作一同对环境产生影响,并各自接受自己动作所产生的奖励。 :numref:`ch12/ch12-marl`(b)为多智能体顺序执行动作的情况,后续智能体可能观察到前序智能体的动作,他们的动作一同对环境产生影响,并接受到各自的奖励值或共同的奖励值。除此之外,还有许多其他可能的多智能体框架,如更复杂的智能体间观察机制、智能体间通讯机制、多智能体合作与竞争等等。同时,这里假设多个智能体对环境的观察量都为环境的状态,这是最简单的一种,也是现实中最不可能出现的一种,实际情况下的多智能体往往对环境有各自不同的观察量。![两种可能的多智能体强化学习框架:(a)同步式多智能体决策;(b)异步式多智能体决策。](../img/ch12/ch12-marl.png) - -:width:`800px` - -:label:`ch12/ch12-marl` - -这里我们可以根据前面对单智能体强化学习过程的马尔可夫决策过程描述,给出多智能体强化学习的马尔可夫决策过程,它可以用一个数组$(\mathcal{S}, N, \boldsymbol{\mathcal{A}}, \mathbf{R}, \mathcal{T}, \gamma)$来表示。$N$是智能体个数,$\mathcal{S}$和$\boldsymbol{\mathcal{A}}=(\mathcal{A}_1, \mathcal{A}_2, ..., \mathcal{A}_N)$分别是环境状态空间和多智能体动作空间,其中$A_i$是第$i$个智能体的动作空间,$\mathbf{R}=(R_1, R_2, ..., R_N)$是多智能体奖励函数,$\mathbf{R}(s,\mathbf{a})$: $\mathcal{S}\times \boldsymbol{\mathcal{A}}\rightarrow \mathbb{R}^N$为对于当前状态$s\in\mathcal{S}$和当前多智能体动作$\mathbf{a}\in\boldsymbol{\mathcal{A}}$的奖励向量值,其中$R_i$是对第$i$个智能体的奖励值。从当前状态和动作到下一个状态的状态转移概率定义为$\mathcal{T}(s^\prime|s,\mathbf{a})$: $\mathcal{S}\times\boldsymbol{\mathcal{A}}\times\mathcal{S}\rightarrow \mathbb{R}_+$。$\gamma\in(0,1)$是奖励折扣因子(假设多个智能体采用相同的奖励折扣因子)。不同于单智能体强化学习,多智能体强化学习的目标除了常见的最大化每个智能体各自的期望累计奖励值$\mathbb{E}[\sum_t \gamma^t r^i_t], i\in[N]$之外,还有许多其他可能的学习目标,如达到纳什均衡、最大化团队奖励等等。 - -由上述介绍和定义可以发现,多智能体强化学习是一个比单智能体强化学习更加复杂的问题。而实际上,多个智能体的存在,对于每个智能体的决策而言,绝对不是简单的把每个单智能体决策累加的难度,实际情况要比单智能体决策问题复杂很多。多智能体系统的研究实际上是门古老的学科,它与博弈论(Game Theory)密切相关,在深度强化学习盛行以前早已有大量研究和许多理论上未解的难题。其中一个典型的问题是纳什均衡在双人非零和博弈下没有多项式时间内可解的方法(实际上,这是一个PPAD(Polynomial Parity Argument, Directed version)类的问题。(见论文Settling the Complexity of Computing Two-Player Nash Equilibria. Xi Chen, et al.)由于篇幅限制,我们这里无法对多智能体问题做深入探讨,我们可以用一个简单例子来介绍为什么多智能体强化学习问题无法简单地用单智能体强化学习算法来解。 - -:剪刀-石头-布的奖励值表 - -| 奖励值 | 剪刀 | 石头 | 布 | -| --- | ------- | ------- | ------- | -| **剪刀** | (0,0) | (-1,+1) | (+1,-1) | -| **石头** | (+1,-1) | (0,0) | (-1,+1) | -| **布** | (-1,+1) | (+1,-1) | (0,0) | -|:label:`tab_ch12_ch12_marl`|||| - -我们考虑一个大家都熟悉的游戏, 剪刀-石头-布,考虑两个玩家玩这个游戏的输赢情况,我们知道有这样的输赢关系:剪刀<石头<布<剪刀...这里的“<”即前一个纯策略被后一个纯策略完全压制,我们给予奖励值-1、+1到这两个玩家,当他们选择相同的纯策略时,奖励值均为0。于是我们得到一个奖励值表如 :numref:`tab_ch12_ch12_marl`所示,横轴为玩家1,纵轴为玩家2,表内的数组为玩家1和玩家2各自在相应动作下得到的奖励值。 - -由于这个矩阵的反对称性,这个问题的纳什均衡策略对两个玩家相同,均为$(\frac{1}{3}, \frac{1}{3}, \frac{1}{3})$的策略分布,即有各$\frac{1}{3}$的概率出剪刀、石头或布。如果我们把得到这个纳什均衡策略作为多智能体学习的目标,那么我们可以简单分析得到这个均衡策略无法通过简单的单智能体算法得到。考虑我们随机初始化两个玩家为任意两个纯策略,比如玩家1出剪刀,玩家2出石头。这时假设玩家2策略固定,可以把玩家2看做固定环境的一部分,于是可以使用任意单智能体强化学习算法对玩家1进行训练,使其最大化自己的奖励值。于是,玩家1会收敛到布的纯策略。这时再把玩家1固定,训练玩家2,玩家2又收敛到剪刀的纯策略。于是循环往复,整个训练过程始终无法收敛,玩家1和2各自在3个策略中循环却无法得到正确的纳什均衡策略。 - - -![自学习算法示意图。](../img/ch12/ch12-marl-sp.png) - -:width:`600px` - -:label:`ch12/ch12-marl-sp` - -我们在上面这个例子中采用的学习方法其实是多智能体强化学习中最基础的一种,叫自学习(Selfplay),如 :numref:`ch12/ch12-marl-sp`所示。自学习的方法即固定当前玩家 1 的策略,按照单智能体优化的方法最大化一侧智能体的表现,所得策略称为最佳反应策略(Best Response Strategy)。之后再将这一最佳反应策略作为玩家 2 的固定策略,再来优化另一边的智能体策略,如此循环。我们可以看到自学习在特定的任务设置下可能无法收敛到我们想要的最终目标。正是由于多智能体学习过程中有类似循环结构的出现,我们需要更复杂的训练方法,和专门针对多智能体的学习方式来达到我们想要的目标。 - -一般来讲,多智能体强化学习是比单智能体强化学习更复杂的一类,对于自学习的方法而言,单智能体强化学习的过程可以看做一个多智能体强化学习的子任务。从前面这一小游戏的角度来理解,当玩家 1 策略固定时,玩家 1 加游戏环境构成玩家 2 的实际学习环境,由于这个环境是固定的,玩家 2 可以通过单智能体强化学习来达到自身奖励值最大化;这时再固定玩家 2 的策略,玩家 1 又可以进行单智能体强化学习...... 这样,单智能体强化学习是多智能体任务的子任务。如 :numref:`ch12/ch12-marl-fsp`,其他算法如虚构自学习(Fictitious Self-play),需要在每个单智能体强化学习的步骤中,对对手历史策略的平均策略求得最优应对策略,而对手的训练也是如此,进行循环,能够在上面剪刀-石头-布一类的游戏中保证收敛到纳什均衡策略。 - -![虚构自学习算法示意图。](../img/ch12/ch12-marl-fsp.png) - -:width:`600px` - +## 多智能体强化学习 + +以上所讲述的强化学习内容都为单智能体强化学习,而在近来的强化学习研究中,多智能体强化学习越来越受到研究人员关注。回想在本小节初介绍的单智能体强化学习框架 :numref:`ch12/ch12-rl-framework`,其中我们只有单个智能体产生的单个动作对环境产生影响,环境也返回单个奖励值给智能体。这里我们把单智能体强化学习扩展到多智能体强化学习,可以得到至少两种可能的多智能体强化学习框架,如 :numref:`ch12/ch12-marl`所示。 :numref:`ch12/ch12-marl`(a)为多智能体同时执行动作的情况,他们相互之间观察不到彼此的动作,他们的动作一同对环境产生影响,并各自接受自己动作所产生的奖励。 :numref:`ch12/ch12-marl`(b)为多智能体顺序执行动作的情况,后续智能体可能观察到前序智能体的动作,他们的动作一同对环境产生影响,并接受到各自的奖励值或共同的奖励值。除此之外,还有许多其他可能的多智能体框架,如更复杂的智能体间观察机制、智能体间通讯机制、多智能体合作与竞争等等。同时,这里假设多个智能体对环境的观察量都为环境的状态,这是最简单的一种,也是现实中最不可能出现的一种,实际情况下的多智能体往往对环境有各自不同的观察量。![两种可能的多智能体强化学习框架:(a)同步式多智能体决策;(b)异步式多智能体决策。](../img/ch12/ch12-marl.png) + +:width:`800px` + +:label:`ch12/ch12-marl` + +这里我们可以根据前面对单智能体强化学习过程的马尔可夫决策过程描述,给出多智能体强化学习的马尔可夫决策过程,它可以用一个数组$(\mathcal{S}, N, \boldsymbol{\mathcal{A}}, \mathbf{R}, \mathcal{T}, \gamma)$来表示。$N$是智能体个数,$\mathcal{S}$和$\boldsymbol{\mathcal{A}}=(\mathcal{A}_1, \mathcal{A}_2, ..., \mathcal{A}_N)$分别是环境状态空间和多智能体动作空间,其中$A_i$是第$i$个智能体的动作空间,$\mathbf{R}=(R_1, R_2, ..., R_N)$是多智能体奖励函数,$\mathbf{R}(s,\mathbf{a})$: $\mathcal{S}\times \boldsymbol{\mathcal{A}}\rightarrow \mathbb{R}^N$为对于当前状态$s\in\mathcal{S}$和当前多智能体动作$\mathbf{a}\in\boldsymbol{\mathcal{A}}$的奖励向量值,其中$R_i$是对第$i$个智能体的奖励值。从当前状态和动作到下一个状态的状态转移概率定义为$\mathcal{T}(s^\prime|s,\mathbf{a})$: $\mathcal{S}\times\boldsymbol{\mathcal{A}}\times\mathcal{S}\rightarrow \mathbb{R}_+$。$\gamma\in(0,1)$是奖励折扣因子(假设多个智能体采用相同的奖励折扣因子)。不同于单智能体强化学习,多智能体强化学习的目标除了常见的最大化每个智能体各自的期望累计奖励值$\mathbb{E}[\sum_t \gamma^t r^i_t], i\in[N]$之外,还有许多其他可能的学习目标,如达到纳什均衡、最大化团队奖励等等。 + +由上述介绍和定义可以发现,多智能体强化学习是一个比单智能体强化学习更加复杂的问题。而实际上,多个智能体的存在,对于每个智能体的决策而言,绝对不是简单的把每个单智能体决策累加的难度,实际情况要比单智能体决策问题复杂很多。多智能体系统的研究实际上是门古老的学科,它与博弈论(Game Theory)密切相关,在深度强化学习盛行以前早已有大量研究和许多理论上未解的难题。其中一个典型的问题是纳什均衡在双人非零和博弈下没有多项式时间内可解的方法(实际上,这是一个PPAD(Polynomial Parity Argument, Directed version)类的问题。(见论文Settling the Complexity of Computing Two-Player Nash Equilibria. Xi Chen, et al.)由于篇幅限制,我们这里无法对多智能体问题做深入探讨,我们可以用一个简单例子来介绍为什么多智能体强化学习问题无法简单地用单智能体强化学习算法来解。 + +:剪刀-石头-布的奖励值表 + +| 奖励值 | 剪刀 | 石头 | 布 | +| --- | ------- | ------- | ------- | +| **剪刀** | (0,0) | (-1,+1) | (+1,-1) | +| **石头** | (+1,-1) | (0,0) | (-1,+1) | +| **布** | (-1,+1) | (+1,-1) | (0,0) | +|:label:`tab_ch12_ch12_marl`|||| + +我们考虑一个大家都熟悉的游戏, 剪刀-石头-布,考虑两个玩家玩这个游戏的输赢情况,我们知道有这样的输赢关系:剪刀<石头<布<剪刀...这里的“<”即前一个纯策略被后一个纯策略完全压制,我们给予奖励值-1、+1到这两个玩家,当他们选择相同的纯策略时,奖励值均为0。于是我们得到一个奖励值表如 :numref:`tab_ch12_ch12_marl`所示,横轴为玩家1,纵轴为玩家2,表内的数组为玩家1和玩家2各自在相应动作下得到的奖励值。 + +由于这个矩阵的反对称性,这个问题的纳什均衡策略对两个玩家相同,均为$(\frac{1}{3}, \frac{1}{3}, \frac{1}{3})$的策略分布,即有各$\frac{1}{3}$的概率出剪刀、石头或布。如果我们把得到这个纳什均衡策略作为多智能体学习的目标,那么我们可以简单分析得到这个均衡策略无法通过简单的单智能体算法得到。考虑我们随机初始化两个玩家为任意两个纯策略,比如玩家1出剪刀,玩家2出石头。这时假设玩家2策略固定,可以把玩家2看做固定环境的一部分,于是可以使用任意单智能体强化学习算法对玩家1进行训练,使其最大化自己的奖励值。于是,玩家1会收敛到布的纯策略。这时再把玩家1固定,训练玩家2,玩家2又收敛到剪刀的纯策略。于是循环往复,整个训练过程始终无法收敛,玩家1和2各自在3个策略中循环却无法得到正确的纳什均衡策略。 + + +![自学习算法示意图。](../img/ch12/ch12-marl-sp.png) + +:width:`600px` + +:label:`ch12/ch12-marl-sp` + +我们在上面这个例子中采用的学习方法其实是多智能体强化学习中最基础的一种,叫自学习(Selfplay),如 :numref:`ch12/ch12-marl-sp`所示。自学习的方法即固定当前玩家 1 的策略,按照单智能体优化的方法最大化一侧智能体的表现,所得策略称为最佳反应策略(Best Response Strategy)。之后再将这一最佳反应策略作为玩家 2 的固定策略,再来优化另一边的智能体策略,如此循环。我们可以看到自学习在特定的任务设置下可能无法收敛到我们想要的最终目标。正是由于多智能体学习过程中有类似循环结构的出现,我们需要更复杂的训练方法,和专门针对多智能体的学习方式来达到我们想要的目标。 + +一般来讲,多智能体强化学习是比单智能体强化学习更复杂的一类,对于自学习的方法而言,单智能体强化学习的过程可以看做一个多智能体强化学习的子任务。从前面这一小游戏的角度来理解,当玩家 1 策略固定时,玩家 1 加游戏环境构成玩家 2 的实际学习环境,由于这个环境是固定的,玩家 2 可以通过单智能体强化学习来达到自身奖励值最大化;这时再固定玩家 2 的策略,玩家 1 又可以进行单智能体强化学习...... 这样,单智能体强化学习是多智能体任务的子任务。如 :numref:`ch12/ch12-marl-fsp`,其他算法如虚构自学习(Fictitious Self-play),需要在每个单智能体强化学习的步骤中,对对手历史策略的平均策略求得最优应对策略,而对手的训练也是如此,进行循环,能够在上面剪刀-石头-布一类的游戏中保证收敛到纳什均衡策略。 + +![虚构自学习算法示意图。](../img/ch12/ch12-marl-fsp.png) + +:width:`600px` + :label:`ch12/ch12-marl-fsp` \ No newline at end of file diff --git a/chapter_reinforcement_learning/marl_sys.md b/zh_chapters/chapter_reinforcement_learning/marl_sys.md similarity index 99% rename from chapter_reinforcement_learning/marl_sys.md rename to zh_chapters/chapter_reinforcement_learning/marl_sys.md index aceb638..7814445 100644 --- a/chapter_reinforcement_learning/marl_sys.md +++ b/zh_chapters/chapter_reinforcement_learning/marl_sys.md @@ -1,41 +1,41 @@ -## 多智能体强化学习系统 - -上述的简单例子只是为了帮助读者理解强化学习在多智能体问题里的角色,而如今前沿的多智能体强化学习算法已经能够解决相当大规模的复杂多智能体问题,如星际争霸(StarCraft II)、Dota 2等游戏,已相继被DeepMind、OpenAI等公司所研究的智能体AlphaStar :cite:`vinyals2019grandmaster`和OpenAI Five :cite:`berner2019dota`攻克,达到超越人类顶级玩家的水平。国内公司如腾讯、启元世界等也提出了星际争霸游戏的多智能体强化学习解决方案TStarBot-X :cite:`han2020tstarbot`和SCC :cite:`wang2021scc`。对于这类高度复杂的游戏环境,整个训练过程对分布式计算系统的要求更高,而整个训练过程可能需要分为多个阶段。以 AlphaStar 为例,它训练的智能体采用了监督学习与强化学习结合的方式。在训练早期,往往先采用大量的人类专业玩家标定数据进行有监督的学习,从而使智能体快速获得较好的能力,随后,训练会切换到强化学习过程,使用前面介绍的虚构自学习的算法进行训练,即自我博弈。为了得到一个表现最好的智能体,算法需要充分探索整个策略空间,从而在训练中不止对一个策略进行训练,而是对一个策略集群(League)进行训练,并通过类似演化算法的方式对策略集群进行筛选,得到大量策略中表现最好的策略。如 :numref:`ch12/ch12-marl_train`所示,在训练过程中每个智能体往往需要 -和其他智能体以及剥削者(Exploiter)进行博弈,剥削者是专门针对某一个智能体策略的最佳对手策略,与之对抗可以提高策略自身的防剥削能力。通过对大量智能体策略进行训练并筛选的这类方法称为集群式训练(Population-based Training/League Training),是一种通过分布式训练提高策略种群多样性进而提升模型表现的方式。可见,在实践中这类方法自然需要分布式系统支持,来实现多个智能体的训练和相互博弈,这很好地体现了多智能体强化学习对分布式计算的依赖性。 - -![集群式多智能体强化学习训练示意图](../img/ch12/ch12-marl-train.svg) - -:width:`800px` - -:label:`ch12/ch12-marl_train` - -我们将对构建多智能体强化学习系统中的困难分为以下几点进行讨论: - -* **智能体个数带来的复杂度**:从单智能体系统到多智能体系统最直接的变化,就是智能体个数从1变为大于1个。对于一个各个智能体独立的$N$智能体系统而言,这种变化带来的策略空间表示复杂度是指数增加的,即$\tilde{O}(e^N)$。举个简单的例子,对于一个离散空间的单智能体系统,假设其状态空间大小为$S$, 动作空间大小为$A$,游戏步长为$H$,那么这个离散策略空间的大小为$O(HSA)$;而直接将该游戏扩展为$N$玩家游戏后,在最一般的情况下,即所有玩家有对称的动作空间动作空间大小为$A$且不共享任何结构信息,所有玩家策略的联合分布空间大小为$O(HSA^N)$。这是因为每个独立玩家的策略空间构成联合策略空间是乘积关系$\mathcal{A}=\mathcal{A}_1\times\dots\mathcal{A}_N$。而这将直接导致算法搜索复杂度提升。 - - 在这种情况下,原先的单智能体系统,需要扩展为对多智能体策略进行优化的系统,这意味着单智能体分布式系统内的每个并行化的模块现在需要相应扩展到多智能体系统中的每个智能体上。而在复杂的情况下,还需要考虑智能体之间通信过程、智能体之间的异质性等,甚至不同智能体可能需要采用不完全对称模型进行表示,以及采用不同的算法进行优化等等。 - -* **游戏类型带来的复杂度**:从博弈论的角度,多智能系统所产生的游戏类型是复杂的。从最直接的分类角度,有竞争型、合作型、混合型。在竞争型游戏中,最典型的研究模型是二人零和博弈,如前一小节中提到的剪刀-石头-布的游戏。这类游戏中的纳什均衡策略一般为混合型策略,即无法通过单一纯策略达到均衡条件。纯策略纳什均衡存在于少数零和游戏中。合作型游戏即多个智能体需要通过合作来提升整体奖励。在这类问题研究中一般采用基于值分解的思路,将所有智能体得到的奖励值分配到单个智能体作为其奖励值。这一类的算法有VDN :cite:`sunehag2017value`, COMA :cite:`foerster2018counterfactual`, QMIX :cite:`rashid2018qmix`等。 - - 在混合型游戏中,部分智能体之间为合作关系,部分智能体或智能体的集合间为竞争关系。一般的非零和博弈且非纯合作型游戏为混合型游戏,举个简单的例子如囚徒困境(Prisoner's Dilemma), 其奖励值表如 :numref:`tab_ch12_ch12_marl_prison`所示。囚徒困境的两个玩家各有两个动作,沉默和背叛。可以用警察审查两名罪犯来理解,奖励值的绝对值即他们将被判处的年数。纯所有玩家的奖励值之和非常数,故其为非零和博弈型游戏。因此这一游戏不能被认为是纯竞争型或纯合作型游戏,因为当他们中的一方选择沉默一方选择背叛时,二者没有有效合作,而一方拿到了 0 的奖励,另一方为-3。而两者都选择沉默时是一种合作策略,各自拿到-1 的奖励值。尽管这一策略看起来优于其他策略,但是这并不是这个游戏的纳什均衡策略,因为纳什均衡策略假设玩家间策略需要单独制定,无法形成联合策略分布。这实际上切断了玩家间的信息沟通和潜在合作的可能。因此,囚徒困境的纳什均衡策略是两个玩家都选择背叛对方。 - 诸如此类的博弈论游戏类型,导致单智能体强化学习不能被直接用来优化多智能体系统中的各个智能体的策略。单智能体强化学习一般是找极值的过程,而多智能体系统求解纳什均衡策略往往是找极大-极小值即鞍点的过程,从优化的角度看这也是不同的。复杂的关系需要更普适的系统进行表达,这也对多智能体系统的构建提出了挑战。多智能体游戏类型也有许多其他的分类角度,如单轮进行的游戏、多轮进行的游戏、多智能体同时决策的、多智能体序贯决策等等,每一类不同的游戏都有相应不同的算法。而现有的多智能体系统往往针对单一类型游戏或者单一算法,缺少普适性多智能体强化学习系统,尤其是分布式的系统。 - -:囚徒困境奖励值 - -| 奖励值 | 沉默 | 背叛 | -| --- | ------- | ------- | -| **沉默** | (-1,-1) | (-3,0) | -| **背叛** | (0,-3) | (-2,-2) | -|:label:`tab_ch12_ch12_marl_prison`||| - - -* **算法的异构**:从前面介绍的几个简单的多智能体算法,如自学习、虚构自学习等可以看出,多智能体算法有时由许多轮单智能体强化学习过程组成。而对不同的游戏类型,算法的类型也不相同。比如,对合作型游戏,许多算法是基于功劳分配(Credit Assignment)的思想,如何将多个智能体获得的共同奖励合理分配给单个智能体是这类算法的核心。而这里面按照具体算法执行方式,也可以分为集成训练统一执行的(Centralized Training Centralized Execution)、集成训练分别执行的(Centralized Training Decentralized Execution)、分别训练并分别执行(Decentralized Training Decentralized Execution)的几类,来描述不同智能体训练过程和执行过程的统一性。对于竞争型游戏,往往采用各种计算纳什均衡的近似方法,如前面提到的虚构自学习、Double Oracle、Mirror Descent 等等,将获取单个最优策略的单智能体强化学习过程看做一个“动作”,而对这些“动作”组成的元问题上进行纳什均衡近似。现有的算法在类似问题上有很大的差异性,使得构建一个统一的多智能体强化学习系统比较困难。 -* **学习方法组合**:在前面提到的AlphaStar :cite:`vinyals2019grandmaster`等工作中,多智能体系统中优化得到一个好的策略往往不只需要强化学习算法,还需要其他学习方法如模仿学习等的辅助。比如从一些顶级人类玩家的游戏记录中形成有标签的训练样本,来预训练智能体。由于这些大规模游戏的复杂性,这往往是一个在训练前期快速提升智能体表现的有效方式。而对于整个学习系统而言,这就需要对不同学习范式进行结合,如合理地在模仿学习和强化学习之间进行切换等。这也使得大规模多智能体系统不单一是构建强化学习系统的问题,而需要许多其他学习机制和协调机制的配合实现。 - -如 :numref:`ch12/ch12_marl_sys`所示,为一个分布式多智能体强化学习系统。图中的两个智能体可以类似扩展到多个智能体。每个智能体包含多个行动者(Actor)用于采样和学习者(Learner)用于更新模型,这些行动者和学习者可以并行处理来加速训练过程,具体方法可以参考单智能体分布式系统章节介绍的A3C和IMPALA架构。训练好的模型被统一存储和管理在模型存储器中,是否对各个智能体的模型统一存储取决于各个智能体是否对称——如果不对称,需要将模型分别存储。存储器中的模型可以被模型评估器用来打分,从而为下一步模型选择器做准备。模型选择器根据模型评估器或者元学习者(如PSRO算法 :cite:`lanctot2017unified`)以及均衡求解器等进行模型选择,并将选出的模型分发到各个智能体的行动者上。这一处理过程,我们称为联盟型管理(League-based Management)。对于与环境交互的部分,分布式系统可以通过一个推理服务器(Inference Server)对各个并行进程中的模型进行集中推理,将基于观察量(Observation)的动作(Action)发送给环境。环境部分也可以是并行的,对推理服务器传递来的动作进行并行处理后,返回观察量。推理服务器将采集到的交互轨迹发送给各个智能体进行模型训练。以上为一个分布式多智能体系统的例子,实际中根据不同的游戏类型和算法结构可能会有不同的设计。 - -![分布式多智能体强化学习系统](../img/ch12/ch12-marl-sys.png) - -:width:`800px` - +## 多智能体强化学习系统 + +上述的简单例子只是为了帮助读者理解强化学习在多智能体问题里的角色,而如今前沿的多智能体强化学习算法已经能够解决相当大规模的复杂多智能体问题,如星际争霸(StarCraft II)、Dota 2等游戏,已相继被DeepMind、OpenAI等公司所研究的智能体AlphaStar :cite:`vinyals2019grandmaster`和OpenAI Five :cite:`berner2019dota`攻克,达到超越人类顶级玩家的水平。国内公司如腾讯、启元世界等也提出了星际争霸游戏的多智能体强化学习解决方案TStarBot-X :cite:`han2020tstarbot`和SCC :cite:`wang2021scc`。对于这类高度复杂的游戏环境,整个训练过程对分布式计算系统的要求更高,而整个训练过程可能需要分为多个阶段。以 AlphaStar 为例,它训练的智能体采用了监督学习与强化学习结合的方式。在训练早期,往往先采用大量的人类专业玩家标定数据进行有监督的学习,从而使智能体快速获得较好的能力,随后,训练会切换到强化学习过程,使用前面介绍的虚构自学习的算法进行训练,即自我博弈。为了得到一个表现最好的智能体,算法需要充分探索整个策略空间,从而在训练中不止对一个策略进行训练,而是对一个策略集群(League)进行训练,并通过类似演化算法的方式对策略集群进行筛选,得到大量策略中表现最好的策略。如 :numref:`ch12/ch12-marl_train`所示,在训练过程中每个智能体往往需要 +和其他智能体以及剥削者(Exploiter)进行博弈,剥削者是专门针对某一个智能体策略的最佳对手策略,与之对抗可以提高策略自身的防剥削能力。通过对大量智能体策略进行训练并筛选的这类方法称为集群式训练(Population-based Training/League Training),是一种通过分布式训练提高策略种群多样性进而提升模型表现的方式。可见,在实践中这类方法自然需要分布式系统支持,来实现多个智能体的训练和相互博弈,这很好地体现了多智能体强化学习对分布式计算的依赖性。 + +![集群式多智能体强化学习训练示意图](../img/ch12/ch12-marl-train.svg) + +:width:`800px` + +:label:`ch12/ch12-marl_train` + +我们将对构建多智能体强化学习系统中的困难分为以下几点进行讨论: + +* **智能体个数带来的复杂度**:从单智能体系统到多智能体系统最直接的变化,就是智能体个数从1变为大于1个。对于一个各个智能体独立的$N$智能体系统而言,这种变化带来的策略空间表示复杂度是指数增加的,即$\tilde{O}(e^N)$。举个简单的例子,对于一个离散空间的单智能体系统,假设其状态空间大小为$S$, 动作空间大小为$A$,游戏步长为$H$,那么这个离散策略空间的大小为$O(HSA)$;而直接将该游戏扩展为$N$玩家游戏后,在最一般的情况下,即所有玩家有对称的动作空间动作空间大小为$A$且不共享任何结构信息,所有玩家策略的联合分布空间大小为$O(HSA^N)$。这是因为每个独立玩家的策略空间构成联合策略空间是乘积关系$\mathcal{A}=\mathcal{A}_1\times\dots\mathcal{A}_N$。而这将直接导致算法搜索复杂度提升。 + + 在这种情况下,原先的单智能体系统,需要扩展为对多智能体策略进行优化的系统,这意味着单智能体分布式系统内的每个并行化的模块现在需要相应扩展到多智能体系统中的每个智能体上。而在复杂的情况下,还需要考虑智能体之间通信过程、智能体之间的异质性等,甚至不同智能体可能需要采用不完全对称模型进行表示,以及采用不同的算法进行优化等等。 + +* **游戏类型带来的复杂度**:从博弈论的角度,多智能系统所产生的游戏类型是复杂的。从最直接的分类角度,有竞争型、合作型、混合型。在竞争型游戏中,最典型的研究模型是二人零和博弈,如前一小节中提到的剪刀-石头-布的游戏。这类游戏中的纳什均衡策略一般为混合型策略,即无法通过单一纯策略达到均衡条件。纯策略纳什均衡存在于少数零和游戏中。合作型游戏即多个智能体需要通过合作来提升整体奖励。在这类问题研究中一般采用基于值分解的思路,将所有智能体得到的奖励值分配到单个智能体作为其奖励值。这一类的算法有VDN :cite:`sunehag2017value`, COMA :cite:`foerster2018counterfactual`, QMIX :cite:`rashid2018qmix`等。 + + 在混合型游戏中,部分智能体之间为合作关系,部分智能体或智能体的集合间为竞争关系。一般的非零和博弈且非纯合作型游戏为混合型游戏,举个简单的例子如囚徒困境(Prisoner's Dilemma), 其奖励值表如 :numref:`tab_ch12_ch12_marl_prison`所示。囚徒困境的两个玩家各有两个动作,沉默和背叛。可以用警察审查两名罪犯来理解,奖励值的绝对值即他们将被判处的年数。纯所有玩家的奖励值之和非常数,故其为非零和博弈型游戏。因此这一游戏不能被认为是纯竞争型或纯合作型游戏,因为当他们中的一方选择沉默一方选择背叛时,二者没有有效合作,而一方拿到了 0 的奖励,另一方为-3。而两者都选择沉默时是一种合作策略,各自拿到-1 的奖励值。尽管这一策略看起来优于其他策略,但是这并不是这个游戏的纳什均衡策略,因为纳什均衡策略假设玩家间策略需要单独制定,无法形成联合策略分布。这实际上切断了玩家间的信息沟通和潜在合作的可能。因此,囚徒困境的纳什均衡策略是两个玩家都选择背叛对方。 + 诸如此类的博弈论游戏类型,导致单智能体强化学习不能被直接用来优化多智能体系统中的各个智能体的策略。单智能体强化学习一般是找极值的过程,而多智能体系统求解纳什均衡策略往往是找极大-极小值即鞍点的过程,从优化的角度看这也是不同的。复杂的关系需要更普适的系统进行表达,这也对多智能体系统的构建提出了挑战。多智能体游戏类型也有许多其他的分类角度,如单轮进行的游戏、多轮进行的游戏、多智能体同时决策的、多智能体序贯决策等等,每一类不同的游戏都有相应不同的算法。而现有的多智能体系统往往针对单一类型游戏或者单一算法,缺少普适性多智能体强化学习系统,尤其是分布式的系统。 + +:囚徒困境奖励值 + +| 奖励值 | 沉默 | 背叛 | +| --- | ------- | ------- | +| **沉默** | (-1,-1) | (-3,0) | +| **背叛** | (0,-3) | (-2,-2) | +|:label:`tab_ch12_ch12_marl_prison`||| + + +* **算法的异构**:从前面介绍的几个简单的多智能体算法,如自学习、虚构自学习等可以看出,多智能体算法有时由许多轮单智能体强化学习过程组成。而对不同的游戏类型,算法的类型也不相同。比如,对合作型游戏,许多算法是基于功劳分配(Credit Assignment)的思想,如何将多个智能体获得的共同奖励合理分配给单个智能体是这类算法的核心。而这里面按照具体算法执行方式,也可以分为集成训练统一执行的(Centralized Training Centralized Execution)、集成训练分别执行的(Centralized Training Decentralized Execution)、分别训练并分别执行(Decentralized Training Decentralized Execution)的几类,来描述不同智能体训练过程和执行过程的统一性。对于竞争型游戏,往往采用各种计算纳什均衡的近似方法,如前面提到的虚构自学习、Double Oracle、Mirror Descent 等等,将获取单个最优策略的单智能体强化学习过程看做一个“动作”,而对这些“动作”组成的元问题上进行纳什均衡近似。现有的算法在类似问题上有很大的差异性,使得构建一个统一的多智能体强化学习系统比较困难。 +* **学习方法组合**:在前面提到的AlphaStar :cite:`vinyals2019grandmaster`等工作中,多智能体系统中优化得到一个好的策略往往不只需要强化学习算法,还需要其他学习方法如模仿学习等的辅助。比如从一些顶级人类玩家的游戏记录中形成有标签的训练样本,来预训练智能体。由于这些大规模游戏的复杂性,这往往是一个在训练前期快速提升智能体表现的有效方式。而对于整个学习系统而言,这就需要对不同学习范式进行结合,如合理地在模仿学习和强化学习之间进行切换等。这也使得大规模多智能体系统不单一是构建强化学习系统的问题,而需要许多其他学习机制和协调机制的配合实现。 + +如 :numref:`ch12/ch12_marl_sys`所示,为一个分布式多智能体强化学习系统。图中的两个智能体可以类似扩展到多个智能体。每个智能体包含多个行动者(Actor)用于采样和学习者(Learner)用于更新模型,这些行动者和学习者可以并行处理来加速训练过程,具体方法可以参考单智能体分布式系统章节介绍的A3C和IMPALA架构。训练好的模型被统一存储和管理在模型存储器中,是否对各个智能体的模型统一存储取决于各个智能体是否对称——如果不对称,需要将模型分别存储。存储器中的模型可以被模型评估器用来打分,从而为下一步模型选择器做准备。模型选择器根据模型评估器或者元学习者(如PSRO算法 :cite:`lanctot2017unified`)以及均衡求解器等进行模型选择,并将选出的模型分发到各个智能体的行动者上。这一处理过程,我们称为联盟型管理(League-based Management)。对于与环境交互的部分,分布式系统可以通过一个推理服务器(Inference Server)对各个并行进程中的模型进行集中推理,将基于观察量(Observation)的动作(Action)发送给环境。环境部分也可以是并行的,对推理服务器传递来的动作进行并行处理后,返回观察量。推理服务器将采集到的交互轨迹发送给各个智能体进行模型训练。以上为一个分布式多智能体系统的例子,实际中根据不同的游戏类型和算法结构可能会有不同的设计。 + +![分布式多智能体强化学习系统](../img/ch12/ch12-marl-sys.png) + +:width:`800px` + :label:`ch12/ch12_marl_sys` \ No newline at end of file diff --git a/chapter_reinforcement_learning/rl_introduction.md b/zh_chapters/chapter_reinforcement_learning/rl_introduction.md similarity index 99% rename from chapter_reinforcement_learning/rl_introduction.md rename to zh_chapters/chapter_reinforcement_learning/rl_introduction.md index cf8b446..7199213 100644 --- a/chapter_reinforcement_learning/rl_introduction.md +++ b/zh_chapters/chapter_reinforcement_learning/rl_introduction.md @@ -1,27 +1,27 @@ -## 强化学习介绍 - -近年来,强化学习作为机器学习的一个分支受到越来越多的关注。2013 年 DeepMind 公司的研究人员提出了深度 Q 学习 :cite:`mnih2013playing`(Deep Q-learning),成功让 AI 从图像中学习玩电子游戏。自此以后,以 DeepMind 为首的科研机构推出了像 AlphaGo 围棋 AI 这类的引人瞩目的强化学习成果,并在 2016 年与世界顶级围棋高手李世石的对战中取得了胜利。自那以后,强化学习领域连续取得了一系列成就,如星际争霸游戏智能体 AlphaStar、Dota 2 游戏智能体 OpenAI Five、多人零和博弈德州扑克的 Pluribus、机器狗运动控制算法等。在这一系列科研成就的背后,是整个强化学习领域算法在这些年内快速迭代进步的结果,基于模拟器产生的大量数据使得对数据“饥饿”(Data Hungry)的深度神经网络能够表现出很好的拟合效果,从而将强化学习算法的能力充分发挥出来,在以上领域中达到或者超过人类专家的学习表现。目前,强化学习已经从电子游戏逐步走向更广阔的应用场景,如机器人控制、机械手灵巧操作、能源系统调度、网络负载分配、股票期货交易等一系列更加现实和富有意义的领域,对传统控制方法和启发式决策理论发起冲击。 - -![强化学习框架](../img/ch12/ch12-rl.png) - -:width:`400px` - -:label:`ch12/ch12-rl-framework` -强化学习的核心是不断地与环境交互来优化策略从而提升奖励的过程,主要表现为基于某个**状态**(State)下的**动作**(Action)的选择。进行这一决策的对象我们常称为**智能体**(Agent),而这一决策的影响将在**环境**(Environment)中体现。更具体地,不同的决策会影响环境的**状态转移**(State Transition)和**奖励**(Reward)。以上状态转移是环境从当前状态转移到下一状态的函数,它可以是确定性也可以是随机性的。奖励是环境对智能体动作的反馈,通常是一个标量。以上过程可以抽象为 :numref:`ch12/ch12-rl-framework`所示,这是文献中最常见的强化学习的模型描述。 - -举例来说,当人在玩某个电子游戏的时候,需要逐渐熟悉游戏的操作以取得更好的游戏结果,那么人从刚接触到这个游戏到逐步掌握游戏技巧的这个过程为一个类似于强化学习的过程。该游戏从开始后的任一时刻,会处于一个特定的状态,而人通过观察这个状态会获得一个**观察量**(Observation)(如观察游戏机显示屏的图像),并基于这个观察量做出一个操作动作(如发射子弹),这一动作将改变这个游戏下一时刻的状态,使其转移到下一个状态(如把怪物打败了),并且玩家可以知道当前动作的效果(如产生了一个正或负的分数,怪物打败了则获得正分数)。这时玩家再基于下一个状态的观察量做出新的动作选择,周而复始,直到游戏结束。通过反复的操作和观察,人能够逐步掌握这个游戏的技巧,一个强化学习智能体也是如此。 - -这里注意,有几个比较关键的问题:一是观察量未必等于状态,而通常观察量是状态的函数,从状态到观察量的映射可能有一定的信息损失。对于观察量等于状态或者根据观察量能够完全恢复环境状态的情况,我们称为**完全可观测**(Fully Observable),否则我们称为**部分可观测**(Partially Observable)环境;二是玩家的每个动作未必会产生立即反馈,某个动作可能在许多步之后才产生效果,强化学习模型允许这种延迟反馈的存在;三是这种反馈对人的学习过程而言未必是个数字,但是我们对强化学习智能体所得到的反馈进行数学抽象,将其转变为一个数字,称为奖励值。奖励值可以是状态的函数,也可以是状态和动作的函数,依具体问题而定。奖励值的存在是强化学习问题的一个基本假设,也是现有强化学习与监督式学习的一个主要区别 - -强化学习的决策过程通常由一个马尔可夫决策过程(Markov Decision Process,MDP)(马尔可夫决策过程即一个后续状态只依赖当前状态和动作而不依赖于历史状态的函数)描述,可以用一个数组$(\mathcal{S}, \mathcal{A}, R, \mathcal{T}, \gamma)$来表示。$\mathcal{S}$和$\mathcal{A}$分别是状态空间和动作空间,$R$是奖励函数,$R(s,a)$: $\mathcal{S}\times \mathcal{A}\rightarrow \mathbb{R}$为对于当前状态$s\in\mathcal{S}$和当前动作$a\in\mathcal{A}$的奖励值。从当前状态和动作到下一个状态的状态转移概率定义为$\mathcal{T}(s^\prime|s,a)$: $\mathcal{S}\times\mathcal{A}\times\mathcal{S}\rightarrow \mathbb{R}_+$。$\gamma\in(0,1)$是奖励折扣因子(折扣因子可以乘到每个后续奖励值上,从而使无穷长序列有有限的奖励值之和)。强化学习的目标是最大化智能体的期望累计奖励值$\mathbb{E}[\sum_t \gamma^t r_t]$。 - -马尔可夫决策过程中的马尔可夫性质由以下定义 - -$$ -\mathcal{T}(s_{t+1}|s_t) = \mathcal{T}(s_{t+1}|s_0, s_1, s_2, \dots, s_t) -$$ - -即当前状态转移只依赖于上一时刻状态,而不依赖于整个历史。这里的状态转移函数$\mathcal{T}$中省略了动作$a$,马尔可夫性质是环境转移过程的属性,其独立于产生动作的决策过程。 - -基于马尔可夫性质,可以进一步推导出在某一时刻最优策略不依赖于整个决策历史,而只依赖于当前最新状态的结论。这一结论在强化学习算法设计中有着重要意义,它简化了最优策略的求解过程。 - +## 强化学习介绍 + +近年来,强化学习作为机器学习的一个分支受到越来越多的关注。2013 年 DeepMind 公司的研究人员提出了深度 Q 学习 :cite:`mnih2013playing`(Deep Q-learning),成功让 AI 从图像中学习玩电子游戏。自此以后,以 DeepMind 为首的科研机构推出了像 AlphaGo 围棋 AI 这类的引人瞩目的强化学习成果,并在 2016 年与世界顶级围棋高手李世石的对战中取得了胜利。自那以后,强化学习领域连续取得了一系列成就,如星际争霸游戏智能体 AlphaStar、Dota 2 游戏智能体 OpenAI Five、多人零和博弈德州扑克的 Pluribus、机器狗运动控制算法等。在这一系列科研成就的背后,是整个强化学习领域算法在这些年内快速迭代进步的结果,基于模拟器产生的大量数据使得对数据“饥饿”(Data Hungry)的深度神经网络能够表现出很好的拟合效果,从而将强化学习算法的能力充分发挥出来,在以上领域中达到或者超过人类专家的学习表现。目前,强化学习已经从电子游戏逐步走向更广阔的应用场景,如机器人控制、机械手灵巧操作、能源系统调度、网络负载分配、股票期货交易等一系列更加现实和富有意义的领域,对传统控制方法和启发式决策理论发起冲击。 + +![强化学习框架](../img/ch12/ch12-rl.png) + +:width:`400px` + +:label:`ch12/ch12-rl-framework` +强化学习的核心是不断地与环境交互来优化策略从而提升奖励的过程,主要表现为基于某个**状态**(State)下的**动作**(Action)的选择。进行这一决策的对象我们常称为**智能体**(Agent),而这一决策的影响将在**环境**(Environment)中体现。更具体地,不同的决策会影响环境的**状态转移**(State Transition)和**奖励**(Reward)。以上状态转移是环境从当前状态转移到下一状态的函数,它可以是确定性也可以是随机性的。奖励是环境对智能体动作的反馈,通常是一个标量。以上过程可以抽象为 :numref:`ch12/ch12-rl-framework`所示,这是文献中最常见的强化学习的模型描述。 + +举例来说,当人在玩某个电子游戏的时候,需要逐渐熟悉游戏的操作以取得更好的游戏结果,那么人从刚接触到这个游戏到逐步掌握游戏技巧的这个过程为一个类似于强化学习的过程。该游戏从开始后的任一时刻,会处于一个特定的状态,而人通过观察这个状态会获得一个**观察量**(Observation)(如观察游戏机显示屏的图像),并基于这个观察量做出一个操作动作(如发射子弹),这一动作将改变这个游戏下一时刻的状态,使其转移到下一个状态(如把怪物打败了),并且玩家可以知道当前动作的效果(如产生了一个正或负的分数,怪物打败了则获得正分数)。这时玩家再基于下一个状态的观察量做出新的动作选择,周而复始,直到游戏结束。通过反复的操作和观察,人能够逐步掌握这个游戏的技巧,一个强化学习智能体也是如此。 + +这里注意,有几个比较关键的问题:一是观察量未必等于状态,而通常观察量是状态的函数,从状态到观察量的映射可能有一定的信息损失。对于观察量等于状态或者根据观察量能够完全恢复环境状态的情况,我们称为**完全可观测**(Fully Observable),否则我们称为**部分可观测**(Partially Observable)环境;二是玩家的每个动作未必会产生立即反馈,某个动作可能在许多步之后才产生效果,强化学习模型允许这种延迟反馈的存在;三是这种反馈对人的学习过程而言未必是个数字,但是我们对强化学习智能体所得到的反馈进行数学抽象,将其转变为一个数字,称为奖励值。奖励值可以是状态的函数,也可以是状态和动作的函数,依具体问题而定。奖励值的存在是强化学习问题的一个基本假设,也是现有强化学习与监督式学习的一个主要区别 + +强化学习的决策过程通常由一个马尔可夫决策过程(Markov Decision Process,MDP)(马尔可夫决策过程即一个后续状态只依赖当前状态和动作而不依赖于历史状态的函数)描述,可以用一个数组$(\mathcal{S}, \mathcal{A}, R, \mathcal{T}, \gamma)$来表示。$\mathcal{S}$和$\mathcal{A}$分别是状态空间和动作空间,$R$是奖励函数,$R(s,a)$: $\mathcal{S}\times \mathcal{A}\rightarrow \mathbb{R}$为对于当前状态$s\in\mathcal{S}$和当前动作$a\in\mathcal{A}$的奖励值。从当前状态和动作到下一个状态的状态转移概率定义为$\mathcal{T}(s^\prime|s,a)$: $\mathcal{S}\times\mathcal{A}\times\mathcal{S}\rightarrow \mathbb{R}_+$。$\gamma\in(0,1)$是奖励折扣因子(折扣因子可以乘到每个后续奖励值上,从而使无穷长序列有有限的奖励值之和)。强化学习的目标是最大化智能体的期望累计奖励值$\mathbb{E}[\sum_t \gamma^t r_t]$。 + +马尔可夫决策过程中的马尔可夫性质由以下定义 + +$$ +\mathcal{T}(s_{t+1}|s_t) = \mathcal{T}(s_{t+1}|s_0, s_1, s_2, \dots, s_t) +$$ + +即当前状态转移只依赖于上一时刻状态,而不依赖于整个历史。这里的状态转移函数$\mathcal{T}$中省略了动作$a$,马尔可夫性质是环境转移过程的属性,其独立于产生动作的决策过程。 + +基于马尔可夫性质,可以进一步推导出在某一时刻最优策略不依赖于整个决策历史,而只依赖于当前最新状态的结论。这一结论在强化学习算法设计中有着重要意义,它简化了最优策略的求解过程。 + diff --git a/chapter_reinforcement_learning/single_node_rl.md b/zh_chapters/chapter_reinforcement_learning/single_node_rl.md similarity index 99% rename from chapter_reinforcement_learning/single_node_rl.md rename to zh_chapters/chapter_reinforcement_learning/single_node_rl.md index 3d2a207..e14f796 100644 --- a/chapter_reinforcement_learning/single_node_rl.md +++ b/zh_chapters/chapter_reinforcement_learning/single_node_rl.md @@ -1,21 +1,21 @@ -## 单节点强化学习系统 - -前面介绍了强化学习的基本知识,这里我们介绍常见的单智能体强化学习系统中较为简单的一类,即单节点强化学习系统,这里的节点是指一个用于模型更新的计算单元。我们按照是否对模型更新的过程做并行化处理,将强化学习系统分为单节点和分布式强化学习系统。其中,单节点强化学习系统可以理解为只实例化一个类对象作为智能体,与环境交互进行采样和利用所采得的样本进行更新的过程分别视为这个类内的不同函数。除此之外的更为复杂的强化学习框架都可视为分布式强化学习系统。 - -分布式强化学习系统的具体形式有很多,系统的形式也往往依赖于所实现的算法。从最简单的情况考虑,假设我们仍在同一个计算单元上实现算法,但是将强化学习的采样过程和更新过程实现为两个并行的进程,甚至各自实现为多个进程,以满足不同计算资源间的平衡。这时就需要进程间通信来协调采样和更新过程,这是一个最基础的分布式强化学习框架。更为复杂的情况是,整个算法的运行在多个计算设备上进行(如一个多机的计算集群),智能体的函数可能需要跨机跨进程间的通信来实现。对于多智能体系统,还需要同时对多个智能体的模型进行更新,则需要更为复杂的计算系统设计。我们将逐步介绍这些不同的系统内的实现机制。 - -我们先对单节点强化学习系统进行介绍。在这里,我们以RLzoo :cite:`ding2020efficient`为例,讲解一个单节点强化学习系统构建所需要的基本模块。如 :numref:`ch12/ch12-rlzoo`所示,是RLzoo算法库中采用的一个典型的单节点强化学习系统,它包括几个基本的组成部分:神经网络、适配器、策略网络和价值网络、环境实例、模型学习器、经验回放缓存(Experience Replay Buffer)等。 - -我们先对前三个,神经网络、适配器、策略网络和价值网络进行介绍。神经网络即一般深度学习中的神经网络,用于实现基于数据的函数拟合,我们在图中简单列出常见的三类神经网络:全连接网络,卷积网络和循环网络。策略网络和价值网络是一般深度强化学习的常见组成部分,策略网络即一个由深度神经网络参数化的策略表示,而价值网络为神经网络表示的状态价值(State-Value)或状态-动作价值(State-Action Value)函数。这里我们不妨称前三类神经网络为一般神经网络,策略网络和价值网络为强化学习特定网络,前者往往是后者的重要组成部分。在RLzoo中,适配器则是为实现强化学习特定网络而选配一般神经网络的功能模块。首先,根据不同的观察量类型,强化学习智能体所用的神经网络头部会有不同的结构,这一选择可以由一个基于观察量的适配器来实现;其次,根据所采用的强化学习算法类型,相应的策略网络尾部需要有不同的输出类型,包括确定性策略和随机性策略,RLzoo 中使用一个策略适配器来进行选择;最后,根据不同的动作输出,如离散型、连续型、类别型等,需要使用一个动作适配器来选择。:numref:`ch12/ch12-rlzoo`中我们统称这三个不类型的适配器为适配器。 - -介绍完这些,我们已经有了可用的策略网络和价值网络,这构成了强化学习智能体核心学习模块。除此之外,还需要一个学习器(Learner)来更新这些学习模块,更新的规则就是强化学习算法给出的损失函数。而要想实现学习模块的更新,最重要的是输入的学习数据,即智能体跟环境交互过程中所采集的样本。对于**离线**(Off-Policy)强化学习,这些样本通常被存储于一个称为经验回放缓存的地方,学习器在需要更新模型时从该缓存中采得一些样本来进行更新。这里说到的离线强化学习是强化学习算法中的一类,强化学习算法可以分为在线(On-Policy)强化学习和离线(Off-Policy)强化学习两类,按照某个特定判据。这个判据是,用于更新的模型和用于采样的模型是否为同一个,如果是,则称在线强化学习算法,否则为离线强化学习算法。因而,离线强化学习通常允许与环境交互所采集的样本被存储于一个较大的缓存内,从而允许在许久之后再从这个缓存中抽取样本对模型进行更新。而对于在线强化学习,这个“缓存”有时其实也是存在的,只不过它所存储的是非常近期内采集的数据,从而被更新模型和用于采样的模型可以近似认为是同一个。从而,这里我们简单表示 RLzoo 的强化学习系统统一包括这个经验回放缓存模块。有了以上策略和价值网络、经验回放缓存、适配器、学习器,我们就得到了 RLzoo 中一个单节点的强化学习智能体,将这个智能体与环境实例交互,并采集数据进行模型更新,我们就得到了一个完整的单节点强化学习系统。这里的环境实例化我们允许多个环境并行采样。 - -![RLzoo算法库中使用的强化学习系统](../img/ch12/ch12-rlzoo.png) - -:width:`800px` - -:label:`ch12/ch12-rlzoo` -近来研究人员发现,强化学习算法领域的发展瓶颈,可能不仅在于算法本身,而在于让智能体在其中采集数据的模拟器的模拟速度。Isaac Gym :cite:`makoviychuk2021isaac`是Nvidia公司于2021年推出的基于GPU(Graphics Processing Unit)的模拟引擎,在单GPU上实现2-3倍于之前基于CPU(Central Processing Unit)的模拟器的运行速度。关于 GPU上运行加速我们已经在章节 5 中有所介绍。之所以 GPU 模拟能够对强化学习任务实现显著的加 -速效果,除了 GPU 本身多核心的并行运算能力之外,还在于这省却了 CPU 与 GPU 之间的数据传输和通信时间。传统的强化学习环境,如 OpenAI Gym(这是一个常用的强化学习基准测试环境)等,都是基于 CPU 进行的模拟计算,而深度学习方法的神经网络训练通常是在 GPU 或TPU(Tensor Processing Unit) 上进行的。 - +## 单节点强化学习系统 + +前面介绍了强化学习的基本知识,这里我们介绍常见的单智能体强化学习系统中较为简单的一类,即单节点强化学习系统,这里的节点是指一个用于模型更新的计算单元。我们按照是否对模型更新的过程做并行化处理,将强化学习系统分为单节点和分布式强化学习系统。其中,单节点强化学习系统可以理解为只实例化一个类对象作为智能体,与环境交互进行采样和利用所采得的样本进行更新的过程分别视为这个类内的不同函数。除此之外的更为复杂的强化学习框架都可视为分布式强化学习系统。 + +分布式强化学习系统的具体形式有很多,系统的形式也往往依赖于所实现的算法。从最简单的情况考虑,假设我们仍在同一个计算单元上实现算法,但是将强化学习的采样过程和更新过程实现为两个并行的进程,甚至各自实现为多个进程,以满足不同计算资源间的平衡。这时就需要进程间通信来协调采样和更新过程,这是一个最基础的分布式强化学习框架。更为复杂的情况是,整个算法的运行在多个计算设备上进行(如一个多机的计算集群),智能体的函数可能需要跨机跨进程间的通信来实现。对于多智能体系统,还需要同时对多个智能体的模型进行更新,则需要更为复杂的计算系统设计。我们将逐步介绍这些不同的系统内的实现机制。 + +我们先对单节点强化学习系统进行介绍。在这里,我们以RLzoo :cite:`ding2020efficient`为例,讲解一个单节点强化学习系统构建所需要的基本模块。如 :numref:`ch12/ch12-rlzoo`所示,是RLzoo算法库中采用的一个典型的单节点强化学习系统,它包括几个基本的组成部分:神经网络、适配器、策略网络和价值网络、环境实例、模型学习器、经验回放缓存(Experience Replay Buffer)等。 + +我们先对前三个,神经网络、适配器、策略网络和价值网络进行介绍。神经网络即一般深度学习中的神经网络,用于实现基于数据的函数拟合,我们在图中简单列出常见的三类神经网络:全连接网络,卷积网络和循环网络。策略网络和价值网络是一般深度强化学习的常见组成部分,策略网络即一个由深度神经网络参数化的策略表示,而价值网络为神经网络表示的状态价值(State-Value)或状态-动作价值(State-Action Value)函数。这里我们不妨称前三类神经网络为一般神经网络,策略网络和价值网络为强化学习特定网络,前者往往是后者的重要组成部分。在RLzoo中,适配器则是为实现强化学习特定网络而选配一般神经网络的功能模块。首先,根据不同的观察量类型,强化学习智能体所用的神经网络头部会有不同的结构,这一选择可以由一个基于观察量的适配器来实现;其次,根据所采用的强化学习算法类型,相应的策略网络尾部需要有不同的输出类型,包括确定性策略和随机性策略,RLzoo 中使用一个策略适配器来进行选择;最后,根据不同的动作输出,如离散型、连续型、类别型等,需要使用一个动作适配器来选择。:numref:`ch12/ch12-rlzoo`中我们统称这三个不类型的适配器为适配器。 + +介绍完这些,我们已经有了可用的策略网络和价值网络,这构成了强化学习智能体核心学习模块。除此之外,还需要一个学习器(Learner)来更新这些学习模块,更新的规则就是强化学习算法给出的损失函数。而要想实现学习模块的更新,最重要的是输入的学习数据,即智能体跟环境交互过程中所采集的样本。对于**离线**(Off-Policy)强化学习,这些样本通常被存储于一个称为经验回放缓存的地方,学习器在需要更新模型时从该缓存中采得一些样本来进行更新。这里说到的离线强化学习是强化学习算法中的一类,强化学习算法可以分为在线(On-Policy)强化学习和离线(Off-Policy)强化学习两类,按照某个特定判据。这个判据是,用于更新的模型和用于采样的模型是否为同一个,如果是,则称在线强化学习算法,否则为离线强化学习算法。因而,离线强化学习通常允许与环境交互所采集的样本被存储于一个较大的缓存内,从而允许在许久之后再从这个缓存中抽取样本对模型进行更新。而对于在线强化学习,这个“缓存”有时其实也是存在的,只不过它所存储的是非常近期内采集的数据,从而被更新模型和用于采样的模型可以近似认为是同一个。从而,这里我们简单表示 RLzoo 的强化学习系统统一包括这个经验回放缓存模块。有了以上策略和价值网络、经验回放缓存、适配器、学习器,我们就得到了 RLzoo 中一个单节点的强化学习智能体,将这个智能体与环境实例交互,并采集数据进行模型更新,我们就得到了一个完整的单节点强化学习系统。这里的环境实例化我们允许多个环境并行采样。 + +![RLzoo算法库中使用的强化学习系统](../img/ch12/ch12-rlzoo.png) + +:width:`800px` + +:label:`ch12/ch12-rlzoo` +近来研究人员发现,强化学习算法领域的发展瓶颈,可能不仅在于算法本身,而在于让智能体在其中采集数据的模拟器的模拟速度。Isaac Gym :cite:`makoviychuk2021isaac`是Nvidia公司于2021年推出的基于GPU(Graphics Processing Unit)的模拟引擎,在单GPU上实现2-3倍于之前基于CPU(Central Processing Unit)的模拟器的运行速度。关于 GPU上运行加速我们已经在章节 5 中有所介绍。之所以 GPU 模拟能够对强化学习任务实现显著的加 +速效果,除了 GPU 本身多核心的并行运算能力之外,还在于这省却了 CPU 与 GPU 之间的数据传输和通信时间。传统的强化学习环境,如 OpenAI Gym(这是一个常用的强化学习基准测试环境)等,都是基于 CPU 进行的模拟计算,而深度学习方法的神经网络训练通常是在 GPU 或TPU(Tensor Processing Unit) 上进行的。 + 从智能体与 CPU 上实例化的模拟环境交互过程所收集的数据样本,通常先暂时以 CPU 的数据格式存储,在使用的时候被转移到 GPU 上成为具有 GPU 数据类型的数据(如使用 PyTorch 时可通过tensor.to(device)的函数实现,只需将device设为“cuda”即可将一个类型为torch.Tensor的tensor转移到GPU上),然后来进行模型训练。同时,由于模型参数是以 GPU 上数据的类型存储的,调用模型进行前向传递的过程中也需要先将输入数据从 CPU 转移到 GPU 上,并且可能需要将模型输出的 GPU 数据再转移回 CPU 类型。这一系列冗余的数据转换操作都会显著增长模型学习的时间,并且也增加了算法实际使用过程中的工程量。Isaac Gym 模拟器的设计从模拟器下层运行硬件上解决了这一困难,由于模拟器和模型双双实现在 GPU 上,他们之间的数据通信不再需要通过 CPU 来实现,从而绕过了 CPU 与 GPU 数据双向传输这一问题,实现了对强化学习任务中模拟过程的特定加速。 \ No newline at end of file diff --git a/chapter_reinforcement_learning/summary.md b/zh_chapters/chapter_reinforcement_learning/summary.md similarity index 99% rename from chapter_reinforcement_learning/summary.md rename to zh_chapters/chapter_reinforcement_learning/summary.md index 7973d43..f26f538 100644 --- a/chapter_reinforcement_learning/summary.md +++ b/zh_chapters/chapter_reinforcement_learning/summary.md @@ -1,7 +1,7 @@ -## 小结 - -在这一章,我们简单介绍了强化学习的基本概念,包括单智能体和多智能体强化学习算法、单节点和分布式强化学习系统等,给读者对强化学习问题的基本认识。当前,强化学习是一个快速发展的深度学习分支,许多实际问题都有可能通过强化学习算法的进一步发展得到解决。另一方面,由于强化学习问题设置的特殊性(如需要与环境交互进行采样等),也使得相应算法对计算系统的要求更高:如何更好地平衡样本采集和策略训练过程?如何均衡 CPU 和 GPU 等不同计算硬件的能力?如何在大规模分布式系统上有效部署强化学习智能体?都需要对计算机系统的设计和使用有更好的理解。 - -## 参考文献 - +## 小结 + +在这一章,我们简单介绍了强化学习的基本概念,包括单智能体和多智能体强化学习算法、单节点和分布式强化学习系统等,给读者对强化学习问题的基本认识。当前,强化学习是一个快速发展的深度学习分支,许多实际问题都有可能通过强化学习算法的进一步发展得到解决。另一方面,由于强化学习问题设置的特殊性(如需要与环境交互进行采样等),也使得相应算法对计算系统的要求更高:如何更好地平衡样本采集和策略训练过程?如何均衡 CPU 和 GPU 等不同计算硬件的能力?如何在大规模分布式系统上有效部署强化学习智能体?都需要对计算机系统的设计和使用有更好的理解。 + +## 参考文献 + :bibliography:`../references/reinforcement.bib` \ No newline at end of file diff --git a/chapter_rl_sys/control.md b/zh_chapters/chapter_rl_sys/control.md similarity index 100% rename from chapter_rl_sys/control.md rename to zh_chapters/chapter_rl_sys/control.md diff --git a/chapter_rl_sys/control_code_ex.md b/zh_chapters/chapter_rl_sys/control_code_ex.md similarity index 100% rename from chapter_rl_sys/control_code_ex.md rename to zh_chapters/chapter_rl_sys/control_code_ex.md diff --git a/chapter_rl_sys/index.md b/zh_chapters/chapter_rl_sys/index.md similarity index 100% rename from chapter_rl_sys/index.md rename to zh_chapters/chapter_rl_sys/index.md diff --git a/chapter_rl_sys/perception.md b/zh_chapters/chapter_rl_sys/perception.md similarity index 100% rename from chapter_rl_sys/perception.md rename to zh_chapters/chapter_rl_sys/perception.md diff --git a/chapter_rl_sys/perception_code_ex.md b/zh_chapters/chapter_rl_sys/perception_code_ex.md similarity index 100% rename from chapter_rl_sys/perception_code_ex.md rename to zh_chapters/chapter_rl_sys/perception_code_ex.md diff --git a/chapter_rl_sys/planning.md b/zh_chapters/chapter_rl_sys/planning.md similarity index 100% rename from chapter_rl_sys/planning.md rename to zh_chapters/chapter_rl_sys/planning.md diff --git a/chapter_rl_sys/planning_code_ex.md b/zh_chapters/chapter_rl_sys/planning_code_ex.md similarity index 100% rename from chapter_rl_sys/planning_code_ex.md rename to zh_chapters/chapter_rl_sys/planning_code_ex.md diff --git a/chapter_rl_sys/rl_sys_intro.md b/zh_chapters/chapter_rl_sys/rl_sys_intro.md similarity index 100% rename from chapter_rl_sys/rl_sys_intro.md rename to zh_chapters/chapter_rl_sys/rl_sys_intro.md diff --git a/chapter_rl_sys/robot_learning.md b/zh_chapters/chapter_rl_sys/robot_learning.md similarity index 100% rename from chapter_rl_sys/robot_learning.md rename to zh_chapters/chapter_rl_sys/robot_learning.md diff --git a/chapter_rl_sys/robot_safety.md b/zh_chapters/chapter_rl_sys/robot_safety.md similarity index 100% rename from chapter_rl_sys/robot_safety.md rename to zh_chapters/chapter_rl_sys/robot_safety.md diff --git a/chapter_rl_sys/ros.md b/zh_chapters/chapter_rl_sys/ros.md similarity index 100% rename from chapter_rl_sys/ros.md rename to zh_chapters/chapter_rl_sys/ros.md diff --git a/chapter_rl_sys/ros_code_ex.md b/zh_chapters/chapter_rl_sys/ros_code_ex.md similarity index 100% rename from chapter_rl_sys/ros_code_ex.md rename to zh_chapters/chapter_rl_sys/ros_code_ex.md diff --git a/chapter_rl_sys/summary.md b/zh_chapters/chapter_rl_sys/summary.md similarity index 100% rename from chapter_rl_sys/summary.md rename to zh_chapters/chapter_rl_sys/summary.md diff --git a/zh_chapters/config.ini b/zh_chapters/config.ini new file mode 100644 index 0000000..47c2556 --- /dev/null +++ b/zh_chapters/config.ini @@ -0,0 +1,79 @@ +[project] + +name = machine learning system + +title = 机器学习系统:设计和实现 + +author = Luo Mai, Hao Dong + +copyright = 2022, All authors. + +release = 1.0.0 + +lang = zh + +[build] + +notebooks = *.md */*.md + +# Resources are provided via symlinks created by build_html_zh.sh +resources = img/ references/ + +# Exclude nothing in zh_chapters — all .md files here are real content +exclusions = */*_origin.md + +eval_notebook = True + +tabs = mindspore, pytorch, tensorflow + +sphinx_configs = numfig_format = {'figure': '图%%s', 'table': '表%%s', 'code-block': '列表%%s', 'section': '%%s节'} + latex_elements = { + 'utf8extra' : '', + 'inputenc' : '', + 'babel' : r'''\usepackage[english]{babel}''', + 'preamble' : r''' + \usepackage{ctex} + \setmainfont{Source Serif Pro} + \setsansfont{Source Sans Pro} + \setmonofont{Source Code Pro} + \setCJKmainfont[BoldFont=Source Han Serif SC SemiBold]{Source Han Serif SC} + \setCJKsansfont[BoldFont=Source Han Sans SC Medium]{Source Han Sans SC Normal} + \setCJKmonofont{Source Han Sans SC Normal} + \addto\captionsenglish{\renewcommand{\chaptername}{}} + \addto\captionsenglish{\renewcommand{\contentsname}{目录}} + \usepackage[draft]{minted} + \fvset{breaklines=true, breakanywhere=true} + \setlength{\headheight}{13.6pt} + \makeatletter + \fancypagestyle{normal}{ + \fancyhf{} + \fancyfoot[LE,RO]{{\py@HeaderFamily\thepage}} + \fancyfoot[LO]{{\py@HeaderFamily\nouppercase{\rightmark}}} + \fancyfoot[RE]{{\py@HeaderFamily\nouppercase{\leftmark}}} + \fancyhead[LE,RO]{{\py@HeaderFamily }} + } + \makeatother + \CJKsetecglue{} + \usepackage{zhnumber} + ''', + 'pointsize': '10pt', + 'figure_align': 'H', + 'fncychap': '\\usepackage[Sonny]{fncychap}', + } + bibtex_bibfiles = ['references/accelerator.bib', 'references/appendix.bib', 'references/backend.bib', 'references/data.bib', 'references/explainable.bib', 'references/extension.bib', 'references/federated.bib', 'references/frontend.bib', 'references/graph.bib', 'references/interface.bib', 'references/introduction.bib', 'references/model.bib', 'references/model_deployment.bib', 'references/recommender.bib', 'references/reinforcement.bib', 'references/rlsys.bib', 'references/training.bib'] + + + +[html] + +header_links = GitHub, https://github.com/openmlsys/openmlsys-zh, fab fa-github, + English, https://openmlsys.github.io/, fas fa-language + +favicon = static/favicon.png + +html_logo = static/logo-with-text.png + + +[pdf] + +latex_logo = static/logo.png diff --git a/zh_chapters/img b/zh_chapters/img new file mode 120000 index 0000000..0af1dd5 --- /dev/null +++ b/zh_chapters/img @@ -0,0 +1 @@ +/chivier-disk/hyq-home/Projects/openmlsys-zh/img \ No newline at end of file diff --git a/index.md b/zh_chapters/index.md similarity index 100% rename from index.md rename to zh_chapters/index.md diff --git a/zh_chapters/mlsys.bib b/zh_chapters/mlsys.bib new file mode 100644 index 0000000..9ed4432 --- /dev/null +++ b/zh_chapters/mlsys.bib @@ -0,0 +1,1307 @@ +@article{rosenblatt1958perceptron, + title={The perceptron: a probabilistic model for information storage and organization in the brain.}, + author={Rosenblatt, Frank}, + journal={Psychological Review}, + volume={65}, + number={6}, + pages={386}, + year={1958}, + publisher={American Psychological Association} +} + +@article{lecun1989backpropagation, + title={Backpropagation applied to handwritten zip code recognition}, + author={LeCun, Yann and Boser, Bernhard and Denker, John S and Henderson, Donnie and Howard, Richard E and Hubbard, Wayne and Jackel, Lawrence D}, + journal={Neural computation}, + volume={1}, + number={4}, + pages={541--551}, + year={1989}, + publisher={MIT Press} +} + +@article{lanctot2017unified, + title={A unified game-theoretic approach to multiagent reinforcement learning}, + author={Lanctot, Marc and Zambaldi, Vinicius and Gruslys, Audrunas and Lazaridou, Angeliki and Tuyls, Karl and P{\'e}rolat, Julien and Silver, David and Graepel, Thore}, + journal={Advances in neural information processing systems}, + volume={30}, + year={2017} +} + + +@article{mnih2013playing, + title={Playing atari with deep reinforcement learning}, + author={Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Graves, Alex and Antonoglou, Ioannis and Wierstra, Daan and Riedmiller, Martin}, + journal={arXiv preprint arXiv:1312.5602}, + year={2013} +} + +@article{sunehag2017value, + title={Value-decomposition networks for cooperative multi-agent learning}, + author={Sunehag, Peter and Lever, Guy and Gruslys, Audrunas and Czarnecki, Wojciech Marian and Zambaldi, Vinicius and Jaderberg, Max and Lanctot, Marc and Sonnerat, Nicolas and Leibo, Joel Z and Tuyls, Karl and others}, + journal={arXiv preprint arXiv:1706.05296}, + year={2017} +} + + +@inproceedings{rashid2018qmix, + title={Qmix: Monotonic value function factorisation for deep multi-agent reinforcement learning}, + author={Rashid, Tabish and Samvelyan, Mikayel and Schroeder, Christian and Farquhar, Gregory and Foerster, Jakob and Whiteson, Shimon}, + booktitle={International Conference on Machine Learning}, + pages={4295--4304}, + year={2018}, + organization={PMLR} +} + +@inproceedings{foerster2018counterfactual, + title={Counterfactual multi-agent policy gradients}, + author={Foerster, Jakob and Farquhar, Gregory and Afouras, Triantafyllos and Nardelli, Nantas and Whiteson, Shimon}, + booktitle={Proceedings of the AAAI conference on artificial intelligence}, + volume={32}, + number={1}, + year={2018} +} + + +@inproceedings{krizhevsky2012imagenet, + title={Imagenet classification with deep convolutional neural networks}, + author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E}, + booktitle={Advances in Neural Information Processing Systems}, + pages={1097--1105}, + year={2012} +} + +@inproceedings{he2016deep, + title={{Deep Residual Learning for Image Recognition}}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2016} +} + +@article{rumelhart1986learning, + title={Learning representations by back-propagating errors}, + author={Rumelhart, David E and Hinton, Geoffrey E and Williams, Ronald J}, + journal={Nature}, + volume={323}, + number={6088}, + pages={533}, + year={1986}, + publisher={Nature Publishing Group} +} + +@article{Hochreiter1997lstm, + author = {Hochreiter, Sepp and Hochreiter, S and Schmidhuber, J{\"{u}}rgen and Schmidhuber, J}, + isbn = {08997667 (ISSN)}, + issn = {0899-7667}, + journal = {Neural Computation}, + number = {8}, + pages = {1735--80}, + pmid = {9377276}, + title = {{Long Short-Term Memory.}}, + volume = {9}, + year = {1997} +} + +@inproceedings{vaswani2017attention, + title={Attention is all you need}, + author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, + booktitle={Advances in Neural Information Processing Systems}, + pages={5998--6008}, + year={2017} +} + +@article{lecun2015deep, + title={Deep learning}, + author={LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey}, + journal={Nature}, + volume={521}, + number={7553}, + pages={436}, + year={2015}, + publisher={Nature Publishing Group} +} + +@inproceedings{KingmaAdam2014, + title = {{Adam}: A Method for Stochastic Optimization}, + author = {Kingma, Diederik and Ba, Jimmy}, + booktitle = {Proceedings of the International Conference on Learning Representations (ICLR)}, + year = {2014} +} + +@techreport{tieleman2012rmsprop, + title={Divide the gradient by a running average of its recent magnitude. COURSERA: Neural networks for machine learning}, + author={Tieleman, T and Hinton, G}, + year={2017}, + institution={Technical Report} +} + +@article{duchi2011adagrad, + title={Adaptive subgradient methods for online learning and stochastic optimization}, + author={Duchi, John and Hazan, Elad and Singer, Yoram}, + journal={Journal of Machine Learning Research (JMLR)}, + volume={12}, + number={Jul}, + pages={2121--2159}, + year={2011} +} + +@inproceedings{meijer2006linq, + title={Linq: reconciling object, relations and xml in the. net framework}, + author={Meijer, Erik and Beckman, Brian and Bierman, Gavin}, + booktitle={Proceedings of the 2006 ACM SIGMOD international conference on Management of data}, + pages={706--706}, + year={2006} +} + +@inproceedings{murray2013naiad, + title={Naiad: a timely dataflow system}, + author={Murray, Derek G and McSherry, Frank and Isaacs, Rebecca and Isard, Michael and Barham, Paul and Abadi, Mart{\'\i}n}, + booktitle={Proceedings of the Twenty-Fourth ACM Symposium on Operating Systems Principles}, + pages={439--455}, + year={2013} +} + +@inproceedings{mnih2016asynchronous, + title={Asynchronous methods for deep reinforcement learning}, + author={Mnih, Volodymyr and Badia, Adria Puigdomenech and Mirza, Mehdi and Graves, Alex and Lillicrap, Timothy and Harley, Tim and Silver, David and Kavukcuoglu, Koray}, + booktitle={International Conference on Machine Learning (ICML)}, + pages={1928--1937}, + year={2016} +} + +@article{espeholt2018impala, + title={Impala: Scalable distributed deep-rl with importance weighted actor-learner architectures}, + author={Espeholt, Lasse and Soyer, Hubert and Munos, Remi and Simonyan, Karen and Mnih, Volodymir and Ward, Tom and Doron, Yotam and Firoiu, Vlad and Harley, Tim and Dunning, Iain and others}, + journal={arXiv preprint arXiv:1802.01561}, + year={2018} +} + +@article{espeholt2019seed, + title={Seed rl: Scalable and efficient deep-rl with accelerated central inference}, + author={Espeholt, Lasse and Marinier, Rapha{\"e}l and Stanczyk, Piotr and Wang, Ke and Michalski, Marcin}, + journal={arXiv preprint arXiv:1910.06591}, + year={2019} +} + +@misc{horgan2018distributed, + title={Distributed Prioritized Experience Replay}, + author={Dan Horgan and John Quan and David Budden and Gabriel Barth-Maron and Matteo Hessel and Hado van Hasselt and David Silver}, + year={2018}, + eprint={1803.00933}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} + +@inproceedings{moritz2018ray, + title={Ray: A distributed framework for emerging $\{$AI$\}$ applications}, + author={Moritz, Philipp and Nishihara, Robert and Wang, Stephanie and Tumanov, Alexey and Liaw, Richard and Liang, Eric and Elibol, Melih and Yang, Zongheng and Paul, William and Jordan, Michael I and others}, + booktitle={13th $\{$USENIX$\}$ Symposium on Operating Systems Design and Implementation ($\{$OSDI$\}$ 18)}, + pages={561--577}, + year={2018} +} + +@inproceedings{zaharia2010spark, + title={Spark: Cluster computing with working sets}, + author={Zaharia, Matei and Chowdhury, Mosharaf and Franklin, Michael J and Shenker, Scott and Stoica, Ion}, + booktitle={2nd USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 10)}, + year={2010} +} + +@article{fetterly2009dryadlinq, + title={DryadLINQ: A system for general-purpose distributed data-parallel computing using a high-level language}, + author={Fetterly, Yuan Yu Michael Isard Dennis and Budiu, Mihai and Erlingsson, {\'U}lfar and Currey, Pradeep Kumar Gunda Jon}, + journal={Proc. LSDS-IR}, + volume={8}, + year={2009} +} + +@article{murray2021tf, + title={tf. data: A machine learning data processing framework}, + author={Murray, Derek G and Simsa, Jiri and Klimovic, Ana and Indyk, Ihor}, + journal={arXiv preprint arXiv:2101.12127}, + year={2021} +} + +@article{mohan2020analyzing, + title={Analyzing and mitigating data stalls in dnn training}, + author={Mohan, Jayashree and Phanishayee, Amar and Raniwala, Ashish and Chidambaram, Vijay}, + journal={arXiv preprint arXiv:2007.06775}, + year={2020} +} + +@misc{rmpygil + author = "Sam Gross", + title = "Multithreaded Python without the GIL", + howpublished = "Website", + year = {2021}, + note = {\url{https://docs.google.com/document/d/18CXhDb1ygxg-YXNBJNzfzZsDFosB5e6BfnXLlejd9l0/edit#heading=h.kcngwrty1lv}} +} + +@misc{nvidia_dali + author = "NVIDIA", + title = "DALI", + howpublished = "Website", + year = {2018}, + note = {\url{https://github.com/NVIDIA/DALI}} +} + +@misc{minddata + author = "HuaWei", + title = "Dataset Plugin", + howpublished = "Website", + year = {2020}, + note = {\url{https://gitee.com/mindspore/dataset-plugin}} +} + +@article{liang2017ray, + title={Ray rllib: A composable and scalable reinforcement learning library}, + author={Liang, Eric and Liaw, Richard and Nishihara, Robert and Moritz, Philipp and Fox, Roy and Gonzalez, Joseph and Goldberg, Ken and Stoica, Ion}, + journal={arXiv preprint arXiv:1712.09381}, + pages={85}, + year={2017} +} + +@article{cassirer2021reverb, + title={Reverb: A Framework For Experience Replay}, + author={Cassirer, Albin and Barth-Maron, Gabriel and Brevdo, Eugene and Ramos, Sabela and Boyd, Toby and Sottiaux, Thibault and Kroiss, Manuel}, + journal={arXiv preprint arXiv:2102.04736}, + year={2021} +} + +@article{hoffman2020acme, + title={Acme: A research framework for distributed reinforcement learning}, + author={Hoffman, Matt and Shahriari, Bobak and Aslanides, John and Barth-Maron, Gabriel and Behbahani, Feryal and Norman, Tamara and Abdolmaleki, Abbas and Cassirer, Albin and Yang, Fan and Baumli, Kate and others}, + journal={arXiv preprint arXiv:2006.00979}, + year={2020} +} + +@article{ding2020efficient, + title={Efficient Reinforcement Learning Development with RLzoo}, + author={Ding, Zihan and Yu, Tianyang and Huang, Yanhua and Zhang, Hongming and Li, Guo and Guo, Quancheng and Mai, Luo and Dong, Hao}, + journal={arXiv preprint arXiv:2009.08644}, + year={2020} +} + +@article{makoviychuk2021isaac, + title={Isaac Gym: High Performance GPU-Based Physics Simulation For Robot Learning}, + author={Makoviychuk, Viktor and Wawrzyniak, Lukasz and Guo, Yunrong and Lu, Michelle and Storey, Kier and Macklin, Miles and Hoeller, David and Rudin, Nikita and Allshire, Arthur and Handa, Ankur and others}, + journal={arXiv preprint arXiv:2108.10470}, + year={2021} +} + +@article{vinyals2019grandmaster, + title={Grandmaster level in StarCraft II using multi-agent reinforcement learning}, + author={Vinyals, Oriol and Babuschkin, Igor and Czarnecki, Wojciech M and Mathieu, Micha{\"e}l and Dudzik, Andrew and Chung, Junyoung and Choi, David H and Powell, Richard and Ewalds, Timo and Georgiev, Petko and others}, + journal={Nature}, + volume={575}, + number={7782}, + pages={350--354}, + year={2019}, + publisher={Nature Publishing Group} +} + +@article{berner2019dota, + title={Dota 2 with large scale deep reinforcement learning}, + author={Berner, Christopher and Brockman, Greg and Chan, Brooke and Cheung, Vicki and D{\k{e}}biak, Przemys{\l}aw and Dennison, Christy and Farhi, David and Fischer, Quirin and Hashme, Shariq and Hesse, Chris and others}, + journal={arXiv preprint arXiv:1912.06680}, + year={2019} +} + +@article{han2020tstarbot, + title={Tstarbot-x: An open-sourced and comprehensive study for efficient league training in starcraft ii full game}, + author={Han, Lei and Xiong, Jiechao and Sun, Peng and Sun, Xinghai and Fang, Meng and Guo, Qingwei and Chen, Qiaobo and Shi, Tengfei and Yu, Hongsheng and Wu, Xipeng and others}, + journal={arXiv preprint arXiv:2011.13729}, + year={2020} +} + +@inproceedings{wang2021scc, + title={SCC: an efficient deep reinforcement learning agent mastering the game of StarCraft II}, + author={Wang, Xiangjun and Song, Junxiao and Qi, Penghui and Peng, Peng and Tang, Zhenkun and Zhang, Wei and Li, Weimin and Pi, Xiongjun and He, Jujie and Gao, Chao and others}, + booktitle={International Conference on Machine Learning}, + pages={10905--10915}, + year={2021}, + organization={PMLR} +} + +@inproceedings{MLSYS2021_979d472a, + author = {Yin, Chunxing and Acun, Bilge and Wu, Carole-Jean and Liu, Xing}, + booktitle = {Proceedings of Machine Learning and Systems}, + editor = {A. Smola and A. Dimakis and I. Stoica}, + pages = {448--462}, + title = {TT-Rec: Tensor Train Compression for Deep Learning Recommendation Models}, + url = {https://proceedings.mlsys.org/paper/2021/file/979d472a84804b9f647bc185a877a8b5-Paper.pdf}, + volume = {3}, + year = {2021} +} + +@inproceedings{MLSYS2020_f7e6c855, + author = {Zhao, Weijie and Xie, Deping and Jia, Ronglai and Qian, Yulei and Ding, Ruiquan and Sun, Mingming and Li, Ping}, + booktitle = {Proceedings of Machine Learning and Systems}, + editor = {I. Dhillon and D. Papailiopoulos and V. Sze}, + pages = {412--428}, + title = {Distributed Hierarchical GPU Parameter Server for Massive Scale Deep Learning Ads Systems}, + url = {https://proceedings.mlsys.org/paper/2020/file/f7e6c85504ce6e82442c770f7c8606f0-Paper.pdf}, + volume = {2}, + year = {2020} +} + +@article{zionex, + title={Software-Hardware Co-design for Fast and Scalable Training of Deep Learning Recommendation Models}, + author={Mudigere, Dheevatsa and Hao, Yuchen and Huang, Jianyu and Jia, Zhihao and Tulloch, Andrew and Sridharan, Srinivas and Liu, Xing and Ozdal, Mustafa and Nie, Jade and Park, Jongsoo and others}, + journal={arXiv preprint arXiv:2104.05158}, + year={2021} +} + +@inproceedings{gong2020edgerec, + title={EdgeRec: Recommender System on Edge in Mobile Taobao}, + author={Gong, Yu and Jiang, Ziwen and Feng, Yufei and Hu, Binbin and Zhao, Kaiqi and Liu, Qingwen and Ou, Wenwu}, + booktitle={Proceedings of the 29th ACM International Conference on Information \& Knowledge Management}, + pages={2477--2484}, + year={2020} +} + +@inproceedings{NEURIPS2020_a1d4c20b, + author = {He, Chaoyang and Annavaram, Murali and Avestimehr, Salman}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}, + pages = {14068--14080}, + publisher = {Curran Associates, Inc.}, + title = {Group Knowledge Transfer: Federated Learning of Large CNNs at the Edge}, + url = {https://proceedings.neurips.cc/paper/2020/file/a1d4c20b182ad7137ab3606f0e3fc8a4-Paper.pdf}, + volume = {33}, + year = {2020} +} + +@INPROCEEDINGS{9355295, + author={Xie, Minhui and Ren, Kai and Lu, Youyou and Yang, Guangxu and Xu, Qingxing and Wu, Bihai and Lin, Jiazhen and Ao, Hongbo and Xu, Wanhong and Shu, Jiwu}, + booktitle={SC20: International Conference for High Performance Computing, Networking, Storage and Analysis}, + title={Kraken: Memory-Efficient Continual Learning for Large-Scale Real-Time Recommendations}, + year={2020}, + volume={}, + number={}, + pages={1-17}, + doi={10.1109/SC41405.2020.00025} +} + +@inproceedings{MLSYS2021_ec895663, + author = {Jiang, Wenqi and He, Zhenhao and Zhang, Shuai and Preu\ss er, Thomas B. and Zeng, Kai and Feng, Liang and Zhang, Jiansong and Liu, Tongxuan and Li , Yong and Zhou, Jingren and Zhang, Ce and Alonso, Gustavo}, + booktitle = {Proceedings of Machine Learning and Systems}, + editor = {A. Smola and A. Dimakis and I. Stoica}, + pages = {845--859}, + title = {MicroRec: Efficient Recommendation Inference by Hardware and Data Structure Solutions}, + url = {https://proceedings.mlsys.org/paper/2021/file/ec8956637a99787bd197eacd77acce5e-Paper.pdf}, + volume = {3}, + year = {2021} +} + +@inproceedings{10.1145/3394486.3403059, +author = {Shi, Hao-Jun Michael and Mudigere, Dheevatsa and Naumov, Maxim and Yang, Jiyan}, +title = {Compositional Embeddings Using Complementary Partitions for Memory-Efficient Recommendation Systems}, +year = {2020}, +isbn = {9781450379984}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3394486.3403059}, +doi = {10.1145/3394486.3403059}, +abstract = {}, +booktitle = {Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining}, +pages = {165–175}, +numpages = {11}, +keywords = {model compression, recommendation systems, embeddings}, +location = {Virtual Event, CA, USA}, +series = {KDD '20} +} + +@misc{ginart2021mixed, + title={Mixed Dimension Embeddings with Application to Memory-Efficient Recommendation Systems}, + author={Antonio Ginart and Maxim Naumov and Dheevatsa Mudigere and Jiyan Yang and James Zou}, + year={2021}, + eprint={1909.11810}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} + +@inproceedings{10.1145/2020408.2020444, +author = {Chu, Wei and Zinkevich, Martin and Li, Lihong and Thomas, Achint and Tseng, Belle}, +title = {Unbiased Online Active Learning in Data Streams}, +year = {2011}, +isbn = {9781450308137}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/2020408.2020444}, +doi = {10.1145/2020408.2020444}, +abstract = {Unlabeled samples can be intelligently selected for labeling to minimize classification error. In many real-world applications, a large number of unlabeled samples arrive in a streaming manner, making it impossible to maintain all the data in a candidate pool. In this work, we focus on binary classification problems and study selective labeling in data streams where a decision is required on each sample sequentially. We consider the unbiasedness property in the sampling process, and design optimal instrumental distributions to minimize the variance in the stochastic process. Meanwhile, Bayesian linear classifiers with weighted maximum likelihood are optimized online to estimate parameters. In empirical evaluation, we collect a data stream of user-generated comments on a commercial news portal in 30 consecutive days, and carry out offline evaluation to compare various sampling strategies, including unbiased active learning, biased variants, and random sampling. Experimental results verify the usefulness of online active learning, especially in the non-stationary situation with concept drift.}, +booktitle = {Proceedings of the 17th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, +pages = {195–203}, +numpages = {9}, +keywords = {unbiasedness, bayesian online learning, active learning, data streaming, adaptive importance sampling}, +location = {San Diego, California, USA}, +series = {KDD '11} +} + +@inproceedings{10.1145/3267809.3267817, +author = {Tian, Huangshi and Yu, Minchen and Wang, Wei}, +title = {Continuum: A Platform for Cost-Aware, Low-Latency Continual Learning}, +year = {2018}, +isbn = {9781450360111}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3267809.3267817}, +doi = {10.1145/3267809.3267817}, +abstract = {Many machine learning applications operate in dynamic environments that change over time, in which models must be continually updated to capture the recent trend in data. However, most of today's learning frameworks perform training offline, without a system support for continual model updating.In this paper, we design and implement Continuum, a general-purpose platform that streamlines the implementation and deployment of continual model updating across existing learning frameworks. In pursuit of fast data incorporation, we further propose two update policies, cost-aware and best-effort, that judiciously determine when to perform model updating, with and without accounting for the training cost (machine-time), respectively. Theoretical analysis shows that cost-aware policy is 2-competitive. We implement both polices in Continuum, and evaluate their performance through EC2 deployment and trace-driven simulations. The evaluation shows that Continuum results in reduced data incorporation latency, lower training cost, and improved model quality in a number of popular online learning applications that span multiple application domains, programming languages, and frameworks.}, +booktitle = {Proceedings of the ACM Symposium on Cloud Computing}, +pages = {26–40}, +numpages = {15}, +keywords = {Competitive Analysis, Continual Learning System, Online Algorithm}, +location = {Carlsbad, CA, USA}, +series = {SoCC '18} +} + +@inproceedings{10.1145/2648584.2648589, +author = {He, Xinran and Pan, Junfeng and Jin, Ou and Xu, Tianbing and Liu, Bo and Xu, Tao and Shi, Yanxin and Atallah, Antoine and Herbrich, Ralf and Bowers, Stuart and Candela, Joaquin Qui\~{n}onero}, +title = {Practical Lessons from Predicting Clicks on Ads at Facebook}, +year = {2014}, +isbn = {9781450329996}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/2648584.2648589}, +doi = {10.1145/2648584.2648589}, +abstract = {Online advertising allows advertisers to only bid and pay for measurable user responses, such as clicks on ads. As a consequence, click prediction systems are central to most online advertising systems. With over 750 million daily active users and over 1 million active advertisers, predicting clicks on Facebook ads is a challenging machine learning task. In this paper we introduce a model which combines decision trees with logistic regression, outperforming either of these methods on its own by over 3%, an improvement with significant impact to the overall system performance. We then explore how a number of fundamental parameters impact the final prediction performance of our system. Not surprisingly, the most important thing is to have the right features: those capturing historical information about the user or ad dominate other types of features. Once we have the right features and the right model (decisions trees plus logistic regression), other factors play small roles (though even small improvements are important at scale). Picking the optimal handling for data freshness, learning rate schema and data sampling improve the model slightly, though much less than adding a high-value feature, or picking the right model to begin with.}, +booktitle = {Proceedings of the Eighth International Workshop on Data Mining for Online Advertising}, +pages = {1–9}, +numpages = {9}, +location = {New York, NY, USA}, +series = {ADKDD'14} +} + +@misc{2017NVIDIA, + author={NVIDIA}, + title={NVIDIA Tesla V100 GPU Architecture: The World's Most Advanced Datacenter GPU}, + year={2017}, + howpublished = "Website", + note = {\url{http://www.nvidia.com/object/volta-architecture-whitepaper.html}} +} + +@inproceedings{2021Ascend, + title={Ascend: a Scalable and Unified Architecture for Ubiquitous Deep Neural Network Computing : Industry Track Paper}, + author={Liao, Heng and Tu, Jiajin and Xia, Jing and Liu, Hu and Zhou, Xiping and Yuan, Honghui and Hu, Yuxing}, + booktitle={2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA)}, + year={2021}, + pages = {789–801}, + doi = {10.1109/HPCA51647.2021.00071}, +} + +@article{2018Modeling, + title={Modeling Deep Learning Accelerator Enabled GPUs}, + author={Raihan, M. A. and Goli, N. and Aamodt, T.}, + journal={arXiv e-prints arXiv:1811.08309}, + year={2018} +} + +@book{2007Engineering, + title={Engineering a Compiler}, + author={ Cooper, Keith D. and Torczon, Linda }, + publisher={Engineering A Compiler}, + year={2007}, +} + +@article{ragan2013halide, + title={Halide: a language and compiler for optimizing parallelism, locality, and recomputation in image processing pipelines}, + author={Ragan-Kelley, Jonathan and Barnes, Connelly and Adams, Andrew and Paris, Sylvain and Durand, Fr{\'e}do and Amarasinghe, Saman}, + journal={Acm Sigplan Notices}, + volume={48}, + number={6}, + pages={519--530}, + year={2013}, + publisher={ACM New York, NY, USA} +} + +@inproceedings{verdoolaege2010isl, + title={isl: An integer set library for the polyhedral model}, + author={Verdoolaege, Sven}, + booktitle={International Congress on Mathematical Software}, + pages={299--302}, + year={2010}, + organization={Springer} +} + +@article{chen2018tvm, + title={TVM: end-to-end optimization stack for deep learning}, + author={Chen, Tianqi and Moreau, Thierry and Jiang, Ziheng and Shen, Haichen and Yan, Eddie Q and Wang, Leyuan and Hu, Yuwei and Ceze, Luis and Guestrin, Carlos and Krishnamurthy, Arvind}, + journal={arXiv preprint arXiv:1802.04799}, + volume={11}, + pages={20}, + year={2018}, + publisher={CoRR} +} + +@inproceedings{zheng2020ansor, + title={Ansor: Generating $\{$High-Performance$\}$ Tensor Programs for Deep Learning}, + author={Zheng, Lianmin and Jia, Chengfan and Sun, Minmin and Wu, Zhao and Yu, Cody Hao and Haj-Ali, Ameer and Wang, Yida and Yang, Jun and Zhuo, Danyang and Sen, Koushik and others}, + booktitle={14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)}, + pages={863--879}, + year={2020} +} + +@inproceedings{zhao2021akg, + title={AKG: automatic kernel generation for neural processing units using polyhedral transformations}, + author={Zhao, Jie and Li, Bojie and Nie, Wang and Geng, Zhen and Zhang, Renwei and Gao, Xiong and Cheng, Bin and Wu, Chen and Cheng, Yun and Li, Zheng and others}, + booktitle={Proceedings of the 42nd ACM SIGPLAN International Conference on Programming Language Design and Implementation}, + pages={1233--1248}, + year={2021} +} + +@article{lattner2020mlir, + title={MLIR: A compiler infrastructure for the end of Moore's law}, + author={Lattner, Chris and Amini, Mehdi and Bondhugula, Uday and Cohen, Albert and Davis, Andy and Pienaar, Jacques and Riddle, River and Shpeisman, Tatiana and Vasilache, Nicolas and Zinenko, Oleksandr}, + journal={arXiv preprint arXiv:2002.11054}, + year={2020} +} + +@article{vasilache2022composable, + title={Composable and Modular Code Generation in MLIR: A Structured and Retargetable Approach to Tensor Compiler Construction}, + author={Vasilache, Nicolas and Zinenko, Oleksandr and Bik, Aart JC and Ravishankar, Mahesh and Raoux, Thomas and Belyaev, Alexander and Springer, Matthias and Gysi, Tobias and Caballero, Diego and Herhut, Stephan and others}, + journal={arXiv preprint arXiv:2202.03293}, + year={2022} +} + +@inproceedings{bastoul2004code, + title={Code generation in the polyhedral model is easier than you think}, + author={Bastoul, C{\'e}dric}, + booktitle={Proceedings. 13th International Conference on Parallel Architecture and Compilation Techniques, 2004. PACT 2004.}, + pages={7--16}, + year={2004}, + organization={IEEE} +} + +@ARTICLE{2020tkde_li, + author={Li, Xiao-Hui and Cao, Caleb Chen and Shi, Yuhan and Bai, Wei and Gao, Han and Qiu, Luyu and Wang, Cong and Gao, Yuanyuan and Zhang, Shenjia and Xue, Xun and Chen, Lei}, + journal={IEEE Transactions on Knowledge and Data Engineering}, + title={A Survey of Data-driven and Knowledge-aware eXplainable AI}, + year={2020}, + volume={}, + number={}, + pages={1-1}, + doi={10.1109/TKDE.2020.2983930} +} + +@article{erhan2009visualizing, + title={Visualizing higher-layer features of a deep network}, + author={Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Vincent, Pascal}, + journal={University of Montreal}, + volume={1341}, + number={3}, + pages={1}, + year={2009} +} + +@misc{kim2018interpretability, + title={Interpretability Beyond Feature Attribution: Quantitative Testing with Concept Activation Vectors (TCAV)}, + author={Been Kim and Martin Wattenberg and Justin Gilmer and Carrie Cai and James Wexler and Fernanda Viegas and Rory Sayres}, + year={2018}, + eprint={1711.11279}, + archivePrefix={arXiv}, + primaryClass={stat.ML} +} + +@article{riedl2019human, + title={Human-centered artificial intelligence and machine learning}, + author={Riedl, Mark O.}, + journal={Human Behavior and Emerging Technologies}, + volume={1}, + number={1}, + pages={33--36}, + year={2019}, + publisher={Wiley Online Library} + +} + +@inproceedings{10.1145/3460231.3474255, +author = {de Souza Pereira Moreira, Gabriel and Rabhi, Sara and Lee, Jeong Min and Ak, Ronay and Oldridge, Even}, +title = {Transformers4Rec: Bridging the Gap between NLP and Sequential / Session-Based Recommendation}, +year = {2021}, +isbn = {9781450384582}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3460231.3474255}, +doi = {10.1145/3460231.3474255}, +abstract = {}, +booktitle = {Fifteenth ACM Conference on Recommender Systems}, +pages = {143–153}, +numpages = {11}, +location = {Amsterdam, Netherlands}, +series = {RecSys '21} +} + +@inproceedings{10.1145/3124749.3124754, +author = {Wang, Ruoxi and Fu, Bin and Fu, Gang and Wang, Mingliang}, +title = {Deep & Cross Network for Ad Click Predictions}, +year = {2017}, +isbn = {9781450351942}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3124749.3124754}, +doi = {10.1145/3124749.3124754}, +abstract = {Feature engineering has been the key to the success of many prediction models. However, the process is nontrivial and often requires manual feature engineering or exhaustive searching. DNNs are able to automatically learn feature interactions; however, they generate all the interactions implicitly, and are not necessarily efficient in learning all types of cross features. In this paper, we propose the Deep & Cross Network (DCN) which keeps the benefits of a DNN model, and beyond that, it introduces a novel cross network that is more efficient in learning certain bounded-degree feature interactions. In particular, DCN explicitly applies feature crossing at each layer, requires no manual feature engineering, and adds negligible extra complexity to the DNN model. Our experimental results have demonstrated its superiority over the state-of-art algorithms on the CTR prediction dataset and dense classification dataset, in terms of both model accuracy and memory usage.}, +booktitle = {Proceedings of the ADKDD'17}, +articleno = {12}, +numpages = {7}, +keywords = {CTR Prediction, Deep Learning, Neural Networks, Feature Crossing}, +location = {Halifax, NS, Canada}, +series = {ADKDD'17} +} + +@inproceedings{ijcai2017-239, + author = {Huifeng Guo and Ruiming TANG and Yunming Ye and Zhenguo Li and Xiuqiang He}, + title = {DeepFM: A Factorization-Machine based Neural Network for CTR Prediction}, + booktitle = {Proceedings of the Twenty-Sixth International Joint Conference on + Artificial Intelligence, {IJCAI-17}}, + pages = {1725--1731}, + year = {2017}, + doi = {10.24963/ijcai.2017/239}, + url = {https://doi.org/10.24963/ijcai.2017/239}, +} + +@article{naumov2019deep, + title={Deep learning recommendation model for personalization and recommendation systems}, + author={Naumov, Maxim and Mudigere, Dheevatsa and Shi, Hao-Jun Michael and Huang, Jianyu and Sundaraman, Narayanan and Park, Jongsoo and Wang, Xiaodong and Gupta, Udit and Wu, Carole-Jean and Azzolini, Alisson G and others}, + journal={arXiv preprint arXiv:1906.00091}, + year={2019} +} + +@inproceedings{NIPS2015_86df7dcf, + author = {Sculley, D. and Holt, Gary and Golovin, Daniel and Davydov, Eugene and Phillips, Todd and Ebner, Dietmar and Chaudhary, Vinay and Young, Michael and Crespo, Jean-Fran\c{c}ois and Dennison, Dan}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, + pages = {}, + publisher = {Curran Associates, Inc.}, + title = {Hidden Technical Debt in Machine Learning Systems}, + url = {https://proceedings.neurips.cc/paper/2015/file/86df7dcfd896fcaf2674f757a2463eba-Paper.pdf}, + volume = {28}, + year = {2015} +} + +@misc{Merlin, + note={Accessed on 2022-03-24}, + author = {NVIDIA}, + year = {2022}, + title = {{{NVIDIA Merlin}}}, + howpublished = {\url{https://github.com/NVIDIA-Merlin/Merlin}}, +} + +@misc{NVTabular, + note={Accessed on 2022-03-24}, + author = {NVIDIA}, + year = {2022}, + title = {{{NVIDIA NVTabular}}}, + howpublished = {\url{https://github.com/NVIDIA-Merlin/NVTabular}}, +} + +@misc{HugeCTR, + note={Accessed on 2022-03-24}, + author = {NVIDIA}, + year = {2022}, + title = {{{NVIDIA HugeCTR}}}, + howpublished = {\url{https://github.com/NVIDIA-Merlin/HugeCTR}}, +} + +@misc{Triton, + note={Accessed on 2022-03-24}, + author = {NVIDIA}, + year = {2022}, + title = {{{NVIDIA Triton}}}, + howpublished = {\url{https://github.com/triton-inference-server/server}}, +} + +@inproceedings{10.1145/3437801.3441578, +author = {Fang, Jiarui and Yu, Yang and Zhao, Chengduo and Zhou, Jie}, +title = {TurboTransformers: An Efficient GPU Serving System for Transformer Models}, +year = {2021}, +isbn = {9781450382946}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3437801.3441578}, +doi = {10.1145/3437801.3441578}, +abstract = {The transformer is the most critical algorithm innovation of the Nature Language Processing (NLP) field in recent years. Unlike the Recurrent Neural Network (RNN) models, transformers are able to process on dimensions of sequence lengths in parallel, therefore leads to better accuracy on long sequences. However, efficient deployments of them for online services in data centers equipped with GPUs are not easy. First, more computation introduced by transformer structures makes it more challenging to meet the latency and throughput constraints of serving. Second, NLP tasks take in sentences of variable length. The variability of input dimensions brings a severe problem to efficient memory management and serving optimization.To solve the above challenges, this paper designed a transformer serving system called TurboTransformers, which consists of a computing runtime and a serving framework. Three innovative features make it stand out from other similar works. An efficient parallel algorithm is proposed for GPU-based batch reduction operations, like Softmax and LayerNorm, which are major hot spots besides BLAS routines. A memory allocation algorithm, which better balances the memory footprint and allocation/free efficiency, is designed for variable-length input situations. A serving framework equipped with a new batch scheduler using dynamic programming achieves the optimal throughput on variable-length requests. The system can achieve the state-of-the-art transformer model serving performance on GPU platforms and can be seamlessly integrated into your PyTorch code with a few lines of code.}, +booktitle = {Proceedings of the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, +pages = {389–402}, +numpages = {14}, +keywords = {serving system, deep learning runtime, GPU, transformers}, +location = {Virtual Event, Republic of Korea}, +series = {PPoPP '21} +} + +@inproceedings{wang-etal-2021-lightseq, + title = "{L}ight{S}eq: A High Performance Inference Library for Transformers", + author = "Wang, Xiaohui and + Xiong, Ying and + Wei, Yang and + Wang, Mingxuan and + Li, Lei", + booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Papers", + month = jun, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.naacl-industry.15", + doi = "10.18653/v1/2021.naacl-industry.15", + pages = "113--120", + abstract = "Transformer and its variants have achieved great success in natural language processing. Since Transformer models are huge in size, serving these models is a challenge for real industrial applications. In this paper, we propose , a highly efficient inference library for models in the Transformer family. includes a series of GPU optimization techniques to both streamline the computation of Transformer layers and reduce memory footprint. supports models trained using PyTorch and Tensorflow. Experimental results on standard machine translation benchmarks show that achieves up to 14x speedup compared with TensorFlow and 1.4x speedup compared with , a concurrent CUDA implementation. The code will be released publicly after the review.", +} + +@inproceedings{quigley2009ros, + title={ROS: an open-source Robot Operating System}, + author={Quigley, Morgan and Conley, Ken and Gerkey, Brian and Faust, Josh and Foote, Tully and Leibs, Jeremy and Wheeler, Rob and Ng, Andrew Y and others}, + booktitle={ICRA workshop on open source software}, + volume={3}, + number={3.2}, + pages={5}, + year={2009}, + organization={Kobe, Japan} +} + +@inproceedings{maruyama2016exploring, + title={Exploring the performance of ROS2}, + author={Maruyama, Yuya and Kato, Shinpei and Azumi, Takuya}, + booktitle={Proceedings of the 13th ACM SIGBED International Conference on Embedded Software (EMSOFT)}, + pages={1--10}, + year={2016} +} + +@inproceedings{ding2019camnet, + title={CamNet: Coarse-to-fine retrieval for camera re-localization}, + author={Ding, Mingyu and Wang, Zhe and Sun, Jiankai and Shi, Jianping and Luo, Ping}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={2871--2880}, + year={2019} +} + +@inproceedings{yi2020segvoxelnet, + title={Segvoxelnet: Exploring semantic context and depth-aware features for 3d vehicle detection from point cloud}, + author={Yi, Hongwei and Shi, Shaoshuai and Ding, Mingyu and Sun, Jiankai and Xu, Kui and Zhou, Hui and Wang, Zhe and Li, Sheng and Wang, Guoping}, + booktitle={2020 IEEE International Conference on Robotics and Automation (ICRA)}, + pages={2274--2280}, + year={2020}, + organization={IEEE} +} + +@ARTICLE{9712373, author={Sun, Jiankai and Huang, De-An and Lu, Bo and Liu, Yun-Hui and Zhou, Bolei and Garg, Animesh}, journal={IEEE Robotics and Automation Letters}, title={PlaTe: Visually-Grounded Planning With Transformers in Procedural Tasks}, year={2022}, volume={7}, number={2}, pages={4924-4930}, doi={10.1109/LRA.2022.3150855}} + +@inproceedings{li2018undeepvo, + title={Undeepvo: Monocular visual odometry through unsupervised deep learning}, + author={Li, Ruihao and Wang, Sen and Long, Zhiqiang and Gu, Dongbing}, + booktitle={2018 IEEE international conference on robotics and automation (ICRA)}, + pages={7286--7291}, + year={2018}, + organization={IEEE} +} + +@inproceedings{quintero2021motion, + title={Motion planning via bayesian learning in the dark}, + author={Quintero-Pena, Carlos and Chamzas, Constantinos and Unhelkar, Vaibhav and Kavraki, Lydia E}, + booktitle={ICRA: Workshop on Machine Learning for Motion Planning}, + year={2021} +} + +@MISC{ML4KP, +author = {Edgar Granados and Aravind Sivaramakrishnan and Troy McMahon and Zakary Littlefield and Kostas E. Bekris}, +title = {Machine Learning for Kinodynamic Planning (ML4KP)}, +howpublished = {\url{https://github.com/PRX-Kinodynamic/ML4KP}}, +year = {2021--2021} +} + + + +@article{aradi2020survey, + title={Survey of deep reinforcement learning for motion planning of autonomous vehicles}, + author={Aradi, Szil{\'a}rd}, + journal={IEEE Transactions on Intelligent Transportation Systems}, + year={2020}, + publisher={IEEE} +} + +@article{vianna2021neural, + title={Neural Network Based Model Predictive Control for an Autonomous Vehicle}, + author={Vianna, Maria Luiza Costa and Goubault, Eric and Putot, Sylvie}, + journal={arXiv preprint arXiv:2107.14573}, + year={2021} +} + +@article{qiu2021egocentric, + title={Egocentric Human Trajectory Forecasting with a Wearable Camera and Multi-Modal Fusion}, + author={Qiu, Jianing and Chen, Lipeng and Gu, Xiao and Lo, Frank P-W and Tsai, Ya-Yen and Sun, Jiankai and Liu, Jiaqi and Lo, Benny}, + journal={arXiv preprint arXiv:2111.00993}, + year={2021} +} + +@InProceedings{pmlr-v155-huang21a, + title = {Learning a Decision Module by Imitating Driver’s Control Behaviors}, + author = {Huang, Junning and Xie, Sirui and Sun, Jiankai and Ma, Qiurui and Liu, Chunxiao and Lin, Dahua and Zhou, Bolei}, + booktitle = {Proceedings of the 2020 Conference on Robot Learning}, + pages = {1--10}, + year = {2021}, + editor = {Kober, Jens and Ramos, Fabio and Tomlin, Claire}, + volume = {155}, + series = {Proceedings of Machine Learning Research}, + month = {16--18 Nov}, + publisher = {PMLR}, + pdf = {https://proceedings.mlr.press/v155/huang21a/huang21a.pdf}, + url = {https://proceedings.mlr.press/v155/huang21a.html}, + abstract = {Autonomous driving systems have a pipeline of perception, decision, planning, and control. The decision module processes information from the perception module and directs the execution of downstream planning and control modules. On the other hand, the recent success of deep learning suggests that this pipeline could be replaced by end-to-end neural control policies, however, safety cannot be well guaranteed for the data-driven neural networks. In this work, we propose a hybrid framework to learn neural decisions in the classical modular pipeline through end-to-end imitation learning. This hybrid framework can preserve the merits of the classical pipeline such as the strict enforcement of physical and logical constraints while learning complex driving decisions from data. To circumvent the ambiguous annotation of human driving decisions, our method learns high-level driving decisions by imitating low-level control behaviors. We show in the simulation experiments that our modular driving agent can generalize its driving decision and control to various complex scenarios where the rule-based programs fail. It can also generate smoother and safer driving trajectories than end-to-end neural policies. Demo and code are available at https://decisionforce.github.io/modulardecision/.} +} + + +@InProceedings{pmlr-v155-sun21a, + title = {Neuro-Symbolic Program Search for Autonomous Driving Decision Module Design}, + author = {Sun, Jiankai and Sun, Hao and Han, Tian and Zhou, Bolei}, + booktitle = {Proceedings of the 2020 Conference on Robot Learning}, + pages = {21--30}, + year = {2021}, + editor = {Kober, Jens and Ramos, Fabio and Tomlin, Claire}, + volume = {155}, + series = {Proceedings of Machine Learning Research}, + month = {16--18 Nov}, + publisher = {PMLR}, + pdf = {https://proceedings.mlr.press/v155/sun21a/sun21a.pdf}, + url = {https://proceedings.mlr.press/v155/sun21a.html}, + abstract = {As a promising topic in cognitive robotics, neuro-symbolic modeling integrates symbolic reasoning and neural representation altogether. However, previous neuro-symbolic models usually wire their structures and the connections manually, making the underlying parameters sub-optimal. In this work, we propose the Neuro-Symbolic Program Search (NSPS) to improve the autonomous driving system design. NSPS is a novel automated search method that synthesizes the Neuro-Symbolic Programs. It can produce robust and expressive Neuro-Symbolic Programs and automatically tune the hyper-parameters. We validate NSPS in the CARLA driving simulation environment. The resulting Neuro-Symbolic Decision Programs successfully handle multiple traffic scenarios. Compared with previous neural-network-based driving and rule-based methods, our neuro-symbolic driving pipeline achieves more stable and safer behaviors in complex driving scenarios while maintaining an interpretable symbolic decision-making process.} +} + +@ARTICLE{9491826, author={Lu, Sidi and Shi, Weisong}, journal={IEEE Internet Computing}, title={The Emergence of Vehicle Computing}, year={2021}, volume={25}, number={3}, pages={18-22}, doi={10.1109/MIC.2021.3066076}} + +@article{benekohal1988carsim, + title={CARSIM: Car-following model for simulation of traffic in normal and stop-and-go conditions}, + author={Benekohal, Rahim F and Treiterer, Joseph}, + journal={Transportation research record}, + volume={1194}, + pages={99--111}, + year={1988}, + publisher={SAGE Publishing} +} + +@book{buehler2009darpa, + title={The DARPA urban challenge: autonomous vehicles in city traffic}, + author={Buehler, Martin and Iagnemma, Karl and Singh, Sanjiv}, + volume={56}, + year={2009}, + publisher={springer} +} + + +@InProceedings{pmlr-v100-bansal20a, + title = {Combining Optimal Control and Learning for Visual Navigation in Novel Environments}, + author = {Bansal, Somil and Tolani, Varun and Gupta, Saurabh and Malik, Jitendra and Tomlin, Claire}, + booktitle = {Proceedings of the Conference on Robot Learning}, + pages = {420--429}, + year = {2020}, + editor = {Kaelbling, Leslie Pack and Kragic, Danica and Sugiura, Komei}, + volume = {100}, + series = {Proceedings of Machine Learning Research}, + month = {30 Oct--01 Nov}, + publisher = {PMLR}, + pdf = {http://proceedings.mlr.press/v100/bansal20a/bansal20a.pdf}, + url = {https://proceedings.mlr.press/v100/bansal20a.html}, + abstract = {Model-based control is a popular paradigm for robot navigation because it can leverage a known dynamics model to efficiently plan robust robot trajectories. However, it is challenging to use model-based methods in settings where the environment is a priori unknown and can only be observed partially through onboard sensors on the robot. In this work, we address this short-coming by coupling model-based control with learning-based perception. The learning-based perception module produces a series of waypoints that guide the robot to the goal via a collision-free path. These waypoints are used by a model-based planner to generate a smooth and dynamically feasible trajectory that is executed on the physical system using feedback control. Our experiments in simulated real-world cluttered environments and on an actual ground vehicle demonstrate that the proposed approach can reach goal locations more reliably and efficiently in novel environments as compared to purely geometric mapping-based or end-to-end learning-based alternatives. Our approach does not rely on detailed explicit 3D maps of the environment, works well with low frame rates, and generalizes well from simulation to the real world. Videos describing our approach and experiments are available on the project website4.} +} + +@article{levine2018learning, + title={Learning hand-eye coordination for robotic grasping with deep learning and large-scale data collection}, + author={Levine, Sergey and Pastor, Peter and Krizhevsky, Alex and Ibarz, Julian and Quillen, Deirdre}, + journal={The International journal of robotics research}, + volume={37}, + number={4-5}, + pages={421--436}, + year={2018}, + publisher={SAGE Publications Sage UK: London, England} +} + +@incollection{peters2016robot, + title={Robot learning}, + author={Peters, Jan and Lee, Daniel D and Kober, Jens and Nguyen-Tuong, Duy and Bagnell, J Andrew and Schaal, Stefan}, + booktitle={Springer Handbook of Robotics}, + pages={357--398}, + year={2016}, + publisher={Springer} +} + +@article{saxena2014robobrain, + title={Robobrain: Large-scale knowledge engine for robots}, + author={Saxena, Ashutosh and Jain, Ashesh and Sener, Ozan and Jami, Aditya and Misra, Dipendra K and Koppula, Hema S}, + journal={arXiv preprint arXiv:1412.0691}, + year={2014} +} + +@inproceedings{zhu2017target, + title={Target-driven visual navigation in indoor scenes using deep reinforcement learning}, + author={Zhu, Yuke and Mottaghi, Roozbeh and Kolve, Eric and Lim, Joseph J and Gupta, Abhinav and Fei-Fei, Li and Farhadi, Ali}, + booktitle={2017 IEEE international conference on robotics and automation (ICRA)}, + pages={3357--3364}, + year={2017}, + organization={IEEE} +} + +@ARTICLE{9123682, author={Pan, Bowen and Sun, Jiankai and Leung, Ho Yin Tiga and Andonian, Alex and Zhou, Bolei}, journal={IEEE Robotics and Automation Letters}, title={Cross-View Semantic Segmentation for Sensing Surroundings}, year={2020}, volume={5}, number={3}, pages={4867-4873}, doi={10.1109/LRA.2020.3004325}} + +@article{tang2018ba, + title={Ba-net: Dense bundle adjustment network}, + author={Tang, Chengzhou and Tan, Ping}, + journal={arXiv preprint arXiv:1806.04807}, + year={2018} +} + +@inproceedings{tanaka2021learning, + title={Learning To Bundle-Adjust: A Graph Network Approach to Faster Optimization of Bundle Adjustment for Vehicular SLAM}, + author={Tanaka, Tetsuya and Sasagawa, Yukihiro and Okatani, Takayuki}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={6250--6259}, + year={2021} +} + +@inproceedings{tobin2017domain, + title={Domain randomization for transferring deep neural networks from simulation to the real world}, + author={Tobin, Josh and Fong, Rachel and Ray, Alex and Schneider, Jonas and Zaremba, Wojciech and Abbeel, Pieter}, + booktitle={2017 IEEE/RSJ international conference on intelligent robots and systems (IROS)}, + pages={23--30}, + year={2017}, + organization={IEEE} +} + +@inproceedings{finn2017deep, + title={Deep visual foresight for planning robot motion}, + author={Finn, Chelsea and Levine, Sergey}, + booktitle={2017 IEEE International Conference on Robotics and Automation (ICRA)}, + pages={2786--2793}, + year={2017}, + organization={IEEE} +} + +@article{duan2017one, + title={One-shot imitation learning}, + author={Duan, Yan and Andrychowicz, Marcin and Stadie, Bradly and Jonathan Ho, OpenAI and Schneider, Jonas and Sutskever, Ilya and Abbeel, Pieter and Zaremba, Wojciech}, + journal={Advances in neural information processing systems}, + volume={30}, + year={2017} +} + +@book{koubaa2017robot, + title={Robot Operating System (ROS).}, + author={Koub{\^a}a, Anis and others}, + volume={1}, + year={2017}, + publisher={Springer} +} + +@article{coleman2014reducing, + title={Reducing the barrier to entry of complex robotic software: a moveit! case study}, + author={Coleman, David and Sucan, Ioan and Chitta, Sachin and Correll, Nikolaus}, + journal={arXiv preprint arXiv:1404.3785}, + year={2014} +} + +@inproceedings{salzmann2020trajectron++, + title={Trajectron++: Dynamically-feasible trajectory forecasting with heterogeneous data}, + author={Salzmann, Tim and Ivanovic, Boris and Chakravarty, Punarjay and Pavone, Marco}, + booktitle={European Conference on Computer Vision}, + pages={683--700}, + year={2020}, + organization={Springer} +} + +@inproceedings{gog2021pylot, + title={Pylot: A modular platform for exploring latency-accuracy tradeoffs in autonomous vehicles}, + author={Gog, Ionel and Kalra, Sukrit and Schafhalter, Peter and Wright, Matthew A and Gonzalez, Joseph E and Stoica, Ion}, + booktitle={2021 IEEE International Conference on Robotics and Automation (ICRA)}, + pages={8806--8813}, + year={2021}, + organization={IEEE} +} + +@inproceedings{Dosovitskiy17, + title = { {CARLA}: {An} Open Urban Driving Simulator}, + author = {Alexey Dosovitskiy and German Ros and Felipe Codevilla and Antonio Lopez and Vladlen Koltun}, + booktitle = {Proceedings of the 1st Annual Conference on Robot Learning}, + pages = {1--16}, + year = {2017} +} + +@inproceedings{10.1145/3492321.3519576, +author = {Gog, Ionel and Kalra, Sukrit and Schafhalter, Peter and Gonzalez, Joseph E. and Stoica, Ion}, +title = {D3: A Dynamic Deadline-Driven Approach for Building Autonomous Vehicles}, +year = {2022}, +isbn = {9781450391627}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3492321.3519576}, +doi = {10.1145/3492321.3519576}, +abstract = {Autonomous vehicles (AVs) must drive across a variety of challenging environments that impose continuously-varying deadlines and runtime-accuracy tradeoffs on their software pipelines. A deadline-driven execution of such AV pipelines requires a new class of systems that enable the computation to maximize accuracy under dynamically-varying deadlines. Designing these systems presents interesting challenges that arise from combining ease-of-development of AV pipelines with deadline specification and enforcement mechanisms.Our work addresses these challenges through D3 (Dynamic Deadline-Driven), a novel execution model that centralizes the deadline management, and allows applications to adjust their computation by modeling missed deadlines as exceptions. Further, we design and implement ERDOS, an open-source realization of D3 for AV pipelines that exposes finegrained execution events to applications, and provides mechanisms to speculatively execute computation and enforce deadlines between an arbitrary set of events. Finally, we address the crucial lack of AV benchmarks through our state-of-the-art open-source AV pipeline, Pylot, that works seamlessly across simulators and real AVs. We evaluate the efficacy of D3 and ERDOS by driving Pylot across challenging driving scenarios spanning 50km, and observe a 68% reduction in collisions as compared to prior execution models.}, +booktitle = {Proceedings of the Seventeenth European Conference on Computer Systems}, +pages = {453–471}, +numpages = {19}, +location = {Rennes, France}, +series = {EuroSys '22} +} + +@article{li2021metadrive, + author = {Li, Quanyi and Peng, Zhenghao and Xue, Zhenghai and Zhang, Qihang and Zhou, Bolei}, + journal = {ArXiv preprint}, + title = {Metadrive: Composing diverse driving scenarios for generalizable reinforcement learning}, + url = {https://arxiv.org/abs/2109.12674}, + volume = {abs/2109.12674}, + year = {2021} +} + +@article{peng2021learning, + author = {Peng, Zhenghao and Li, Quanyi and Hui, Ka Ming and Liu, Chunxiao and Zhou, Bolei}, + journal = {Advances in Neural Information Processing Systems}, + title = {Learning to Simulate Self-Driven Particles System with Coordinated Policy Optimization}, + volume = {34}, + year = {2021} +} + + +@inproceedings{peng2021safe, + author = {Peng, Zhenghao and Li, Quanyi and Liu, Chunxiao and Zhou, Bolei}, + booktitle = {5th Annual Conference on Robot Learning}, + title = {Safe Driving via Expert Guided Policy Optimization}, + year = {2021} +} + +@ARTICLE{8421746, author={Qin, Tong and Li, Peiliang and Shen, Shaojie}, journal={IEEE Transactions on Robotics}, title={VINS-Mono: A Robust and Versatile Monocular Visual-Inertial State Estimator}, year={2018}, volume={34}, number={4}, pages={1004-1020}, doi={10.1109/TRO.2018.2853729}} + +@article{campos2021orb, + title={Orb-slam3: An accurate open-source library for visual, visual--inertial, and multimap slam}, + author={Campos, Carlos and Elvira, Richard and Rodr{\'\i}guez, Juan J G{\'o}mez and Montiel, Jos{\'e} MM and Tard{\'o}s, Juan D}, + journal={IEEE Transactions on Robotics}, + volume={37}, + number={6}, + pages={1874--1890}, + year={2021}, + publisher={IEEE} +} + +@inproceedings{li2021efficient, + author = {Li, Quanyi and Peng, Zhenghao and Zhou, Bolei}, + booktitle = {International Conference on Learning Representations}, + title = {Efficient Learning of Safe Driving Policy via Human-AI Copilot Optimization}, + year = {2021} +} + +@article{chaplot2020learning, + title={Learning to explore using active neural slam}, + author={Chaplot, Devendra Singh and Gandhi, Dhiraj and Gupta, Saurabh and Gupta, Abhinav and Salakhutdinov, Ruslan}, + journal={arXiv preprint arXiv:2004.05155}, + year={2020} +} + +@article{teed2021droid, + title={Droid-slam: Deep visual slam for monocular, stereo, and rgb-d cameras}, + author={Teed, Zachary and Deng, Jia}, + journal={Advances in Neural Information Processing Systems}, + volume={34}, + year={2021} +} + +@article{brunke2021safe, + title={Safe learning in robotics: From learning-based control to safe reinforcement learning}, + author={Brunke, Lukas and Greeff, Melissa and Hall, Adam W and Yuan, Zhaocong and Zhou, Siqi and Panerati, Jacopo and Schoellig, Angela P}, + journal={Annual Review of Control, Robotics, and Autonomous Systems}, + volume={5}, + year={2021}, + publisher={Annual Reviews} +} + + +@InProceedings{pmlr-v144-gama21a, + title = {Graph Neural Networks for Distributed Linear-Quadratic Control}, + author = {Gama, Fernando and Sojoudi, Somayeh}, + booktitle = {Proceedings of the 3rd Conference on Learning for Dynamics and Control}, + pages = {111--124}, + year = {2021}, + editor = {Jadbabaie, Ali and Lygeros, John and Pappas, George J. and A. Parrilo, Pablo and Recht, Benjamin and Tomlin, Claire J. and Zeilinger, Melanie N.}, + volume = {144}, + series = {Proceedings of Machine Learning Research}, + month = {07 -- 08 June}, + publisher = {PMLR}, + pdf = {http://proceedings.mlr.press/v144/gama21a/gama21a.pdf}, + url = {https://proceedings.mlr.press/v144/gama21a.html}, + abstract = {The linear-quadratic controller is one of the fundamental problems in control theory. The optimal solution is a linear controller that requires access to the state of the entire system at any given time. When considering a network system, this renders the optimal controller a centralized one. The interconnected nature of a network system often demands a distributed controller, where different components of the system are controlled based only on local information. Unlike the classical centralized case, obtaining the optimal distributed controller is usually an intractable problem. Thus, we adopt a graph neural network (GNN) as a parametrization of distributed controllers. GNNs are naturally local and have distributed architectures, making them well suited for learning nonlinear distributed controllers. By casting the linear-quadratic problem as a self-supervised learning problem, we are able to find the best GNN-based distributed controller. We also derive sufficient conditions for the resulting closed-loop system to be stable. We run extensive simulations to study the performance of GNN-based distributed controllers and showcase that they are a computationally efficient parametrization with scalability and transferability capabilities.} +} + + +@InProceedings{pmlr-v144-mehrjou21a, + title = {Neural Lyapunov Redesign}, + author = {Mehrjou, Arash and Ghavamzadeh, Mohammad and Sch\"olkopf, Bernhard}, + booktitle = {Proceedings of the 3rd Conference on Learning for Dynamics and Control}, + pages = {459--470}, + year = {2021}, + editor = {Jadbabaie, Ali and Lygeros, John and Pappas, George J. and A. Parrilo, Pablo and Recht, Benjamin and Tomlin, Claire J. and Zeilinger, Melanie N.}, + volume = {144}, + series = {Proceedings of Machine Learning Research}, + month = {07 -- 08 June}, + publisher = {PMLR}, + pdf = {http://proceedings.mlr.press/v144/mehrjou21a/mehrjou21a.pdf}, + url = {https://proceedings.mlr.press/v144/mehrjou21a.html}, + abstract = {Learning controllers merely based on a performance metric has been proven effective in many physical and non-physical tasks in both control theory and reinforcement learning. However, in practice, the controller must guarantee some notion of safety to ensure that it does not harm either the agent or the environment. Stability is a crucial notion of safety, whose violation can certainly cause unsafe behaviors. Lyapunov functions are effective tools to assess stability in nonlinear dynamical systems. In this paper, we combine an improving Lyapunov function with automatic controller synthesis in an iterative fashion to obtain control policies with large safe regions. We propose a two-player collaborative algorithm that alternates between estimating a Lyapunov function and deriving a controller that gradually enlarges the stability region of the closed-loop system. We provide theoretical results on the class of systems that can be treated with the proposed algorithm and empirically evaluate the effectiveness of our method using an exemplary dynamical system.} +} + + +@InProceedings{pmlr-v144-zhang21b, + title = {{LEOC}: A Principled Method in Integrating Reinforcement Learning and Classical Control Theory}, + author = {Zhang, Naifu and Capel, Nicholas}, + booktitle = {Proceedings of the 3rd Conference on Learning for Dynamics and Control}, + pages = {689--701}, + year = {2021}, + editor = {Jadbabaie, Ali and Lygeros, John and Pappas, George J. and A. Parrilo, Pablo and Recht, Benjamin and Tomlin, Claire J. and Zeilinger, Melanie N.}, + volume = {144}, + series = {Proceedings of Machine Learning Research}, + month = {07 -- 08 June}, + publisher = {PMLR}, + pdf = {http://proceedings.mlr.press/v144/zhang21b/zhang21b.pdf}, + url = {https://proceedings.mlr.press/v144/zhang21b.html}, + abstract = {There have been attempts in reinforcement learning to exploit a priori knowledge about the structure of the system. This paper proposes a hybrid reinforcement learning controller which dynamically interpolates a model-based linear controller and an arbitrary differentiable policy. The linear controller is designed based on local linearised model knowledge, and stabilises the system in a neighbourhood about an operating point. The coefficients of interpolation between the two controllers are determined by a scaled distance function measuring the distance between the current state and the operating point. The overall hybrid controller is proven to maintain the stability guarantee around the neighborhood of the operating point and still possess the universal function approximation property of the arbitrary non-linear policy. Learning has been done on both model-based (PILCO) and model-free (DDPG) frameworks. Simulation experiments performed in OpenAI gym demonstrate stability and robustness of the proposed hybrid controller. This paper thus introduces a principled method allowing for the direct importing of control methodology into reinforcement learning.} +} + + +@InProceedings{pmlr-v144-rafailov21a, + title = {Offline Reinforcement Learning from Images with Latent Space Models}, + author = {Rafailov, Rafael and Yu, Tianhe and Rajeswaran, Aravind and Finn, Chelsea}, + booktitle = {Proceedings of the 3rd Conference on Learning for Dynamics and Control}, + pages = {1154--1168}, + year = {2021}, + editor = {Jadbabaie, Ali and Lygeros, John and Pappas, George J. and A. Parrilo, Pablo and Recht, Benjamin and Tomlin, Claire J. and Zeilinger, Melanie N.}, + volume = {144}, + series = {Proceedings of Machine Learning Research}, + month = {07 -- 08 June}, + publisher = {PMLR}, + pdf = {http://proceedings.mlr.press/v144/rafailov21a/rafailov21a.pdf}, + url = {https://proceedings.mlr.press/v144/rafailov21a.html}, + abstract = {Offline reinforcement learning (RL) refers to the task of learning policies from a static dataset of environment interactions. Offline RL enables extensive utilization and re-use of historical datasets, while also alleviating safety concerns associated with online exploration, thereby expanding the real-world applicability of RL. Most prior work in offline RL has focused on tasks with compact state representations. However, the ability to learn directly from rich observation spaces like images is critical for real-world applications like robotics. In this work, we build on recent advances in model-based algorithms for offline RL, and extend them to high-dimensional visual observation spaces. Model-based offline RL algorithms have achieved state of the art results in state based tasks and are minimax optimal. However, they rely crucially on the ability to quantify uncertainty in the model predictions. This is particularly challenging with image observations. To overcome this challenge, we propose to learn a latent-state dynamics model, and represent the uncertainty in the latent space. Our approach is both tractable in practice and corresponds to maximizing a lower bound of the ELBO in the unknown POMDP. Through experiments on a range of challenging image-based locomotion and robotic manipulation tasks, we find that our algorithm significantly outperforms previous offline model-free RL methods as well as state-of-the-art online visual model-based RL methods. Moreover, we also find that our approach excels on an image-based drawer closing task on a real robot using a pre-existing dataset. All results including videos can be found online at \url{https://sites.google.com/view/lompo/}.} +} + +@inproceedings{chen2020transferable, + title={Transferable active grasping and real embodied dataset}, + author={Chen, Xiangyu and Ye, Zelin and Sun, Jiankai and Fan, Yuda and Hu, Fang and Wang, Chenxi and Lu, Cewu}, + booktitle={2020 IEEE International Conference on Robotics and Automation (ICRA)}, + pages={3611--3618}, + year={2020}, + organization={IEEE} +} + +@article{sun2021adversarial, + title={Adversarial inverse reinforcement learning with self-attention dynamics model}, + author={Sun, Jiankai and Yu, Lantao and Dong, Pinqian and Lu, Bo and Zhou, Bolei}, + journal={IEEE Robotics and Automation Letters}, + volume={6}, + number={2}, + pages={1880--1886}, + year={2021}, + publisher={IEEE} +} + +@article{huang2018navigationnet, + title={NavigationNet: A large-scale interactive indoor navigation dataset}, + author={Huang, He and Shen, Yujing and Sun, Jiankai and Lu, Cewu}, + journal={arXiv preprint arXiv:1808.08374}, + year={2018} +} + +@inproceedings{xu2019depth, + title={Depth completion from sparse lidar data with depth-normal constraints}, + author={Xu, Yan and Zhu, Xinge and Shi, Jianping and Zhang, Guofeng and Bao, Hujun and Li, Hongsheng}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={2811--2820}, + year={2019} +} + +@inproceedings{zhu2020ssn, + title={Ssn: Shape signature networks for multi-class object detection from point clouds}, + author={Zhu, Xinge and Ma, Yuexin and Wang, Tai and Xu, Yan and Shi, Jianping and Lin, Dahua}, + booktitle={European Conference on Computer Vision}, + pages={581--597}, + year={2020}, + organization={Springer} +} + +@inproceedings{huang2019prior, + title={Prior guided dropout for robust visual localization in dynamic environments}, + author={Huang, Zhaoyang and Xu, Yan and Shi, Jianping and Zhou, Xiaowei and Bao, Hujun and Zhang, Guofeng}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={2791--2800}, + year={2019} +} + +@article{xu2020selfvoxelo, + title={Selfvoxelo: Self-supervised lidar odometry with voxel-based deep neural networks}, + author={Xu, Yan and Huang, Zhaoyang and Lin, Kwan-Yee and Zhu, Xinge and Shi, Jianping and Bao, Hujun and Zhang, Guofeng and Li, Hongsheng}, + journal={arXiv preprint arXiv:2010.09343}, + year={2020} +} + +@article{huang2021life, + title={LIFE: Lighting Invariant Flow Estimation}, + author={Huang, Zhaoyang and Pan, Xiaokun and Xu, Runsen and Xu, Yan and Zhang, Guofeng and Li, Hongsheng and others}, + journal={arXiv preprint arXiv:2104.03097}, + year={2021} +} + +@inproceedings{huang2021vs, + title={VS-Net: Voting with Segmentation for Visual Localization}, + author={Huang, Zhaoyang and Zhou, Han and Li, Yijin and Yang, Bangbang and Xu, Yan and Zhou, Xiaowei and Bao, Hujun and Zhang, Guofeng and Li, Hongsheng}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={6101--6111}, + year={2021} +} + +@article{yang2021pdnet, + title={PDNet: Towards Better One-stage Object Detection with Prediction Decoupling}, + author={Yang, Li and Xu, Yan and Wang, Shaoru and Yuan, Chunfeng and Zhang, Ziqi and Li, Bing and Hu, Weiming}, + journal={arXiv preprint arXiv:2104.13876}, + year={2021} +} + +@article{xu2022robust, + title={Robust Self-supervised LiDAR Odometry via Representative Structure Discovery and 3D Inherent Error Modeling}, + author={Xu, Yan and Lin, Junyi and Shi, Jianping and Zhang, Guofeng and Wang, Xiaogang and Li, Hongsheng}, + journal={IEEE Robotics and Automation Letters}, + year={2022}, + publisher={IEEE} +} + +@article{xu2022rnnpose, + title={RNNPose: Recurrent 6-DoF Object Pose Refinement with Robust Correspondence Field Estimation and Pose Optimization}, + author={Xu, Yan and Lin, Junyi and Zhang, Guofeng and Wang, Xiaogang and Li, Hongsheng}, + journal={arXiv preprint arXiv:2203.12870}, + year={2022} +} + +@article{Sun2022SelfSupervisedTA, + title={Self-Supervised Traffic Advisors: Distributed, Multi-view Traffic Prediction for Smart Cities}, + author={Jiankai Sun and Shreyas Kousik and David Fridovich-Keil and Mac Schwager}, + journal={arXiv preprint}, + year={2022} +} + +@ARTICLE{9813561, author={Qiu, Jianing and Chen, Lipeng and Gu, Xiao and Lo, Frank P.-W. and Tsai, Ya-Yen and Sun, Jiankai and Liu, Jiaqi and Lo, Benny}, journal={IEEE Robotics and Automation Letters}, title={Egocentric Human Trajectory Forecasting with a Wearable Camera and Multi-Modal Fusion}, year={2022}, volume={}, number={}, pages={1-8}, doi={10.1109/LRA.2022.3188101}} + +@article{MegBA, + title={MegBA: A High-Performance and Distributed Library for Large-Scale Bundle Adjustment}, + author={Ren, Jie and Liang, Wenteng and Yan, Ran and Mai, Luo and Liu, Shiwen and Liu, Xiao}, + journal={European Conference on Computer Vision}, + year={2022} +} + +@inproceedings{li2023behavior, + title={Behavior-1k: A benchmark for embodied ai with 1,000 everyday activities and realistic simulation}, + author={Li, Chengshu and Zhang, Ruohan and Wong, Josiah and Gokmen, Cem and Srivastava, Sanjana and Mart{\'\i}n-Mart{\'\i}n, Roberto and Wang, Chen and Levine, Gabrael and Lingelbach, Michael and Sun, Jiankai and others}, + booktitle={Conference on Robot Learning}, + pages={80--93}, + year={2023}, + organization={PMLR} +} + +@article{wang2023mimicplay, + title={MimicPlay: Long-Horizon Imitation Learning by Watching Human Play}, + author={Wang, Chen and Fan, Linxi and Sun, Jiankai and Zhang, Ruohan and Fei-Fei, Li and Xu, Danfei and Zhu, Yuke and Anandkumar, Anima}, + journal={arXiv preprint arXiv:2302.12422}, + year={2023} +} diff --git a/zh_chapters/references b/zh_chapters/references new file mode 120000 index 0000000..543a78f --- /dev/null +++ b/zh_chapters/references @@ -0,0 +1 @@ +/chivier-disk/hyq-home/Projects/openmlsys-zh/references \ No newline at end of file diff --git a/zh_chapters/static b/zh_chapters/static new file mode 120000 index 0000000..1ca9b6a --- /dev/null +++ b/zh_chapters/static @@ -0,0 +1 @@ +/chivier-disk/hyq-home/Projects/openmlsys-zh/static \ No newline at end of file