add 45

2026-03-22 04:40:45 +08:00 · 2024-10-20 07:19:08 +00:00
parent 049b40d222
commit 63a5241bfa
23 changed files with 4250 additions and 25 deletions
--- a/src/45-scx-nest/.config
+++ b/src/45-scx-nest/.config
@@ -0,0 +1,2 @@
+level=Depth
+type=Scheduler
--- a/src/45-scx-nest/.gitignore
+++ b/src/45-scx-nest/.gitignore
@@ -0,0 +1,3 @@
+scx_simple
+scx_nest
+.output
--- a/src/45-scx-nest/Makefile
+++ b/src/45-scx-nest/Makefile
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+OUTPUT := .output
+CLANG ?= clang
+LIBBPF_SRC := $(abspath ../third_party/libbpf/src)
+BPFTOOL_SRC := $(abspath ../third_party/bpftool/src)
+LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
+BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
+BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
+LIBBLAZESYM_SRC := $(abspath ../third_party/blazesym/)
+LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym.a)
+LIBBLAZESYM_HEADER := $(abspath $(OUTPUT)/blazesym.h)
+ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
+			 | sed 's/arm.*/arm/' \
+			 | sed 's/aarch64/arm64/' \
+			 | sed 's/ppc64le/powerpc/' \
+			 | sed 's/mips.*/mips/' \
+			 | sed 's/riscv64/riscv/' \
+			 | sed 's/loongarch64/loongarch/')
+VMLINUX := ../third_party/vmlinux/$(ARCH)/vmlinux.h
+# Use our own libbpf API headers and Linux UAPI headers distributed with
+# libbpf to avoid dependency on system-wide headers, which could be missing or
+# outdated
+INCLUDES := -I$(OUTPUT) -I../third_party/libbpf/include/uapi -Iinclude/ -I$(dir $(VMLINUX))
+CFLAGS := -g -Wall 
+ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)
+
+APPS = scx_nest
+
+CARGO ?= $(shell which cargo)
+ifeq ($(strip $(CARGO)),)
+BZS_APPS :=
+else
+BZS_APPS := 
+APPS += $(BZS_APPS)
+# Required by libblazesym
+ALL_LDFLAGS += -lrt -ldl -lpthread -lm
+endif
+
+# Get Clang's default includes on this system. We'll explicitly add these dirs
+# to the includes list when compiling with `-target bpf` because otherwise some
+# architecture-specific dirs will be "missing" on some architectures/distros -
+# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h,
+# sys/cdefs.h etc. might be missing.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
+
+ifeq ($(V),1)
+	Q =
+	msg =
+else
+	Q = @
+	msg = @printf '  %-8s %s%s\n'					\
+		      "$(1)"						\
+		      "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))"	\
+		      "$(if $(3), $(3))";
+	MAKEFLAGS += --no-print-directory
+endif
+
+define allow-override
+  $(if $(or $(findstring environment,$(origin $(1))),\
+            $(findstring command line,$(origin $(1)))),,\
+    $(eval $(1) = $(2)))
+endef
+
+$(call allow-override,CC,$(CROSS_COMPILE)cc)
+$(call allow-override,LD,$(CROSS_COMPILE)ld)
+
+.PHONY: all
+all: $(APPS)
+
+.PHONY: clean
+clean:
+	$(call msg,CLEAN)
+	$(Q)rm -rf $(OUTPUT) $(APPS)
+
+$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
+	$(call msg,MKDIR,$@)
+	$(Q)mkdir -p $@
+
+# Build libbpf
+$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
+	$(call msg,LIB,$@)
+	$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1		      \
+		    OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@)		      \
+		    INCLUDEDIR= LIBDIR= UAPIDIR=			      \
+		    install
+
+# Build bpftool
+$(BPFTOOL): | $(BPFTOOL_OUTPUT)
+	$(call msg,BPFTOOL,$@)
+	$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
+
+
+$(LIBBLAZESYM_SRC)/target/release/libblazesym.a::
+	$(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release
+
+$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
+	$(call msg,LIB, $@)
+	$(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@
+
+$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
+	$(call msg,LIB,$@)
+	$(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@
+
+# Build BPF code
+$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
+	$(call msg,BPF,$@)
+	$(Q)$(CLANG) -mlittle-endian  -g -O2  -mcpu=v3 -target bpf -D__TARGET_ARCH_$(ARCH)		      \
+		     $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES)		      \
+		     -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
+	$(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
+
+# Generate BPF skeletons
+$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL)
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
+	$(call msg,GEN-SKEL,$@)
+	$(Q)$(BPFTOOL)  gen skeleton $< name $(APPS) > $@
+
+# Build user-space code
+$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h
+
+$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT)
+	$(call msg,CC,$@)
+	$(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@
+
+$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER)
+
+$(BZS_APPS): $(LIBBLAZESYM_OBJ)
+
+# Build application binary
+$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
+	$(call msg,BINARY,$@)
+	$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
+
+# delete failed targets
+.DELETE_ON_ERROR:
+
+# keep intermediate (.skel.h, .bpf.o, etc) targets
+.SECONDARY:
--- a/src/45-scx-nest/README.md
+++ b/src/45-scx-nest/README.md
@@ -0,0 +1,927 @@
+# eBPF Tutorial by Example: Implementing the `scx_nest` Scheduler
+
+In the ever-evolving landscape of system performance optimization, the ability to customize and extend kernel behavior is invaluable. One of the most powerful tools for achieving this is eBPF (extended Berkeley Packet Filter). In this tutorial, we'll explore the implementation of the `scx_nest` scheduler, an advanced eBPF program that leverages the `sched_ext` scheduler class introduced in Linux kernel version `6.12`. By the end of this guide, you'll understand how to build a sophisticated scheduler that dynamically adjusts task placement based on CPU core frequencies and utilization.
+
+## Introduction to `sched_ext`
+
+The `sched_ext` scheduler class marks a significant advancement in Linux kernel scheduling capabilities. Unlike traditional schedulers, `sched_ext` allows its behavior to be defined dynamically through a set of BPF (Berkeley Packet Filter) programs. This flexibility enables developers to implement custom scheduling algorithms tailored to specific workloads and system requirements.
+
+## Understanding the `scx_nest` Scheduler
+
+### Overview
+
+The `scx_nest` scheduler is inspired by the Inria Paris paper titled "[OS Scheduling with Nest: Keeping Tasks Close Together on Warm Cores](https://hal.inria.fr/hal-03612592/file/paper.pdf)." Developed by Meta Platforms, Inc., `scx_nest` focuses on encouraging task placement on CPU cores that are likely to run at higher frequencies based on recent usage patterns. This approach aims to optimize performance by ensuring that tasks execute on the most efficient cores available.
+
+The scheduler operates as a global weighted virtual time (vtime) scheduler, similar to the Completely Fair Scheduler (CFS), while utilizing the Nest algorithm to select idle cores during task wakeup. This dual strategy ensures that tasks are not only fairly distributed but also placed on cores that can execute them most effectively.
+
+`scx_nest` is designed to optimize workloads with relatively low CPU utilization that can benefit from running on a subset of cores. By concentrating tasks on fewer cores, the scheduler helps maintain high frequencies on those cores, enhancing performance. However, for workloads that perform better when distributed across many cores to avoid cache thrashing, `scx_nest` may not be the ideal choice. Evaluating the suitability of `scx_nest` for a specific workload often requires experimentation.
+
+Given its design, `scx_nest` is suitable for production environments, provided the hardware constraints are met. It performs optimally on single CCX (Core Complex) or single-socket hosts with a uniform L3 cache topology. While preemption is not implemented in the current version, the shared scheduling queue across all CPUs ensures that tasks at the front of the queue are executed promptly, provided there are enough CPUs available.
+
+## High-Level Code Analysis
+
+The `scx_nest` scheduler's implementation is intricate, involving various data structures, maps, and functions that work in harmony to manage task placement and CPU core utilization. The complete source code is available in the [eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) repository. Below, we'll dissect the core components of the scheduler, explaining each part in detail.
+
+### Core Data Structures and Maps
+
+#### Task Context (`task_ctx`)
+
+Each task in the system has an associated context that maintains scheduling-related information. This context is crucial for making informed scheduling decisions based on the task's history and current state.
+
+```c
+/* Per-task scheduling context */
+struct task_ctx {
+    /*
+     * A temporary cpumask for calculating a task's primary and reserve
+     * mask.
+     */
+    struct bpf_cpumask __kptr *tmp_mask;
+
+    /*
+     * The number of times that a task observes that its previous core is
+     * not idle. If this occurs r_impatient times in a row, a core is
+     * attempted to be retrieved from either the reserve nest, or the
+     * fallback nest.
+     */
+    u32 prev_misses;
+
+    /*
+     * A core that the task is "attached" to, meaning the last core that it
+     * executed on at least twice in a row, and the core that it first
+     * tries to migrate to on wakeup. The task only migrates to the
+     * attached core if it is idle and in the primary nest.
+     */
+    s32 attached_core;
+
+    /*
+     * The last core that the task executed on. This is used to determine
+     * if the task should attach to the core that it will execute on next.
+     */
+    s32 prev_cpu;
+};
+```
+
+The `task_ctx` structure holds a temporary CPU mask (`tmp_mask`) used for calculating the task's primary and reserve CPU sets. The `prev_misses` counter tracks how often the task's preferred core was not idle, influencing decisions to migrate the task to different cores. The `attached_core` indicates the core the task is currently bound to, ensuring it runs on a high-frequency core when possible. Lastly, `prev_cpu` records the last core the task executed on, aiding in maintaining task-core affinity.
+
+#### Per-CPU Context (`pcpu_ctx`)
+
+Each CPU has an associated context that manages timers and compaction state. This context helps in determining when a core should be demoted from the primary nest due to inactivity.
+
+```c
+struct pcpu_ctx {
+    /* The timer used to compact the core from the primary nest. */
+    struct bpf_timer timer;
+
+    /* Whether the current core has been scheduled for compaction. */
+    bool scheduled_compaction;
+};
+```
+
+The `pcpu_ctx` structure contains a `bpf_timer` used to schedule compaction events and a boolean flag `scheduled_compaction` indicating whether a compaction has been scheduled for the core.
+
+#### Maps
+
+Several BPF maps are utilized to store contexts and manage timers:
+
+```c
+/* Task storage map */
+struct {
+    __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+    __uint(map_flags, BPF_F_NO_PREALLOC);
+    __type(key, int);
+    __type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+/* Per-CPU contexts */
+struct {
+    __uint(type, BPF_MAP_TYPE_ARRAY);
+    __uint(max_entries, 1024);
+    __type(key, s32);
+    __type(value, struct pcpu_ctx);
+} pcpu_ctxs SEC(".maps");
+
+/* Statistics timer */
+struct {
+    __uint(type, BPF_MAP_TYPE_ARRAY);
+    __uint(max_entries, 1);
+    __type(key, u32);
+    __type(value, struct stats_timer);
+} stats_timer SEC(".maps");
+```
+
+- **`task_ctx_stor`:** This map stores the scheduling context for each task, enabling the scheduler to access and modify task-specific information.
+- **`pcpu_ctxs`:** An array map that holds the per-CPU contexts, allowing the scheduler to manage timers and compaction states for each CPU.
+- **`stats_timer`:** A single-entry array map used to manage a central timer for collecting scheduling statistics.
+
+Additionally, the scheduler maintains masks for primary, reserved, other, and idle CPUs, as well as a statistics map to track various scheduler metrics.
+
+### Core Functions
+
+#### `stat_inc`
+
+A helper function to increment scheduler statistics:
+
+```c
+static __always_inline void stat_inc(u32 idx)
+{
+    u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
+    if (cnt_p)
+        (*cnt_p)++;
+}
+```
+
+This function looks up a counter in the `stats` map and increments it if the counter exists. It's used throughout the scheduler to track various events and states.
+
+#### `vtime_before`
+
+A utility function to compare virtual times:
+
+```c
+static inline bool vtime_before(u64 a, u64 b)
+{
+    return (s64)(a - b) < 0;
+}
+```
+
+This function determines if virtual time `a` is before `b`, facilitating time-based scheduling decisions.
+
+#### `try_make_core_reserved`
+
+Attempts to promote a core to the reserved nest:
+
+```c
+static __always_inline void
+try_make_core_reserved(s32 cpu, struct bpf_cpumask * reserved, bool promotion)
+{
+    s32 tmp_nr_reserved;
+
+    /*
+     * This check is racy, but that's OK. If we incorrectly fail to promote
+     * a core to reserve, it's because another context added or removed a
+     * core from reserved in this small window. It will balance out over
+     * subsequent wakeups.
+     */
+    tmp_nr_reserved = nr_reserved;
+    if (tmp_nr_reserved < r_max) {
+        /*
+         * It's possible that we could exceed r_max for a time here,
+         * but that should balance out as more cores are either demoted
+         * or fail to be promoted into the reserve nest.
+         */
+        __sync_fetch_and_add(&nr_reserved, 1);
+        bpf_cpumask_set_cpu(cpu, reserved);
+        if (promotion)
+            stat_inc(NEST_STAT(PROMOTED_TO_RESERVED));
+        else
+            stat_inc(NEST_STAT(DEMOTED_TO_RESERVED));
+    } else {
+        bpf_cpumask_clear_cpu(cpu, reserved);
+        stat_inc(NEST_STAT(RESERVED_AT_CAPACITY));
+    }
+}
+```
+
+The `try_make_core_reserved` function attempts to add a CPU core to the reserved mask. It first checks if the number of reserved cores (`nr_reserved`) is below the maximum allowed (`r_max`). If so, it increments the `nr_reserved` counter and adds the core to the reserved mask. Depending on whether the core is being promoted or demoted, it increments the corresponding statistic. If the reserved capacity is full, it clears the core from the reserved mask and updates the relevant statistic.
+
+#### `update_attached`
+
+Updates the task's attached core based on recent execution:
+
+```c
+static void update_attached(struct task_ctx *tctx, s32 prev_cpu, s32 new_cpu)
+{
+    if (tctx->prev_cpu == new_cpu)
+        tctx->attached_core = new_cpu;
+    tctx->prev_cpu = prev_cpu;
+}
+```
+
+This function updates the `attached_core` for a task. If the task has executed on the same core consecutively, it attaches the task to that core. It then updates the `prev_cpu` to reflect the latest core the task ran on.
+
+#### `compact_primary_core`
+
+Handles the compaction of a primary core by demoting it to the reserve nest:
+
+```c
+static int compact_primary_core(void *map, int *key, struct bpf_timer *timer)
+{
+    struct bpf_cpumask *primary, *reserve;
+    s32 cpu = bpf_get_smp_processor_id();
+    struct pcpu_ctx *pcpu_ctx;
+
+    stat_inc(NEST_STAT(CALLBACK_COMPACTED));
+
+    /*
+     * If we made it to this callback, it means that the timer callback was
+     * never cancelled, and so the core needs to be demoted from the
+     * primary nest.
+     */
+    pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu);
+    if (!pcpu_ctx) {
+        scx_bpf_error("Couldn't lookup pcpu ctx");
+        return 0;
+    }
+    bpf_rcu_read_lock();
+    primary = primary_cpumask;
+    reserve = reserve_cpumask;
+    if (!primary || !reserve) {
+        scx_bpf_error("Couldn't find primary or reserve");
+        bpf_rcu_read_unlock();
+        return 0;
+    }
+
+    bpf_cpumask_clear_cpu(cpu, primary);
+    try_make_core_reserved(cpu, reserve, false);
+    bpf_rcu_read_unlock();
+    pcpu_ctx->scheduled_compaction = false;
+    return 0;
+}
+```
+
+When the compaction timer expires, `compact_primary_core` is invoked. It demotes the current CPU core from the primary nest to the reserve nest by clearing it from the primary mask and attempting to add it to the reserve mask using `try_make_core_reserved`. This ensures that inactive cores are efficiently managed, maintaining a balance between performance and resource utilization.
+
+#### `nest_select_cpu`
+
+Determines the appropriate CPU for a task upon waking up:
+
+```c
+s32 BPF_STRUCT_OPS(nest_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+    struct bpf_cpumask *p_mask, *primary, *reserve;
+    s32 cpu;
+    struct task_ctx *tctx;
+    struct pcpu_ctx *pcpu_ctx;
+    bool direct_to_primary = false, reset_impatient = true;
+
+    tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+    if (!tctx)
+        return -ENOENT;
+
+    bpf_rcu_read_lock();
+    p_mask = tctx->tmp_mask;
+    primary = primary_cpumask;
+    reserve = reserve_cpumask;
+    if (!p_mask || !primary || !reserve) {
+        bpf_rcu_read_unlock();
+        return -ENOENT;
+    }
+
+    tctx->prev_cpu = prev_cpu;
+
+    bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary));
+
+    /* First try to wake the task on its attached core. */
+    if (bpf_cpumask_test_cpu(tctx->attached_core, cast_mask(p_mask)) &&
+        scx_bpf_test_and_clear_cpu_idle(tctx->attached_core)) {
+        cpu = tctx->attached_core;
+        stat_inc(NEST_STAT(WAKEUP_ATTACHED));
+        goto migrate_primary;
+    }
+
+    /*
+     * Try to stay on the previous core if it's in the primary set, and
+     * there's no hypertwin. If the previous core is the core the task is
+     * attached to, don't bother as we already just tried that above.
+     */
+    if (prev_cpu != tctx->attached_core &&
+        bpf_cpumask_test_cpu(prev_cpu, cast_mask(p_mask)) &&
+        scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+        cpu = prev_cpu;
+        stat_inc(NEST_STAT(WAKEUP_PREV_PRIMARY));
+        goto migrate_primary;
+    }
+
+    if (find_fully_idle) {
+        /* Then try any fully idle core in primary. */
+        cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask),
+                                    SCX_PICK_IDLE_CORE);
+        if (cpu >= 0) {
+            stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_PRIMARY));
+            goto migrate_primary;
+        }
+    }
+
+    /* Then try _any_ idle core in primary, even if its hypertwin is active. */
+    cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0);
+    if (cpu >= 0) {
+        stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_PRIMARY));
+        goto migrate_primary;
+    }
+
+    if (r_impatient > 0 && ++tctx->prev_misses >= r_impatient) {
+        direct_to_primary = true;
+        tctx->prev_misses = 0;
+        stat_inc(NEST_STAT(TASK_IMPATIENT));
+    }
+
+    reset_impatient = false;
+
+    /* Then try any fully idle core in reserve. */
+    bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(reserve));
+    if (find_fully_idle) {
+        cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask),
+                                    SCX_PICK_IDLE_CORE);
+        if (cpu >= 0) {
+            stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_RESERVE));
+            goto promote_to_primary;
+        }
+    }
+
+    /* Then try _any_ idle core in reserve, even if its hypertwin is active. */
+    cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0);
+    if (cpu >= 0) {
+        stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_RESERVE));
+        goto promote_to_primary;
+    }
+
+    /* Then try _any_ idle core in the task's cpumask. */
+    cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+    if (cpu >= 0) {
+        /*
+         * We found a core that (we didn't _think_) is in any nest.
+         * This means that we need to either promote the core to the
+         * reserve nest, or if we're going direct to primary due to
+         * r_impatient being exceeded, promote directly to primary.
+         *
+         * We have to do one final check here to see if the core is in
+         * the primary or reserved cpumask because we could potentially
+         * race with the core changing states between AND'ing the
+         * primary and reserve masks with p->cpus_ptr above, and
+         * atomically reserving it from the idle mask with
+         * scx_bpf_pick_idle_cpu(). This is also technically true of
+         * the checks above, but in all of those cases we just put the
+         * core directly into the primary mask so it's not really that
+         * big of a problem. Here, we want to make sure that we don't
+         * accidentally put a core into the reserve nest that was e.g.
+         * already in the primary nest. This is unlikely, but we check
+         * for it on what should be a relatively cold path regardless.
+         */
+        stat_inc(NEST_STAT(WAKEUP_IDLE_OTHER));
+        if (bpf_cpumask_test_cpu(cpu, cast_mask(primary)))
+            goto migrate_primary;
+        else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve)))
+            goto promote_to_primary;
+        else if (direct_to_primary)
+            goto promote_to_primary;
+        else
+            try_make_core_reserved(cpu, reserve, true);
+        bpf_rcu_read_unlock();
+        return cpu;
+    }
+
+    bpf_rcu_read_unlock();
+    return prev_cpu;
+
+promote_to_primary:
+    stat_inc(NEST_STAT(PROMOTED_TO_PRIMARY));
+migrate_primary:
+    if (reset_impatient)
+        tctx->prev_misses = 0;
+    pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu);
+    if (pcpu_ctx) {
+        if (pcpu_ctx->scheduled_compaction) {
+            if (bpf_timer_cancel(&pcpu_ctx->timer) < 0)
+                scx_bpf_error("Failed to cancel pcpu timer");
+            if (bpf_timer_set_callback(&pcpu_ctx->timer, compact_primary_core))
+                scx_bpf_error("Failed to re-arm pcpu timer");
+            pcpu_ctx->scheduled_compaction = false;
+            stat_inc(NEST_STAT(CANCELLED_COMPACTION));
+        }
+    } else {
+        scx_bpf_error("Failed to lookup pcpu ctx");
+    }
+    bpf_cpumask_set_cpu(cpu, primary);
+    /*
+     * Check to see whether the CPU is in the reserved nest. This can
+     * happen if the core is compacted concurrently with us trying to place
+     * the currently-waking task onto it. Similarly, this is the expected
+     * state of the core if we found the core in the reserve nest and are
+     * promoting it.
+     *
+     * We don't have to worry about racing with any other waking task here
+     * because we've atomically reserved the core with (some variant of)
+     * scx_bpf_pick_idle_cpu().
+     */
+    if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) {
+        __sync_sub_and_fetch(&nr_reserved, 1);
+        bpf_cpumask_clear_cpu(cpu, reserve);
+    }
+    bpf_rcu_read_unlock();
+    update_attached(tctx, prev_cpu, cpu);
+    scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
+    return cpu;
+}
+```
+
+The `nest_select_cpu` function is the heart of the `scx_nest` scheduler. When a task wakes up, this function determines the most suitable CPU core for its execution. The function follows a series of checks to ensure that tasks are placed on high-frequency, idle cores, promoting efficiency and performance.
+
+Initially, it retrieves the task's context from the `task_ctx_stor` map. It then locks the read-copy-update (RCU) lock to safely access the primary and reserve CPU masks. The scheduler first attempts to place the task on its attached core, ensuring core affinity. If the attached core is not idle, it tries the previous core. Depending on various conditions, including the task's impatience (`r_impatient`) and the availability of idle cores in the primary and reserve nests, the scheduler decides whether to migrate the task, promote a core to the primary nest, or demote a core to the reserve nest.
+
+Throughout the process, the scheduler updates relevant statistics to provide insights into its operations. The use of RCU locks ensures that the scheduler's decisions are made safely without interfering with other concurrent operations.
+
+#### `nest_enqueue`
+
+Handles the enqueuing of tasks into the scheduling queue:
+
+```c
+void BPF_STRUCT_OPS(nest_enqueue, struct task_struct *p, u64 enq_flags)
+{
+    struct task_ctx *tctx;
+    u64 vtime = p->scx.dsq_vtime;
+
+    tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+    if (!tctx) {
+        scx_bpf_error("Unable to find task ctx");
+        return;
+    }
+
+    /*
+     * Limit the amount of budget that an idling task can accumulate
+     * to one slice.
+     */
+    if (vtime_before(vtime, vtime_now - slice_ns))
+        vtime = vtime_now - slice_ns;
+
+    scx_bpf_dispatch_vtime(p, FALLBACK_DSQ_ID, slice_ns, vtime, enq_flags);
+}
+```
+
+The `nest_enqueue` function manages the queuing of tasks, adjusting their virtual time (`vtime`) to ensure fairness and prevent tasks from accumulating excessive execution budget while idling. If a task's `vtime` falls below a certain threshold, it's adjusted to maintain balance within the scheduler.
+
+#### `nest_dispatch`
+
+Manages the dispatching of tasks to CPU cores:
+
+```c
+void BPF_STRUCT_OPS(nest_dispatch, s32 cpu, struct task_struct *prev)
+{
+    struct pcpu_ctx *pcpu_ctx;
+    struct bpf_cpumask *primary, *reserve;
+    s32 key = cpu;
+    bool in_primary;
+
+    primary = primary_cpumask;
+    reserve = reserve_cpumask;
+    if (!primary || !reserve) {
+        scx_bpf_error("No primary or reserve cpumask");
+        return;
+    }
+
+    pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key);
+    if (!pcpu_ctx) {
+        scx_bpf_error("Failed to lookup pcpu ctx");
+        return;
+    }
+
+    if (!scx_bpf_consume(FALLBACK_DSQ_ID)) {
+        in_primary = bpf_cpumask_test_cpu(cpu, cast_mask(primary));
+
+        if (prev && (prev->scx.flags & SCX_TASK_QUEUED) && in_primary) {
+            scx_bpf_dispatch(prev, SCX_DSQ_LOCAL, slice_ns, 0);
+            return;
+        }
+
+        stat_inc(NEST_STAT(NOT_CONSUMED));
+        if (in_primary) {
+            /*
+             * Immediately demote a primary core if the previous
+             * task on it is dying
+             *
+             * Note that we elect to not compact the "first" CPU in
+             * the mask so as to encourage at least one core to
+             * remain in the nest. It would be better to check for
+             * whether there is only one core remaining in the
+             * nest, but BPF doesn't yet have a kfunc for querying
+             * cpumask weight.
+             */
+            if ((prev && prev->__state == TASK_DEAD) &&
+                (cpu != bpf_cpumask_first(cast_mask(primary)))) {
+                stat_inc(NEST_STAT(EAGERLY_COMPACTED));
+                bpf_cpumask_clear_cpu(cpu, primary);
+                try_make_core_reserved(cpu, reserve, false);
+            } else  {
+                pcpu_ctx->scheduled_compaction = true;
+                /*
+                 * The core isn't being used anymore. Set a
+                 * timer to remove the core from the nest in
+                 * p_remove if it's still unused by that point.
+                 */
+                bpf_timer_start(&pcpu_ctx->timer, p_remove_ns,
+                               BPF_F_TIMER_CPU_PIN);
+                stat_inc(NEST_STAT(SCHEDULED_COMPACTION));
+            }
+        }
+        return;
+    }
+    stat_inc(NEST_STAT(CONSUMED));
+}
+```
+
+The `nest_dispatch` function is responsible for dispatching tasks to CPU cores. It first checks if there's a task available in the fallback dispatch queue (`FALLBACK_DSQ_ID`). If no task is consumed, it evaluates whether the previous task on the CPU is dead. If so, and the CPU is not the first in the primary mask, the scheduler demotes the core to the reserve nest. Otherwise, it schedules a compaction timer to potentially demote the core after a specified duration (`p_remove_ns`). If a task is successfully consumed from the fallback queue, it increments the corresponding statistic.
+
+#### `nest_running`
+
+Updates the global virtual time when a task starts running:
+
+```c
+void BPF_STRUCT_OPS(nest_running, struct task_struct *p)
+{
+    /*
+     * Global vtime always progresses forward as tasks start executing. The
+     * test and update can be performed concurrently from multiple CPUs and
+     * thus racy. Any error should be contained and temporary. Let's just
+     * live with it.
+     */
+    if (vtime_before(vtime_now, p->scx.dsq_vtime))
+        vtime_now = p->scx.dsq_vtime;
+}
+```
+
+The `nest_running` function ensures that the global virtual time (`vtime_now`) progresses forward as tasks start executing. This mechanism helps maintain fairness and temporal consistency across the scheduler's operations.
+
+#### `nest_stopping`
+
+Handles the stopping of a task, adjusting its virtual time:
+
+```c
+void BPF_STRUCT_OPS(nest_stopping, struct task_struct *p, bool runnable)
+{
+    /* scale the execution time by the inverse of the weight and charge */
+    p->scx.dsq_vtime += (slice_ns - p->scx.slice) * 100 / p->scx.weight;
+}
+```
+
+When a task stops running, `nest_stopping` adjusts its virtual time based on its execution slice and weight. This adjustment ensures that tasks are fairly accounted for in the scheduler's virtual time calculations, maintaining balance and preventing any single task from monopolizing CPU resources.
+
+#### `nest_init_task`
+
+Initializes a new task's context:
+
+```c
+s32 BPF_STRUCT_OPS(nest_init_task, struct task_struct *p,
+                   struct scx_init_task_args *args)
+{
+    struct task_ctx *tctx;
+    struct bpf_cpumask *cpumask;
+
+    /*
+     * @p is new. Let's ensure that its task_ctx is available. We can sleep
+     * in this function and the following will automatically use GFP_KERNEL.
+     */
+    tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
+                                BPF_LOCAL_STORAGE_GET_F_CREATE);
+    if (!tctx)
+        return -ENOMEM;
+
+    cpumask = bpf_cpumask_create();
+    if (!cpumask)
+        return -ENOMEM;
+
+    cpumask = bpf_kptr_xchg(&tctx->tmp_mask, cpumask);
+    if (cpumask)
+        bpf_cpumask_release(cpumask);
+
+    tctx->attached_core = -1;
+    tctx->prev_cpu = -1;
+
+    return 0;
+}
+```
+
+The `nest_init_task` function initializes the scheduling context for a new task. It ensures that the task's context is available by retrieving it from the `task_ctx_stor` map, creating a new `bpf_cpumask` for temporary calculations, and setting initial values for `attached_core` and `prev_cpu`.
+
+#### `nest_enable`
+
+Enables scheduling for a task by setting its virtual time:
+
+```c
+void BPF_STRUCT_OPS(nest_enable, struct task_struct *p)
+{
+    p->scx.dsq_vtime = vtime_now;
+}
+```
+
+The `nest_enable` function activates scheduling for a task by initializing its virtual time (`dsq_vtime`) to the current global virtual time (`vtime_now`). This ensures that the task's scheduling state is synchronized with the scheduler's virtual time.
+
+#### `stats_timerfn`
+
+Handles periodic statistics collection:
+
+```c
+static int stats_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+    s32 cpu;
+    struct bpf_cpumask *primary, *reserve;
+    const struct cpumask *idle;
+    stats_primary_mask = 0;
+    stats_reserved_mask = 0;
+    stats_other_mask = 0;
+    stats_idle_mask = 0;
+    long err;
+
+    bpf_rcu_read_lock();
+    primary = primary_cpumask;
+    reserve = reserve_cpumask;
+    if (!primary || !reserve) {
+        bpf_rcu_read_unlock();
+        scx_bpf_error("Failed to lookup primary or reserve");
+        return 0;
+    }
+
+    idle = scx_bpf_get_idle_cpumask();
+    bpf_for(cpu, 0, nr_cpus) {
+        if (bpf_cpumask_test_cpu(cpu, cast_mask(primary)))
+            stats_primary_mask |= (1ULL << cpu);
+        else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve)))
+            stats_reserved_mask |= (1ULL << cpu);
+        else
+            stats_other_mask |= (1ULL << cpu);
+
+        if (bpf_cpumask_test_cpu(cpu, idle))
+            stats_idle_mask |= (1ULL << cpu);
+    }
+    bpf_rcu_read_unlock();
+    scx_bpf_put_idle_cpumask(idle);
+
+    err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0);
+    if (err)
+        scx_bpf_error("Failed to arm stats timer");
+
+    return 0;
+}
+```
+
+The `stats_timerfn` function is invoked periodically by a central timer to collect and update scheduler statistics. It captures the current state of CPU cores, categorizing them into primary, reserve, other, and idle masks. This information provides insights into how the scheduler is managing CPU resources and task placement over time. After collecting the statistics, the function re-arms the timer to ensure continuous monitoring.
+
+#### `nest_init`
+
+Initializes the `scx_nest` scheduler:
+
+```c
+s32 BPF_STRUCT_OPS_SLEEPABLE(nest_init)
+{
+    struct bpf_cpumask *cpumask;
+    s32 cpu;
+    int err;
+    struct bpf_timer *timer;
+    u32 key = 0;
+
+    err = scx_bpf_create_dsq(FALLBACK_DSQ_ID, NUMA_NO_NODE);
+    if (err) {
+        scx_bpf_error("Failed to create fallback DSQ");
+        return err;
+    }
+
+    cpumask = bpf_cpumask_create();
+    if (!cpumask)
+        return -ENOMEM;
+    bpf_cpumask_clear(cpumask);
+    cpumask = bpf_kptr_xchg(&primary_cpumask, cpumask);
+    if (cpumask)
+        bpf_cpumask_release(cpumask);
+
+    cpumask = bpf_cpumask_create();
+    if (!cpumask)
+        return -ENOMEM;
+
+    bpf_cpumask_clear(cpumask);
+    cpumask = bpf_kptr_xchg(&reserve_cpumask, cpumask);
+    if (cpumask)
+        bpf_cpumask_release(cpumask);
+
+    bpf_for(cpu, 0, nr_cpus) {
+        s32 key = cpu;
+        struct pcpu_ctx *ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key);
+
+        if (!ctx) {
+            scx_bpf_error("Failed to lookup pcpu_ctx");
+            return -ENOENT;
+        }
+        ctx->scheduled_compaction = false;
+        if (bpf_timer_init(&ctx->timer, &pcpu_ctxs, CLOCK_BOOTTIME)) {
+            scx_bpf_error("Failed to initialize pcpu timer");
+            return -EINVAL;
+        }
+        err = bpf_timer_set_callback(&ctx->timer, compact_primary_core);
+        if (err) {
+            scx_bpf_error("Failed to set pcpu timer callback");
+            return -EINVAL;
+        }
+    }
+
+    timer = bpf_map_lookup_elem(&stats_timer, &key);
+    if (!timer) {
+        scx_bpf_error("Failed to lookup central timer");
+        return -ESRCH;
+    }
+    bpf_timer_init(timer, &stats_timer, CLOCK_BOOTTIME);
+    bpf_timer_set_callback(timer, stats_timerfn);
+    err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0);
+    if (err)
+        scx_bpf_error("Failed to arm stats timer");
+
+    return err;
+}
+```
+
+The `nest_init` function sets up the `scx_nest` scheduler during system initialization. It creates a fallback dispatch queue (`FALLBACK_DSQ_ID`) and initializes the primary and reserve CPU masks. For each CPU, it retrieves the per-CPU context from the `pcpu_ctxs` map, initializes a timer for core compaction, and sets the callback to `compact_primary_core`. Additionally, it initializes and starts the central statistics timer (`stats_timer`) with the callback function `stats_timerfn`, ensuring that scheduler statistics are continuously monitored.
+
+#### `nest_exit`
+
+Handles cleanup when the scheduler exits:
+
+```c
+void BPF_STRUCT_OPS(nest_exit, struct scx_exit_info *ei)
+{
+    UEI_RECORD(uei, ei);
+}
+```
+
+The `nest_exit` function records exit information and performs any necessary cleanup when the scheduler is being removed or the system is shutting down. This ensures that all resources are properly released and that the system remains stable.
+
+#### `SCX_OPS_DEFINE`
+
+Defines the operations structure for the `scx_nest` scheduler:
+
+```c
+SCX_OPS_DEFINE(nest_ops,
+               .select_cpu        = (void *)nest_select_cpu,
+               .enqueue            = (void *)nest_enqueue,
+               .dispatch        = (void *)nest_dispatch,
+               .running            = (void *)nest_running,
+               .stopping        = (void *)nest_stopping,
+               .init_task        = (void *)nest_init_task,
+               .enable            = (void *)nest_enable,
+               .init            = (void *)nest_init,
+               .exit            = (void *)nest_exit,
+               .flags            = 0,
+               .name            = "nest");
+```
+
+The `SCX_OPS_DEFINE` macro binds all the scheduler's functions to the `nest_ops` structure, which the `sched_ext` framework uses to interface with the scheduler. This structure ensures that the scheduler's operations are correctly mapped and invoked by the kernel during task scheduling events.
+
+### Initialization and Cleanup
+
+Proper initialization and cleanup are crucial for the scheduler's stability and performance.
+
+#### `nest_init` Function
+
+The `nest_init` function is responsible for setting up the scheduler during system initialization. Here's how it operates:
+
+1. **Create Fallback Dispatch Queue:**
+   - It calls `scx_bpf_create_dsq` to create a fallback dispatch queue (`FALLBACK_DSQ_ID`). If this fails, it logs an error and exits.
+
+2. **Initialize Primary and Reserve CPU Masks:**
+   - It creates and clears a new `bpf_cpumask` for the primary mask.
+   - It exchanges the newly created mask with the existing `primary_cpumask`. If an old mask exists, it releases it.
+   - The same process is repeated for the reserve mask.
+
+3. **Initialize Per-CPU Contexts:**
+   - For each CPU, it retrieves the per-CPU context from the `pcpu_ctxs` map.
+   - It initializes the `scheduled_compaction` flag to `false`.
+   - It initializes the timer using `bpf_timer_init` and sets the callback to `compact_primary_core` using `bpf_timer_set_callback`.
+   - If any of these steps fail, it logs an error and exits.
+
+4. **Initialize and Start Statistics Timer:**
+   - It retrieves the central statistics timer from the `stats_timer` map.
+   - It initializes the timer and sets its callback to `stats_timerfn`.
+   - It starts the timer with a delay of `sampling_cadence_ns - 5000` nanoseconds.
+   - If starting the timer fails, it logs an error.
+
+5. **Return:**
+   - The function returns the result of the timer initialization, indicating success or failure.
+
+This initialization process ensures that all necessary components of the scheduler are correctly set up, including CPU masks, timers, and dispatch queues.
+
+#### `nest_exit` Function
+
+The `nest_exit` function handles cleanup when the scheduler is being removed or the system is shutting down:
+
+```c
+void BPF_STRUCT_OPS(nest_exit, struct scx_exit_info *ei)
+{
+    UEI_RECORD(uei, ei);
+}
+```
+
+This function records exit information through the `UEI_RECORD` macro, ensuring that any necessary cleanup actions are performed. Proper cleanup is essential to maintain system stability and prevent resource leaks.
+
+### Final Scheduler Definition
+
+The `SCX_OPS_DEFINE` macro binds all the scheduler's functions into a single structure used by the `sched_ext` framework:
+
+```c
+SCX_OPS_DEFINE(nest_ops,
+               .select_cpu        = (void *)nest_select_cpu,
+               .enqueue            = (void *)nest_enqueue,
+               .dispatch        = (void *)nest_dispatch,
+               .running            = (void *)nest_running,
+               .stopping        = (void *)nest_stopping,
+               .init_task        = (void *)nest_init_task,
+               .enable            = (void *)nest_enable,
+               .init            = (void *)nest_init,
+               .exit            = (void *)nest_exit,
+               .flags            = 0,
+               .name            = "nest");
+```
+
+This structure, `nest_ops`, effectively registers the scheduler's operations with the `sched_ext` framework, ensuring that the scheduler responds appropriately to various scheduling events and system states.
+
+## Compilation and Execution
+
+To compile and run the `scx_nest` scheduler, follow these steps:
+
+**Compile the Code:**
+
+Use `make` to build the scheduler. Ensure that you have the necessary build tools and kernel headers installed.
+
+```bash
+make
+```
+
+**Run the Scheduler:**
+
+Execute the compiled scheduler binary. Depending on your system's configuration and permissions, you might need to run this command with elevated privileges.
+
+```bash
+./scx_nest
+```
+
+### Sample Output
+
+Upon running the scheduler, you should observe output similar to the following:
+
+```
+# ./scx_nest 
+
+Wakeup stats
+------------
+WAKEUP_ATTACHED=150
+WAKEUP_PREV_PRIMARY=61
+WAKEUP_FULLY_IDLE_PRIMARY=0
+WAKEUP_ANY_IDLE_PRIMARY=103
+WAKEUP_FULLY_IDLE_RESERVE=0
+WAKEUP_ANY_IDLE_RESERVE=216
+WAKEUP_IDLE_OTHER=11
+
+
+Nest stats
+----------
+TASK_IMPATIENT=67
+PROMOTED_TO_PRIMARY=217
+PROMOTED_TO_RESERVED=8
+DEMOTED_TO_RESERVED=212
+RESERVED_AT_CAPACITY=6
+SCHEDULED_COMPACTION=525
+CANCELLED_COMPACTION=314
+EAGERLY_COMPACTED=8
+CALLBACK_COMPACTED=208
+
+
+Consume stats
+-------------
+CONSUMED=166
+NOT_CONSUMED=667
+
+
+
+Masks
+-----
+PRIMARY  ( 0): | -------------------------------------------------------------------------------------------------------------------------------- |
+RESERVED (10): | ***-*--*--------------------------------------------------------***-*--*-------------------------------------------------------- |
+OTHER    (128): | ******************************************************************************************************************************** |
+IDLE     (16): | ********--------------------------------------------------------********-------------------------------------------------------- |
+
+
+^CEXIT: unregistered from user space
+```
+
+This output provides comprehensive statistics on task wakeups, nest operations, consumption rates, and CPU mask statuses. It indicates how the scheduler is managing tasks and CPU cores, showcasing the effectiveness of the `scx_nest` algorithm in maintaining high-frequency core utilization and efficient task placement.
+
+## Summary and Call to Action
+
+In this tutorial, we've delved into the implementation of the `scx_nest` scheduler, an advanced eBPF program that customizes CPU scheduling to optimize performance based on core frequency and utilization. By leveraging the `sched_ext` framework, `scx_nest` demonstrates how eBPF can dynamically define scheduling behavior, offering flexibility and control beyond traditional schedulers.
+
+Key takeaways include:
+
+- Understanding the flexibility and power of the `sched_ext` scheduler class.
+- Exploring the intricate data structures and maps that underpin the `scx_nest` scheduler.
+- Analyzing core functions that manage task placement, core compaction, and statistics collection.
+- Learning how to compile and execute the scheduler, observing its impact through detailed statistics.
+
+The `scx_nest` scheduler serves as an excellent example of how advanced eBPF programming can be utilized to implement complex system functionalities in a flexible and dynamic manner.
+
+If you'd like to dive deeper into eBPF and explore more advanced examples, visit our tutorial repository at [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or check out our website at [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/).
+
+## References
+
+The original source code for the `scx_nest` scheduler is available in the [sched-ext/scx](https://github.com/sched-ext/scx) repository.
+
+Additional resources that can enhance your understanding include:
+
+- **Linux Kernel Documentation:** [Scheduler Ext Documentation](https://www.kernel.org/doc/html/next/scheduler/sched-ext.html)
+- **Kernel Source Tree:** [Linux Kernel `sched_ext` Tools](https://github.com/torvalds/linux/tree/master/tools/sched_ext)
+- **eBPF Official Documentation:** [https://ebpf.io/docs/](https://ebpf.io/docs/)
+- **libbpf Documentation:** [https://github.com/libbpf/libbpf](https://github.com/libbpf/libbpf)
+
+Feel free to explore these resources to expand your knowledge and continue your journey into advanced eBPF programming!
--- a/src/45-scx-nest/README.zh.md
+++ b/src/45-scx-nest/README.zh.md
@@ -0,0 +1,903 @@
+# eBPF 示例教程：实现 `scx_nest` 调度器
+
+在系统性能优化不断发展的领域中，自定义和扩展内核行为的能力是非常宝贵的。实现这一目标的最强大工具之一是 eBPF（扩展的 Berkeley 包过滤器）。在本教程中，我们将探讨 `scx_nest` 调度器的实现，这是一个先进的 eBPF 程序，利用了在 Linux 内核版本 `6.12` 中引入的 `sched_ext` 调度器类。在本指南结束时，您将了解如何构建一个复杂的调度器，该调度器根据 CPU 核心频率和利用率动态调整任务分配。
+
+## `sched_ext` 介绍
+
+`sched_ext` 调度器类标志着 Linux 内核调度能力的重大进步。与传统调度器不同，`sched_ext` 允许通过一组 BPF（Berkeley 包过滤器）程序动态定义其行为。这种灵活性使开发人员能够实现针对特定工作负载和系统需求量身定制的自定义调度算法。
+
+## 理解 `scx_nest` 调度器
+
+### 概述
+
+`scx_nest` 调度器受 Inria Paris 论文《[OS Scheduling with Nest: Keeping Tasks Close Together on Warm Cores](https://hal.inria.fr/hal-03612592/file/paper.pdf)》的启发。由 Meta Platforms, Inc. 开发，`scx_nest` 专注于鼓励将任务分配到基于最近使用模式可能以更高频率运行的 CPU 核心上。这种方法旨在通过确保任务在最有效的核心上执行来优化性能。
+
+该调度器作为一个全局加权虚拟时间（vtime）调度器运行，类似于完全公平调度器（CFS），同时利用 Nest 算法在任务唤醒时选择空闲核心。这种双重策略确保任务不仅被公平分配，还被放置在能够最有效执行它们的核心上。
+
+`scx_nest` 旨在优化 CPU 利用率相对较低且可以受益于在少数核心上运行的工作负载。通过将任务集中在较少的核心上，调度器有助于保持这些核心的高频率，从而提升性能。然而，对于那些在分布到多个核心以避免缓存抖动时表现更好的工作负载，`scx_nest` 可能并不是理想选择。评估 `scx_nest` 对特定工作负载的适用性通常需要实验。
+
+鉴于其设计，`scx_nest` 适用于生产环境，前提是满足硬件限制。它在具有统一 L3 缓存拓扑的单个 CCX（核心复合体）或单插槽主机上表现最佳。虽然当前版本未实现抢占，但所有 CPU 共享的调度队列确保队列前端的任务能够及时执行，前提是有足够的 CPU 可用。
+
+## 高级代码分析
+
+`scx_nest` 调度器的实现复杂，涉及各种数据结构、映射和函数，它们协同工作以管理任务分配和 CPU 核心利用率。完整的源代码可在 [eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 仓库中找到。下面，我们将剖析调度器的核心组件，详细解释每个部分。
+
+### 核心数据结构和映射
+
+#### 任务上下文 (`task_ctx`)
+
+系统中的每个任务都有一个关联的上下文，用于维护与调度相关的信息。这个上下文对于基于任务的历史和当前状态做出明智的调度决策至关重要。
+
+```c
+/* 每个任务的调度上下文 */
+struct task_ctx {
+    /*
+     * 用于计算任务的主掩码和保留掩码的临时 cpumask。
+     */
+    struct bpf_cpumask __kptr *tmp_mask;
+
+    /*
+     * 任务观察到其之前的核心不为空闲的次数。如果连续发生 r_impatient 次，
+     * 将尝试从保留 Nest 或回退 Nest 中获取一个核心。
+     */
+    u32 prev_misses;
+
+    /*
+     * 任务“附加”的核心，意味着它至少连续在该核心上执行了两次，
+     * 并且在唤醒时首先尝试迁移到该核心。任务只有在附加核心空闲且
+     * 在主 Nest 中时才会迁移到附加核心。
+     */
+    s32 attached_core;
+
+    /*
+     * 任务上次执行的核心。这用于确定任务是否应该附加到下一个
+     * 执行的核心。
+     */
+    s32 prev_cpu;
+};
+```
+
+`task_ctx` 结构体包含一个临时 CPU 掩码 (`tmp_mask`)，用于计算任务的主 CPU 集合和保留 CPU 集合。`prev_misses` 计数器跟踪任务的首选核心不为空闲的次数，影响迁移任务到不同核心的决策。`attached_core` 指示任务当前绑定的核心，确保在可能的情况下在高频率核心上运行。最后，`prev_cpu` 记录任务上次执行的核心，有助于维护任务与核心的亲和性。
+
+#### 每 CPU 上下文 (`pcpu_ctx`)
+
+每个 CPU 都有一个关联的上下文，用于管理定时器和压缩状态。这个上下文有助于确定何时由于不活动而将核心从主 Nest 中降级。
+
+```c
+struct pcpu_ctx {
+    /* 用于从主 Nest 中压缩核心的定时器。 */
+    struct bpf_timer timer;
+
+    /* 当前核心是否已安排进行压缩。 */
+    bool scheduled_compaction;
+};
+```
+
+`pcpu_ctx` 结构体包含一个 `bpf_timer`，用于调度压缩事件，以及一个布尔标志 `scheduled_compaction`，指示是否已为核心安排了压缩。
+
+#### 映射
+
+多个 BPF 映射用于存储上下文和管理定时器：
+
+```c
+/* 任务存储映射 */
+struct {
+    __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+    __uint(map_flags, BPF_F_NO_PREALLOC);
+    __type(key, int);
+    __type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+/* 每 CPU 上下文 */
+struct {
+    __uint(type, BPF_MAP_TYPE_ARRAY);
+    __uint(max_entries, 1024);
+    __type(key, s32);
+    __type(value, struct pcpu_ctx);
+} pcpu_ctxs SEC(".maps");
+
+/* 统计定时器 */
+struct {
+    __uint(type, BPF_MAP_TYPE_ARRAY);
+    __uint(max_entries, 1);
+    __type(key, u32);
+    __type(value, struct stats_timer);
+} stats_timer SEC(".maps");
+```
+
+- **`task_ctx_stor`:** 该映射存储每个任务的调度上下文，使调度器能够访问和修改特定任务的信息。
+- **`pcpu_ctxs`:** 一个数组映射，保存每个 CPU 的上下文，使调度器能够管理每个 CPU 的定时器和压缩状态。
+- **`stats_timer`:** 一个单条目的数组映射，用于管理用于收集调度统计信息的中央定时器。
+
+此外，调度器维护了主 CPU 掩码、保留 CPU 掩码、其他 CPU 掩码和空闲 CPU 掩码，以及用于跟踪各种调度器指标的统计映射。
+
+### 核心函数
+
+#### `stat_inc`
+
+一个辅助函数，用于递增调度统计数据：
+
+```c
+static __always_inline void stat_inc(u32 idx)
+{
+    u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
+    if (cnt_p)
+        (*cnt_p)++;
+}
+```
+
+此函数在 `stats` 映射中查找一个计数器，并在计数器存在时递增它。调度器在各处使用它来跟踪各种事件和状态。
+
+#### `vtime_before`
+
+一个用于比较虚拟时间的实用函数：
+
+```c
+static inline bool vtime_before(u64 a, u64 b)
+{
+    return (s64)(a - b) < 0;
+}
+```
+
+此函数确定虚拟时间 `a` 是否在 `b` 之前，有助于基于时间的调度决策。
+
+#### `try_make_core_reserved`
+
+尝试将一个核心提升为保留 Nest：
+
+```c
+static __always_inline void
+try_make_core_reserved(s32 cpu, struct bpf_cpumask * reserved, bool promotion)
+{
+    s32 tmp_nr_reserved;
+
+    /*
+     * 此检查存在竞争，但没关系。如果我们错误地未能将核心提升到保留，
+     * 那是因为另一个上下文在这个小窗口中添加或移除了保留中的核心。
+     * 这将在随后的唤醒中平衡。
+     */
+    tmp_nr_reserved = nr_reserved;
+    if (tmp_nr_reserved < r_max) {
+        /*
+         * 这里有可能暂时超过 r_max，但随着更多核心被降级或未能
+         * 被提升到保留 Nest，应该会平衡。
+         */
+        __sync_fetch_and_add(&nr_reserved, 1);
+        bpf_cpumask_set_cpu(cpu, reserved);
+        if (promotion)
+            stat_inc(NEST_STAT(PROMOTED_TO_RESERVED));
+        else
+            stat_inc(NEST_STAT(DEMOTED_TO_RESERVED));
+    } else {
+        bpf_cpumask_clear_cpu(cpu, reserved);
+        stat_inc(NEST_STAT(RESERVED_AT_CAPACITY));
+    }
+}
+```
+
+`try_make_core_reserved` 函数尝试将一个 CPU 核心添加到保留掩码中。首先检查保留核心的数量 (`nr_reserved`) 是否低于允许的最大值 (`r_max`)。如果是，则递增 `nr_reserved` 计数器并将核心添加到保留掩码中。根据核心是被提升还是降级，递增相应的统计数据。如果保留容量已满，则从保留掩码中清除核心并更新相关统计数据。
+
+#### `update_attached`
+
+根据最近的执行更新任务的附加核心：
+
+```c
+static void update_attached(struct task_ctx *tctx, s32 prev_cpu, s32 new_cpu)
+{
+    if (tctx->prev_cpu == new_cpu)
+        tctx->attached_core = new_cpu;
+    tctx->prev_cpu = prev_cpu;
+}
+```
+
+此函数更新任务的 `attached_core`。如果任务连续在同一核心上执行，它会将任务附加到该核心。然后更新 `prev_cpu` 以反映任务最近运行的核心。
+
+#### `compact_primary_core`
+
+处理主核心的压缩，将其降级到保留 Nest：
+
+```c
+static int compact_primary_core(void *map, int *key, struct bpf_timer *timer)
+{
+    struct bpf_cpumask *primary, *reserve;
+    s32 cpu = bpf_get_smp_processor_id();
+    struct pcpu_ctx *pcpu_ctx;
+
+    stat_inc(NEST_STAT(CALLBACK_COMPACTED));
+
+    /*
+     * 如果我们到达此回调，这意味着定时器回调从未被取消，
+     * 因此需要将核心从主 Nest 中降级。
+     */
+    pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu);
+    if (!pcpu_ctx) {
+        scx_bpf_error("无法查找 pcpu ctx");
+        return 0;
+    }
+    bpf_rcu_read_lock();
+    primary = primary_cpumask;
+    reserve = reserve_cpumask;
+    if (!primary || !reserve) {
+        scx_bpf_error("无法找到 primary 或 reserve");
+        bpf_rcu_read_unlock();
+        return 0;
+    }
+
+    bpf_cpumask_clear_cpu(cpu, primary);
+    try_make_core_reserved(cpu, reserve, false);
+    bpf_rcu_read_unlock();
+    pcpu_ctx->scheduled_compaction = false;
+    return 0;
+}
+```
+
+当压缩定时器到期时，将调用 `compact_primary_core`。它通过从主掩码中清除当前 CPU 核心并尝试将其添加到保留掩码中，将当前 CPU 核心从主 Nest 降级到保留 Nest。这确保了不活动的核心得到有效管理，保持性能和资源利用之间的平衡。
+
+#### `nest_select_cpu`
+
+在任务唤醒时确定适当的 CPU：
+
+```c
+s32 BPF_STRUCT_OPS(nest_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+    struct bpf_cpumask *p_mask, *primary, *reserve;
+    s32 cpu;
+    struct task_ctx *tctx;
+    struct pcpu_ctx *pcpu_ctx;
+    bool direct_to_primary = false, reset_impatient = true;
+
+    tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+    if (!tctx)
+        return -ENOENT;
+
+    bpf_rcu_read_lock();
+    p_mask = tctx->tmp_mask;
+    primary = primary_cpumask;
+    reserve = reserve_cpumask;
+    if (!p_mask || !primary || !reserve) {
+        bpf_rcu_read_unlock();
+        return -ENOENT;
+    }
+
+    tctx->prev_cpu = prev_cpu;
+
+    bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary));
+
+    /* 首先尝试在附加核心上唤醒任务。 */
+    if (bpf_cpumask_test_cpu(tctx->attached_core, cast_mask(p_mask)) &&
+        scx_bpf_test_and_clear_cpu_idle(tctx->attached_core)) {
+        cpu = tctx->attached_core;
+        stat_inc(NEST_STAT(WAKEUP_ATTACHED));
+        goto migrate_primary;
+    }
+
+    /*
+     * 如果之前的核心在主集合中，并且没有 hypertwin，则尝试留在之前的核心。
+     * 如果之前的核心是任务附加的核心，不需要再尝试，因为我们已经在上面尝试过了。
+     */
+    if (prev_cpu != tctx->attached_core &&
+        bpf_cpumask_test_cpu(prev_cpu, cast_mask(p_mask)) &&
+        scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+        cpu = prev_cpu;
+        stat_inc(NEST_STAT(WAKEUP_PREV_PRIMARY));
+        goto migrate_primary;
+    }
+
+    if (find_fully_idle) {
+        /* 然后尝试在主集合中选择任何完全空闲的核心。 */
+        cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask),
+                                    SCX_PICK_IDLE_CORE);
+        if (cpu >= 0) {
+            stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_PRIMARY));
+            goto migrate_primary;
+        }
+    }
+
+    /* 然后尝试在主集合中选择任何空闲的核心，即使其 hypertwin 正在活动。 */
+    cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0);
+    if (cpu >= 0) {
+        stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_PRIMARY));
+        goto migrate_primary;
+    }
+
+    if (r_impatient > 0 && ++tctx->prev_misses >= r_impatient) {
+        direct_to_primary = true;
+        tctx->prev_misses = 0;
+        stat_inc(NEST_STAT(TASK_IMPATIENT));
+    }
+
+    reset_impatient = false;
+
+    /* 然后尝试在保留集合中选择任何完全空闲的核心。 */
+    bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(reserve));
+    if (find_fully_idle) {
+        cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask),
+                                    SCX_PICK_IDLE_CORE);
+        if (cpu >= 0) {
+            stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_RESERVE));
+            goto promote_to_primary;
+        }
+    }
+
+    /* 然后尝试在保留集合中选择任何空闲的核心，即使其 hypertwin 正在活动。 */
+    cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0);
+    if (cpu >= 0) {
+        stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_RESERVE));
+        goto promote_to_primary;
+    }
+
+    /* 然后尝试在任务的 cpumask 中选择任何空闲的核心。 */
+    cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+    if (cpu >= 0) {
+        /*
+         * 我们找到了一个核心（我们认为它不在任何 Nest 中）。
+         * 这意味着我们需要将该核心提升到保留 Nest，或者如果由于
+         * 超过 r_impatient 而直接提升到主 Nest。
+         *
+         * 我们必须在这里进行最后一次检查，看看核心是否在主掩码或保留掩码中，
+         * 因为我们可能与核心在将主掩码和保留掩码与 p->cpus_ptr 进行 AND
+         * 运算之间更改状态，并使用 scx_bpf_pick_idle_cpu() 原子性地保留它。
+         * 这在上面的检查中技术上也是如此，但在那些情况下我们只是直接
+         * 将核心放入主掩码中，因此问题不大。在这里，我们要确保不会
+         * 意外地将已经在主掩码中的核心放入保留 Nest 中。这是不太可能的，
+         * 但我们在应该相对冷路径上进行了检查。
+         */
+        stat_inc(NEST_STAT(WAKEUP_IDLE_OTHER));
+        if (bpf_cpumask_test_cpu(cpu, cast_mask(primary)))
+            goto migrate_primary;
+        else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve)))
+            goto promote_to_primary;
+        else if (direct_to_primary)
+            goto promote_to_primary;
+        else
+            try_make_core_reserved(cpu, reserve, true);
+        bpf_rcu_read_unlock();
+        return cpu;
+    }
+
+    bpf_rcu_read_unlock();
+    return prev_cpu;
+
+promote_to_primary:
+    stat_inc(NEST_STAT(PROMOTED_TO_PRIMARY));
+migrate_primary:
+    if (reset_impatient)
+        tctx->prev_misses = 0;
+    pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu);
+    if (pcpu_ctx) {
+        if (pcpu_ctx->scheduled_compaction) {
+            if (bpf_timer_cancel(&pcpu_ctx->timer) < 0)
+                scx_bpf_error("取消 pcpu 定时器失败");
+            if (bpf_timer_set_callback(&pcpu_ctx->timer, compact_primary_core))
+                scx_bpf_error("重新设置 pcpu 定时器回调失败");
+            pcpu_ctx->scheduled_compaction = false;
+            stat_inc(NEST_STAT(CANCELLED_COMPACTION));
+        }
+    } else {
+        scx_bpf_error("查找 pcpu ctx 失败");
+    }
+    bpf_cpumask_set_cpu(cpu, primary);
+    /*
+     * 检查 CPU 是否在保留掩码中。如果是，这可能发生在核心在我们尝试
+     * 将当前唤醒任务分配到其上时被并发地压缩。同样，如果我们在
+     * 由于超时直接提升到主 Nest，也会发生这种情况。
+     *
+     * 我们不必担心与其他唤醒任务的竞争，因为我们已经通过（某种
+     * 变体的）scx_bpf_pick_idle_cpu() 原子性地保留了该核心。
+     */
+    if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) {
+        __sync_sub_and_fetch(&nr_reserved, 1);
+        bpf_cpumask_clear_cpu(cpu, reserve);
+    }
+    bpf_rcu_read_unlock();
+    update_attached(tctx, prev_cpu, cpu);
+    scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
+    return cpu;
+}
+```
+
+`nest_select_cpu` 函数是 `scx_nest` 调度器的核心。当任务唤醒时，此函数确定其执行最合适的 CPU 核心。该函数遵循一系列检查，以确保任务被放置在高频率、空闲的核心上，从而提升效率和性能。
+
+最初，它从 `task_ctx_stor` 映射中检索任务的上下文。然后，它锁定读拷贝更新（RCU）锁，以安全地访问主掩码和保留掩码。调度器首先尝试将任务放置在其附加核心上，确保核心亲和性。如果附加核心不空闲，它会尝试先前的核心。根据各种条件，包括任务的急躁程度 (`r_impatient`) 和主 Nest 及保留 Nest 中空闲核心的可用性，调度器决定是否迁移任务、将核心提升到主 Nest，或将核心降级到保留 Nest。
+
+在整个过程中，调度器更新相关统计数据，以提供对其操作的见解。使用 RCU 锁确保调度器的决策是在不干扰其他并发操作的情况下安全做出的。
+
+#### `nest_enqueue`
+
+处理将任务入队到调度队列：
+
+```c
+void BPF_STRUCT_OPS(nest_enqueue, struct task_struct *p, u64 enq_flags)
+{
+    struct task_ctx *tctx;
+    u64 vtime = p->scx.dsq_vtime;
+
+    tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+    if (!tctx) {
+        scx_bpf_error("无法找到任务上下文");
+        return;
+    }
+
+    /*
+     * 将空闲任务的预算限制为一个切片。
+     */
+    if (vtime_before(vtime, vtime_now - slice_ns))
+        vtime = vtime_now - slice_ns;
+
+    scx_bpf_dispatch_vtime(p, FALLBACK_DSQ_ID, slice_ns, vtime, enq_flags);
+}
+```
+
+`nest_enqueue` 函数管理任务的入队，调整其虚拟时间 (`vtime`) 以确保公平性并防止任务在空闲时积累过多的执行预算。如果任务的 `vtime` 低于某个阈值，它将被调整以保持调度器内部的平衡。
+
+#### `nest_dispatch`
+
+管理将任务分派到 CPU 核心：
+
+```c
+void BPF_STRUCT_OPS(nest_dispatch, s32 cpu, struct task_struct *prev)
+{
+    struct pcpu_ctx *pcpu_ctx;
+    struct bpf_cpumask *primary, *reserve;
+    s32 key = cpu;
+    bool in_primary;
+
+    primary = primary_cpumask;
+    reserve = reserve_cpumask;
+    if (!primary || !reserve) {
+        scx_bpf_error("没有主或保留 cpumask");
+        return;
+    }
+
+    pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key);
+    if (!pcpu_ctx) {
+        scx_bpf_error("查找 pcpu ctx 失败");
+        return;
+    }
+
+    if (!scx_bpf_consume(FALLBACK_DSQ_ID)) {
+        in_primary = bpf_cpumask_test_cpu(cpu, cast_mask(primary));
+
+        if (prev && (prev->scx.flags & SCX_TASK_QUEUED) && in_primary) {
+            scx_bpf_dispatch(prev, SCX_DSQ_LOCAL, slice_ns, 0);
+            return;
+        }
+
+        stat_inc(NEST_STAT(NOT_CONSUMED));
+        if (in_primary) {
+            /*
+             * 如果主集合中的前一个任务正在死亡，立即降级主核心。
+             *
+             * 注意，我们选择不压缩掩码中的“第一个” CPU，以鼓励至少保留一个核心在 Nest 中。
+             * 最好检查是否仅剩一个核心在 Nest 中，但 BPF 目前没有用于查询
+             * cpumask 权重的内核函数。
+             */
+            if ((prev && prev->__state == TASK_DEAD) &&
+                (cpu != bpf_cpumask_first(cast_mask(primary)))) {
+                stat_inc(NEST_STAT(EAGERLY_COMPACTED));
+                bpf_cpumask_clear_cpu(cpu, primary);
+                try_make_core_reserved(cpu, reserve, false);
+            } else  {
+                pcpu_ctx->scheduled_compaction = true;
+                /*
+                 * 核心不再被使用。设置定时器以在 p_remove 中移除核心
+                 * 如果在那时仍未使用。
+                 */
+                bpf_timer_start(&pcpu_ctx->timer, p_remove_ns,
+                               BPF_F_TIMER_CPU_PIN);
+                stat_inc(NEST_STAT(SCHEDULED_COMPACTION));
+            }
+        }
+        return;
+    }
+    stat_inc(NEST_STAT(CONSUMED));
+}
+```
+
+`nest_dispatch` 函数负责将任务分派到 CPU 核心。它首先检查回退调度队列 (`FALLBACK_DSQ_ID`) 中是否有可用任务。如果没有任务被消耗，它会评估 CPU 上的前一个任务是否已经死亡。如果是，并且 CPU 不在主掩码中的第一个位置，调度器将核心降级到保留 Nest。否则，它会为核心安排一个压缩定时器，以便在指定时间后可能降级该核心。如果从回退队列成功消耗了一个任务，它会递增相应的统计数据。
+
+#### `nest_running`
+
+当任务开始运行时更新全局虚拟时间：
+
+```c
+void BPF_STRUCT_OPS(nest_running, struct task_struct *p)
+{
+    /*
+     * 全局虚拟时间在任务开始执行时总是向前推进。
+     * 测试和更新可以从多个 CPU 同时执行，因此存在竞争。
+     * 任何错误都应该是可控且暂时的。我们就这样处理。
+     */
+    if (vtime_before(vtime_now, p->scx.dsq_vtime))
+        vtime_now = p->scx.dsq_vtime;
+}
+```
+
+`nest_running` 函数确保全局虚拟时间 (`vtime_now`) 在任务开始执行时向前推进。这一机制有助于维护调度器操作的公平性和时间一致性。
+
+#### `nest_stopping`
+
+处理任务停止运行，调整其虚拟时间：
+
+```c
+void BPF_STRUCT_OPS(nest_stopping, struct task_struct *p, bool runnable)
+{
+    /* 按权重的倒数和费用缩放执行时间 */
+    p->scx.dsq_vtime += (slice_ns - p->scx.slice) * 100 / p->scx.weight;
+}
+```
+
+当任务停止运行时，`nest_stopping` 根据其执行切片和权重调整其虚拟时间。这一调整确保任务在调度器的虚拟时间计算中得到公平考虑，保持平衡并防止任何单个任务垄断 CPU 资源。
+
+#### `nest_init_task`
+
+初始化新任务的上下文：
+
+```c
+s32 BPF_STRUCT_OPS(nest_init_task, struct task_struct *p,
+                   struct scx_init_task_args *args)
+{
+    struct task_ctx *tctx;
+    struct bpf_cpumask *cpumask;
+
+    /*
+     * @p 是新的。确保其 task_ctx 可用。
+     * 我们可以在此函数中休眠，以下内容将自动使用 GFP_KERNEL。
+     */
+    tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
+                                BPF_LOCAL_STORAGE_GET_F_CREATE);
+    if (!tctx)
+        return -ENOMEM;
+
+    cpumask = bpf_cpumask_create();
+    if (!cpumask)
+        return -ENOMEM;
+
+    cpumask = bpf_kptr_xchg(&tctx->tmp_mask, cpumask);
+    if (cpumask)
+        bpf_cpumask_release(cpumask);
+
+    tctx->attached_core = -1;
+    tctx->prev_cpu = -1;
+
+    return 0;
+}
+```
+
+`nest_init_task` 函数为新任务初始化调度上下文。它通过从 `task_ctx_stor` 映射中检索任务的上下文来确保任务的上下文可用，创建一个新的 `bpf_cpumask` 进行临时计算，并为 `attached_core` 和 `prev_cpu` 设置初始值。
+
+#### `nest_enable`
+
+通过设置任务的虚拟时间启用调度：
+
+```c
+void BPF_STRUCT_OPS(nest_enable, struct task_struct *p)
+{
+    p->scx.dsq_vtime = vtime_now;
+}
+```
+
+`nest_enable` 函数通过将任务的虚拟时间 (`dsq_vtime`) 初始化为当前的全局虚拟时间 (`vtime_now`) 来激活任务的调度。这确保了任务的调度状态与调度器的虚拟时间同步。
+
+#### `stats_timerfn`
+
+处理定期的统计信息收集：
+
+```c
+static int stats_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+    s32 cpu;
+    struct bpf_cpumask *primary, *reserve;
+    const struct cpumask *idle;
+    stats_primary_mask = 0;
+    stats_reserved_mask = 0;
+    stats_other_mask = 0;
+    stats_idle_mask = 0;
+    long err;
+
+    bpf_rcu_read_lock();
+    primary = primary_cpumask;
+    reserve = reserve_cpumask;
+    if (!primary || !reserve) {
+        bpf_rcu_read_unlock();
+        scx_bpf_error("查找主或保留失败");
+        return 0;
+    }
+
+    idle = scx_bpf_get_idle_cpumask();
+    bpf_for(cpu, 0, nr_cpus) {
+        if (bpf_cpumask_test_cpu(cpu, cast_mask(primary)))
+            stats_primary_mask |= (1ULL << cpu);
+        else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve)))
+            stats_reserved_mask |= (1ULL << cpu);
+        else
+            stats_other_mask |= (1ULL << cpu);
+
+        if (bpf_cpumask_test_cpu(cpu, idle))
+            stats_idle_mask |= (1ULL << cpu);
+    }
+    bpf_rcu_read_unlock();
+    scx_bpf_put_idle_cpumask(idle);
+
+    err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0);
+    if (err)
+        scx_bpf_error("启动统计定时器失败");
+
+    return 0;
+}
+```
+
+`stats_timerfn` 函数由中央定时器定期调用，用于收集和更新调度统计信息。它捕捉当前 CPU 核心的状态，将它们分类到主、保留、其他和空闲掩码中。这些信息提供了调度器如何管理 CPU 资源和任务分配的洞察。在收集统计信息后，该函数重新启动定时器以确保持续监控。
+
+#### `nest_init`
+
+初始化 `scx_nest` 调度器：
+
+```c
+s32 BPF_STRUCT_OPS_SLEEPABLE(nest_init)
+{
+    struct bpf_cpumask *cpumask;
+    s32 cpu;
+    int err;
+    struct bpf_timer *timer;
+    u32 key = 0;
+
+    err = scx_bpf_create_dsq(FALLBACK_DSQ_ID, NUMA_NO_NODE);
+    if (err) {
+        scx_bpf_error("创建回退 DSQ 失败");
+        return err;
+    }
+
+    cpumask = bpf_cpumask_create();
+    if (!cpumask)
+        return -ENOMEM;
+    bpf_cpumask_clear(cpumask);
+    cpumask = bpf_kptr_xchg(&primary_cpumask, cpumask);
+    if (cpumask)
+        bpf_cpumask_release(cpumask);
+
+    cpumask = bpf_cpumask_create();
+    if (!cpumask)
+        return -ENOMEM;
+
+    bpf_cpumask_clear(cpumask);
+    cpumask = bpf_kptr_xchg(&reserve_cpumask, cpumask);
+    if (cpumask)
+        bpf_cpumask_release(cpumask);
+
+    bpf_for(cpu, 0, nr_cpus) {
+        s32 key = cpu;
+        struct pcpu_ctx *ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key);
+
+        if (!ctx) {
+            scx_bpf_error("查找 pcpu_ctx 失败");
+            return -ENOENT;
+        }
+        ctx->scheduled_compaction = false;
+        if (bpf_timer_init(&ctx->timer, &pcpu_ctxs, CLOCK_BOOTTIME)) {
+            scx_bpf_error("初始化 pcpu 定时器失败");
+            return -EINVAL;
+        }
+        err = bpf_timer_set_callback(&ctx->timer, compact_primary_core);
+        if (err) {
+            scx_bpf_error("设置 pcpu 定时器回调失败");
+            return -EINVAL;
+        }
+    }
+
+    timer = bpf_map_lookup_elem(&stats_timer, &key);
+    if (!timer) {
+        scx_bpf_error("查找中央定时器失败");
+        return -ESRCH;
+    }
+    bpf_timer_init(timer, &stats_timer, CLOCK_BOOTTIME);
+    bpf_timer_set_callback(timer, stats_timerfn);
+    err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0);
+    if (err)
+        scx_bpf_error("启动统计定时器失败");
+
+    return err;
+}
+```
+
+`nest_init` 函数在系统初始化期间设置 `scx_nest` 调度器。它创建了一个回退调度队列 (`FALLBACK_DSQ_ID`) 并初始化了主掩码和保留掩码。对于每个 CPU，它从 `pcpu_ctxs` 映射中检索每 CPU 上下文，初始化压缩定时器，并将回调设置为 `compact_primary_core`。此外，它初始化并启动中央统计定时器 (`stats_timer`) 及其回调函数 `stats_timerfn`，确保调度器统计信息的持续监控。
+
+#### `nest_exit`
+
+在调度器退出时进行清理：
+
+```c
+void BPF_STRUCT_OPS(nest_exit, struct scx_exit_info *ei)
+{
+    UEI_RECORD(uei, ei);
+}
+```
+
+`nest_exit` 函数记录退出信息并在调度器被移除或系统关闭时执行任何必要的清理操作。这确保所有资源得到适当释放，系统保持稳定。
+
+#### `SCX_OPS_DEFINE`
+
+为 `scx_nest` 调度器定义操作结构：
+
+```c
+SCX_OPS_DEFINE(nest_ops,
+               .select_cpu        = (void *)nest_select_cpu,
+               .enqueue            = (void *)nest_enqueue,
+               .dispatch        = (void *)nest_dispatch,
+               .running            = (void *)nest_running,
+               .stopping        = (void *)nest_stopping,
+               .init_task        = (void *)nest_init_task,
+               .enable            = (void *)nest_enable,
+               .init            = (void *)nest_init,
+               .exit            = (void *)nest_exit,
+               .flags            = 0,
+               .name            = "nest");
+```
+
+`SCX_OPS_DEFINE` 宏将调度器的所有函数绑定到 `nest_ops` 结构中，`sched_ext` 框架使用该结构与调度器进行接口。这确保调度器的操作在任务调度事件期间被正确映射和调用。
+
+### 初始化和清理
+
+适当的初始化和清理对于调度器的稳定性和性能至关重要。
+
+#### `nest_init` 函数
+
+`nest_init` 函数负责在系统初始化期间设置调度器。其操作如下：
+
+1. **创建回退调度队列：**
+   - 调用 `scx_bpf_create_dsq` 创建回退调度队列 (`FALLBACK_DSQ_ID`)。如果失败，记录错误并退出。
+
+2. **初始化主掩码和保留掩码：**
+   - 创建并清除一个新的 `bpf_cpumask` 作为主掩码。
+   - 将新创建的掩码与现有的 `primary_cpumask` 交换。如果存在旧掩码，则释放它。
+   - 对保留掩码重复相同的过程。
+
+3. **初始化每 CPU 上下文：**
+   - 对于每个 CPU，从 `pcpu_ctxs` 映射中检索每 CPU 上下文。
+   - 将 `scheduled_compaction` 标志初始化为 `false`。
+   - 使用 `bpf_timer_init` 初始化定时器，并使用 `bpf_timer_set_callback` 将回调设置为 `compact_primary_core`。
+   - 如果任何步骤失败，记录错误并退出。
+
+4. **初始化并启动统计定时器：**
+   - 从 `stats_timer` 映射中检索中央统计定时器。
+   - 初始化定时器并将其回调设置为 `stats_timerfn`。
+   - 以 `sampling_cadence_ns - 5000` 纳秒的延迟启动定时器。
+   - 如果启动定时器失败，记录错误。
+
+5. **返回：**
+   - 函数返回定时器初始化的结果，指示成功或失败。
+
+这一初始化过程确保调度器的所有必要组件（包括 CPU 掩码、定时器和调度队列）都已正确设置。
+
+#### `nest_exit` 函数
+
+`nest_exit` 函数在调度器被移除或系统关闭时处理清理工作：
+
+```c
+void BPF_STRUCT_OPS(nest_exit, struct scx_exit_info *ei)
+{
+    UEI_RECORD(uei, ei);
+}
+```
+
+此函数通过 `UEI_RECORD` 宏记录退出信息，确保执行任何必要的清理操作。这对于保持系统稳定性和防止资源泄漏至关重要。
+
+### 最终调度器定义
+
+`SCX_OPS_DEFINE` 宏将调度器的所有函数绑定到单一结构中，供 `sched_ext` 框架使用：
+
+```c
+SCX_OPS_DEFINE(nest_ops,
+               .select_cpu        = (void *)nest_select_cpu,
+               .enqueue            = (void *)nest_enqueue,
+               .dispatch        = (void *)nest_dispatch,
+               .running            = (void *)nest_running,
+               .stopping        = (void *)nest_stopping,
+               .init_task        = (void *)nest_init_task,
+               .enable            = (void *)nest_enable,
+               .init            = (void *)nest_init,
+               .exit            = (void *)nest_exit,
+               .flags            = 0,
+               .name            = "nest");
+```
+
+此结构体 `nest_ops` 有效地将调度器的操作注册到 `sched_ext` 框架，确保调度器在各种调度事件和系统状态下做出适当响应。
+
+## 编译和执行
+
+要编译和运行 `scx_nest` 调度器，请按照以下步骤操作：
+
+**编译代码：**
+
+使用 `make` 构建调度器。确保已安装必要的构建工具和内核头文件。
+
+```bash
+make
+```
+
+**运行调度器：**
+
+执行编译后的调度器二进制文件。根据系统配置和权限，您可能需要以提升的权限运行此命令。
+
+```bash
+./scx_nest
+```
+
+### 示例输出
+
+运行调度器后，您应该会看到类似以下的输出：
+
+```
+# ./scx_nest 
+
+唤醒统计
+------------
+WAKEUP_ATTACHED=150
+WAKEUP_PREV_PRIMARY=61
+WAKEUP_FULLY_IDLE_PRIMARY=0
+WAKEUP_ANY_IDLE_PRIMARY=103
+WAKEUP_FULLY_IDLE_RESERVE=0
+WAKEUP_ANY_IDLE_RESERVE=216
+WAKEUP_IDLE_OTHER=11
+
+
+Nest 统计
+----------
+TASK_IMPATIENT=67
+PROMOTED_TO_PRIMARY=217
+PROMOTED_TO_RESERVED=8
+DEMOTED_TO_RESERVED=212
+RESERVED_AT_CAPACITY=6
+SCHEDULED_COMPACTION=525
+CANCELLED_COMPACTION=314
+EAGERLY_COMPACTED=8
+CALLBACK_COMPACTED=208
+
+
+消耗统计
+-------------
+CONSUMED=166
+NOT_CONSUMED=667
+
+
+
+掩码
+-----
+PRIMARY  ( 0): | -------------------------------------------------------------------------------------------------------------------------------- |
+RESERVED (10): | ***-*--*--------------------------------------------------------***-*--*-------------------------------------------------------- |
+OTHER    (128): | ******************************************************************************************************************************** |
+IDLE     (16): | ********--------------------------------------------------------********-------------------------------------------------------- |
+
+
+^C退出：已从用户空间注销
+```
+
+此输出提供了有关任务唤醒、Nest 操作、消耗率和 CPU 掩码状态的全面统计信息。它显示了调度器如何管理任务和 CPU 核心，展示了 `scx_nest` 算法在保持高频率核心利用率和高效任务分配方面的有效性。
+
+## 总结与行动呼吁
+
+在本教程中，我们深入探讨了 `scx_nest` 调度器的实现，这是一个先进的 eBPF 程序，基于核心频率和利用率定制 CPU 调度以优化性能。通过利用 `sched_ext` 框架，`scx_nest` 展示了 eBPF 如何动态定义调度行为，提供超越传统调度器的灵活性和控制力。
+
+主要收获包括：
+
+- 理解 `sched_ext` 调度器类的灵活性和强大功能。
+- 探索支撑 `scx_nest` 调度器的复杂数据结构和映射。
+- 分析管理任务分配、核心压缩和统计信息收集的核心函数。
+- 学习如何编译和执行调度器，并通过详细统计信息观察其影响。
+
+`scx_nest` 调度器是一个极好的例子，展示了如何利用先进的 eBPF 编程以灵活和动态的方式实现复杂的系统功能。
+
+如果您想深入了解 eBPF 并探索更多高级示例，请访问我们的教程仓库 [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 或查看我们的网站 [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/)。
+
+## 参考文献
+
+`scx_nest` 调度器的原始源代码可在 [sched-ext/scx](https://github.com/sched-ext/scx) 仓库中找到。
+
+可以增强您理解的其他资源包括：
+
+- **Linux 内核文档:** [Scheduler Ext 文档](https://www.kernel.org/doc/html/next/scheduler/sched-ext.html)
+- **内核源树:** [Linux 内核 `sched_ext` 工具](https://github.com/torvalds/linux/tree/master/tools/sched_ext)
+- **eBPF 官方文档:** [https://ebpf.io/docs/](https://ebpf.io/docs/)
+- **libbpf 文档:** [https://github.com/libbpf/libbpf](https://github.com/libbpf/libbpf)
+
+欢迎探索这些资源，扩展您的知识，继续深入学习高级 eBPF 编程的旅程。
--- a/src/45-scx-nest/include/bpf-compat/gnu/stubs.h
+++ b/src/45-scx-nest/include/bpf-compat/gnu/stubs.h
@@ -0,0 +1,11 @@
+/*
+ * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when
+ * compiling BPF files although its content doesn't play any role. The file in
+ * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is
+ * defined. When compiling a BPF source, __x86_64__ isn't set and thus
+ * stubs-32.h is selected. However, the file is not there if the system doesn't
+ * have 32bit glibc devel package installed leading to a build failure.
+ *
+ * The problem is worked around by making this file available in the include
+ * search paths before the system one when building BPF.
+ */
--- a/src/45-scx-nest/include/scx/common.bpf.h
+++ b/src/45-scx-nest/include/scx/common.bpf.h
@@ -0,0 +1,427 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCX_COMMON_BPF_H
+#define __SCX_COMMON_BPF_H
+
+#ifdef LSP
+#define __bpf__
+#include "../vmlinux/vmlinux.h"
+#else
+#include "vmlinux.h"
+#endif
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <asm-generic/errno.h>
+#include "user_exit_info.h"
+
+#define PF_WQ_WORKER			0x00000020	/* I'm a workqueue worker */
+#define PF_KTHREAD			0x00200000	/* I am a kernel thread */
+#define PF_EXITING			0x00000004
+#define CLOCK_MONOTONIC			1
+
+/*
+ * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can
+ * lead to really confusing misbehaviors. Let's trigger a build failure.
+ */
+static inline void ___vmlinux_h_sanity_check___(void)
+{
+	_Static_assert(SCX_DSQ_FLAG_BUILTIN,
+		       "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole");
+}
+
+s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
+u32 scx_bpf_dispatch_nr_slots(void) __ksym;
+void scx_bpf_dispatch_cancel(void) __ksym;
+bool scx_bpf_consume(u64 dsq_id) __ksym;
+void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
+void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
+bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+u32 scx_bpf_reenqueue_local(void) __ksym;
+void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
+s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
+void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
+struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
+void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
+void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
+void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
+void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak;
+u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak;
+u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak;
+void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak;
+u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
+const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
+const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
+void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak;
+const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
+const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
+void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
+bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
+s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
+s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
+bool scx_bpf_task_running(const struct task_struct *p) __ksym;
+s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
+struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
+
+/*
+ * Use the following as @it__iter when calling
+ * scx_bpf_dispatch[_vtime]_from_dsq() from within bpf_for_each() loops.
+ */
+#define BPF_FOR_EACH_ITER	(&___it)
+
+static inline __attribute__((format(printf, 1, 2)))
+void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
+
+/*
+ * Helper macro for initializing the fmt and variadic argument inputs to both
+ * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to
+ * refer to the initialized list of inputs to the bstr kfunc.
+ */
+#define scx_bpf_bstr_preamble(fmt, args...)					\
+	static char ___fmt[] = fmt;						\
+	/*									\
+	 * Note that __param[] must have at least one				\
+	 * element to keep the verifier happy.					\
+	 */									\
+	unsigned long long ___param[___bpf_narg(args) ?: 1] = {};		\
+										\
+	_Pragma("GCC diagnostic push")						\
+	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")			\
+	___bpf_fill(___param, args);						\
+	_Pragma("GCC diagnostic pop")						\
+
+/*
+ * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments
+ * instead of an array of u64. Using this macro will cause the scheduler to
+ * exit cleanly with the specified exit code being passed to user space.
+ */
+#define scx_bpf_exit(code, fmt, args...)					\
+({										\
+	scx_bpf_bstr_preamble(fmt, args)					\
+	scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param));		\
+	___scx_bpf_bstr_format_checker(fmt, ##args);				\
+})
+
+/*
+ * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
+ * instead of an array of u64. Invoking this macro will cause the scheduler to
+ * exit in an erroneous state, with diagnostic information being passed to the
+ * user.
+ */
+#define scx_bpf_error(fmt, args...)						\
+({										\
+	scx_bpf_bstr_preamble(fmt, args)					\
+	scx_bpf_error_bstr(___fmt, ___param, sizeof(___param));			\
+	___scx_bpf_bstr_format_checker(fmt, ##args);				\
+})
+
+/*
+ * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments
+ * instead of an array of u64. To be used from ops.dump() and friends.
+ */
+#define scx_bpf_dump(fmt, args...)						\
+({										\
+	scx_bpf_bstr_preamble(fmt, args)					\
+	scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param));			\
+	___scx_bpf_bstr_format_checker(fmt, ##args);				\
+})
+
+#define BPF_STRUCT_OPS(name, args...)						\
+SEC("struct_ops/"#name)								\
+BPF_PROG(name, ##args)
+
+#define BPF_STRUCT_OPS_SLEEPABLE(name, args...)					\
+SEC("struct_ops.s/"#name)							\
+BPF_PROG(name, ##args)
+
+/**
+ * RESIZABLE_ARRAY - Generates annotations for an array that may be resized
+ * @elfsec: the data section of the BPF program in which to place the array
+ * @arr: the name of the array
+ *
+ * libbpf has an API for setting map value sizes. Since data sections (i.e.
+ * bss, data, rodata) themselves are maps, a data section can be resized. If
+ * a data section has an array as its last element, the BTF info for that
+ * array will be adjusted so that length of the array is extended to meet the
+ * new length of the data section. This macro annotates an array to have an
+ * element count of one with the assumption that this array can be resized
+ * within the userspace program. It also annotates the section specifier so
+ * this array exists in a custom sub data section which can be resized
+ * independently.
+ *
+ * See RESIZE_ARRAY() for the userspace convenience macro for resizing an
+ * array declared with RESIZABLE_ARRAY().
+ */
+#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr)
+
+/**
+ * MEMBER_VPTR - Obtain the verified pointer to a struct or array member
+ * @base: struct or array to index
+ * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...)
+ *
+ * The verifier often gets confused by the instruction sequence the compiler
+ * generates for indexing struct fields or arrays. This macro forces the
+ * compiler to generate a code sequence which first calculates the byte offset,
+ * checks it against the struct or array size and add that byte offset to
+ * generate the pointer to the member to help the verifier.
+ *
+ * Ideally, we want to abort if the calculated offset is out-of-bounds. However,
+ * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller
+ * must check for %NULL and take appropriate action to appease the verifier. To
+ * avoid confusing the verifier, it's best to check for %NULL and dereference
+ * immediately.
+ *
+ *	vptr = MEMBER_VPTR(my_array, [i][j]);
+ *	if (!vptr)
+ *		return error;
+ *	*vptr = new_value;
+ *
+ * sizeof(@base) should encompass the memory area to be accessed and thus can't
+ * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
+ * `MEMBER_VPTR(ptr, ->member)`.
+ */
+#define MEMBER_VPTR(base, member) (typeof((base) member) *)			\
+({										\
+	u64 __base = (u64)&(base);						\
+	u64 __addr = (u64)&((base) member) - __base;				\
+	_Static_assert(sizeof(base) >= sizeof((base) member),			\
+		       "@base is smaller than @member, is @base a pointer?");	\
+	asm volatile (								\
+		"if %0 <= %[max] goto +2\n"					\
+		"%0 = 0\n"							\
+		"goto +1\n"							\
+		"%0 += %1\n"							\
+		: "+r"(__addr)							\
+		: "r"(__base),							\
+		  [max]"i"(sizeof(base) - sizeof((base) member)));		\
+	__addr;									\
+})
+
+/**
+ * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element
+ * @arr: array to index into
+ * @i: array index
+ * @n: number of elements in array
+ *
+ * Similar to MEMBER_VPTR() but is intended for use with arrays where the
+ * element count needs to be explicit.
+ * It can be used in cases where a global array is defined with an initial
+ * size but is intended to be be resized before loading the BPF program.
+ * Without this version of the macro, MEMBER_VPTR() will use the compile time
+ * size of the array to compute the max, which will result in rejection by
+ * the verifier.
+ */
+#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)				\
+({										\
+	u64 __base = (u64)arr;							\
+	u64 __addr = (u64)&(arr[i]) - __base;					\
+	asm volatile (								\
+		"if %0 <= %[max] goto +2\n"					\
+		"%0 = 0\n"							\
+		"goto +1\n"							\
+		"%0 += %1\n"							\
+		: "+r"(__addr)							\
+		: "r"(__base),							\
+		  [max]"r"(sizeof(arr[0]) * ((n) - 1)));			\
+	__addr;									\
+})
+
+
+/*
+ * BPF declarations and helpers
+ */
+
+/* list and rbtree */
+#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node)))
+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
+
+void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
+void bpf_obj_drop_impl(void *kptr, void *meta) __ksym;
+
+#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL))
+#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL)
+
+void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
+void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
+struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym;
+struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym;
+struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
+				      struct bpf_rb_node *node) __ksym;
+int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+			bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+			void *meta, __u64 off) __ksym;
+#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0)
+
+struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
+
+void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
+#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL)
+
+/* task */
+struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
+void bpf_task_release(struct task_struct *p) __ksym;
+
+/* cgroup */
+struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym;
+void bpf_cgroup_release(struct cgroup *cgrp) __ksym;
+struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
+
+/* css iteration */
+struct bpf_iter_css;
+struct cgroup_subsys_state;
+extern int bpf_iter_css_new(struct bpf_iter_css *it,
+			    struct cgroup_subsys_state *start,
+			    unsigned int flags) __weak __ksym;
+extern struct cgroup_subsys_state *
+bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym;
+extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym;
+
+/* cpumask */
+struct bpf_cpumask *bpf_cpumask_create(void) __ksym;
+struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym;
+u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym;
+u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym;
+void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1,
+		     const struct cpumask *src2) __ksym;
+void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1,
+		    const struct cpumask *src2) __ksym;
+void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1,
+		     const struct cpumask *src2) __ksym;
+bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym;
+bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym;
+void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym;
+u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym;
+u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
+				   const struct cpumask *src2) __ksym;
+u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
+
+/*
+ * Access a cpumask in read-only mode (typically to check bits).
+ */
+const struct cpumask *cast_mask(struct bpf_cpumask *mask)
+{
+	return (const struct cpumask *)mask;
+}
+
+/* rcu */
+void bpf_rcu_read_lock(void) __ksym;
+void bpf_rcu_read_unlock(void) __ksym;
+
+
+/*
+ * Other helpers
+ */
+
+/* useful compiler attributes */
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#define __maybe_unused __attribute__((__unused__))
+
+/*
+ * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They
+ * prevent compiler from caching, redoing or reordering reads or writes.
+ */
+typedef __u8  __attribute__((__may_alias__))  __u8_alias_t;
+typedef __u16 __attribute__((__may_alias__)) __u16_alias_t;
+typedef __u32 __attribute__((__may_alias__)) __u32_alias_t;
+typedef __u64 __attribute__((__may_alias__)) __u64_alias_t;
+
+static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(__u8_alias_t  *) res = *(volatile __u8_alias_t  *) p; break;
+	case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break;
+	case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break;
+	case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break;
+	default:
+		barrier();
+		__builtin_memcpy((void *)res, (const void *)p, size);
+		barrier();
+	}
+}
+
+static __always_inline void __write_once_size(volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(volatile  __u8_alias_t *) p = *(__u8_alias_t  *) res; break;
+	case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break;
+	case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break;
+	case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break;
+	default:
+		barrier();
+		__builtin_memcpy((void *)p, (const void *)res, size);
+		barrier();
+	}
+}
+
+#define READ_ONCE(x)					\
+({							\
+	union { typeof(x) __val; char __c[1]; } __u =	\
+		{ .__c = { 0 } };			\
+	__read_once_size(&(x), __u.__c, sizeof(x));	\
+	__u.__val;					\
+})
+
+#define WRITE_ONCE(x, val)				\
+({							\
+	union { typeof(x) __val; char __c[1]; } __u =	\
+		{ .__val = (val) }; 			\
+	__write_once_size(&(x), __u.__c, sizeof(x));	\
+	__u.__val;					\
+})
+
+/*
+ * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value.
+ * @v: The value for which we're computing the base 2 logarithm.
+ */
+static inline u32 log2_u32(u32 v)
+{
+        u32 r;
+        u32 shift;
+
+        r = (v > 0xFFFF) << 4; v >>= r;
+        shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+        shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+        shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+        r |= (v >> 1);
+        return r;
+}
+
+/*
+ * log2_u64 - Compute the base 2 logarithm of a 64-bit exponential value.
+ * @v: The value for which we're computing the base 2 logarithm.
+ */
+static inline u32 log2_u64(u64 v)
+{
+        u32 hi = v >> 32;
+        if (hi)
+                return log2_u32(hi) + 32 + 1;
+        else
+                return log2_u32(v) + 1;
+}
+
+#include "compat.bpf.h"
+
+#endif	/* __SCX_COMMON_BPF_H */
--- a/src/45-scx-nest/include/scx/common.h
+++ b/src/45-scx-nest/include/scx/common.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCHED_EXT_COMMON_H
+#define __SCHED_EXT_COMMON_H
+
+#ifdef __KERNEL__
+#error "Should not be included by BPF programs"
+#endif
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <errno.h>
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int8_t s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
+
+#define SCX_BUG(__fmt, ...)							\
+	do {									\
+		fprintf(stderr, "[SCX_BUG] %s:%d", __FILE__, __LINE__);		\
+		if (errno)							\
+			fprintf(stderr, " (%s)\n", strerror(errno));		\
+		else								\
+			fprintf(stderr, "\n");					\
+		fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__);		\
+		fprintf(stderr, "\n");						\
+										\
+		exit(EXIT_FAILURE);						\
+	} while (0)
+
+#define SCX_BUG_ON(__cond, __fmt, ...)					\
+	do {								\
+		if (__cond)						\
+			SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__);	\
+	} while (0)
+
+/**
+ * RESIZE_ARRAY - Convenience macro for resizing a BPF array
+ * @__skel: the skeleton containing the array
+ * @elfsec: the data section of the BPF program in which the array exists
+ * @arr: the name of the array
+ * @n: the desired array element count
+ *
+ * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two
+ * operations. It resizes the map which corresponds to the custom data
+ * section that contains the target array. As a side effect, the BTF info for
+ * the array is adjusted so that the array length is sized to cover the new
+ * data section size. The second operation is reassigning the skeleton pointer
+ * for that custom data section so that it points to the newly memory mapped
+ * region.
+ */
+#define RESIZE_ARRAY(__skel, elfsec, arr, n)						\
+	do {										\
+		size_t __sz;								\
+		bpf_map__set_value_size((__skel)->maps.elfsec##_##arr,			\
+				sizeof((__skel)->elfsec##_##arr->arr[0]) * (n));	\
+		(__skel)->elfsec##_##arr =						\
+			bpf_map__initial_value((__skel)->maps.elfsec##_##arr, &__sz);	\
+	} while (0)
+
+#include "user_exit_info.h"
+#include "compat.h"
+
+#endif	/* __SCHED_EXT_COMMON_H */
--- a/src/45-scx-nest/include/scx/compat.bpf.h
+++ b/src/45-scx-nest/include/scx/compat.bpf.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCX_COMPAT_BPF_H
+#define __SCX_COMPAT_BPF_H
+
+#define __COMPAT_ENUM_OR_ZERO(__type, __ent)					\
+({										\
+	__type __ret = 0;							\
+	if (bpf_core_enum_value_exists(__type, __ent))				\
+		__ret = __ent;							\
+	__ret;									\
+})
+
+/* v6.12: 819513666966 ("sched_ext: Add cgroup support") */
+#define __COMPAT_scx_bpf_task_cgroup(p)						\
+	(bpf_ksym_exists(scx_bpf_task_cgroup) ?					\
+	 scx_bpf_task_cgroup((p)) : NULL)
+
+/* v6.12: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") */
+#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it, slice)			\
+	(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice) ?			\
+	 scx_bpf_dispatch_from_dsq_set_slice((it), (slice)) : (void)0)
+#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it, vtime)			\
+	(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime) ?			\
+	 scx_bpf_dispatch_from_dsq_set_vtime((it), (vtime)) : (void)0)
+#define __COMPAT_scx_bpf_dispatch_from_dsq(it, p, dsq_id, enq_flags)		\
+	(bpf_ksym_exists(scx_bpf_dispatch_from_dsq) ?				\
+	 scx_bpf_dispatch_from_dsq((it), (p), (dsq_id), (enq_flags)) : false)
+#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it, p, dsq_id, enq_flags)	\
+	(bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq) ?			\
+	 scx_bpf_dispatch_vtime_from_dsq((it), (p), (dsq_id), (enq_flags)) : false)
+
+/*
+ * Define sched_ext_ops. This may be expanded to define multiple variants for
+ * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
+ */
+#define SCX_OPS_DEFINE(__name, ...)						\
+	SEC(".struct_ops.link")							\
+	struct sched_ext_ops __name = {						\
+		__VA_ARGS__,							\
+	};
+
+#endif	/* __SCX_COMPAT_BPF_H */
--- a/src/45-scx-nest/include/scx/compat.h
+++ b/src/45-scx-nest/include/scx/compat.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCX_COMPAT_H
+#define __SCX_COMPAT_H
+
+#include <bpf/btf.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+struct btf *__COMPAT_vmlinux_btf __attribute__((weak));
+
+static inline void __COMPAT_load_vmlinux_btf(void)
+{
+	if (!__COMPAT_vmlinux_btf) {
+		__COMPAT_vmlinux_btf = btf__load_vmlinux_btf();
+		SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()");
+	}
+}
+
+static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v)
+{
+	const struct btf_type *t;
+	const char *n;
+	s32 tid;
+	int i;
+
+	__COMPAT_load_vmlinux_btf();
+
+	tid = btf__find_by_name(__COMPAT_vmlinux_btf, type);
+	if (tid < 0)
+		return false;
+
+	t = btf__type_by_id(__COMPAT_vmlinux_btf, tid);
+	SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid);
+
+	if (btf_is_enum(t)) {
+		struct btf_enum *e = btf_enum(t);
+
+		for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
+			n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off);
+			SCX_BUG_ON(!n, "btf__name_by_offset()");
+			if (!strcmp(n, name)) {
+				*v = e[i].val;
+				return true;
+			}
+		}
+	} else if (btf_is_enum64(t)) {
+		struct btf_enum64 *e = btf_enum64(t);
+
+		for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
+			n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off);
+			SCX_BUG_ON(!n, "btf__name_by_offset()");
+			if (!strcmp(n, name)) {
+				*v = btf_enum64_value(&e[i]);
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+#define __COMPAT_ENUM_OR_ZERO(__type, __ent)					\
+({										\
+	u64 __val = 0;								\
+	__COMPAT_read_enum(__type, __ent, &__val);				\
+	__val;									\
+})
+
+static inline bool __COMPAT_has_ksym(const char *ksym)
+{
+	__COMPAT_load_vmlinux_btf();
+	return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0;
+}
+
+static inline bool __COMPAT_struct_has_field(const char *type, const char *field)
+{
+	const struct btf_type *t;
+	const struct btf_member *m;
+	const char *n;
+	s32 tid;
+	int i;
+
+	__COMPAT_load_vmlinux_btf();
+	tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT);
+	if (tid < 0)
+		return false;
+
+	t = btf__type_by_id(__COMPAT_vmlinux_btf, tid);
+	SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid);
+
+	m = btf_members(t);
+
+	for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
+		n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off);
+		SCX_BUG_ON(!n, "btf__name_by_offset()");
+			if (!strcmp(n, field))
+				return true;
+	}
+
+	return false;
+}
+
+#define SCX_OPS_SWITCH_PARTIAL							\
+	__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")
+
+static inline long scx_hotplug_seq(void)
+{
+	int fd;
+	char buf[32];
+	ssize_t len;
+	long val;
+
+	fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY);
+	if (fd < 0)
+		return -ENOENT;
+
+	len = read(fd, buf, sizeof(buf) - 1);
+	SCX_BUG_ON(len <= 0, "read failed (%ld)", len);
+	buf[len] = 0;
+	close(fd);
+
+	val = strtoul(buf, NULL, 10);
+	SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val);
+
+	return val;
+}
+
+/*
+ * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
+ * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
+ * and attach it, backward compatibility is automatically maintained where
+ * reasonable.
+ *
+ * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is
+ * the current minimum required kernel version.
+ */
+#define SCX_OPS_OPEN(__ops_name, __scx_name) ({					\
+	struct __scx_name *__skel;						\
+										\
+	SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"),		\
+		   "sched_ext_ops.dump() missing, kernel too old?");		\
+										\
+	__skel = __scx_name##__open();						\
+	SCX_BUG_ON(!__skel, "Could not open " #__scx_name);			\
+	__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq();		\
+	__skel; 								\
+})
+
+#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({		\
+	UEI_SET_SIZE(__skel, __ops_name, __uei_name);				\
+	SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel");	\
+})
+
+/*
+ * New versions of bpftool now emit additional link placeholders for BPF maps,
+ * and set up BPF skeleton in such a way that libbpf will auto-attach BPF maps
+ * automatically, assumming libbpf is recent enough (v1.5+). Old libbpf will do
+ * nothing with those links and won't attempt to auto-attach maps.
+ *
+ * To maintain compatibility with older libbpf while avoiding trying to attach
+ * twice, disable the autoattach feature on newer libbpf.
+ */
+#if LIBBPF_MAJOR_VERSION > 1 ||							\
+	(LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 5)
+#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name)			\
+	bpf_map__set_autoattach((__skel)->maps.__ops_name, false)
+#else
+#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) do {} while (0)
+#endif
+
+#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({			\
+	struct bpf_link *__link;						\
+	__SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name);			\
+	SCX_BUG_ON(__scx_name##__attach((__skel)), "Failed to attach skel");	\
+	__link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name);		\
+	SCX_BUG_ON(!__link, "Failed to attach struct_ops");			\
+	__link;									\
+})
+
+#endif	/* __SCX_COMPAT_H */
--- a/src/45-scx-nest/include/scx/user_exit_info.h
+++ b/src/45-scx-nest/include/scx/user_exit_info.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Define struct user_exit_info which is shared between BPF and userspace parts
+ * to communicate exit status and other information.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef __USER_EXIT_INFO_H
+#define __USER_EXIT_INFO_H
+
+enum uei_sizes {
+	UEI_REASON_LEN		= 128,
+	UEI_MSG_LEN		= 1024,
+	UEI_DUMP_DFL_LEN	= 32768,
+};
+
+struct user_exit_info {
+	int		kind;
+	s64		exit_code;
+	char		reason[UEI_REASON_LEN];
+	char		msg[UEI_MSG_LEN];
+};
+
+#ifdef __bpf__
+
+#ifdef LSP
+#include "../vmlinux/vmlinux.h"
+#else
+#include "vmlinux.h"
+#endif
+#include <bpf/bpf_core_read.h>
+
+#define UEI_DEFINE(__name)							\
+	char RESIZABLE_ARRAY(data, __name##_dump);				\
+	const volatile u32 __name##_dump_len;					\
+	struct user_exit_info __name SEC(".data")
+
+#define UEI_RECORD(__uei_name, __ei) ({						\
+	bpf_probe_read_kernel_str(__uei_name.reason,				\
+				  sizeof(__uei_name.reason), (__ei)->reason);	\
+	bpf_probe_read_kernel_str(__uei_name.msg,				\
+				  sizeof(__uei_name.msg), (__ei)->msg);		\
+	bpf_probe_read_kernel_str(__uei_name##_dump,				\
+				  __uei_name##_dump_len, (__ei)->dump);		\
+	if (bpf_core_field_exists((__ei)->exit_code))				\
+		__uei_name.exit_code = (__ei)->exit_code;			\
+	/* use __sync to force memory barrier */				\
+	__sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind,		\
+				    (__ei)->kind);				\
+})
+
+#else	/* !__bpf__ */
+
+#include <stdio.h>
+#include <stdbool.h>
+
+/* no need to call the following explicitly if SCX_OPS_LOAD() is used */
+#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({					\
+	u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN;	\
+	(__skel)->rodata->__uei_name##_dump_len = __len;				\
+	RESIZE_ARRAY((__skel), data, __uei_name##_dump, __len);				\
+})
+
+#define UEI_EXITED(__skel, __uei_name) ({					\
+	/* use __sync to force memory barrier */				\
+	__sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1);	\
+})
+
+#define UEI_REPORT(__skel, __uei_name) ({					\
+	struct user_exit_info *__uei = &(__skel)->data->__uei_name;		\
+	char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \
+	if (__uei_dump[0] != '\0') {						\
+		fputs("\nDEBUG DUMP\n", stderr);				\
+		fputs("================================================================================\n\n", stderr); \
+		fputs(__uei_dump, stderr);					\
+		fputs("\n================================================================================\n\n", stderr); \
+	}									\
+	fprintf(stderr, "EXIT: %s", __uei->reason);				\
+	if (__uei->msg[0] != '\0')						\
+		fprintf(stderr, " (%s)", __uei->msg);				\
+	fputs("\n", stderr);							\
+	__uei->exit_code;							\
+})
+
+/*
+ * We can't import vmlinux.h while compiling user C code. Let's duplicate
+ * scx_exit_code definition.
+ */
+enum scx_exit_code {
+	/* Reasons */
+	SCX_ECODE_RSN_HOTPLUG		= 1LLU << 32,
+
+	/* Actions */
+	SCX_ECODE_ACT_RESTART		= 1LLU << 48,
+};
+
+enum uei_ecode_mask {
+	UEI_ECODE_USER_MASK		= ((1LLU << 32) - 1),
+	UEI_ECODE_SYS_RSN_MASK		= ((1LLU << 16) - 1) << 32,
+	UEI_ECODE_SYS_ACT_MASK		= ((1LLU << 16) - 1) << 48,
+};
+
+/*
+ * These macro interpret the ecode returned from UEI_REPORT().
+ */
+#define UEI_ECODE_USER(__ecode)		((__ecode) & UEI_ECODE_USER_MASK)
+#define UEI_ECODE_SYS_RSN(__ecode)	((__ecode) & UEI_ECODE_SYS_RSN_MASK)
+#define UEI_ECODE_SYS_ACT(__ecode)	((__ecode) & UEI_ECODE_SYS_ACT_MASK)
+
+#define UEI_ECODE_RESTART(__ecode)	(UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART)
+
+#endif	/* __bpf__ */
+#endif	/* __USER_EXIT_INFO_H */
--- a/src/45-scx-nest/scx_nest.bpf.c
+++ b/src/45-scx-nest/scx_nest.bpf.c
@@ -0,0 +1,654 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * As described in [0], a Nest scheduler which encourages task placement on
+ * cores that are likely to be running at higher frequency, based upon recent usage.
+ *
+ * [0]: https://hal.inria.fr/hal-03612592/file/paper.pdf
+ *
+ * It operates as a global weighted vtime scheduler (similarly to CFS), while
+ * using the Nest algorithm to choose idle cores at wakup time.
+ *
+ * It also demonstrates the following niceties.
+ *
+ * - More robust task placement policies.
+ * - Termination notification for userspace.
+ *
+ * While rather simple, this scheduler should work reasonably well on CPUs with
+ * a uniform L3 cache topology. While preemption is not implemented, the fact
+ * that the scheduling queue is shared across all CPUs means that whatever is
+ * at the front of the queue is likely to be executed fairly quickly given
+ * enough number of CPUs.
+ *
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <scx/common.bpf.h>
+
+#include "scx_nest.h"
+
+#define TASK_DEAD                       0x00000080
+
+char _license[] SEC("license") = "GPL";
+
+enum {
+	FALLBACK_DSQ_ID		= 0,
+	MSEC_PER_SEC		= 1000LLU,
+	USEC_PER_MSEC		= 1000LLU,
+	NSEC_PER_USEC		= 1000LLU,
+	NSEC_PER_MSEC		= USEC_PER_MSEC * NSEC_PER_USEC,
+	USEC_PER_SEC		= USEC_PER_MSEC * MSEC_PER_SEC,
+	NSEC_PER_SEC		= NSEC_PER_USEC * USEC_PER_SEC,
+};
+
+#define CLOCK_BOOTTIME 7
+#define NUMA_NO_NODE -1
+
+const volatile u64 p_remove_ns = 2 * NSEC_PER_MSEC;
+const volatile u64 r_max = 5;
+const volatile u64 r_impatient = 2;
+const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile bool find_fully_idle = false;
+const volatile u64 sampling_cadence_ns = 1 * NSEC_PER_SEC;
+const volatile u64 r_depth = 5;
+
+// Used for stats tracking. May be stale at any given time.
+u64 stats_primary_mask, stats_reserved_mask, stats_other_mask, stats_idle_mask;
+
+// Used for internal tracking.
+static s32 nr_reserved;
+
+static u64 vtime_now;
+UEI_DEFINE(uei);
+
+extern unsigned long CONFIG_HZ __kconfig;
+
+/* Per-task scheduling context */
+struct task_ctx {
+	/*
+	 * A temporary cpumask for calculating a task's primary and reserve
+	 * mask.
+	 */
+	struct bpf_cpumask __kptr *tmp_mask;
+
+	/*
+	 * The number of times that a task observes that its previous core is
+	 * not idle. If this occurs r_impatient times in a row, a core is
+	 * attempted to be retrieved from either the reserve nest, or the
+	 * fallback nest.
+	 */
+	u32 prev_misses;
+
+	/*
+	 * A core that the task is "attached" to, meaning the last core that it
+	 * executed on at least twice in a row, and the core that it first
+	 * tries to migrate to on wakeup. The task only migrates to the
+	 * attached core if it is idle and in the primary nest.
+	 */
+	s32 attached_core;
+
+	/*
+	 * The last core that the task executed on. This is used to determine
+	 * if the task should attach to the core that it will execute on next.
+	 */
+	s32 prev_cpu;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+struct pcpu_ctx {
+	/* The timer used to compact the core from the primary nest. */
+	struct bpf_timer timer;
+
+	/* Whether the current core has been scheduled for compaction. */
+	bool scheduled_compaction;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1024);
+	__type(key, s32);
+	__type(value, struct pcpu_ctx);
+} pcpu_ctxs SEC(".maps");
+
+struct stats_timer {
+	struct bpf_timer timer;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, u32);
+	__type(value, struct stats_timer);
+} stats_timer SEC(".maps");
+
+const volatile u32 nr_cpus = 1; /* !0 for veristat, set during init. */
+
+private(NESTS) struct bpf_cpumask __kptr *primary_cpumask;
+private(NESTS) struct bpf_cpumask __kptr *reserve_cpumask;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(u32));
+	__uint(value_size, sizeof(u64));
+	__uint(max_entries, NEST_STAT(NR));
+} stats SEC(".maps");
+
+
+static __always_inline void stat_inc(u32 idx)
+{
+	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
+	if (cnt_p)
+		(*cnt_p)++;
+}
+
+static inline bool vtime_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
+
+static __always_inline void
+try_make_core_reserved(s32 cpu, struct bpf_cpumask * reserved, bool promotion)
+{
+	s32 tmp_nr_reserved;
+
+	/*
+	 * This check is racy, but that's OK. If we incorrectly fail to promote
+	 * a core to reserve, it's because another context added or removed a
+	 * core from reserved in this small window. It will balance out over
+	 * subsequent wakeups.
+	 */
+	tmp_nr_reserved = nr_reserved;
+	if (tmp_nr_reserved < r_max) {
+		/*
+		 * It's possible that we could exceed r_max for a time here,
+		 * but that should balance out as more cores are either demoted
+		 * or fail to be promoted into the reserve nest.
+		 */
+		__sync_fetch_and_add(&nr_reserved, 1);
+		bpf_cpumask_set_cpu(cpu, reserved);
+		if (promotion)
+			stat_inc(NEST_STAT(PROMOTED_TO_RESERVED));
+		else
+			stat_inc(NEST_STAT(DEMOTED_TO_RESERVED));
+	} else {
+		bpf_cpumask_clear_cpu(cpu, reserved);
+		stat_inc(NEST_STAT(RESERVED_AT_CAPACITY));
+	}
+}
+
+static void update_attached(struct task_ctx *tctx, s32 prev_cpu, s32 new_cpu)
+{
+	if (tctx->prev_cpu == new_cpu)
+		tctx->attached_core = new_cpu;
+	tctx->prev_cpu = prev_cpu;
+}
+
+static int compact_primary_core(void *map, int *key, struct bpf_timer *timer)
+{
+	struct bpf_cpumask *primary, *reserve;
+	s32 cpu = bpf_get_smp_processor_id();
+	struct pcpu_ctx *pcpu_ctx;
+
+	stat_inc(NEST_STAT(CALLBACK_COMPACTED));
+	/*
+	 * If we made it to this callback, it means that the timer callback was
+	 * never cancelled, and so the core needs to be demoted from the
+	 * primary nest.
+	 */
+	pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu);
+	if (!pcpu_ctx) {
+		scx_bpf_error("Couldn't lookup pcpu ctx");
+		return 0;
+	}
+	bpf_rcu_read_lock();
+	primary = primary_cpumask;
+	reserve = reserve_cpumask;
+	if (!primary || !reserve) {
+		scx_bpf_error("Couldn't find primary or reserve");
+		bpf_rcu_read_unlock();
+		return 0;
+	}
+
+	bpf_cpumask_clear_cpu(cpu, primary);
+	try_make_core_reserved(cpu, reserve, false);
+	bpf_rcu_read_unlock();
+	pcpu_ctx->scheduled_compaction = false;
+	return 0;
+}
+
+s32 BPF_STRUCT_OPS(nest_select_cpu, struct task_struct *p, s32 prev_cpu,
+		   u64 wake_flags)
+{
+	struct bpf_cpumask *p_mask, *primary, *reserve;
+	s32 cpu;
+	struct task_ctx *tctx;
+	struct pcpu_ctx *pcpu_ctx;
+	bool direct_to_primary = false, reset_impatient = true;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!tctx)
+		return -ENOENT;
+
+	bpf_rcu_read_lock();
+	p_mask = tctx->tmp_mask;
+	primary = primary_cpumask;
+	reserve = reserve_cpumask;
+	if (!p_mask || !primary || !reserve) {
+		bpf_rcu_read_unlock();
+		return -ENOENT;
+	}
+
+	tctx->prev_cpu = prev_cpu;
+
+	bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary));
+
+	/* First try to wake the task on its attached core. */
+	if (bpf_cpumask_test_cpu(tctx->attached_core, cast_mask(p_mask)) &&
+	    scx_bpf_test_and_clear_cpu_idle(tctx->attached_core)) {
+		cpu = tctx->attached_core;
+		stat_inc(NEST_STAT(WAKEUP_ATTACHED));
+		goto migrate_primary;
+	}
+
+	/*
+	 * Try to stay on the previous core if it's in the primary set, and
+	 * there's no hypertwin. If the previous core is the core the task is
+	 * attached to, don't bother as we already just tried that above.
+	 */
+	if (prev_cpu != tctx->attached_core &&
+	    bpf_cpumask_test_cpu(prev_cpu, cast_mask(p_mask)) &&
+	    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+		cpu = prev_cpu;
+		stat_inc(NEST_STAT(WAKEUP_PREV_PRIMARY));
+		goto migrate_primary;
+	}
+
+	if (find_fully_idle) {
+		/* Then try any fully idle core in primary. */
+		cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask),
+					    SCX_PICK_IDLE_CORE);
+		if (cpu >= 0) {
+			stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_PRIMARY));
+			goto migrate_primary;
+		}
+	}
+
+	/* Then try _any_ idle core in primary, even if its hypertwin is active. */
+	cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0);
+	if (cpu >= 0) {
+		stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_PRIMARY));
+		goto migrate_primary;
+	}
+
+	if (r_impatient > 0 && ++tctx->prev_misses >= r_impatient) {
+		direct_to_primary = true;
+		tctx->prev_misses = 0;
+		stat_inc(NEST_STAT(TASK_IMPATIENT));
+	}
+
+	reset_impatient = false;
+
+	/* Then try any fully idle core in reserve. */
+	bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(reserve));
+	if (find_fully_idle) {
+		cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask),
+					    SCX_PICK_IDLE_CORE);
+		if (cpu >= 0) {
+			stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_RESERVE));
+			goto promote_to_primary;
+		}
+	}
+
+	/* Then try _any_ idle core in reserve, even if its hypertwin is active. */
+	cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0);
+	if (cpu >= 0) {
+		stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_RESERVE));
+		goto promote_to_primary;
+	}
+
+	/* Then try _any_ idle core in the task's cpumask. */
+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+	if (cpu >= 0) {
+		/*
+		 * We found a core that (we didn't _think_) is in any nest.
+		 * This means that we need to either promote the core to the
+		 * reserve nest, or if we're going direct to primary due to
+		 * r_impatient being exceeded, promote directly to primary.
+		 *
+		 * We have to do one final check here to see if the core is in
+		 * the primary or reserved cpumask because we could potentially
+		 * race with the core changing states between AND'ing the
+		 * primary and reserve masks with p->cpus_ptr above, and
+		 * atomically reserving it from the idle mask with
+		 * scx_bpf_pick_idle_cpu(). This is also technically true of
+		 * the checks above, but in all of those cases we just put the
+		 * core directly into the primary mask so it's not really that
+		 * big of a problem. Here, we want to make sure that we don't
+		 * accidentally put a core into the reserve nest that was e.g.
+		 * already in the primary nest. This is unlikely, but we check
+		 * for it on what should be a relatively cold path regardless.
+		 */
+		stat_inc(NEST_STAT(WAKEUP_IDLE_OTHER));
+		if (bpf_cpumask_test_cpu(cpu, cast_mask(primary)))
+			goto migrate_primary;
+		else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve)))
+			goto promote_to_primary;
+		else if (direct_to_primary)
+			goto promote_to_primary;
+		else
+			try_make_core_reserved(cpu, reserve, true);
+		bpf_rcu_read_unlock();
+		return cpu;
+	}
+
+	bpf_rcu_read_unlock();
+	return prev_cpu;
+
+promote_to_primary:
+	stat_inc(NEST_STAT(PROMOTED_TO_PRIMARY));
+migrate_primary:
+	if (reset_impatient)
+		tctx->prev_misses = 0;
+	pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu);
+	if (pcpu_ctx) {
+		if (pcpu_ctx->scheduled_compaction) {
+			if (bpf_timer_cancel(&pcpu_ctx->timer) < 0)
+				scx_bpf_error("Failed to cancel pcpu timer");
+			if (bpf_timer_set_callback(&pcpu_ctx->timer, compact_primary_core))
+				scx_bpf_error("Failed to re-arm pcpu timer");
+			pcpu_ctx->scheduled_compaction = false;
+			stat_inc(NEST_STAT(CANCELLED_COMPACTION));
+		}
+	} else {
+		scx_bpf_error("Failed to lookup pcpu ctx");
+	}
+	bpf_cpumask_set_cpu(cpu, primary);
+	/*
+	 * Check to see whether the CPU is in the reserved nest. This can
+	 * happen if the core is compacted concurrently with us trying to place
+	 * the currently-waking task onto it. Similarly, this is the expected
+	 * state of the core if we found the core in the reserve nest and are
+	 * promoting it.
+	 *
+	 * We don't have to worry about racing with any other waking task here
+	 * because we've atomically reserved the core with (some variant of)
+	 * scx_bpf_pick_idle_cpu().
+	 */
+	if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) {
+		__sync_sub_and_fetch(&nr_reserved, 1);
+		bpf_cpumask_clear_cpu(cpu, reserve);
+	}
+	bpf_rcu_read_unlock();
+	update_attached(tctx, prev_cpu, cpu);
+	scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(nest_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct task_ctx *tctx;
+	u64 vtime = p->scx.dsq_vtime;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!tctx) {
+		scx_bpf_error("Unable to find task ctx");
+		return;
+	}
+
+	/*
+	 * Limit the amount of budget that an idling task can accumulate
+	 * to one slice.
+	 */
+	if (vtime_before(vtime, vtime_now - slice_ns))
+		vtime = vtime_now - slice_ns;
+
+	scx_bpf_dispatch_vtime(p, FALLBACK_DSQ_ID, slice_ns, vtime,
+			       enq_flags);
+}
+
+void BPF_STRUCT_OPS(nest_dispatch, s32 cpu, struct task_struct *prev)
+{
+	struct pcpu_ctx *pcpu_ctx;
+	struct bpf_cpumask *primary, *reserve;
+	s32 key = cpu;
+	bool in_primary;
+
+	primary = primary_cpumask;
+	reserve = reserve_cpumask;
+	if (!primary || !reserve) {
+		scx_bpf_error("No primary or reserve cpumask");
+		return;
+	}
+
+	pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key);
+	if (!pcpu_ctx) {
+		scx_bpf_error("Failed to lookup pcpu ctx");
+		return;
+	}
+
+	if (!scx_bpf_consume(FALLBACK_DSQ_ID)) {
+		in_primary = bpf_cpumask_test_cpu(cpu, cast_mask(primary));
+
+		if (prev && (prev->scx.flags & SCX_TASK_QUEUED) && in_primary) {
+			scx_bpf_dispatch(prev, SCX_DSQ_LOCAL, slice_ns, 0);
+			return;
+		}
+
+		stat_inc(NEST_STAT(NOT_CONSUMED));
+		if (in_primary) {
+			/*
+			 * Immediately demote a primary core if the previous
+			 * task on it is dying
+			 *
+			 * Note that we elect to not compact the "first" CPU in
+			 * the mask so as to encourage at least one core to
+			 * remain in the nest. It would be better to check for
+			 * whether there is only one core remaining in the
+			 * nest, but BPF doesn't yet have a kfunc for querying
+			 * cpumask weight.
+			 */
+			if ((prev && prev->__state == TASK_DEAD) &&
+			    (cpu != bpf_cpumask_first(cast_mask(primary)))) {
+				stat_inc(NEST_STAT(EAGERLY_COMPACTED));
+				bpf_cpumask_clear_cpu(cpu, primary);
+				try_make_core_reserved(cpu, reserve, false);
+			} else  {
+				pcpu_ctx->scheduled_compaction = true;
+				/*
+				 * The core isn't being used anymore. Set a
+				 * timer to remove the core from the nest in
+				 * p_remove if it's still unused by that point.
+				 */
+				bpf_timer_start(&pcpu_ctx->timer, p_remove_ns,
+						BPF_F_TIMER_CPU_PIN);
+				stat_inc(NEST_STAT(SCHEDULED_COMPACTION));
+			}
+		}
+		return;
+	}
+	stat_inc(NEST_STAT(CONSUMED));
+}
+
+void BPF_STRUCT_OPS(nest_running, struct task_struct *p)
+{
+	/*
+	 * Global vtime always progresses forward as tasks start executing. The
+	 * test and update can be performed concurrently from multiple CPUs and
+	 * thus racy. Any error should be contained and temporary. Let's just
+	 * live with it.
+	 */
+	if (vtime_before(vtime_now, p->scx.dsq_vtime))
+		vtime_now = p->scx.dsq_vtime;
+}
+
+void BPF_STRUCT_OPS(nest_stopping, struct task_struct *p, bool runnable)
+{
+	/* scale the execution time by the inverse of the weight and charge */
+	p->scx.dsq_vtime += (slice_ns - p->scx.slice) * 100 / p->scx.weight;
+}
+
+s32 BPF_STRUCT_OPS(nest_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	struct task_ctx *tctx;
+	struct bpf_cpumask *cpumask;
+
+	/*
+	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
+	 * in this function and the following will automatically use GFP_KERNEL.
+	 */
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
+				    BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!tctx)
+		return -ENOMEM;
+
+	cpumask = bpf_cpumask_create();
+	if (!cpumask)
+		return -ENOMEM;
+
+	cpumask = bpf_kptr_xchg(&tctx->tmp_mask, cpumask);
+	if (cpumask)
+		bpf_cpumask_release(cpumask);
+
+	tctx->attached_core = -1;
+	tctx->prev_cpu = -1;
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(nest_enable, struct task_struct *p)
+{
+	p->scx.dsq_vtime = vtime_now;
+}
+
+static int stats_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+	s32 cpu;
+	struct bpf_cpumask *primary, *reserve;
+	const struct cpumask *idle;
+	stats_primary_mask = 0;
+	stats_reserved_mask = 0;
+	stats_other_mask = 0;
+	stats_idle_mask = 0;
+	long err;
+
+	bpf_rcu_read_lock();
+	primary = primary_cpumask;
+	reserve = reserve_cpumask;
+	if (!primary || !reserve) {
+		bpf_rcu_read_unlock();
+		scx_bpf_error("Failed to lookup primary or reserve");
+		return 0;
+	}
+
+	idle = scx_bpf_get_idle_cpumask();
+	bpf_for(cpu, 0, nr_cpus) {
+		if (bpf_cpumask_test_cpu(cpu, cast_mask(primary)))
+			stats_primary_mask |= (1ULL << cpu);
+		else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve)))
+			stats_reserved_mask |= (1ULL << cpu);
+		else
+			stats_other_mask |= (1ULL << cpu);
+
+		if (bpf_cpumask_test_cpu(cpu, idle))
+			stats_idle_mask |= (1ULL << cpu);
+	}
+	bpf_rcu_read_unlock();
+	scx_bpf_put_idle_cpumask(idle);
+
+	err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0);
+	if (err)
+		scx_bpf_error("Failed to arm stats timer");
+
+	return 0;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(nest_init)
+{
+	struct bpf_cpumask *cpumask;
+	s32 cpu;
+	int err;
+	struct bpf_timer *timer;
+	u32 key = 0;
+
+	err = scx_bpf_create_dsq(FALLBACK_DSQ_ID, NUMA_NO_NODE);
+	if (err) {
+		scx_bpf_error("Failed to create fallback DSQ");
+		return err;
+	}
+
+	cpumask = bpf_cpumask_create();
+	if (!cpumask)
+		return -ENOMEM;
+	bpf_cpumask_clear(cpumask);
+	cpumask = bpf_kptr_xchg(&primary_cpumask, cpumask);
+	if (cpumask)
+		bpf_cpumask_release(cpumask);
+
+	cpumask = bpf_cpumask_create();
+	if (!cpumask)
+		return -ENOMEM;
+
+	bpf_cpumask_clear(cpumask);
+	cpumask = bpf_kptr_xchg(&reserve_cpumask, cpumask);
+	if (cpumask)
+		bpf_cpumask_release(cpumask);
+
+	bpf_for(cpu, 0, nr_cpus) {
+		s32 key = cpu;
+		struct pcpu_ctx *ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key);
+
+		if (!ctx) {
+			scx_bpf_error("Failed to lookup pcpu_ctx");
+			return -ENOENT;
+		}
+		ctx->scheduled_compaction = false;
+		if (bpf_timer_init(&ctx->timer, &pcpu_ctxs, CLOCK_BOOTTIME)) {
+			scx_bpf_error("Failed to initialize pcpu timer");
+			return -EINVAL;
+		}
+		err = bpf_timer_set_callback(&ctx->timer, compact_primary_core);
+		if (err) {
+			scx_bpf_error("Failed to set pcpu timer callback");
+			return -EINVAL;
+		}
+	}
+
+	timer = bpf_map_lookup_elem(&stats_timer, &key);
+	if (!timer) {
+		scx_bpf_error("Failed to lookup central timer");
+		return -ESRCH;
+	}
+	bpf_timer_init(timer, &stats_timer, CLOCK_BOOTTIME);
+	bpf_timer_set_callback(timer, stats_timerfn);
+	err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0);
+	if (err)
+		scx_bpf_error("Failed to arm stats timer");
+
+	return err;
+}
+
+void BPF_STRUCT_OPS(nest_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(nest_ops,
+	       .select_cpu		= (void *)nest_select_cpu,
+	       .enqueue			= (void *)nest_enqueue,
+	       .dispatch		= (void *)nest_dispatch,
+	       .running			= (void *)nest_running,
+	       .stopping		= (void *)nest_stopping,
+	       .init_task		= (void *)nest_init_task,
+	       .enable			= (void *)nest_enable,
+	       .init			= (void *)nest_init,
+	       .exit			= (void *)nest_exit,
+	       .flags			= 0,
+	       .name			= "nest");
+
--- a/src/45-scx-nest/scx_nest.c
+++ b/src/45-scx-nest/scx_nest.c
@@ -0,0 +1,236 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+
+#include "scx_nest.skel.h"
+#include "scx_nest.h"
+
+#define SAMPLING_CADENCE_S 2
+
+const char help_fmt[] =
+"A Nest sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-p] [-d DELAY] [-m <max>] [-i ITERS]\n"
+"\n"
+"  -d DELAY_US   Delay (us), before removing an idle core from the primary nest (default 2000us / 2ms)\n"
+"  -m R_MAX      Maximum number of cores in the reserve nest (default 5)\n"
+"  -i ITERS      Number of successive placement failures tolerated before trying to aggressively expand primary nest (default 2), or 0 to disable\n"
+"  -s SLICE_US   Override slice duration in us (default 20000us / 20ms)\n"
+"  -I            First try to find a fully idle core, and then any idle core, when searching nests. Default behavior is to ignore hypertwins and check for any idle core.\n"
+"  -v            Print libbpf debug messages\n"
+"  -h            Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int nest)
+{
+	exit_req = 1;
+}
+
+struct nest_stat {
+        const char *label;
+        enum nest_stat_group group;
+        enum nest_stat_idx idx;
+};
+
+#define NEST_ST(__stat, __grp, __desc) {	\
+	.label = #__stat,		\
+	.group = __grp,			\
+	.idx = NEST_STAT(__stat)		\
+},
+static struct nest_stat nest_stats[NEST_STAT(NR)] = {
+#include "scx_nest_stats_table.h"
+};
+#undef NEST_ST
+
+static void read_stats(struct scx_nest *skel, u64 *stats)
+{
+	int nr_cpus = libbpf_num_possible_cpus();
+	u64 cnts[NEST_STAT(NR)][nr_cpus];
+	u32 idx;
+
+	memset(stats, 0, sizeof(stats[0]) * NEST_STAT(NR));
+
+	for (idx = 0; idx < NEST_STAT(NR); idx++) {
+		int ret, cpu;
+
+		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
+					  &idx, cnts[idx]);
+		if (ret < 0)
+			continue;
+		for (cpu = 0; cpu < nr_cpus; cpu++)
+			stats[idx] += cnts[idx][cpu];
+	}
+}
+
+static void print_underline(const char *str)
+{
+	char buf[64];
+	size_t len;
+
+	len = strlen(str);
+	memset(buf, '-', len);
+	buf[len] = '\0';
+	printf("\n\n%s\n%s\n", str, buf);
+}
+
+static void print_stat_grp(enum nest_stat_group grp)
+{
+	const char *group;
+
+	switch (grp) {
+		case STAT_GRP_WAKEUP:
+			group = "Wakeup stats";
+			break;
+		case STAT_GRP_NEST:
+			group = "Nest stats";
+			break;
+		case STAT_GRP_CONSUME:
+			group = "Consume stats";
+			break;
+		default:
+			group = "Unknown stats";
+			break;
+	}
+
+	print_underline(group);
+}
+
+static void print_active_nests(const struct scx_nest *skel)
+{
+	u64 primary = skel->bss->stats_primary_mask;
+	u64 reserved = skel->bss->stats_reserved_mask;
+	u64 other = skel->bss->stats_other_mask;
+	u64 idle = skel->bss->stats_idle_mask;
+	u32 nr_cpus = skel->rodata->nr_cpus, cpu;
+	int idx;
+	char cpus[nr_cpus + 1];
+
+	memset(cpus, 0, nr_cpus + 1);
+	print_underline("Masks");
+	for (idx = 0; idx < 4; idx++) {
+		const char *mask_str;
+		u64 mask, total = 0;
+
+		memset(cpus, '-', nr_cpus);
+		if (idx == 0) {
+			mask_str = "PRIMARY";
+			mask = primary;
+		} else if (idx == 1) {
+			mask_str = "RESERVED";
+			mask = reserved;
+		} else if (idx == 2) {
+			mask_str = "OTHER";
+			mask = other;
+		} else {
+			mask_str = "IDLE";
+			mask = idle;
+		}
+		for (cpu = 0; cpu < nr_cpus; cpu++) {
+			if (mask & (1ULL << cpu)) {
+				cpus[cpu] = '*';
+				total++;
+			}
+		}
+		printf("%-9s(%2" PRIu64 "): | %s |\n", mask_str, total, cpus);
+	}
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_nest *skel;
+	struct bpf_link *link;
+	__u32 opt;
+	__u64 ecode;
+
+	libbpf_set_print(libbpf_print_fn);
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+restart:
+	skel = SCX_OPS_OPEN(nest_ops, scx_nest);
+
+	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
+	skel->rodata->sampling_cadence_ns = SAMPLING_CADENCE_S * 1000 * 1000 * 1000;
+
+	while ((opt = getopt(argc, argv, "d:m:i:Is:vh")) != -1) {
+		switch (opt) {
+		case 'd':
+			skel->rodata->p_remove_ns = strtoull(optarg, NULL, 0) * 1000;
+			break;
+		case 'm':
+			skel->rodata->r_max = strtoull(optarg, NULL, 0);
+			break;
+		case 'i':
+			skel->rodata->r_impatient = strtoull(optarg, NULL, 0);
+			break;
+		case 'I':
+			skel->rodata->find_fully_idle = true;
+			break;
+		case 's':
+			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
+			break;
+		case 'v':
+			verbose = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	SCX_OPS_LOAD(skel, nest_ops, scx_nest, uei);
+	link = SCX_OPS_ATTACH(skel, nest_ops, scx_nest);
+
+	while (!exit_req && !UEI_EXITED(skel, uei)) {
+		u64 stats[NEST_STAT(NR)];
+		enum nest_stat_idx i;
+		enum nest_stat_group last_grp = -1;
+
+		read_stats(skel, stats);
+		for (i = 0; i < NEST_STAT(NR); i++) {
+			struct nest_stat *nest_stat;
+
+			nest_stat = &nest_stats[i];
+			if (nest_stat->group != last_grp) {
+				print_stat_grp(nest_stat->group);
+				last_grp = nest_stat->group;
+			}
+			printf("%s=%" PRIu64 "\n", nest_stat->label, stats[nest_stat->idx]);
+		}
+		printf("\n");
+		print_active_nests(skel);
+		printf("\n");
+		printf("\n");
+		printf("\n");
+		fflush(stdout);
+		sleep(SAMPLING_CADENCE_S);
+	}
+
+	bpf_link__destroy(link);
+	ecode = UEI_REPORT(skel, uei);
+	scx_nest__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
+	return 0;
+}
--- a/src/45-scx-nest/scx_nest.h
+++ b/src/45-scx-nest/scx_nest.h
@@ -0,0 +1,18 @@
+#ifndef __SCX_NEST_H
+#define __SCX_NEST_H
+
+enum nest_stat_group {
+	STAT_GRP_WAKEUP,
+	STAT_GRP_NEST,
+	STAT_GRP_CONSUME,
+};
+
+#define NEST_STAT(__stat) BPFSTAT_##__stat
+#define NEST_ST(__stat, __grp, __desc) NEST_STAT(__stat),
+enum nest_stat_idx {
+#include "scx_nest_stats_table.h"
+	NEST_ST(NR, 0, 0)
+};
+#undef NEST_ST
+
+#endif /* __SCX_NEST_H */
--- a/src/45-scx-nest/scx_nest_stats_table.h
+++ b/src/45-scx-nest/scx_nest_stats_table.h
@@ -0,0 +1,20 @@
+NEST_ST(WAKEUP_ATTACHED, STAT_GRP_WAKEUP, "Attached CPU was idle, and in primary nest")
+NEST_ST(WAKEUP_PREV_PRIMARY, STAT_GRP_WAKEUP, "Previous CPU was idle, and in primary nest")
+NEST_ST(WAKEUP_FULLY_IDLE_PRIMARY, STAT_GRP_WAKEUP, "Woken up to fully idle primary nest core")
+NEST_ST(WAKEUP_ANY_IDLE_PRIMARY, STAT_GRP_WAKEUP, "Woken up to idle logical primary nest core")
+NEST_ST(WAKEUP_FULLY_IDLE_RESERVE, STAT_GRP_WAKEUP, "Woken up to fully idle reserve nest core")
+NEST_ST(WAKEUP_ANY_IDLE_RESERVE, STAT_GRP_WAKEUP, "Woken up to idle logical reserve nest core")
+NEST_ST(WAKEUP_IDLE_OTHER, STAT_GRP_WAKEUP, "Woken to any idle logical core in p->cpus_ptr")
+
+NEST_ST(TASK_IMPATIENT, STAT_GRP_NEST, "A task was found to be impatient")
+NEST_ST(PROMOTED_TO_PRIMARY, STAT_GRP_NEST, "A core was promoted into the primary nest")
+NEST_ST(PROMOTED_TO_RESERVED, STAT_GRP_NEST, "A core was promoted into the reserve nest")
+NEST_ST(DEMOTED_TO_RESERVED, STAT_GRP_NEST, "A core was demoted into the reserve nest")
+NEST_ST(RESERVED_AT_CAPACITY, STAT_GRP_NEST, "Reserved nest was at capacity")
+NEST_ST(SCHEDULED_COMPACTION, STAT_GRP_NEST, "Scheduled a primary core to be compacted")
+NEST_ST(CANCELLED_COMPACTION, STAT_GRP_NEST, "Cancelled a primary core from being compacted at task wakeup time")
+NEST_ST(EAGERLY_COMPACTED, STAT_GRP_NEST, "A core was compacted in ops.dispatch()")
+NEST_ST(CALLBACK_COMPACTED, STAT_GRP_NEST, "A core was compacted in the scheduled timer callback")
+
+NEST_ST(CONSUMED, STAT_GRP_CONSUME, "A task was consumed from the global DSQ")
+NEST_ST(NOT_CONSUMED, STAT_GRP_CONSUME, "There was no task in the global DSQ")