From 42268ad0eb4142245ea40ab01a5690a40e9c3b41 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Sep 2024 11:10:07 -1000 Subject: [PATCH 01/17] sched_ext: Build fix for !CONFIG_SMP move_remote_task_to_local_dsq() is only defined on SMP configs but scx_disaptch_from_dsq() was calling move_remote_task_to_local_dsq() on UP configs too causing build failures. Add a dummy move_remote_task_to_local_dsq() which triggers a warning. Signed-off-by: Tejun Heo Fixes: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409241108.jaocHiDJ-lkp@intel.com/ --- kernel/sched/ext.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index c09e3dc38c34..d74d1fe06999 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2357,6 +2357,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, } } #else /* CONFIG_SMP */ +static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } #endif /* CONFIG_SMP */ From 1e123fd73deb16cb362ecefb55c90c9196f4a6c2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Sep 2024 11:58:44 -1000 Subject: [PATCH 02/17] sched_ext: Add __COMPAT helpers for features added during v6.12 devel cycle cgroup support and scx_bpf_dispatch[_vtime]_from_dsq() are newly added since 8bb30798fd6e ("sched_ext: Fixes incorrect type in bpf_scx_init()") which is the current earliest commit targeted by BPF schedulers. Add compat helpers for them and apply them in the example schedulers. These will be dropped after a few kernel releases. The exact backward compatibility window hasn't been decided yet. Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/compat.bpf.h | 19 +++++++++++++++++++ tools/sched_ext/scx_flatcg.bpf.c | 10 +++++----- tools/sched_ext/scx_qmap.bpf.c | 12 ++++++------ 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 3d2fe1208900..e5afe9efd3f3 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -15,6 +15,25 @@ __ret; \ }) +/* v6.12: 819513666966 ("sched_ext: Add cgroup support") */ +#define __COMPAT_scx_bpf_task_cgroup(p) \ + (bpf_ksym_exists(scx_bpf_task_cgroup) ? \ + scx_bpf_task_cgroup((p)) : NULL) + +/* v6.12: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") */ +#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it, slice) \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice) ? \ + scx_bpf_dispatch_from_dsq_set_slice((it), (slice)) : (void)0) +#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it, vtime) \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime) ? \ + scx_bpf_dispatch_from_dsq_set_vtime((it), (vtime)) : (void)0) +#define __COMPAT_scx_bpf_dispatch_from_dsq(it, p, dsq_id, enq_flags) \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq) ? \ + scx_bpf_dispatch_from_dsq((it), (p), (dsq_id), (enq_flags)) : false) +#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it, p, dsq_id, enq_flags) \ + (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq) ? \ + scx_bpf_dispatch_vtime_from_dsq((it), (p), (dsq_id), (enq_flags)) : false) + /* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index 3ab2b60781a0..936415b98ae7 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -383,7 +383,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) return; } - cgrp = scx_bpf_task_cgroup(p); + cgrp = __COMPAT_scx_bpf_task_cgroup(p); cgc = find_cgrp_ctx(cgrp); if (!cgc) goto out_release; @@ -509,7 +509,7 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) { struct cgroup *cgrp; - cgrp = scx_bpf_task_cgroup(p); + cgrp = __COMPAT_scx_bpf_task_cgroup(p); update_active_weight_sums(cgrp, true); bpf_cgroup_release(cgrp); } @@ -522,7 +522,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) if (fifo_sched) return; - cgrp = scx_bpf_task_cgroup(p); + cgrp = __COMPAT_scx_bpf_task_cgroup(p); cgc = find_cgrp_ctx(cgrp); if (cgc) { /* @@ -565,7 +565,7 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) if (!taskc->bypassed_at) return; - cgrp = scx_bpf_task_cgroup(p); + cgrp = __COMPAT_scx_bpf_task_cgroup(p); cgc = find_cgrp_ctx(cgrp); if (cgc) { __sync_fetch_and_add(&cgc->cvtime_delta, @@ -579,7 +579,7 @@ void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) { struct cgroup *cgrp; - cgrp = scx_bpf_task_cgroup(p); + cgrp = __COMPAT_scx_bpf_task_cgroup(p); update_active_weight_sums(cgrp, false); bpf_cgroup_release(cgrp); } diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 83c8f54c1e31..67e2a7968cc9 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -318,11 +318,11 @@ static bool dispatch_highpri(bool from_timer) if (tctx->highpri) { /* exercise the set_*() and vtime interface too */ - scx_bpf_dispatch_from_dsq_set_slice( + __COMPAT_scx_bpf_dispatch_from_dsq_set_slice( BPF_FOR_EACH_ITER, slice_ns * 2); - scx_bpf_dispatch_from_dsq_set_vtime( + __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime( BPF_FOR_EACH_ITER, highpri_seq++); - scx_bpf_dispatch_vtime_from_dsq( + __COMPAT_scx_bpf_dispatch_vtime_from_dsq( BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); } } @@ -340,9 +340,9 @@ static bool dispatch_highpri(bool from_timer) else cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); - if (scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p, - SCX_DSQ_LOCAL_ON | cpu, - SCX_ENQ_PREEMPT)) { + if (__COMPAT_scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p, + SCX_DSQ_LOCAL_ON | cpu, + SCX_ENQ_PREEMPT)) { if (cpu == this_cpu) { dispatched = true; __sync_fetch_and_add(&nr_expedited_local, 1); From a748db0c8c6a88da66c3ab3791bd1a229f4a7fee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Sep 2024 12:22:37 -1000 Subject: [PATCH 03/17] tools/sched_ext: Receive misc updates from SCX repo Receive misc tools/sched_ext updates from https://github.com/sched-ext/scx to sync userspace bits. - LSP macros to help language servers. - bpf_cpumask_weight() declaration and cast_mask() helper. - Cosmetic updates to scx_flatcg.bpf.c. Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/common.bpf.h | 15 +++++++++++++++ tools/sched_ext/include/scx/user_exit_info.h | 4 ++++ tools/sched_ext/scx_flatcg.bpf.c | 5 ++--- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index f538c75db183..225f61f9bfca 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -7,7 +7,13 @@ #ifndef __SCX_COMMON_BPF_H #define __SCX_COMMON_BPF_H +#ifdef LSP +#define __bpf__ +#include "../vmlinux/vmlinux.h" +#else #include "vmlinux.h" +#endif + #include #include #include @@ -309,6 +315,15 @@ void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, const struct cpumask *src2) __ksym; +u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym; + +/* + * Access a cpumask in read-only mode (typically to check bits). + */ +const struct cpumask *cast_mask(struct bpf_cpumask *mask) +{ + return (const struct cpumask *)mask; +} /* rcu */ void bpf_rcu_read_lock(void) __ksym; diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h index 891693ee604e..8ce2734402e1 100644 --- a/tools/sched_ext/include/scx/user_exit_info.h +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -25,7 +25,11 @@ struct user_exit_info { #ifdef __bpf__ +#ifdef LSP +#include "../vmlinux/vmlinux.h" +#else #include "vmlinux.h" +#endif #include #define UEI_DEFINE(__name) \ diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index 936415b98ae7..e272bc39bbbd 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -225,7 +225,7 @@ static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) break; /* - * We can be oppotunistic here and not grab the + * We can be opportunistic here and not grab the * cgv_tree_lock and deal with the occasional races. * However, hweight updates are already cached and * relatively low-frequency. Let's just do the @@ -258,8 +258,7 @@ static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc) * and thus can't be updated and repositioned. Instead, we collect the * vtime deltas separately and apply it asynchronously here. */ - delta = cgc->cvtime_delta; - __sync_fetch_and_sub(&cgc->cvtime_delta, delta); + delta = __sync_fetch_and_sub(&cgc->cvtime_delta, cgc->cvtime_delta); cvtime = cgv_node->cvtime + delta; /* From c9c809f4137c3c0f962226c10d245d3ce2fd5b7c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Sep 2024 12:56:46 -1000 Subject: [PATCH 04/17] scx_flatcg: Use a user DSQ for fallback instead of SCX_DSQ_GLOBAL scx_flatcg was using SCX_DSQ_GLOBAL for fallback handling. However, it is assuming that SCX_DSQ_GLOBAL isn't automatically consumed, which was true a while ago but is no longer the case. Also, there are further changes planned for SCX_DSQ_GLOBAL which will disallow explicit consumption from it. Switch to a user DSQ for fallback. Signed-off-by: Tejun Heo Acked-by: David Vernet --- tools/sched_ext/scx_flatcg.bpf.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index e272bc39bbbd..b722baf6da4b 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -49,7 +49,10 @@ /* * Maximum amount of retries to find a valid cgroup. */ -#define CGROUP_MAX_RETRIES 1024 +enum { + FALLBACK_DSQ = 0, + CGROUP_MAX_RETRIES = 1024, +}; char _license[] SEC("license") = "GPL"; @@ -377,7 +380,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); } else { stat_inc(FCG_STAT_GLOBAL); - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + scx_bpf_dispatch(p, FALLBACK_DSQ, SCX_SLICE_DFL, enq_flags); } return; } @@ -780,7 +783,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) pick_next_cgroup: cpuc->cur_at = now; - if (scx_bpf_consume(SCX_DSQ_GLOBAL)) { + if (scx_bpf_consume(FALLBACK_DSQ)) { cpuc->cur_cgid = 0; return; } @@ -837,7 +840,7 @@ int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp, int ret; /* - * Technically incorrect as cgroup ID is full 64bit while dq ID is + * Technically incorrect as cgroup ID is full 64bit while dsq ID is * 63bit. Should not be a problem in practice and easy to spot in the * unlikely case that it breaks. */ @@ -925,6 +928,11 @@ void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p, p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta; } +s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init) +{ + return scx_bpf_create_dsq(FALLBACK_DSQ, -1); +} + void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) { UEI_RECORD(uei, ei); @@ -943,6 +951,7 @@ SCX_OPS_DEFINE(flatcg_ops, .cgroup_init = (void *)fcg_cgroup_init, .cgroup_exit = (void *)fcg_cgroup_exit, .cgroup_move = (void *)fcg_cgroup_move, + .init = (void *)fcg_init, .exit = (void *)fcg_exit, .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, .name = "flatcg"); From 63fb3ec80516b256e9fc91de48567f5eda61d135 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Sep 2024 12:56:46 -1000 Subject: [PATCH 05/17] sched_ext: Allow only user DSQs for scx_bpf_consume(), scx_bpf_dsq_nr_queued() and bpf_iter_scx_dsq_new() SCX_DSQ_GLOBAL is special in that it can't be used as a priority queue and is consumed implicitly, but all BPF DSQ related kfuncs could be used on it. SCX_DSQ_GLOBAL will be split per-node for scalability and those operations won't make sense anymore. Disallow SCX_DSQ_GLOBAL on scx_bpf_consume(), scx_bpf_dsq_nr_queued() and bpf_iter_scx_dsq_new(). This means that SCX_DSQ_GLOBAL can only be used as a dispatch target from BPF schedulers. With scx_flatcg, which was using SCX_DSQ_GLOBAL as the fallback DSQ, updated, this shouldn't affect any schedulers. This leaves find_dsq_for_dispatch() the only user of find_non_local_dsq(). Open code and remove find_non_local_dsq(). Signed-off-by: tejun heo Acked-by: David Vernet --- kernel/sched/ext.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index d74d1fe06999..ed2b914c42d1 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1808,16 +1808,6 @@ static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); } -static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) -{ - lockdep_assert(rcu_read_lock_any_held()); - - if (dsq_id == SCX_DSQ_GLOBAL) - return &scx_dsq_global; - else - return find_user_dsq(dsq_id); -} - static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, struct task_struct *p) { @@ -1835,7 +1825,11 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, return &cpu_rq(cpu)->scx.local_dsq; } - dsq = find_non_local_dsq(dsq_id); + if (dsq_id == SCX_DSQ_GLOBAL) + dsq = &scx_dsq_global; + else + dsq = find_user_dsq(dsq_id); + if (unlikely(!dsq)) { scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", dsq_id, p->comm, p->pid); @@ -6176,7 +6170,7 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id) flush_dispatch_buf(dspc->rq); - dsq = find_non_local_dsq(dsq_id); + dsq = find_user_dsq(dsq_id); if (unlikely(!dsq)) { scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); return false; @@ -6497,7 +6491,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) goto out; } } else { - dsq = find_non_local_dsq(dsq_id); + dsq = find_user_dsq(dsq_id); if (dsq) { ret = READ_ONCE(dsq->nr); goto out; @@ -6546,7 +6540,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) return -EINVAL; - kit->dsq = find_non_local_dsq(dsq_id); + kit->dsq = find_user_dsq(dsq_id); if (!kit->dsq) return -ENOENT; From bba26bf356d1c1314a7bb24041c64c5784febbb0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Sep 2024 12:56:46 -1000 Subject: [PATCH 06/17] sched_ext: Relocate find_user_dsq() To prepare for the addition of find_global_dsq(). No functional changes. Signed-off-by: tejun heo Acked-by: David Vernet --- kernel/sched/ext.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index ed2b914c42d1..acb4db7827a4 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1029,6 +1029,11 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } +static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) +{ + return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); +} + /* * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate @@ -1803,11 +1808,6 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) raw_spin_unlock(&dsq->lock); } -static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) -{ - return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); -} - static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, struct task_struct *p) { From b7b3b2dbae73b412c2d24b3d0ebf1110991e4510 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Sep 2024 12:56:46 -1000 Subject: [PATCH 07/17] sched_ext: Split the global DSQ per NUMA node In the bypass mode, the global DSQ is used to schedule all tasks in simple FIFO order. All tasks are queued into the global DSQ and all CPUs try to execute tasks from it. This creates a lot of cross-node cacheline accesses and scheduling across the node boundaries, and can lead to live-lock conditions where the system takes tens of minutes to disable the BPF scheduler while executing in the bypass mode. Split the global DSQ per NUMA node. Each node has its own global DSQ. When a task is dispatched to SCX_DSQ_GLOBAL, it's put into the global DSQ local to the task's CPU and all CPUs in a node only consume its node-local global DSQ. This resolves a livelock condition which could be reliably triggered on an 2x EPYC 7642 system by running `stress-ng --race-sched 1024` together with `stress-ng --workload 80 --workload-threads 10` while repeatedly enabling and disabling a SCX scheduler. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/ext.c | 73 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 13 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index acb4db7827a4..949a3c43000a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -925,8 +925,15 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs; */ static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); -/* dispatch queues */ -static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; +/* + * Dispatch queues. + * + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is + * to avoid live-locking in bypass mode where all tasks are dispatched to + * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't + * sufficient, it can be further split. + */ +static struct scx_dispatch_q **global_dsqs; static const struct rhashtable_params dsq_hash_params = { .key_len = 8, @@ -1029,6 +1036,11 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } +static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) +{ + return global_dsqs[cpu_to_node(task_cpu(p))]; +} + static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) { return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); @@ -1642,7 +1654,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, scx_ops_error("attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); - dsq = &scx_dsq_global; + dsq = find_global_dsq(p); raw_spin_lock(&dsq->lock); } } @@ -1820,20 +1832,20 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) - return &scx_dsq_global; + return find_global_dsq(p); return &cpu_rq(cpu)->scx.local_dsq; } if (dsq_id == SCX_DSQ_GLOBAL) - dsq = &scx_dsq_global; + dsq = find_global_dsq(p); else dsq = find_user_dsq(dsq_id); if (unlikely(!dsq)) { scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", dsq_id, p->comm, p->pid); - return &scx_dsq_global; + return find_global_dsq(p); } return dsq; @@ -2005,7 +2017,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, global: touch_core_sched(rq, p); /* see the comment in local: */ p->scx.slice = SCX_SLICE_DFL; - dispatch_enqueue(&scx_dsq_global, p, enq_flags); + dispatch_enqueue(find_global_dsq(p), p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -2391,6 +2403,13 @@ static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq) return false; } +static bool consume_global_dsq(struct rq *rq) +{ + int node = cpu_to_node(cpu_of(rq)); + + return consume_dispatch_q(rq, global_dsqs[node]); +} + /** * dispatch_to_local_dsq - Dispatch a task to a local dsq * @rq: current rq which is locked @@ -2424,7 +2443,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, #ifdef CONFIG_SMP if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { - dispatch_enqueue(&scx_dsq_global, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(find_global_dsq(p), p, + enq_flags | SCX_ENQ_CLEAR_OPSS); return; } @@ -2624,7 +2644,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_dispatch_q(rq, &scx_dsq_global)) + if (consume_global_dsq(rq)) goto has_tasks; if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq)) @@ -2649,7 +2669,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_dispatch_q(rq, &scx_dsq_global)) + if (consume_global_dsq(rq)) goto has_tasks; /* @@ -4924,7 +4944,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) struct scx_task_iter sti; struct task_struct *p; unsigned long timeout; - int i, cpu, ret; + int i, cpu, node, ret; if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), cpu_possible_mask)) { @@ -4943,6 +4963,34 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } } + if (!global_dsqs) { + struct scx_dispatch_q **dsqs; + + dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL); + if (!dsqs) { + ret = -ENOMEM; + goto err_unlock; + } + + for_each_node_state(node, N_POSSIBLE) { + struct scx_dispatch_q *dsq; + + dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) { + for_each_node_state(node, N_POSSIBLE) + kfree(dsqs[node]); + kfree(dsqs); + ret = -ENOMEM; + goto err_unlock; + } + + init_dsq(dsq, SCX_DSQ_GLOBAL); + dsqs[node] = dsq; + } + + global_dsqs = dsqs; + } + if (scx_ops_enable_state() != SCX_OPS_DISABLED) { ret = -EBUSY; goto err_unlock; @@ -5777,7 +5825,6 @@ void __init init_sched_ext_class(void) SCX_TG_ONLINE); BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); - init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); #ifdef CONFIG_SMP BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); @@ -6053,7 +6100,7 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit, if (dst_dsq->id == SCX_DSQ_LOCAL) { dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (!task_can_run_on_remote_rq(p, dst_rq, true)) { - dst_dsq = &scx_dsq_global; + dst_dsq = find_global_dsq(p); dst_rq = src_rq; } } else { From 6f34d8d382d64e7d8e77f5a9ddfd06f4c04937b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Sep 2024 12:56:46 -1000 Subject: [PATCH 08/17] sched_ext: Use shorter slice while bypassing While bypassing, tasks are scheduled in FIFO order which favors tasks that hog CPUs. This can slow down e.g. unloading of the BPF scheduler. While bypassing, guaranteeing timely forward progress is the main goal. There's no point in giving long slices. Shorten the time slice used while bypassing from 20ms to 5ms. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/ext.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 949a3c43000a..d6f6bf6caecc 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,6 +9,7 @@ #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) enum scx_consts { + SCX_SLICE_BYPASS = SCX_SLICE_DFL / 4, SCX_DSP_DFL_MAX_BATCH = 32, SCX_DSP_MAX_LOOPS = 32, SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, @@ -1944,6 +1945,7 @@ static bool scx_rq_online(struct rq *rq) static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, int sticky_cpu) { + bool bypassing = scx_rq_bypassing(rq); struct task_struct **ddsp_taskp; unsigned long qseq; @@ -1961,7 +1963,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; - if (scx_rq_bypassing(rq)) + if (bypassing) goto global; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -2016,7 +2018,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, global: touch_core_sched(rq, p); /* see the comment in local: */ - p->scx.slice = SCX_SLICE_DFL; + p->scx.slice = bypassing ? SCX_SLICE_BYPASS : SCX_SLICE_DFL; dispatch_enqueue(find_global_dsq(p), p, enq_flags); } From 1bbcfe620e03aafc542b1c649dea0a9499a4490f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:39 -1000 Subject: [PATCH 09/17] sched_ext: Relocate check_hotplug_seq() call in scx_ops_enable() check_hotplug_seq() is used to detect CPU hotplug event which occurred while the BPF scheduler is being loaded so that initialization can be retried if CPU hotplug events take place before the CPU hotplug callbacks are online. As such, the best place to call it is in the same cpu_read_lock() section that enables the CPU hotplug ops. Currently, it is called in the next cpus_read_lock() block in scx_ops_enable(). The side effect of this placement is a small window in which hotplug sequence detection can trigger unnecessarily, which isn't critical. Move check_hotplug_seq() invocation to the same cpus_read_lock() block as the hotplug operation enablement to close the window and get the invocation out of the way for planned locking updates. Signed-off-by: Tejun Heo Cc: David Vernet --- kernel/sched/ext.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index d6f6bf6caecc..e8ab7e5ee2dd 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5050,6 +5050,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (((void (**)(void))ops)[i]) static_branch_enable_cpuslocked(&scx_has_op[i]); + check_hotplug_seq(ops); cpus_read_unlock(); ret = validate_ops(ops); @@ -5098,8 +5099,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) cpus_read_lock(); scx_cgroup_lock(); - check_hotplug_seq(ops); - for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) static_branch_enable_cpuslocked(&scx_has_op[i]); From fc1fcebead344360979ea9407029f9c8a99d718f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:39 -1000 Subject: [PATCH 10/17] sched_ext: Remove SCX_OPS_PREPPING The distinction between SCX_OPS_PREPPING and SCX_OPS_ENABLING is not used anywhere and only adds confusion. Drop SCX_OPS_PREPPING. Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index e8ab7e5ee2dd..daeb8446ff32 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -779,7 +779,6 @@ enum scx_tg_flags { }; enum scx_ops_enable_state { - SCX_OPS_PREPPING, SCX_OPS_ENABLING, SCX_OPS_ENABLED, SCX_OPS_DISABLING, @@ -787,7 +786,6 @@ enum scx_ops_enable_state { }; static const char *scx_ops_enable_state_str[] = { - [SCX_OPS_PREPPING] = "prepping", [SCX_OPS_ENABLING] = "enabling", [SCX_OPS_ENABLED] = "enabled", [SCX_OPS_DISABLING] = "disabling", @@ -5016,12 +5014,12 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } /* - * Set scx_ops, transition to PREPPING and clear exit info to arm the + * Set scx_ops, transition to ENABLING and clear exit info to arm the * disable path. Failure triggers full disabling from here on. */ scx_ops = *ops; - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) != SCX_OPS_DISABLED); atomic_set(&scx_exit_kind, SCX_EXIT_NONE); @@ -5174,23 +5172,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ preempt_disable(); - /* - * From here on, the disable path must assume that tasks have ops - * enabled and need to be recovered. - * - * Transition to ENABLING fails iff the BPF scheduler has already - * triggered scx_bpf_error(). Returning an error code here would lose - * the recorded error information. Exit indicating success so that the - * error is notified through ops.exit() with all the details. - */ - if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { - preempt_enable(); - spin_unlock_irq(&scx_tasks_lock); - WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); - ret = 0; - goto err_disable_unlock_all; - } - /* * We're fully committed and can't fail. The PREPPED -> ENABLED * transitions here are synchronized against sched_ext_free() through @@ -5221,7 +5202,11 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) cpus_read_unlock(); percpu_up_write(&scx_fork_rwsem); - /* see above ENABLING transition for the explanation on exiting with 0 */ + /* + * Returning an error code here would lose the recorded error + * information. Exit indicating success so that the error is notified + * through ops.exit() with all the details. + */ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); ret = 0; From 8c2090c504e998c8f34ec870bae71dafcc96a6e0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:40 -1000 Subject: [PATCH 11/17] sched_ext: Initialize in bypass mode scx_ops_enable() used preempt_disable() around the task iteration loop to switch tasks into SCX to guarantee forward progress of the task which is running scx_ops_enable(). However, in the gap between setting __scx_ops_enabled and preeempt_disable(), an external entity can put tasks including the enabling one into SCX prematurely, which can lead to malfunctions including stalls. The bypass mode can wrap the entire enabling operation and guarantee forward progress no matter what the BPF scheduler does. Use the bypass mode instead to guarantee forward progress while enabling. While at it, release and regrab scx_tasks_lock between the two task iteration locks in scx_ops_enable() for clarity as there is no reason to keep holding the lock between them. Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index daeb8446ff32..00883f3ef647 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5075,6 +5075,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) queue_delayed_work(system_unbound_wq, &scx_watchdog_work, scx_watchdog_timeout / 2); + /* + * Once __scx_ops_enabled is set, %current can be switched to SCX + * anytime. This can lead to stalls as some BPF schedulers (e.g. + * userspace scheduling) may not function correctly before all tasks are + * switched. Init in bypass mode to guarantee forward progress. + */ + scx_ops_bypass(true); + /* * Lock out forks, cgroup on/offlining and moves before opening the * floodgate so that they don't wander into the operations prematurely. @@ -5134,7 +5142,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * disabled. */ spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); while ((p = scx_task_iter_next_locked(&sti))) { /* @@ -5163,22 +5170,19 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) spin_lock_irq(&scx_tasks_lock); } scx_task_iter_exit(&sti); + spin_unlock_irq(&scx_tasks_lock); /* - * All tasks are prepped but are still ops-disabled. Ensure that - * %current can't be scheduled out and switch everyone. - * preempt_disable() is necessary because we can't guarantee that - * %current won't be starved if scheduled out while switching. + * All tasks are prepped but the tasks are not enabled. Switch everyone. */ - preempt_disable(); + WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); /* * We're fully committed and can't fail. The PREPPED -> ENABLED * transitions here are synchronized against sched_ext_free() through * scx_tasks_lock. */ - WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); - + spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; @@ -5195,12 +5199,12 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) check_class_changed(task_rq(p), p, old_class, p->prio); } scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); - preempt_enable(); + scx_cgroup_unlock(); cpus_read_unlock(); percpu_up_write(&scx_fork_rwsem); + scx_ops_bypass(false); /* * Returning an error code here would lose the recorded error @@ -5241,6 +5245,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) err_disable_unlock_all: scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); + scx_ops_bypass(false); err_disable_unlock_cpus: cpus_read_unlock(); err_disable: From 9753358a6a2b011478e8efdabbb489216252426f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:40 -1000 Subject: [PATCH 12/17] sched_ext: Fix SCX_TASK_INIT -> SCX_TASK_READY transitions in scx_ops_enable() scx_ops_enable() has two task iteration loops. The first one calls scx_ops_init_task() on every task and the latter switches the eligible ones into SCX. The first loop left the tasks in SCX_TASK_INIT state and then the second loop switched it into READY before switching the task into SCX. The distinction between INIT and READY is only meaningful in the fork path where it's used to tell whether the task finished forking so that we can tell ops.exit_task() accordingly. Leaving task in INIT state between the two loops is incosistent with the fork path and incorrect. The following can be triggered by running a program which keeps toggling a task between SCHED_OTHER and SCHED_SCX while enabling a task: sched_ext: Invalid task state transition 1 -> 3 for fish[1526] WARNING: CPU: 2 PID: 1615 at kernel/sched/ext.c:3393 scx_ops_enable_task+0x1a1/0x200 ... Sched_ext: qmap (enabling+all) RIP: 0010:scx_ops_enable_task+0x1a1/0x200 ... switching_to_scx+0x13/0xa0 __sched_setscheduler+0x850/0xa50 do_sched_setscheduler+0x104/0x1c0 __x64_sys_sched_setscheduler+0x18/0x30 do_syscall_64+0x7b/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix it by transitioning to READY in the first loop right after scx_ops_init_task() succeeds. Signed-off-by: Tejun Heo Cc: David Vernet --- kernel/sched/ext.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 00883f3ef647..e83af19de59d 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5166,6 +5166,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) goto err_disable_unlock_all; } + scx_set_task_state(p, SCX_TASK_READY); + put_task_struct(p); spin_lock_irq(&scx_tasks_lock); } @@ -5178,7 +5180,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); /* - * We're fully committed and can't fail. The PREPPED -> ENABLED + * We're fully committed and can't fail. The task READY -> ENABLED * transitions here are synchronized against sched_ext_free() through * scx_tasks_lock. */ @@ -5190,7 +5192,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - scx_set_task_state(p, SCX_TASK_READY); __setscheduler_prio(p, p->prio); check_class_changing(task_rq(p), p, old_class); From 4269c603cc26df154e0db303a9347e6ec3cc805e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:40 -1000 Subject: [PATCH 13/17] sched_ext: Enable scx_ops_init_task() separately scx_ops_init_task() and the follow-up scx_ops_enable_task() in the fork path were gated by scx_enabled() test and thus __scx_ops_enabled had to be turned on before the first scx_ops_init_task() loop in scx_ops_enable(). However, if an external entity causes sched_class switch before the loop is complete, tasks which are not initialized could be switched to SCX. The following can be reproduced by running a program which keeps toggling a process between SCHED_OTHER and SCHED_EXT using sched_setscheduler(2). sched_ext: Invalid task state transition 0 -> 3 for fish[1623] WARNING: CPU: 1 PID: 1650 at kernel/sched/ext.c:3392 scx_ops_enable_task+0x1a1/0x200 ... Sched_ext: simple (enabling) RIP: 0010:scx_ops_enable_task+0x1a1/0x200 ... switching_to_scx+0x13/0xa0 __sched_setscheduler+0x850/0xa50 do_sched_setscheduler+0x104/0x1c0 __x64_sys_sched_setscheduler+0x18/0x30 do_syscall_64+0x7b/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix it by gating scx_ops_init_task() separately using scx_ops_init_task_enabled. __scx_ops_enabled is now set after all tasks are finished with scx_ops_init_task(). Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index e83af19de59d..7729594882d9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -853,6 +853,7 @@ DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); +static bool scx_ops_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -3565,7 +3566,7 @@ int scx_fork(struct task_struct *p) { percpu_rwsem_assert_held(&scx_fork_rwsem); - if (scx_enabled()) + if (scx_ops_init_task_enabled) return scx_ops_init_task(p, task_group(p), true); else return 0; @@ -3573,7 +3574,7 @@ int scx_fork(struct task_struct *p) void scx_post_fork(struct task_struct *p) { - if (scx_enabled()) { + if (scx_ops_init_task_enabled) { scx_set_task_state(p, SCX_TASK_READY); /* @@ -4453,6 +4454,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work) cpus_read_lock(); scx_cgroup_lock(); + scx_ops_init_task_enabled = false; + spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); /* @@ -5132,7 +5135,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) goto err_disable_unlock_all; - static_branch_enable_cpuslocked(&__scx_ops_enabled); + WARN_ON_ONCE(scx_ops_init_task_enabled); + scx_ops_init_task_enabled = true; /* * Enable ops for every task. Fork is excluded by scx_fork_rwsem @@ -5175,9 +5179,11 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) spin_unlock_irq(&scx_tasks_lock); /* - * All tasks are prepped but the tasks are not enabled. Switch everyone. + * All tasks are READY. It's safe to turn on scx_enabled() and switch + * all eligible tasks. */ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); + static_branch_enable_cpuslocked(&__scx_ops_enabled); /* * We're fully committed and can't fail. The task READY -> ENABLED From 568894edbe48f0878f787ed533dc9dbfd09c0fbe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:40 -1000 Subject: [PATCH 14/17] sched_ext: Add scx_cgroup_enabled to gate cgroup operations and fix scx_tg_online() If the BPF scheduler does not implement ops.cgroup_init(), scx_tg_online() didn't set SCX_TG_INITED which meant that ops.cgroup_exit(), even if implemented, won't be called from scx_tg_offline(). This is because SCX_HAS_OP(cgroupt_init) is used to test both whether SCX cgroup operations are enabled and ops.cgroup_init() exists. Fix it by introducing a separate bool scx_cgroup_enabled to gate cgroup operations and use SCX_HAS_OP(cgroup_init) only to test whether ops.cgroup_init() exists. Make all cgroup operations consistently use scx_cgroup_enabled to test whether cgroup operations are enabled. scx_cgroup_enabled is added instead of using scx_enabled() to ease planned locking updates. Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7729594882d9..f9675ced75e8 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3706,6 +3706,7 @@ bool scx_can_stop_tick(struct rq *rq) #ifdef CONFIG_EXT_GROUP_SCHED DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +static bool scx_cgroup_enabled; static bool cgroup_warned_missing_weight; static bool cgroup_warned_missing_idle; @@ -3725,8 +3726,7 @@ static void scx_cgroup_warn_missing_weight(struct task_group *tg) static void scx_cgroup_warn_missing_idle(struct task_group *tg) { - if (scx_ops_enable_state() == SCX_OPS_DISABLED || - cgroup_warned_missing_idle) + if (!scx_cgroup_enabled || cgroup_warned_missing_idle) return; if (!tg->idle) @@ -3747,15 +3747,18 @@ int scx_tg_online(struct task_group *tg) scx_cgroup_warn_missing_weight(tg); - if (SCX_HAS_OP(cgroup_init)) { - struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + if (scx_cgroup_enabled) { + if (SCX_HAS_OP(cgroup_init)) { + struct scx_cgroup_init_args args = + { .weight = tg->scx_weight }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, - tg->css.cgroup, &args); - if (!ret) + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + tg->css.cgroup, &args); + if (ret) + ret = ops_sanitize_err("cgroup_init", ret); + } + if (ret == 0) tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; - else - ret = ops_sanitize_err("cgroup_init", ret); } else { tg->scx_flags |= SCX_TG_ONLINE; } @@ -3786,7 +3789,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) /* released in scx_finish/cancel_attach() */ percpu_down_read(&scx_cgroup_rwsem); - if (!scx_enabled()) + if (!scx_cgroup_enabled) return 0; cgroup_taskset_for_each(p, css, tset) { @@ -3829,7 +3832,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) void scx_move_task(struct task_struct *p) { - if (!scx_enabled()) + if (!scx_cgroup_enabled) return; /* @@ -3865,7 +3868,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct task_struct *p; - if (!scx_enabled()) + if (!scx_cgroup_enabled) goto out_unlock; cgroup_taskset_for_each(p, css, tset) { @@ -3882,7 +3885,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) { percpu_down_read(&scx_cgroup_rwsem); - if (tg->scx_weight != weight) { + if (scx_cgroup_enabled && tg->scx_weight != weight) { if (SCX_HAS_OP(cgroup_set_weight)) SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, tg_cgrp(tg), weight); @@ -4054,6 +4057,9 @@ static void scx_cgroup_exit(void) percpu_rwsem_assert_held(&scx_cgroup_rwsem); + WARN_ON_ONCE(!scx_cgroup_enabled); + scx_cgroup_enabled = false; + /* * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and exit all the inited ones, all online cgroups are exited. @@ -4129,6 +4135,9 @@ static int scx_cgroup_init(void) } rcu_read_unlock(); + WARN_ON_ONCE(scx_cgroup_enabled); + scx_cgroup_enabled = true; + return 0; } From 160216568cddc9d6e7f36133ba41d25459d90de4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:40 -1000 Subject: [PATCH 15/17] sched_ext: Decouple locks in scx_ops_disable_workfn() The disable path uses three big locks - scx_fork_rwsem, scx_cgroup_rwsem and cpus_read_lock. Currently, the locks are grabbed together which is prone to locking order problems. With the preceding scx_cgroup_enabled change, we can decouple them: - As cgroup disabling no longer requires modifying a static_key which requires cpus_read_lock(), no need to grab cpus_read_lock() before grabbing scx_cgroup_rwsem. - cgroup can now be independently disabled before tasks are moved back to the fair class. Relocate scx_cgroup_exit() invocation before scx_fork_rwsem is grabbed, drop now unnecessary cpus_read_lock() and move static_key operations out of scx_fork_rwsem. This decouples all three locks in the disable path. Signed-off-by: Tejun Heo Reported-and-tested-by: Aboorva Devarajan Link: http://lkml.kernel.org/r/8cd0ec0c4c7c1bc0119e61fbef0bee9d5e24022d.camel@linux.ibm.com --- kernel/sched/ext.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f9675ced75e8..7d0ce7e6ea1f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4456,21 +4456,23 @@ static void scx_ops_disable_workfn(struct kthread_work *work) WRITE_ONCE(scx_switching_all, false); /* - * Avoid racing against fork and cgroup changes. See scx_ops_enable() - * for explanation on the locking order. + * Shut down cgroup support before tasks so that the cgroup attach path + * doesn't race against scx_ops_exit_task(). + */ + scx_cgroup_lock(); + scx_cgroup_exit(); + scx_cgroup_unlock(); + + /* + * The BPF scheduler is going away. All tasks including %TASK_DEAD ones + * must be switched out and exited synchronously. */ percpu_down_write(&scx_fork_rwsem); - cpus_read_lock(); - scx_cgroup_lock(); scx_ops_init_task_enabled = false; spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); - /* - * The BPF scheduler is going away. All tasks including %TASK_DEAD ones - * must be switched out and exited synchronously. - */ while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx; @@ -4488,23 +4490,18 @@ static void scx_ops_disable_workfn(struct kthread_work *work) } scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); + percpu_up_write(&scx_fork_rwsem); /* no task is on scx, turn off all the switches and flush in-progress calls */ - static_branch_disable_cpuslocked(&__scx_ops_enabled); + static_branch_disable(&__scx_ops_enabled); for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) - static_branch_disable_cpuslocked(&scx_has_op[i]); - static_branch_disable_cpuslocked(&scx_ops_enq_last); - static_branch_disable_cpuslocked(&scx_ops_enq_exiting); - static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_disable(&scx_has_op[i]); + static_branch_disable(&scx_ops_enq_last); + static_branch_disable(&scx_ops_enq_exiting); + static_branch_disable(&scx_ops_cpu_preempt); + static_branch_disable(&scx_builtin_idle_enabled); synchronize_rcu(); - scx_cgroup_exit(); - - scx_cgroup_unlock(); - cpus_read_unlock(); - percpu_up_write(&scx_fork_rwsem); - if (ei->kind >= SCX_EXIT_ERROR) { pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", scx_ops.name, ei->reason); From efe231d9debf6db812bebb262407c95b21cdb8a2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:40 -1000 Subject: [PATCH 16/17] sched_ext: Decouple locks in scx_ops_enable() The enable path uses three big locks - scx_fork_rwsem, scx_cgroup_rwsem and cpus_read_lock. Currently, the locks are grabbed together which is prone to locking order problems. For example, currently, there is a possible deadlock involving scx_fork_rwsem and cpus_read_lock. cpus_read_lock has to nest inside scx_fork_rwsem due to locking order existing in other subsystems. However, there exists a dependency in the other direction during hotplug if hotplug needs to fork a new task, which happens in some cases. This leads to the following deadlock: scx_ops_enable() hotplug percpu_down_write(&cpu_hotplug_lock) percpu_down_write(&scx_fork_rwsem) block on cpu_hotplug_lock kthread_create() waits for kthreadd kthreadd blocks on scx_fork_rwsem Note that this doesn't trigger lockdep because the hotplug side dependency bounces through kthreadd. With the preceding scx_cgroup_enabled change, this can be solved by decoupling cpus_read_lock, which is needed for static_key manipulations, from the other two locks. - Move the first block of static_key manipulations outside of scx_fork_rwsem and scx_cgroup_rwsem. This is now safe with the preceding scx_cgroup_enabled change. - Drop scx_cgroup_rwsem and scx_fork_rwsem between the two task iteration blocks so that __scx_ops_enabled static_key enabling is outside the two rwsems. Signed-off-by: Tejun Heo Reported-and-tested-by: Aboorva Devarajan Link: http://lkml.kernel.org/r/8cd0ec0c4c7c1bc0119e61fbef0bee9d5e24022d.camel@linux.ibm.com --- kernel/sched/ext.c | 67 +++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 40 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7d0ce7e6ea1f..0c398ecdb938 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5049,7 +5049,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); if (ret) { ret = ops_sanitize_err("init", ret); - goto err_disable_unlock_cpus; + cpus_read_unlock(); + goto err_disable; } } @@ -5092,54 +5093,30 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ scx_ops_bypass(true); - /* - * Lock out forks, cgroup on/offlining and moves before opening the - * floodgate so that they don't wander into the operations prematurely. - * - * We don't need to keep the CPUs stable but static_branch_*() requires - * cpus_read_lock() and scx_cgroup_rwsem must nest inside - * cpu_hotplug_lock because of the following dependency chain: - * - * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem - * - * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use - * static_branch_*_cpuslocked(). - * - * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the - * following dependency chain: - * - * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock - */ - percpu_down_write(&scx_fork_rwsem); - cpus_read_lock(); - scx_cgroup_lock(); - for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) - static_branch_enable_cpuslocked(&scx_has_op[i]); + static_branch_enable(&scx_has_op[i]); if (ops->flags & SCX_OPS_ENQ_LAST) - static_branch_enable_cpuslocked(&scx_ops_enq_last); + static_branch_enable(&scx_ops_enq_last); if (ops->flags & SCX_OPS_ENQ_EXITING) - static_branch_enable_cpuslocked(&scx_ops_enq_exiting); + static_branch_enable(&scx_ops_enq_exiting); if (scx_ops.cpu_acquire || scx_ops.cpu_release) - static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); + static_branch_enable(&scx_ops_cpu_preempt); if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { reset_idle_masks(); - static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_enable(&scx_builtin_idle_enabled); } else { - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_disable(&scx_builtin_idle_enabled); } /* - * All cgroups should be initialized before letting in tasks. cgroup - * on/offlining and task migrations are already locked out. + * Lock out forks, cgroup on/offlining and moves before opening the + * floodgate so that they don't wander into the operations prematurely. */ - ret = scx_cgroup_init(); - if (ret) - goto err_disable_unlock_all; + percpu_down_write(&scx_fork_rwsem); WARN_ON_ONCE(scx_ops_init_task_enabled); scx_ops_init_task_enabled = true; @@ -5150,7 +5127,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * leaving as sched_ext_free() can handle both prepped and enabled * tasks. Prep all tasks first and then enable them with preemption * disabled. + * + * All cgroups should be initialized before scx_ops_init_task() so that + * the BPF scheduler can reliably track each task's cgroup membership + * from scx_ops_init_task(). Lock out cgroup on/offlining and task + * migrations while tasks are being initialized so that + * scx_cgroup_can_attach() never sees uninitialized tasks. */ + scx_cgroup_lock(); + ret = scx_cgroup_init(); + if (ret) + goto err_disable_unlock_all; + spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); while ((p = scx_task_iter_next_locked(&sti))) { @@ -5183,19 +5171,22 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); /* * All tasks are READY. It's safe to turn on scx_enabled() and switch * all eligible tasks. */ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); - static_branch_enable_cpuslocked(&__scx_ops_enabled); + static_branch_enable(&__scx_ops_enabled); /* * We're fully committed and can't fail. The task READY -> ENABLED * transitions here are synchronized against sched_ext_free() through * scx_tasks_lock. */ + percpu_down_write(&scx_fork_rwsem); spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); while ((p = scx_task_iter_next_locked(&sti))) { @@ -5213,10 +5204,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); - - scx_cgroup_unlock(); - cpus_read_unlock(); percpu_up_write(&scx_fork_rwsem); + scx_ops_bypass(false); /* @@ -5259,8 +5248,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); scx_ops_bypass(false); -err_disable_unlock_cpus: - cpus_read_unlock(); err_disable: mutex_unlock(&scx_ops_enable_mutex); /* must be fully disabled before returning */ From 95b873693a0841e02b812e693296a884362fdd51 Mon Sep 17 00:00:00 2001 From: Zhang Qiao Date: Thu, 26 Sep 2024 18:39:49 +0800 Subject: [PATCH 17/17] sched_ext: Remove redundant p->nr_cpus_allowed checker select_rq_task() already checked that 'p->nr_cpus_allowed > 1', 'p->nr_cpus_allowed == 1' checker in scx_select_cpu_dfl() is redundant. Signed-off-by: Zhang Qiao Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 0c398ecdb938..3cd7c50a51c5 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3074,22 +3074,13 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, * there is an idle core elsewhere on the system. */ cpu = smp_processor_id(); - if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && + if ((wake_flags & SCX_WAKE_SYNC) && !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && cpu_rq(cpu)->scx.local_dsq.nr == 0) { if (cpumask_test_cpu(cpu, p->cpus_ptr)) goto cpu_found; } - if (p->nr_cpus_allowed == 1) { - if (test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } else { - return prev_cpu; - } - } - /* * If CPU has SMT, any wholly idle CPU is likely a better pick than * partially idle @prev_cpu.