From 5cdeae437a937c5aa510aade3b48ab397044bcf0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 11 Apr 2024 14:34:53 -1000 Subject: [PATCH] scx: Implement scx_bpf_cpuperf_set() This allows the BPF scheduler to request a specific performance level for each CPU. SCX defaults to max perf if scx_bpf_cpuperf_set() is not called. --- kernel/sched/cpufreq_schedutil.c | 12 ++++- kernel/sched/ext.c | 36 ++++++++++++- kernel/sched/ext.h | 9 ++++ kernel/sched/sched.h | 1 + tools/sched_ext/include/scx/common.bpf.h | 1 + tools/sched_ext/scx_qmap.bpf.c | 65 ++++++++++++++++++++++-- tools/sched_ext/scx_qmap.c | 7 ++- 7 files changed, 124 insertions(+), 7 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 972b7dd65af2d..12174c0137a5b 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -197,7 +197,9 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) { - unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu); + unsigned long min, max; + unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu) + + scx_cpuperf_target(sg_cpu->cpu); util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); util = max(util, boost); @@ -330,6 +332,14 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) unsigned long idle_calls; bool ret; + /* + * The heuristics in this function is for the fair class. For SCX, the + * performance target comes directly from the BPF scheduler. Let's just + * follow it. + */ + if (scx_switched_all()) + return false; + /* if capped by uclamp_max, always update to be in compliance */ if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu))) return false; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index b89189450b2e1..edb703f697c6d 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4474,7 +4474,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops) struct scx_task_iter sti; struct task_struct *p; unsigned long timeout; - int i, ret; + int i, cpu, ret; mutex_lock(&scx_ops_enable_mutex); @@ -4523,6 +4523,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops) atomic_long_set(&scx_nr_rejected, 0); + for_each_possible_cpu(cpu) + cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; + /* * Keep CPUs stable during enable so that the BPF scheduler can track * online CPUs by watching ->on/offline_cpu() after ->init(). @@ -6015,6 +6018,36 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) return SCX_CPUPERF_ONE; } +/** + * scx_bpf_cpuperf_set - Set the relative performance target of a CPU + * @cpu: CPU of interest + * @perf: target performance level [0, %SCX_CPUPERF_ONE] + * @flags: %SCX_CPUPERF_* flags + * + * Set the target performance level of @cpu to @perf. @perf is in linear + * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the + * schedutil cpufreq governor chooses the target frequency. The actual + * performance level chosen is dependent on the hardware and cpufreq driver in + * use and can be monitored using scx_bpf_cpuperf_cur(). + */ +__bpf_kfunc void scx_bpf_cpuperf_set(u32 cpu, u32 perf) +{ + if (unlikely(perf > SCX_CPUPERF_ONE)) { + scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu); + return; + } + + if (ops_cpu_valid(cpu, NULL)) { + struct rq *rq = cpu_rq(cpu); + + rq->scx.cpuperf_target = perf; + + rcu_read_lock_sched_notrace(); + cpufreq_update_util(cpu_rq(cpu), 0); + rcu_read_unlock_sched_notrace(); + } +} + /** * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask */ @@ -6165,6 +6198,7 @@ BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index f6dcdde95af16..d572be7a43243 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -45,6 +45,14 @@ void scx_next_task_picked(struct rq *rq, struct task_struct *p, const struct sched_class *active); void init_sched_ext_class(void); +static inline u32 scx_cpuperf_target(s32 cpu) +{ + if (scx_enabled()) + return cpu_rq(cpu)->scx.cpuperf_target; + else + return 0; +} + static inline const struct sched_class *next_active_class(const struct sched_class *class) { class++; @@ -91,6 +99,7 @@ static inline void scx_tick(void) {} static inline void scx_next_task_picked(struct rq *rq, struct task_struct *p, const struct sched_class *active) {} static inline void init_sched_ext_class(void) {} +static inline u32 scx_cpuperf_target(s32 cpu) { return 0; } #define for_each_active_class for_each_class #define for_balance_class_range for_class_range diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e8ef7309f347a..d31db189977ae 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -732,6 +732,7 @@ struct scx_rq { u64 extra_enq_flags; /* see move_task_to_local_dsq() */ u32 nr_running; u32 flags; + u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ bool cpu_released; cpumask_var_t cpus_to_kick; cpumask_var_t cpus_to_kick_if_idle; diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 1c02f013812a1..44eb4267d3ac2 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -48,6 +48,7 @@ void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksy u32 scx_bpf_nr_cpu_ids(void) __ksym; u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym; u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym; +void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym; const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym; const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym; void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym; diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index b8a8ecd65984d..2d496860bcf1b 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -64,6 +64,18 @@ struct { }, }; +/* + * If enabled, CPU performance target is set according to the queue index + * according to the following table. + */ +static const u32 qidx_to_cpuperf_target[] = { + [0] = SCX_CPUPERF_ONE * 0 / 4, + [1] = SCX_CPUPERF_ONE * 1 / 4, + [2] = SCX_CPUPERF_ONE * 2 / 4, + [3] = SCX_CPUPERF_ONE * 3 / 4, + [4] = SCX_CPUPERF_ONE * 4 / 4, +}; + /* * Per-queue sequence numbers to implement core-sched ordering. * @@ -91,6 +103,8 @@ struct { struct cpu_ctx { u64 dsp_idx; /* dispatch index */ u64 dsp_cnt; /* remaining count */ + u32 avg_weight; + u32 cpuperf_target; }; struct { @@ -104,6 +118,7 @@ struct { u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; u64 nr_core_sched_execed; u32 cpuperf_min, cpuperf_avg, cpuperf_max; +u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) @@ -300,6 +315,29 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) } } +void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) +{ + struct cpu_ctx *cpuc; + u32 zero = 0; + int idx; + + if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { + scx_bpf_error("failed to look up cpu_ctx"); + return; + } + + /* + * Use the running avg of weights to select the target cpuperf level. + * This is a demonstration of the cpuperf feature rather than a + * practical strategy to regulate CPU frequency. + */ + cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4; + idx = weight_to_idx(cpuc->avg_weight); + cpuc->cpuperf_target = qidx_to_cpuperf_target[idx]; + + scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target); +} + /* * The distance from the head of the queue scaled by the weight of the queue. * The lower the number, the older the task and the higher the priority. @@ -454,21 +492,26 @@ struct { */ static int cpu_mon_timerfn(void *map, int *key, struct bpf_timer *timer) { + u32 zero = 0; u32 nr_cpu_ids = scx_bpf_nr_cpu_ids(); u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0; + u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0; const struct cpumask *online; - int i; + int i, nr_online_cpus = 0; online = scx_bpf_get_online_cpumask(); if (!online) return -ENOMEM; bpf_for(i, 0, nr_cpu_ids) { + struct cpu_ctx *cpuc; u32 cap, cur; if (!bpf_cpumask_test_cpu(i, online)) continue; + nr_online_cpus++; + /* collect the capacity and current cpuperf */ cap = scx_bpf_cpuperf_cap(i); cur = scx_bpf_cpuperf_cur(i); @@ -482,15 +525,30 @@ static int cpu_mon_timerfn(void *map, int *key, struct bpf_timer *timer) */ cur_sum += cur * cap / SCX_CPUPERF_ONE; cap_sum += cap; - } - scx_bpf_put_cpumask(online); + if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) { + scx_bpf_error("failed to look up cpu_ctx"); + goto out; + } + + /* collect target */ + cur = cpuc->cpuperf_target; + target_sum += cur; + target_min = cur < target_min ? cur : target_min; + target_max = cur > target_max ? cur : target_max; + } cpuperf_min = cur_min; cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; cpuperf_max = cur_max; + cpuperf_target_min = target_min; + cpuperf_target_avg = target_sum / nr_online_cpus; + cpuperf_target_max = target_max; + bpf_timer_start(timer, ONE_SEC_IN_NS, 0); +out: + scx_bpf_put_cpumask(online); return 0; } @@ -524,6 +582,7 @@ SCX_OPS_DEFINE(qmap_ops, .enqueue = (void *)qmap_enqueue, .dequeue = (void *)qmap_dequeue, .dispatch = (void *)qmap_dispatch, + .tick = (void *)qmap_tick, .core_sched_before = (void *)qmap_core_sched_before, .cpu_release = (void *)qmap_cpu_release, .init_task = (void *)qmap_init_task, diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index 41d73267d2a39..5cadc2e0b32db 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -99,10 +99,13 @@ int main(int argc, char **argv) nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, skel->bss->nr_reenqueued, skel->bss->nr_dequeued, skel->bss->nr_core_sched_execed); - printf("cpuperf: cur min/avg/max=%u/%u/%u\n", + printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", skel->bss->cpuperf_min, skel->bss->cpuperf_avg, - skel->bss->cpuperf_max); + skel->bss->cpuperf_max, + skel->bss->cpuperf_target_min, + skel->bss->cpuperf_target_avg, + skel->bss->cpuperf_target_max); fflush(stdout); sleep(1); }