From 5cdeae437a937c5aa510aade3b48ab397044bcf0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 11 Apr 2024 14:34:53 -1000
Subject: [PATCH] scx: Implement scx_bpf_cpuperf_set()

This allows the BPF scheduler to request a specific performance level for
each CPU. SCX defaults to max perf if scx_bpf_cpuperf_set() is not called.
---
 kernel/sched/cpufreq_schedutil.c         | 12 ++++-
 kernel/sched/ext.c                       | 36 ++++++++++++-
 kernel/sched/ext.h                       |  9 ++++
 kernel/sched/sched.h                     |  1 +
 tools/sched_ext/include/scx/common.bpf.h |  1 +
 tools/sched_ext/scx_qmap.bpf.c           | 65 ++++++++++++++++++++++--
 tools/sched_ext/scx_qmap.c               |  7 ++-
 7 files changed, 124 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 972b7dd65af2d..12174c0137a5b 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -197,7 +197,9 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
 
 static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
 {
-	unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
+	unsigned long min, max;
+	unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu) +
+		scx_cpuperf_target(sg_cpu->cpu);
 
 	util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
 	util = max(util, boost);
@@ -330,6 +332,14 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
 	unsigned long idle_calls;
 	bool ret;
 
+	/*
+	 * The heuristics in this function is for the fair class. For SCX, the
+	 * performance target comes directly from the BPF scheduler. Let's just
+	 * follow it.
+	 */
+	if (scx_switched_all())
+		return false;
+
 	/* if capped by uclamp_max, always update to be in compliance */
 	if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)))
 		return false;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b89189450b2e1..edb703f697c6d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4474,7 +4474,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	struct scx_task_iter sti;
 	struct task_struct *p;
 	unsigned long timeout;
-	int i, ret;
+	int i, cpu, ret;
 
 	mutex_lock(&scx_ops_enable_mutex);
 
@@ -4523,6 +4523,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 
 	atomic_long_set(&scx_nr_rejected, 0);
 
+	for_each_possible_cpu(cpu)
+		cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
+
 	/*
 	 * Keep CPUs stable during enable so that the BPF scheduler can track
 	 * online CPUs by watching ->on/offline_cpu() after ->init().
@@ -6015,6 +6018,36 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
 		return SCX_CPUPERF_ONE;
 }
 
+/**
+ * scx_bpf_cpuperf_set - Set the relative performance target of a CPU
+ * @cpu: CPU of interest
+ * @perf: target performance level [0, %SCX_CPUPERF_ONE]
+ * @flags: %SCX_CPUPERF_* flags
+ *
+ * Set the target performance level of @cpu to @perf. @perf is in linear
+ * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
+ * schedutil cpufreq governor chooses the target frequency. The actual
+ * performance level chosen is dependent on the hardware and cpufreq driver in
+ * use and can be monitored using scx_bpf_cpuperf_cur().
+ */
+__bpf_kfunc void scx_bpf_cpuperf_set(u32 cpu, u32 perf)
+{
+	if (unlikely(perf > SCX_CPUPERF_ONE)) {
+		scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+		return;
+	}
+
+	if (ops_cpu_valid(cpu, NULL)) {
+		struct rq *rq = cpu_rq(cpu);
+
+		rq->scx.cpuperf_target = perf;
+
+		rcu_read_lock_sched_notrace();
+		cpufreq_update_util(cpu_rq(cpu), 0);
+		rcu_read_unlock_sched_notrace();
+	}
+}
+
 /**
  * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
  */
@@ -6165,6 +6198,7 @@ BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index f6dcdde95af16..d572be7a43243 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -45,6 +45,14 @@ void scx_next_task_picked(struct rq *rq, struct task_struct *p,
 			  const struct sched_class *active);
 void init_sched_ext_class(void);
 
+static inline u32 scx_cpuperf_target(s32 cpu)
+{
+	if (scx_enabled())
+		return cpu_rq(cpu)->scx.cpuperf_target;
+	else
+		return 0;
+}
+
 static inline const struct sched_class *next_active_class(const struct sched_class *class)
 {
 	class++;
@@ -91,6 +99,7 @@ static inline void scx_tick(void) {}
 static inline void scx_next_task_picked(struct rq *rq, struct task_struct *p,
 					const struct sched_class *active) {}
 static inline void init_sched_ext_class(void) {}
+static inline u32 scx_cpuperf_target(s32 cpu) { return 0; }
 
 #define for_each_active_class		for_each_class
 #define for_balance_class_range		for_class_range
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e8ef7309f347a..d31db189977ae 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -732,6 +732,7 @@ struct scx_rq {
 	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
 	u32			nr_running;
 	u32			flags;
+	u32			cpuperf_target;		/* [0, SCHED_CAPACITY_SCALE] */
 	bool			cpu_released;
 	cpumask_var_t		cpus_to_kick;
 	cpumask_var_t		cpus_to_kick_if_idle;
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 1c02f013812a1..44eb4267d3ac2 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -48,6 +48,7 @@ void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksy
 u32 scx_bpf_nr_cpu_ids(void) __ksym;
 u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym;
 u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym;
+void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym;
 const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym;
 const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym;
 void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym;
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index b8a8ecd65984d..2d496860bcf1b 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -64,6 +64,18 @@ struct {
 	},
 };
 
+/*
+ * If enabled, CPU performance target is set according to the queue index
+ * according to the following table.
+ */
+static const u32 qidx_to_cpuperf_target[] = {
+	[0] = SCX_CPUPERF_ONE * 0 / 4,
+	[1] = SCX_CPUPERF_ONE * 1 / 4,
+	[2] = SCX_CPUPERF_ONE * 2 / 4,
+	[3] = SCX_CPUPERF_ONE * 3 / 4,
+	[4] = SCX_CPUPERF_ONE * 4 / 4,
+};
+
 /*
  * Per-queue sequence numbers to implement core-sched ordering.
  *
@@ -91,6 +103,8 @@ struct {
 struct cpu_ctx {
 	u64	dsp_idx;	/* dispatch index */
 	u64	dsp_cnt;	/* remaining count */
+	u32	avg_weight;
+	u32	cpuperf_target;
 };
 
 struct {
@@ -104,6 +118,7 @@ struct {
 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
 u64 nr_core_sched_execed;
 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
+u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
 
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
@@ -300,6 +315,29 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
+void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
+{
+	struct cpu_ctx *cpuc;
+	u32 zero = 0;
+	int idx;
+
+	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
+		scx_bpf_error("failed to look up cpu_ctx");
+		return;
+	}
+
+	/*
+	 * Use the running avg of weights to select the target cpuperf level.
+	 * This is a demonstration of the cpuperf feature rather than a
+	 * practical strategy to regulate CPU frequency.
+	 */
+	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
+	idx = weight_to_idx(cpuc->avg_weight);
+	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
+
+	scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
+}
+
 /*
  * The distance from the head of the queue scaled by the weight of the queue.
  * The lower the number, the older the task and the higher the priority.
@@ -454,21 +492,26 @@ struct {
  */
 static int cpu_mon_timerfn(void *map, int *key, struct bpf_timer *timer)
 {
+	u32 zero = 0;
 	u32 nr_cpu_ids = scx_bpf_nr_cpu_ids();
 	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
+	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
 	const struct cpumask *online;
-	int i;
+	int i, nr_online_cpus = 0;
 
 	online = scx_bpf_get_online_cpumask();
 	if (!online)
 		return -ENOMEM;
 
 	bpf_for(i, 0, nr_cpu_ids) {
+		struct cpu_ctx *cpuc;
 		u32 cap, cur;
 
 		if (!bpf_cpumask_test_cpu(i, online))
 			continue;
+		nr_online_cpus++;
 
+		/* collect the capacity and current cpuperf */
 		cap = scx_bpf_cpuperf_cap(i);
 		cur = scx_bpf_cpuperf_cur(i);
 
@@ -482,15 +525,30 @@ static int cpu_mon_timerfn(void *map, int *key, struct bpf_timer *timer)
 		 */
 		cur_sum += cur * cap / SCX_CPUPERF_ONE;
 		cap_sum += cap;
-	}
 
-	scx_bpf_put_cpumask(online);
+		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
+			scx_bpf_error("failed to look up cpu_ctx");
+			goto out;
+		}
+
+		/* collect target */
+		cur = cpuc->cpuperf_target;
+		target_sum += cur;
+		target_min = cur < target_min ? cur : target_min;
+		target_max = cur > target_max ? cur : target_max;
+	}
 
 	cpuperf_min = cur_min;
 	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
 	cpuperf_max = cur_max;
 
+	cpuperf_target_min = target_min;
+	cpuperf_target_avg = target_sum / nr_online_cpus;
+	cpuperf_target_max = target_max;
+
 	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
+out:
+	scx_bpf_put_cpumask(online);
 	return 0;
 }
 
@@ -524,6 +582,7 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .enqueue			= (void *)qmap_enqueue,
 	       .dequeue			= (void *)qmap_dequeue,
 	       .dispatch		= (void *)qmap_dispatch,
+	       .tick			= (void *)qmap_tick,
 	       .core_sched_before	= (void *)qmap_core_sched_before,
 	       .cpu_release		= (void *)qmap_cpu_release,
 	       .init_task		= (void *)qmap_init_task,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 41d73267d2a39..5cadc2e0b32db 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -99,10 +99,13 @@ int main(int argc, char **argv)
 		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
 		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
 		       skel->bss->nr_core_sched_execed);
-		printf("cpuperf: cur min/avg/max=%u/%u/%u\n",
+		printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
 		       skel->bss->cpuperf_min,
 		       skel->bss->cpuperf_avg,
-		       skel->bss->cpuperf_max);
+		       skel->bss->cpuperf_max,
+		       skel->bss->cpuperf_target_min,
+		       skel->bss->cpuperf_target_avg,
+		       skel->bss->cpuperf_target_max);
 		fflush(stdout);
 		sleep(1);
 	}