Skip to content
This repository was archived by the owner on Apr 13, 2023. It is now read-only.

Commit

Permalink
sched_ext: Implement sched_ext_ops.cpu_acquire/release()
Browse files Browse the repository at this point in the history
Scheduler classes are strictly ordered and when a higher priority class has
tasks to run, the lower priority ones lose access to the CPU. Being able to
monitor and act on these events are necessary for use cases includling
strict core-scheduling and latency management.

This patch adds two operations ops.cpu_acquire() and .cpu_release(). The
former is invoked when a CPU becomes available to the BPF scheduler and the
opposite for the latter. This patch also implements
scx_bpf_reenqueue_local() which can be called from .cpu_release() to trigger
requeueing of all tasks in the local dsq of the CPU so that the tasks can be
reassigned to other available CPUs.

scx_example_pair is updated to use .cpu_acquire/release() along with
%SCX_KICK_WAIT to make the pair scheduling guarantee strict even when a CPU
is preempted by a higher priority scheduler class.

scx_example_qmap is updated to use .cpu_acquire/release() to empty the local
dsq of a preempted CPU. A similar approach can be adopted by BPF schedulers
that want to have a tight control over latency.

v3: * Drop the const qualifier from scx_cpu_release_args.task. BPF enforces
      access control through the verifier, so the qualifier isn't actually
      operative and only gets in the way when interacting with various
      helpers.

v2: * Add p->scx.kf_mask annotation to allow calling
      scx_bpf_reenqueue_local() from ops.cpu_release() nested inside
      ops.init() and other sleepable operations.

Signed-off-by: David Vernet <[email protected]>
Reviewed-by: Tejun Heo <[email protected]>
Signed-off-by: Tejun Heo <[email protected]>
Acked-by: Josh Don <[email protected]>
Acked-by: Hao Luo <[email protected]>
Acked-by: Barret Rhoden <[email protected]>
  • Loading branch information
Byte-Lab authored and htejun committed Apr 13, 2023
1 parent f8ae50f commit 7f1bb69
Show file tree
Hide file tree
Showing 8 changed files with 340 additions and 12 deletions.
53 changes: 52 additions & 1 deletion include/linux/sched/ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,32 @@ struct scx_cgroup_init_args {
u32 weight;
};

enum scx_cpu_preempt_reason {
/* next task is being scheduled by &sched_class_rt */
SCX_CPU_PREEMPT_RT,
/* next task is being scheduled by &sched_class_dl */
SCX_CPU_PREEMPT_DL,
/* next task is being scheduled by &sched_class_stop */
SCX_CPU_PREEMPT_STOP,
/* unknown reason for SCX being preempted */
SCX_CPU_PREEMPT_UNKNOWN,
};

/*
* Argument container for ops->cpu_acquire(). Currently empty, but may be
* expanded in the future.
*/
struct scx_cpu_acquire_args {};

/* argument container for ops->cpu_release() */
struct scx_cpu_release_args {
/* the reason the CPU was preempted */
enum scx_cpu_preempt_reason reason;

/* the task that's going to be scheduled on the CPU */
struct task_struct *task;
};

/**
* struct sched_ext_ops - Operation table for BPF scheduler implementation
*
Expand Down Expand Up @@ -330,6 +356,28 @@ struct sched_ext_ops {
*/
void (*update_idle)(s32 cpu, bool idle);

/**
* cpu_acquire - A CPU is becoming available to the BPF scheduler
* @cpu: The CPU being acquired by the BPF scheduler.
* @args: Acquire arguments, see the struct definition.
*
* A CPU that was previously released from the BPF scheduler is now once
* again under its control.
*/
void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);

/**
* cpu_release - A CPU is taken away from the BPF scheduler
* @cpu: The CPU being released by the BPF scheduler.
* @args: Release arguments, see the struct definition.
*
* The specified CPU is no longer under the control of the BPF
* scheduler. This could be because it was preempted by a higher
* priority sched_class, though there may be other reasons as well. The
* caller should consult @args->reason to determine the cause.
*/
void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);

/**
* prep_enable - Prepare to enable BPF scheduling for a task
* @p: task to prepare BPF scheduling for
Expand Down Expand Up @@ -534,12 +582,15 @@ enum scx_kf_mask {
/* all non-sleepables may be nested inside INIT and SLEEPABLE */
SCX_KF_INIT = 1 << 0, /* running ops.init() */
SCX_KF_SLEEPABLE = 1 << 1, /* other sleepable init operations */
/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
SCX_KF_CPU_RELEASE = 1 << 2, /* ops.cpu_release() */
/* ops.dequeue (in REST) may be nested inside DISPATCH */
SCX_KF_DISPATCH = 1 << 3, /* ops.dispatch() */
SCX_KF_ENQUEUE = 1 << 4, /* ops.enqueue() */
SCX_KF_REST = 1 << 5, /* other rq-locked operations */

__SCX_KF_RQ_LOCKED = SCX_KF_DISPATCH | SCX_KF_ENQUEUE | SCX_KF_REST,
__SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
SCX_KF_ENQUEUE | SCX_KF_REST,
__SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_REST,
};

Expand Down
131 changes: 130 additions & 1 deletion kernel/sched/ext.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ static bool scx_warned_zero_slice;

static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);

struct static_key_false scx_has_op[SCX_NR_ONLINE_OPS] =
Expand Down Expand Up @@ -304,6 +305,12 @@ static __always_inline bool scx_kf_allowed(u32 mask)
* inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE
* boundary thanks to the above in_interrupt() check.
*/
if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
(current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
scx_ops_error("cpu_release kfunc called from a nested operation");
return false;
}

if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
(current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
scx_ops_error("dispatch kfunc called from a nested operation");
Expand Down Expand Up @@ -1377,14 +1384,29 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,

lockdep_assert_rq_held(rq);

if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
unlikely(rq->scx.cpu_released)) {
/*
* If the previous sched_class for the current CPU was not SCX,
* notify the BPF scheduler that it again has control of the
* core. This callback complements ->cpu_release(), which is
* emitted in scx_notify_pick_next_task().
*/
if (SCX_HAS_OP(cpu_acquire))
SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL);
rq->scx.cpu_released = false;
}

if (prev_on_scx) {
WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP);
update_curr_scx(rq);

/*
* If @prev is runnable & has slice left, it has priority and
* fetching more just increases latency for the fetched tasks.
* Tell put_prev_task_scx() to put @prev on local_dsq.
* Tell put_prev_task_scx() to put @prev on local_dsq. If the
* BPF scheduler wants to handle this explicitly, it should
* implement ->cpu_released().
*
* See scx_ops_disable_workfn() for the explanation on the
* disabling() test.
Expand Down Expand Up @@ -1590,6 +1612,58 @@ static struct task_struct *pick_next_task_scx(struct rq *rq)
return p;
}

static enum scx_cpu_preempt_reason
preempt_reason_from_class(const struct sched_class *class)
{
#ifdef CONFIG_SMP
if (class == &stop_sched_class)
return SCX_CPU_PREEMPT_STOP;
#endif
if (class == &dl_sched_class)
return SCX_CPU_PREEMPT_DL;
if (class == &rt_sched_class)
return SCX_CPU_PREEMPT_RT;
return SCX_CPU_PREEMPT_UNKNOWN;
}

void __scx_notify_pick_next_task(struct rq *rq, struct task_struct *task,
const struct sched_class *active)
{
lockdep_assert_rq_held(rq);

/*
* The callback is conceptually meant to convey that the CPU is no
* longer under the control of SCX. Therefore, don't invoke the
* callback if the CPU is is staying on SCX, or going idle (in which
* case the SCX scheduler has actively decided not to schedule any
* tasks on the CPU).
*/
if (likely(active >= &ext_sched_class))
return;

/*
* At this point we know that SCX was preempted by a higher priority
* sched_class, so invoke the ->cpu_release() callback if we have not
* done so already. We only send the callback once between SCX being
* preempted, and it regaining control of the CPU.
*
* ->cpu_release() complements ->cpu_acquire(), which is emitted the
* next time that balance_scx() is invoked.
*/
if (!rq->scx.cpu_released) {
if (SCX_HAS_OP(cpu_release)) {
struct scx_cpu_release_args args = {
.reason = preempt_reason_from_class(active),
.task = task,
};

SCX_CALL_OP(SCX_KF_CPU_RELEASE,
cpu_release, cpu_of(rq), &args);
}
rq->scx.cpu_released = true;
}
}

#ifdef CONFIG_SMP

static bool test_and_clear_cpu_idle(int cpu)
Expand Down Expand Up @@ -2657,6 +2731,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
static_branch_disable_cpuslocked(&scx_has_op[i]);
static_branch_disable_cpuslocked(&scx_ops_enq_last);
static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
static_branch_disable_cpuslocked(&scx_ops_cpu_preempt);
static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
synchronize_rcu();

Expand Down Expand Up @@ -2863,6 +2938,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)

if (ops->flags & SCX_OPS_ENQ_EXITING)
static_branch_enable_cpuslocked(&scx_ops_enq_exiting);
if (scx_ops.cpu_acquire || scx_ops.cpu_release)
static_branch_enable_cpuslocked(&scx_ops_cpu_preempt);

if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
reset_idle_masks();
Expand Down Expand Up @@ -3526,6 +3603,56 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
.set = &scx_kfunc_ids_dispatch,
};

/**
* scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
*
* Iterate over all of the tasks currently enqueued on the local DSQ of the
* caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
* processed tasks. Can only be called from ops.cpu_release().
*/
u32 scx_bpf_reenqueue_local(void)
{
u32 nr_enqueued, i;
struct rq *rq;
struct scx_rq *scx_rq;

if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
return 0;

rq = cpu_rq(smp_processor_id());
lockdep_assert_rq_held(rq);
scx_rq = &rq->scx;

/*
* Get the number of tasks on the local DSQ before iterating over it to
* pull off tasks. The enqueue callback below can signal that it wants
* the task to stay on the local DSQ, and we want to prevent the BPF
* scheduler from causing us to loop indefinitely.
*/
nr_enqueued = scx_rq->local_dsq.nr;
for (i = 0; i < nr_enqueued; i++) {
struct task_struct *p;

p = first_local_task(rq);
WARN_ON_ONCE(atomic64_read(&p->scx.ops_state) != SCX_OPSS_NONE);
WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
WARN_ON_ONCE(p->scx.holding_cpu != -1);
dispatch_dequeue(scx_rq, p);
do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
}

return nr_enqueued;
}

BTF_SET8_START(scx_kfunc_ids_cpu_release)
BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
BTF_SET8_END(scx_kfunc_ids_cpu_release)

static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_cpu_release,
};

/**
* scx_bpf_kick_cpu - Trigger reschedule on a CPU
* @cpu: cpu to kick
Expand Down Expand Up @@ -3862,6 +3989,8 @@ static int __init register_ext_kfuncs(void)
&scx_kfunc_set_enqueue_dispatch)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_dispatch)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_cpu_release)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_any))) {
pr_err("sched_ext: failed to register kfunc sets (%d)\n", ret);
Expand Down
24 changes: 22 additions & 2 deletions kernel/sched/ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@ enum scx_enq_flags {
*/
SCX_ENQ_PREEMPT = 1LLU << 32,

/*
* The task being enqueued was previously enqueued on the current CPU's
* %SCX_DSQ_LOCAL, but was removed from it in a call to the
* bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
* invoked in a ->cpu_release() callback, and the task is again
* dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
* task will not be scheduled on the CPU until at least the next invocation
* of the ->cpu_acquire() callback.
*/
SCX_ENQ_REENQ = 1LLU << 40,

/*
* The task being enqueued is the only task available for the cpu. By
* default, ext core keeps executing such tasks but when
Expand Down Expand Up @@ -82,6 +93,8 @@ DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)

DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);

bool task_on_scx(struct task_struct *p);
void scx_pre_fork(struct task_struct *p);
int scx_fork(struct task_struct *p);
Expand All @@ -96,20 +109,27 @@ __printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
#define scx_ops_error(fmt, args...) \
scx_ops_error_type(SCX_EXIT_ERROR, fmt, ##args)

void __scx_notify_pick_next_task(struct rq *rq,
struct task_struct *p,
const struct sched_class *active);

static inline void scx_notify_pick_next_task(struct rq *rq,
const struct task_struct *p,
struct task_struct *p,
const struct sched_class *active)
{
#ifdef CONFIG_SMP
if (!scx_enabled())
return;
#ifdef CONFIG_SMP
/*
* Pairs with the smp_load_acquire() issued by a CPU in
* kick_cpus_irq_workfn() who is waiting for this CPU to perform a
* resched.
*/
smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
#endif
if (!static_branch_unlikely(&scx_ops_cpu_preempt))
return;
__scx_notify_pick_next_task(rq, p, active);
}

static inline void scx_notify_sched_tick(void)
Expand Down
1 change: 1 addition & 0 deletions kernel/sched/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,7 @@ struct scx_rq {
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
u32 nr_running;
u32 flags;
bool cpu_released;
cpumask_var_t cpus_to_kick;
cpumask_var_t cpus_to_preempt;
cpumask_var_t cpus_to_wait;
Expand Down
1 change: 1 addition & 0 deletions tools/sched_ext/scx_common.bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
bool scx_bpf_task_running(const struct task_struct *p) __ksym;
s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
u32 scx_bpf_reenqueue_local(void) __ksym;

#define BPF_STRUCT_OPS(name, args...) \
SEC("struct_ops/"#name) \
Expand Down
Loading

0 comments on commit 7f1bb69

Please sign in to comment.