Skip to content
This repository was archived by the owner on Apr 13, 2023. It is now read-only.

Commit

Permalink
sched_ext: Implement scx_bpf_kick_cpu() and task preemption support
Browse files Browse the repository at this point in the history
It's often useful to wake up and/or trigger reschedule on other CPUs. This
patch adds scx_bpf_kick_cpu() kfunc helper that BPF scheduler can call to
kick the target CPU into the scheduling path.

As a sched_ext task relinquishes its CPU only after its slice is depleted,
this patch also adds SCX_KICK_PREEMPT and SCX_ENQ_PREEMPT which clears the
slice of the target CPU's current task to guarantee that sched_ext's
scheduling path runs on the CPU.

This patch also adds a new example scheduler, scx_example_central, which
demonstrates central scheduling where one CPU is responsible for making all
scheduling decisions in the system. The central CPU makes scheduling
decisions for all CPUs in the system, queues tasks on the appropriate local
dsq's and preempts the worker CPUs. The worker CPUs in turn preempt the
central CPU when it needs tasks to run.

Currently, every CPU depends on its own tick to expire the current task. A
follow-up patch implementing tickless support for sched_ext will allow the
worker CPUs to go full tickless so that they can run completely undisturbed.

v3: * Make scx_example_central switch all tasks by default.

    * Convert to BPF inline iterators.

v2: * Julia Lawall reported that scx_example_central can overflow the
      dispatch buffer and malfunction. As scheduling for other CPUs can't be
      handled by the automatic retry mechanism, fix by implementing an
      explicit overflow and retry handling.

    * Updated to use generic BPF cpumask helpers.

Signed-off-by: Tejun Heo <[email protected]>
Reviewed-by: David Vernet <[email protected]>
Acked-by: Josh Don <[email protected]>
Acked-by: Hao Luo <[email protected]>
Acked-by: Barret Rhoden <[email protected]>
Cc: Julia Lawall <[email protected]>
  • Loading branch information
htejun committed Apr 13, 2023
1 parent fcbc9ac commit f7b0019
Show file tree
Hide file tree
Showing 9 changed files with 424 additions and 5 deletions.
4 changes: 4 additions & 0 deletions include/linux/sched/ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,10 @@ struct sched_ext_entity {
* scx_bpf_dispatch() but can also be modified directly by the BPF
* scheduler. Automatically decreased by SCX as the task executes. On
* depletion, a scheduling event is triggered.
*
* This value is cleared to zero if the task is preempted by
* %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
* task ran. Use p->se.sum_exec_runtime instead.
*/
u64 slice;

Expand Down
82 changes: 79 additions & 3 deletions kernel/sched/ext.c
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
}
}

if (enq_flags & SCX_ENQ_HEAD)
if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
list_add(&p->scx.dsq_node, &dsq->fifo);
else
list_add_tail(&p->scx.dsq_node, &dsq->fifo);
Expand All @@ -513,8 +513,16 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,

if (is_local) {
struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
bool preempt = false;

if (sched_class_above(&ext_sched_class, rq->curr->sched_class))
if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
rq->curr->sched_class == &ext_sched_class) {
rq->curr->scx.slice = 0;
preempt = true;
}

if (preempt || sched_class_above(&ext_sched_class,
rq->curr->sched_class))
resched_curr(rq);
} else {
raw_spin_unlock(&dsq->lock);
Expand Down Expand Up @@ -1888,7 +1896,9 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
* Omitted operations:
*
* - check_preempt_curr: NOOP as it isn't useful in the wakeup path because the
* task isn't tied to the CPU at that point.
* task isn't tied to the CPU at that point. Preemption is implemented by
* resetting the victim task's slice to 0 and triggering reschedule on the
* target CPU.
*
* - migrate_task_rq: Unncessary as task to cpu mapping is transient.
*
Expand Down Expand Up @@ -2715,6 +2725,32 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
.enable_mask = SYSRQ_ENABLE_RTNICE,
};

static void kick_cpus_irq_workfn(struct irq_work *irq_work)
{
struct rq *this_rq = this_rq();
int this_cpu = cpu_of(this_rq);
int cpu;

for_each_cpu(cpu, this_rq->scx.cpus_to_kick) {
struct rq *rq = cpu_rq(cpu);
unsigned long flags;

raw_spin_rq_lock_irqsave(rq, flags);

if (cpu_online(cpu) || cpu == this_cpu) {
if (cpumask_test_cpu(cpu, this_rq->scx.cpus_to_preempt) &&
rq->curr->sched_class == &ext_sched_class)
rq->curr->scx.slice = 0;
resched_curr(rq);
}

raw_spin_rq_unlock_irqrestore(rq, flags);
}

cpumask_clear(this_rq->scx.cpus_to_kick);
cpumask_clear(this_rq->scx.cpus_to_preempt);
}

void __init init_sched_ext_class(void)
{
int cpu;
Expand All @@ -2738,6 +2774,10 @@ void __init init_sched_ext_class(void)

init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
INIT_LIST_HEAD(&rq->scx.watchdog_list);

BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
}

register_sysrq_key('S', &sysrq_sched_ext_reset_op);
Expand Down Expand Up @@ -2974,6 +3014,41 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
.set = &scx_kfunc_ids_dispatch,
};

/**
* scx_bpf_kick_cpu - Trigger reschedule on a CPU
* @cpu: cpu to kick
* @flags: SCX_KICK_* flags
*
* Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
* trigger rescheduling on a busy CPU. This can be called from any online
* scx_ops operation and the actual kicking is performed asynchronously through
* an irq work.
*/
void scx_bpf_kick_cpu(s32 cpu, u64 flags)
{
struct rq *rq;

if (!ops_cpu_valid(cpu)) {
scx_ops_error("invalid cpu %d", cpu);
return;
}

preempt_disable();
rq = this_rq();

/*
* Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
* rq locks. We can probably be smarter and avoid bouncing if called
* from ops which don't hold a rq lock.
*/
cpumask_set_cpu(cpu, rq->scx.cpus_to_kick);
if (flags & SCX_KICK_PREEMPT)
cpumask_set_cpu(cpu, rq->scx.cpus_to_preempt);

irq_work_queue(&rq->scx.kick_cpus_irq_work);
preempt_enable();
}

/**
* scx_bpf_dsq_nr_queued - Return the number of queued tasks
* @dsq_id: id of the DSQ
Expand Down Expand Up @@ -3195,6 +3270,7 @@ s32 scx_bpf_task_cpu(const struct task_struct *p)
}

BTF_SET8_START(scx_kfunc_ids_any)
BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
Expand Down
12 changes: 12 additions & 0 deletions kernel/sched/ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@ enum scx_enq_flags {

/* high 32bits are SCX specific */

/*
* Set the following to trigger preemption when calling
* scx_bpf_dispatch() with a local dsq as the target. The slice of the
* current task is cleared to zero and the CPU is kicked into the
* scheduling path. Implies %SCX_ENQ_HEAD.
*/
SCX_ENQ_PREEMPT = 1LLU << 32,

/*
* The task being enqueued is the only task available for the cpu. By
* default, ext core keeps executing such tasks but when
Expand Down Expand Up @@ -51,6 +59,10 @@ enum scx_deq_flags {
SCX_DEQ_SLEEP = DEQUEUE_SLEEP,
};

enum scx_kick_flags {
SCX_KICK_PREEMPT = 1LLU << 0, /* force scheduling on the CPU */
};

#ifdef CONFIG_SCHED_CLASS_EXT

extern const struct sched_class ext_sched_class;
Expand Down
3 changes: 3 additions & 0 deletions kernel/sched/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,9 @@ struct scx_rq {
u64 ops_qseq;
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
u32 nr_running;
cpumask_var_t cpus_to_kick;
cpumask_var_t cpus_to_preempt;
struct irq_work kick_cpus_irq_work;
};
#endif /* CONFIG_SCHED_CLASS_EXT */

Expand Down
1 change: 1 addition & 0 deletions tools/sched_ext/.gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
scx_example_simple
scx_example_qmap
scx_example_central
*.skel.h
*.subskel.h
/tools/
8 changes: 6 additions & 2 deletions tools/sched_ext/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \
-Wall -Wno-compare-distinct-pointer-types \
-O2 -mcpu=v3

all: scx_example_simple scx_example_qmap
all: scx_example_simple scx_example_qmap scx_example_central

# sort removes libbpf duplicates when not cross-building
MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \
Expand Down Expand Up @@ -174,10 +174,14 @@ scx_example_qmap: scx_example_qmap.c scx_example_qmap.skel.h user_exit_info.h
$(CC) $(CFLAGS) -c $< -o $@.o
$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)

scx_example_central: scx_example_central.c scx_example_central.skel.h user_exit_info.h
$(CC) $(CFLAGS) -c $< -o $@.o
$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)

clean:
rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
rm -f *.o *.bpf.o *.skel.h *.subskel.h
rm -f scx_example_simple scx_example_qmap
rm -f scx_example_simple scx_example_qmap scx_example_central

.PHONY: all clean

Expand Down
1 change: 1 addition & 0 deletions tools/sched_ext/scx_common.bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
bool scx_bpf_consume(u64 dsq_id) __ksym;
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed) __ksym;
Expand Down
Loading

0 comments on commit f7b0019

Please sign in to comment.