From be8ee18152b0523752f3a44900363838bd1573bb Mon Sep 17 00:00:00 2001 From: Atul Kumar Pant Date: Sat, 18 Jan 2025 14:09:27 +0530 Subject: [PATCH 001/310] sched_ext: Fixes typos in comments Fixes some spelling errors in the comments. Signed-off-by: Atul Kumar Pant Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8857c0709bdd..283d7f1addc5 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -416,7 +416,7 @@ struct sched_ext_ops { /** * @update_idle: Update the idle state of a CPU - * @cpu: CPU to udpate the idle state for + * @cpu: CPU to update the idle state for * @idle: whether entering or exiting the idle state * * This operation is called when @rq's CPU goes or leaves the idle @@ -1214,7 +1214,7 @@ static bool scx_kf_allowed_if_unlocked(void) /** * nldsq_next_task - Iterate to the next task in a non-local DSQ - * @dsq: user dsq being interated + * @dsq: user dsq being iterated * @cur: current position, %NULL to start iteration * @rev: walk backwards * @@ -2078,7 +2078,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p) /* * list_add_tail() must be used. scx_ops_bypass() depends on tasks being - * appened to the runnable_list. + * appended to the runnable_list. */ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); } @@ -2480,7 +2480,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, /* * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly * banging on the same DSQ on a large NUMA system to the point where switching - * to the bypass mode can take a long time. Inject artifical delays while the + * to the bypass mode can take a long time. Inject artificial delays while the * bypass mode is switching to guarantee timely completion. */ static void scx_ops_breather(struct rq *rq) @@ -3144,7 +3144,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) * * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used * to implement the default task ordering. The older the timestamp, the higher - * prority the task - the global FIFO ordering matching the default scheduling + * priority the task - the global FIFO ordering matching the default scheduling * behavior. * * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to @@ -4590,7 +4590,7 @@ static int scx_cgroup_init(void) cgroup_warned_missing_idle = false; /* - * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and init, all online cgroups are initialized. */ rcu_read_lock(); From 2279563e3a8cac367b267b09c15cf1e39c06c5cc Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 22 Jan 2025 10:05:25 +0100 Subject: [PATCH 002/310] sched_ext: Include task weight in the error state dump Report the task weight when dumping the task state during an error exit. Moreover, adjust the output format to display dsq_vtime, slice, and weight on the same line. This can help identify whether certain tasks were excessively prioritized or de-prioritized due to large niceness gaps. Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 283d7f1addc5..7081c7be5f62 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5277,9 +5277,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); - dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu", - p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf, - p->scx.dsq_vtime, p->scx.slice); + dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", + p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); + dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", + p->scx.dsq_vtime, p->scx.slice, p->scx.weight); dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); if (SCX_HAS_OP(dump_task)) { From 74ca334338a4489173d9e50775b13fa20cbd5958 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 23 Jan 2025 13:46:06 +0100 Subject: [PATCH 003/310] selftests/sched_ext: Fix enum resolution All scx enums are now automatically generated from vmlinux.h and they must be initialized using the SCX_ENUM_INIT() macro. Fix the scx selftests to use this macro to properly initialize these values. Fixes: 8da7bf2cee27 ("tools/sched_ext: Receive updates from SCX repo") Reported-by: Ihor Solodrai Closes: https://lore.kernel.org/all/Z2tNK2oFDX1OPp8C@slm.duckdns.org/ Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- .../testing/selftests/sched_ext/create_dsq.c | 10 ++++---- .../selftests/sched_ext/ddsp_bogus_dsq_fail.c | 7 ++++-- .../sched_ext/ddsp_vtimelocal_fail.c | 7 ++++-- .../selftests/sched_ext/dsp_local_on.c | 1 + .../sched_ext/enq_last_no_enq_fails.c | 10 ++++---- .../sched_ext/enq_select_cpu_fails.c | 10 ++++---- tools/testing/selftests/sched_ext/exit.c | 1 + tools/testing/selftests/sched_ext/hotplug.c | 6 +++-- .../selftests/sched_ext/init_enable_count.c | 25 ++++++------------- tools/testing/selftests/sched_ext/maximal.c | 7 ++++-- tools/testing/selftests/sched_ext/minimal.c | 10 ++++---- tools/testing/selftests/sched_ext/prog_run.c | 10 ++++---- .../testing/selftests/sched_ext/reload_loop.c | 9 +++---- .../selftests/sched_ext/select_cpu_dfl.c | 7 ++++-- .../sched_ext/select_cpu_dfl_nodispatch.c | 7 ++++-- .../selftests/sched_ext/select_cpu_dispatch.c | 7 ++++-- .../sched_ext/select_cpu_dispatch_bad_dsq.c | 7 ++++-- .../sched_ext/select_cpu_dispatch_dbl_dsp.c | 7 ++++-- .../selftests/sched_ext/select_cpu_vtime.c | 7 ++++-- 19 files changed, 88 insertions(+), 67 deletions(-) diff --git a/tools/testing/selftests/sched_ext/create_dsq.c b/tools/testing/selftests/sched_ext/create_dsq.c index fa946d9146d4..d67431f57ac6 100644 --- a/tools/testing/selftests/sched_ext/create_dsq.c +++ b/tools/testing/selftests/sched_ext/create_dsq.c @@ -14,11 +14,11 @@ static enum scx_test_status setup(void **ctx) { struct create_dsq *skel; - skel = create_dsq__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = create_dsq__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(create_dsq__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c index e65d22f23f3b..b6d13496b24e 100644 --- a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c +++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct ddsp_bogus_dsq_fail *skel; - skel = ddsp_bogus_dsq_fail__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = ddsp_bogus_dsq_fail__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(ddsp_bogus_dsq_fail__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c index abafee587cd6..af9ce4ee8baa 100644 --- a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c +++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c @@ -14,8 +14,11 @@ static enum scx_test_status setup(void **ctx) { struct ddsp_vtimelocal_fail *skel; - skel = ddsp_vtimelocal_fail__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = ddsp_vtimelocal_fail__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(ddsp_vtimelocal_fail__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c index 0ff27e57fe43..e1f2ce4abfe6 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.c @@ -15,6 +15,7 @@ static enum scx_test_status setup(void **ctx) skel = dsp_local_on__open(); SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); skel->rodata->nr_cpus = libbpf_num_possible_cpus(); SCX_FAIL_IF(dsp_local_on__load(skel), "Failed to load skel"); diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c index 73e679953e27..d3387ae03679 100644 --- a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c +++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct enq_last_no_enq_fails *skel; - skel = enq_last_no_enq_fails__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = enq_last_no_enq_fails__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(enq_last_no_enq_fails__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c index dd1350e5f002..a80e3a3b3698 100644 --- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c +++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct enq_select_cpu_fails *skel; - skel = enq_select_cpu_fails__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = enq_select_cpu_fails__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(enq_select_cpu_fails__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c index 31bcd06e21cd..9451782689de 100644 --- a/tools/testing/selftests/sched_ext/exit.c +++ b/tools/testing/selftests/sched_ext/exit.c @@ -23,6 +23,7 @@ static enum scx_test_status run(void *ctx) char buf[16]; skel = exit__open(); + SCX_ENUM_INIT(skel); skel->rodata->exit_point = tc; exit__load(skel); link = bpf_map__attach_struct_ops(skel->maps.exit_ops); diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c index 87bf220b1bce..1c9ceb661c43 100644 --- a/tools/testing/selftests/sched_ext/hotplug.c +++ b/tools/testing/selftests/sched_ext/hotplug.c @@ -49,8 +49,10 @@ static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined) SCX_ASSERT(is_cpu_online()); - skel = hotplug__open_and_load(); - SCX_ASSERT(skel); + skel = hotplug__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(hotplug__load(skel), "Failed to load skel"); /* Testing the offline -> online path, so go offline before starting */ if (onlining) diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c index 97d45f1e5597..0f3eddc7a17a 100644 --- a/tools/testing/selftests/sched_ext/init_enable_count.c +++ b/tools/testing/selftests/sched_ext/init_enable_count.c @@ -15,22 +15,6 @@ #define SCHED_EXT 7 -static struct init_enable_count * -open_load_prog(bool global) -{ - struct init_enable_count *skel; - - skel = init_enable_count__open(); - SCX_BUG_ON(!skel, "Failed to open skel"); - - if (!global) - skel->struct_ops.init_enable_count_ops->flags |= SCX_OPS_SWITCH_PARTIAL; - - SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel"); - - return skel; -} - static enum scx_test_status run_test(bool global) { struct init_enable_count *skel; @@ -40,7 +24,14 @@ static enum scx_test_status run_test(bool global) struct sched_param param = {}; pid_t pids[num_pre_forks]; - skel = open_load_prog(global); + skel = init_enable_count__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + + if (!global) + skel->struct_ops.init_enable_count_ops->flags |= SCX_OPS_SWITCH_PARTIAL; + + SCX_FAIL_IF(init_enable_count__load(skel), "Failed to load skel"); /* * Fork a bunch of children before we attach the scheduler so that we diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c index f38fc973c380..c6be50a9941d 100644 --- a/tools/testing/selftests/sched_ext/maximal.c +++ b/tools/testing/selftests/sched_ext/maximal.c @@ -14,8 +14,11 @@ static enum scx_test_status setup(void **ctx) { struct maximal *skel; - skel = maximal__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = maximal__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(maximal__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/minimal.c b/tools/testing/selftests/sched_ext/minimal.c index 6c5db8ebbf8a..89f7261757ff 100644 --- a/tools/testing/selftests/sched_ext/minimal.c +++ b/tools/testing/selftests/sched_ext/minimal.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct minimal *skel; - skel = minimal__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = minimal__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(minimal__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/prog_run.c b/tools/testing/selftests/sched_ext/prog_run.c index 3cd57ef8daaa..05974820ca69 100644 --- a/tools/testing/selftests/sched_ext/prog_run.c +++ b/tools/testing/selftests/sched_ext/prog_run.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct prog_run *skel; - skel = prog_run__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = prog_run__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(prog_run__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c index 5cfba2d6e056..308211d80436 100644 --- a/tools/testing/selftests/sched_ext/reload_loop.c +++ b/tools/testing/selftests/sched_ext/reload_loop.c @@ -18,11 +18,10 @@ bool force_exit = false; static enum scx_test_status setup(void **ctx) { - skel = maximal__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = maximal__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(maximal__load(skel), "Failed to load skel"); return SCX_TEST_PASS; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c index a53a40c2d2f0..5b6e045e1109 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c @@ -17,8 +17,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dfl *skel; - skel = select_cpu_dfl__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dfl__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dfl__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c index 1d85bf4bf3a3..9b5d232efb7f 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c @@ -17,8 +17,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dfl_nodispatch *skel; - skel = select_cpu_dfl_nodispatch__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dfl_nodispatch__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dfl_nodispatch__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c index 0309ca8785b3..80283dbc41b7 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c @@ -17,8 +17,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dispatch *skel; - skel = select_cpu_dispatch__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dispatch__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dispatch__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c index 47eb6ed7627d..5e72ebbc90a5 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dispatch_bad_dsq *skel; - skel = select_cpu_dispatch_bad_dsq__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dispatch_bad_dsq__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dispatch_bad_dsq__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c index 48ff028a3c46..aa85949478bc 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dispatch_dbl_dsp *skel; - skel = select_cpu_dispatch_dbl_dsp__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dispatch_dbl_dsp__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dispatch_dbl_dsp__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.c index b4629c2364f5..1e9b5c9bfff1 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_vtime.c +++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_vtime *skel; - skel = select_cpu_vtime__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_vtime__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_vtime__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; From e76946110137703c16423baf6ee177b751a34b7e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 23 Jan 2025 16:25:35 +0800 Subject: [PATCH 004/310] workqueue: Put the pwq after detaching the rescuer from the pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 68f83057b913("workqueue: Reap workers via kthread_stop() and remove detach_completion") adds code to reap the normal workers but mistakenly does not handle the rescuer and also removes the code waiting for the rescuer in put_unbound_pool(), which caused a use-after-free bug reported by Cheung Wall. To avoid the use-after-free bug, the pool’s reference must be held until the detachment is complete. Therefore, move the code that puts the pwq after detaching the rescuer from the pool. Reported-by: cheung wall Cc: cheung wall Link: https://lore.kernel.org/lkml/CAKHoSAvP3iQW+GwmKzWjEAOoPvzeWeoMO0Gz7Pp3_4kxt-RMoA@mail.gmail.com/ Fixes: 68f83057b913("workqueue: Reap workers via kthread_stop() and remove detach_completion") Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 33a23c7b2274..ccad33001c58 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3516,12 +3516,6 @@ static int rescuer_thread(void *__rescuer) } } - /* - * Put the reference grabbed by send_mayday(). @pool won't - * go away while we're still attached to it. - */ - put_pwq(pwq); - /* * Leave this pool. Notify regular workers; otherwise, we end up * with 0 concurrency and stalling the execution. @@ -3532,6 +3526,12 @@ static int rescuer_thread(void *__rescuer) worker_detach_from_pool(rescuer); + /* + * Put the reference grabbed by send_mayday(). @pool might + * go away any time after it. + */ + put_pwq_unlocked(pwq); + raw_spin_lock_irq(&wq_mayday_lock); } From e9fe182772dcb2630964724fd93e9c90b68ea0fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Jan 2025 10:48:25 -1000 Subject: [PATCH 005/310] sched_ext: selftests/dsp_local_on: Fix sporadic failures dsp_local_on has several incorrect assumptions, one of which is that p->nr_cpus_allowed always tracks p->cpus_ptr. This is not true when a task is scheduled out while migration is disabled - p->cpus_ptr is temporarily overridden to the previous CPU while p->nr_cpus_allowed remains unchanged. This led to sporadic test faliures when dsp_local_on_dispatch() tries to put a migration disabled task to a different CPU. Fix it by keeping the previous CPU when migration is disabled. There are SCX schedulers that make use of p->nr_cpus_allowed. They should also implement explicit handling for p->migration_disabled. Signed-off-by: Tejun Heo Reported-by: Ihor Solodrai Cc: Andrea Righi Cc: Changwoo Min --- tools/testing/selftests/sched_ext/dsp_local_on.bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c index fbda6bf54671..758b479bd1ee 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c @@ -43,7 +43,7 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) if (!p) return; - if (p->nr_cpus_allowed == nr_cpus) + if (p->nr_cpus_allowed == nr_cpus && !p->migration_disabled) target = bpf_get_prandom_u32() % nr_cpus; else target = scx_bpf_task_cpu(p); From d6f3e7d564b2309e1f17e709a70eca78d7ca2bb8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Jan 2025 12:22:12 -1000 Subject: [PATCH 006/310] sched_ext: Fix incorrect autogroup migration detection scx_move_task() is called from sched_move_task() and tells the BPF scheduler that cgroup migration is being committed. sched_move_task() is used by both cgroup and autogroup migrations and scx_move_task() tried to filter out autogroup migrations by testing the destination cgroup and PF_EXITING but this is not enough. In fact, without explicitly tagging the thread which is doing the cgroup migration, there is no good way to tell apart scx_move_task() invocations for racing migration to the root cgroup and an autogroup migration. This led to scx_move_task() incorrectly ignoring a migration from non-root cgroup to an autogroup of the root cgroup triggering the following warning: WARNING: CPU: 7 PID: 1 at kernel/sched/ext.c:3725 scx_cgroup_can_attach+0x196/0x340 ... Call Trace: cgroup_migrate_execute+0x5b1/0x700 cgroup_attach_task+0x296/0x400 __cgroup_procs_write+0x128/0x140 cgroup_procs_write+0x17/0x30 kernfs_fop_write_iter+0x141/0x1f0 vfs_write+0x31d/0x4a0 __x64_sys_write+0x72/0xf0 do_syscall_64+0x82/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix it by adding an argument to sched_move_task() that indicates whether the moving is for a cgroup or autogroup migration. After the change, scx_move_task() is called only for cgroup migrations and renamed to scx_cgroup_move_task(). Link: https://github.com/sched-ext/scx/issues/370 Fixes: 819513666966 ("sched_ext: Add cgroup support") Cc: stable@vger.kernel.org # v6.12+ Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/sched/autogroup.c | 4 ++-- kernel/sched/core.c | 7 ++++--- kernel/sched/ext.c | 15 +-------------- kernel/sched/ext.h | 4 ++-- kernel/sched/sched.h | 2 +- 5 files changed, 10 insertions(+), 22 deletions(-) diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index db68a964e34e..c4a3ccf6a8ac 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -150,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p) * see this thread after that: we can no longer use signal->autogroup. * See the PF_EXITING check in task_wants_autogroup(). */ - sched_move_task(p); + sched_move_task(p, true); } static void @@ -182,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * sched_autogroup_exit_task(). */ for_each_thread(p, t) - sched_move_task(t); + sched_move_task(t, true); unlock_task_sighand(p, &flags); autogroup_kref_put(prev); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 901170708e2a..e77897a62442 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9042,7 +9042,7 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect * its new group. */ -void sched_move_task(struct task_struct *tsk) +void sched_move_task(struct task_struct *tsk, bool for_autogroup) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; @@ -9071,7 +9071,8 @@ void sched_move_task(struct task_struct *tsk) put_prev_task(rq, tsk); sched_change_group(tsk, group); - scx_move_task(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -9172,7 +9173,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) - sched_move_task(task); + sched_move_task(task, false); scx_cgroup_finish_attach(); } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7081c7be5f62..c7b159f48834 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4323,24 +4323,11 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) return ops_sanitize_err("cgroup_prep_move", ret); } -void scx_move_task(struct task_struct *p) +void scx_cgroup_move_task(struct task_struct *p) { if (!scx_cgroup_enabled) return; - /* - * We're called from sched_move_task() which handles both cgroup and - * autogroup moves. Ignore the latter. - * - * Also ignore exiting tasks, because in the exit path tasks transition - * from the autogroup to the root group, so task_group_is_autogroup() - * alone isn't able to catch exiting autogroup tasks. This is safe for - * cgroup_move(), because cgroup migrations never happen for PF_EXITING - * tasks. - */ - if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING)) - return; - /* * @p must have ops.cgroup_prep_move() called on it and thus * cgrp_moving_from set. diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 4d022d17ac7d..1079b56b0f7a 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -73,7 +73,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); -void scx_move_task(struct task_struct *p); +void scx_cgroup_move_task(struct task_struct *p); void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); @@ -82,7 +82,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle); static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static inline void scx_move_task(struct task_struct *p) {} +static inline void scx_cgroup_move_task(struct task_struct *p) {} static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 38e0e323dda2..b93c8c3dc05a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -572,7 +572,7 @@ extern void sched_online_group(struct task_group *tg, extern void sched_destroy_group(struct task_group *tg); extern void sched_release_group(struct task_group *tg); -extern void sched_move_task(struct task_struct *tsk); +extern void sched_move_task(struct task_struct *tsk, bool for_autogroup); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); From 5f52bbf2f6e0997394cf9c449d44e1c80ff4282c Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 25 Jan 2025 18:14:12 +0100 Subject: [PATCH 007/310] tools/sched_ext: Add helper to check task migration state Introduce a new helper for BPF schedulers to determine whether a task can migrate or not (supporting both SMP and UP systems). Fixes: e9fe182772dc ("sched_ext: selftests/dsp_local_on: Fix sporadic failures") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/common.bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index f3e15e9efa76..f254a39b86a5 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -404,6 +404,17 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask) return (const struct cpumask *)mask; } +/* + * Return true if task @p cannot migrate to a different CPU, false + * otherwise. + */ +static inline bool is_migration_disabled(const struct task_struct *p) +{ + if (bpf_core_field_exists(p->migration_disabled)) + return p->migration_disabled; + return false; +} + /* rcu */ void bpf_rcu_read_lock(void) __ksym; void bpf_rcu_read_unlock(void) __ksym; From 3c7d51b0d29954c40ea3a097e0ec7884b4344331 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 25 Jan 2025 10:36:07 +0100 Subject: [PATCH 008/310] sched_ext: selftests/dsp_local_on: Fix selftest on UP systems In UP systems p->migration_disabled is not available. Fix this by using the portable helper is_migration_disabled(p). Fixes: e9fe182772dc ("sched_ext: selftests/dsp_local_on: Fix sporadic failures") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/dsp_local_on.bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c index 758b479bd1ee..c02b2aa6fc64 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c @@ -43,7 +43,7 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) if (!p) return; - if (p->nr_cpus_allowed == nr_cpus && !p->migration_disabled) + if (p->nr_cpus_allowed == nr_cpus && !is_migration_disabled(p)) target = bpf_get_prandom_u32() % nr_cpus; else target = scx_bpf_task_cpu(p); From 1626e5ef0b00386a4fd083fa7c46c8edbd75f9b4 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 27 Jan 2025 23:06:16 +0100 Subject: [PATCH 009/310] sched_ext: Fix lock imbalance in dispatch_to_local_dsq() While performing the rq locking dance in dispatch_to_local_dsq(), we may trigger the following lock imbalance condition, in particular when multiple tasks are rapidly changing CPU affinity (i.e., running a `stress-ng --race-sched 0`): [ 13.413579] ===================================== [ 13.413660] WARNING: bad unlock balance detected! [ 13.413729] 6.13.0-virtme #15 Not tainted [ 13.413792] ------------------------------------- [ 13.413859] kworker/1:1/80 is trying to release lock (&rq->__lock) at: [ 13.413954] [] dispatch_to_local_dsq+0x108/0x1a0 [ 13.414111] but there are no more locks to release! [ 13.414176] [ 13.414176] other info that might help us debug this: [ 13.414258] 1 lock held by kworker/1:1/80: [ 13.414318] #0: ffff8b66feb41698 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x20/0x90 [ 13.414612] [ 13.414612] stack backtrace: [ 13.415255] CPU: 1 UID: 0 PID: 80 Comm: kworker/1:1 Not tainted 6.13.0-virtme #15 [ 13.415505] Workqueue: 0x0 (events) [ 13.415567] Sched_ext: dsp_local_on (enabled+all), task: runnable_at=-2ms [ 13.415570] Call Trace: [ 13.415700] [ 13.415744] dump_stack_lvl+0x78/0xe0 [ 13.415806] ? dispatch_to_local_dsq+0x108/0x1a0 [ 13.415884] print_unlock_imbalance_bug+0x11b/0x130 [ 13.415965] ? dispatch_to_local_dsq+0x108/0x1a0 [ 13.416226] lock_release+0x231/0x2c0 [ 13.416326] _raw_spin_unlock+0x1b/0x40 [ 13.416422] dispatch_to_local_dsq+0x108/0x1a0 [ 13.416554] flush_dispatch_buf+0x199/0x1d0 [ 13.416652] balance_one+0x194/0x370 [ 13.416751] balance_scx+0x61/0x1e0 [ 13.416848] prev_balance+0x43/0xb0 [ 13.416947] __pick_next_task+0x6b/0x1b0 [ 13.417052] __schedule+0x20d/0x1740 This happens because dispatch_to_local_dsq() is racing with dispatch_dequeue() and, when the latter wins, we incorrectly assume that the task has been moved to dst_rq. Fix by properly tracking the currently locked rq. Fixes: 4d3ca89bdd31 ("sched_ext: Refactor consume_remote_task()") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index c7b159f48834..a6d6d6dadde5 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2575,6 +2575,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, { struct rq *src_rq = task_rq(p); struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); +#ifdef CONFIG_SMP + struct rq *locked_rq = rq; +#endif /* * We're synchronized against dequeue through DISPATCHING. As @p can't @@ -2611,8 +2614,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); /* switch to @src_rq lock */ - if (rq != src_rq) { - raw_spin_rq_unlock(rq); + if (locked_rq != src_rq) { + raw_spin_rq_unlock(locked_rq); + locked_rq = src_rq; raw_spin_rq_lock(src_rq); } @@ -2630,6 +2634,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } else { move_remote_task_to_local_dsq(p, enq_flags, src_rq, dst_rq); + /* task has been moved to dst_rq, which is now locked */ + locked_rq = dst_rq; } /* if the destination CPU is idle, wake it up */ @@ -2638,8 +2644,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } /* switch back to @rq lock */ - if (rq != dst_rq) { - raw_spin_rq_unlock(dst_rq); + if (locked_rq != rq) { + raw_spin_rq_unlock(locked_rq); raw_spin_rq_lock(rq); } #else /* CONFIG_SMP */ From b69bb476dee99d564d65d418e9a20acca6f32c3f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 30 Jan 2025 16:05:42 -0800 Subject: [PATCH 010/310] cgroup: fix race between fork and cgroup.kill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tejun reported the following race between fork() and cgroup.kill at [1]. Tejun: I was looking at cgroup.kill implementation and wondering whether there could be a race window. So, __cgroup_kill() does the following: k1. Set CGRP_KILL. k2. Iterate tasks and deliver SIGKILL. k3. Clear CGRP_KILL. The copy_process() does the following: c1. Copy a bunch of stuff. c2. Grab siglock. c3. Check fatal_signal_pending(). c4. Commit to forking. c5. Release siglock. c6. Call cgroup_post_fork() which puts the task on the css_set and tests CGRP_KILL. The intention seems to be that either a forking task gets SIGKILL and terminates on c3 or it sees CGRP_KILL on c6 and kills the child. However, I don't see what guarantees that k3 can't happen before c6. ie. After a forking task passes c5, k2 can take place and then before the forking task reaches c6, k3 can happen. Then, nobody would send SIGKILL to the child. What am I missing? This is indeed a race. One way to fix this race is by taking cgroup_threadgroup_rwsem in write mode in __cgroup_kill() as the fork() side takes cgroup_threadgroup_rwsem in read mode from cgroup_can_fork() to cgroup_post_fork(). However that would be heavy handed as this adds one more potential stall scenario for cgroup.kill which is usually called under extreme situation like memory pressure. To fix this race, let's maintain a sequence number per cgroup which gets incremented on __cgroup_kill() call. On the fork() side, the cgroup_can_fork() will cache the sequence number locally and recheck it against the cgroup's sequence number at cgroup_post_fork() site. If the sequence numbers mismatch, it means __cgroup_kill() can been called and we should send SIGKILL to the newly created task. Reported-by: Tejun Heo Closes: https://lore.kernel.org/all/Z5QHE2Qn-QZ6M-KW@slm.duckdns.org/ [1] Fixes: 661ee6280931 ("cgroup: introduce cgroup.kill") Cc: stable@vger.kernel.org # v5.14+ Signed-off-by: Shakeel Butt Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 6 +++--- include/linux/sched/task.h | 1 + kernel/cgroup/cgroup.c | 20 ++++++++++++-------- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1b20d2d8ef7c..17960a1e858d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -71,9 +71,6 @@ enum { /* Cgroup is frozen. */ CGRP_FROZEN, - - /* Control group has to be killed. */ - CGRP_KILL, }; /* cgroup_root->flags */ @@ -461,6 +458,9 @@ struct cgroup { int nr_threaded_children; /* # of live threaded child cgroups */ + /* sequence number for cgroup.kill, serialized by css_set_lock. */ + unsigned int kill_seq; + struct kernfs_node *kn; /* cgroup kernfs entry */ struct cgroup_file procs_file; /* handle for "cgroup.procs" */ struct cgroup_file events_file; /* handle for "cgroup.events" */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 0f2aeb37bbb0..ca1db4b92c32 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -43,6 +43,7 @@ struct kernel_clone_args { void *fn_arg; struct cgroup *cgrp; struct css_set *cset; + unsigned int kill_seq; }; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d9061bd55436..afc665b7b1fe 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4013,7 +4013,7 @@ static void __cgroup_kill(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - set_bit(CGRP_KILL, &cgrp->flags); + cgrp->kill_seq++; spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); @@ -4029,10 +4029,6 @@ static void __cgroup_kill(struct cgroup *cgrp) send_sig(SIGKILL, task, 0); } css_task_iter_end(&it); - - spin_lock_irq(&css_set_lock); - clear_bit(CGRP_KILL, &cgrp->flags); - spin_unlock_irq(&css_set_lock); } static void cgroup_kill(struct cgroup *cgrp) @@ -6488,6 +6484,10 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); + if (kargs->cgrp) + kargs->kill_seq = kargs->cgrp->kill_seq; + else + kargs->kill_seq = cset->dfl_cgrp->kill_seq; spin_unlock_irq(&css_set_lock); if (!(kargs->flags & CLONE_INTO_CGROUP)) { @@ -6668,6 +6668,7 @@ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { + unsigned int cgrp_kill_seq = 0; unsigned long cgrp_flags = 0; bool kill = false; struct cgroup_subsys *ss; @@ -6681,10 +6682,13 @@ void cgroup_post_fork(struct task_struct *child, /* init tasks are special, only link regular threads */ if (likely(child->pid)) { - if (kargs->cgrp) + if (kargs->cgrp) { cgrp_flags = kargs->cgrp->flags; - else + cgrp_kill_seq = kargs->cgrp->kill_seq; + } else { cgrp_flags = cset->dfl_cgrp->flags; + cgrp_kill_seq = cset->dfl_cgrp->kill_seq; + } WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; @@ -6719,7 +6723,7 @@ void cgroup_post_fork(struct task_struct *child, * child down right after we finished preparing it for * userspace. */ - kill = test_bit(CGRP_KILL, &cgrp_flags); + kill = kargs->kill_seq != cgrp_kill_seq; } spin_unlock_irq(&css_set_lock); From 029b6ce733712a41421955194b113f283dcb1026 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Sun, 2 Feb 2025 12:37:48 +0900 Subject: [PATCH 011/310] sched_ext: Fix incorrect time delta calculation in time_delta() When (s64)(after - before) > 0, the code returns the result of (s64)(after - before) > 0 while the intended result should be (s64)(after - before). That happens because the middle operand of the ternary operator was omitted incorrectly, returning the result of (s64)(after - before) > 0. Thus, add the middle operand -- (s64)(after - before) -- to return the correct time calculation. Fixes: d07be814fc71 ("sched_ext: Add time helpers for BPF schedulers") Signed-off-by: Changwoo Min Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/common.bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index f254a39b86a5..d72b60a0c582 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -432,7 +432,7 @@ void bpf_rcu_read_unlock(void) __ksym; */ static inline s64 time_delta(u64 after, u64 before) { - return (s64)(after - before) > 0 ? : 0; + return (s64)(after - before) > 0 ? (s64)(after - before) : 0; } /** From a8c9a453387640dbe45761970f41301a6985e7fa Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Thu, 16 Jan 2025 06:24:36 -0800 Subject: [PATCH 012/310] ASoC: fsl_micfil: Enable default case in micfil_set_quality() If 'micfil->quality' received from micfil_quality_set() somehow ends up with an unpredictable value, switch() operator will fail to initialize local variable qsel before regmap_update_bits() tries to utilize it. While it is unlikely, play it safe and enable a default case that returns -EINVAL error. Found by Linux Verification Center (linuxtesting.org) with static analysis tool SVACE. Fixes: bea1d61d5892 ("ASoC: fsl_micfil: rework quality setting") Cc: stable@vger.kernel.org Signed-off-by: Nikita Zhandarovich Link: https://patch.msgid.link/20250116142436.22389-1-n.zhandarovich@fintech.ru Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_micfil.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/fsl/fsl_micfil.c b/sound/soc/fsl/fsl_micfil.c index 1075598a6647..fa4136683392 100644 --- a/sound/soc/fsl/fsl_micfil.c +++ b/sound/soc/fsl/fsl_micfil.c @@ -183,6 +183,8 @@ static int micfil_set_quality(struct fsl_micfil *micfil) case QUALITY_VLOW2: qsel = MICFIL_QSEL_VLOW2_QUALITY; break; + default: + return -EINVAL; } return regmap_update_bits(micfil->regmap, REG_MICFIL_CTRL2, From f0ada00a9b3801b71d203b0033b7612b687b7d72 Mon Sep 17 00:00:00 2001 From: Imran Shaik Date: Thu, 9 Jan 2025 14:27:44 +0530 Subject: [PATCH 013/310] dt-bindings: clock: qcom: Add GPU clocks for QCS8300 The QCS8300 GPU clock controller is a derivative of SA8775P, but has few additional clocks and minor differences. Hence, reuse gpucc bindings of SA8775P and add additional clocks required for QCS8300. Acked-by: Krzysztof Kozlowski Signed-off-by: Imran Shaik Link: https://lore.kernel.org/r/20250109-qcs8300-mm-patches-new-v4-1-63e8ac268b02@quicinc.com Signed-off-by: Rob Herring (Arm) --- .../devicetree/bindings/clock/qcom,gpucc.yaml | 3 +++ include/dt-bindings/clock/qcom,qcs8300-gpucc.h | 17 +++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,qcs8300-gpucc.h diff --git a/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml index 0858fd635282..4cdff6161bf0 100644 --- a/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml @@ -8,6 +8,7 @@ title: Qualcomm Graphics Clock & Reset Controller maintainers: - Taniya Das + - Imran Shaik description: | Qualcomm graphics clock control module provides the clocks, resets and power @@ -23,10 +24,12 @@ description: | include/dt-bindings/clock/qcom,gpucc-sm8150.h include/dt-bindings/clock/qcom,gpucc-sm8250.h include/dt-bindings/clock/qcom,gpucc-sm8350.h + include/dt-bindings/clock/qcom,qcs8300-gpucc.h properties: compatible: enum: + - qcom,qcs8300-gpucc - qcom,sdm845-gpucc - qcom,sa8775p-gpucc - qcom,sc7180-gpucc diff --git a/include/dt-bindings/clock/qcom,qcs8300-gpucc.h b/include/dt-bindings/clock/qcom,qcs8300-gpucc.h new file mode 100644 index 000000000000..afa187467b4c --- /dev/null +++ b/include/dt-bindings/clock/qcom,qcs8300-gpucc.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_GPUCC_QCS8300_H +#define _DT_BINDINGS_CLK_QCOM_GPUCC_QCS8300_H + +#include "qcom,sa8775p-gpucc.h" + +/* QCS8300 introduces below new clocks compared to SA8775P */ + +/* GPU_CC clocks */ +#define GPU_CC_CX_ACCU_SHIFT_CLK 23 +#define GPU_CC_GX_ACCU_SHIFT_CLK 24 + +#endif From 0e193cc558e32a879c717bb2d53a1cf8628b5d20 Mon Sep 17 00:00:00 2001 From: Imran Shaik Date: Thu, 9 Jan 2025 14:27:46 +0530 Subject: [PATCH 014/310] dt-bindings: clock: qcom: Add CAMCC clocks for QCS8300 The QCS8300 camera clock controller is a derivative of SA8775P, but has an additional clock and minor differences. Hence, reuse the SA8775P camera bindings and add additional clock required for QCS8300. Reviewed-by: Vladimir Zapolskiy Acked-by: Krzysztof Kozlowski Signed-off-by: Imran Shaik Link: https://lore.kernel.org/r/20250109-qcs8300-mm-patches-new-v4-3-63e8ac268b02@quicinc.com Signed-off-by: Rob Herring (Arm) --- .../bindings/clock/qcom,sa8775p-camcc.yaml | 6 +++++- include/dt-bindings/clock/qcom,qcs8300-camcc.h | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 include/dt-bindings/clock/qcom,qcs8300-camcc.h diff --git a/Documentation/devicetree/bindings/clock/qcom,sa8775p-camcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sa8775p-camcc.yaml index 36a60d8f5ae3..81623f59d11d 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sa8775p-camcc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sa8775p-camcc.yaml @@ -8,16 +8,20 @@ title: Qualcomm Camera Clock & Reset Controller on SA8775P maintainers: - Taniya Das + - Imran Shaik description: | Qualcomm camera clock control module provides the clocks, resets and power domains on SA8775p. - See also: include/dt-bindings/clock/qcom,sa8775p-camcc.h + See also: + include/dt-bindings/clock/qcom,qcs8300-camcc.h + include/dt-bindings/clock/qcom,sa8775p-camcc.h properties: compatible: enum: + - qcom,qcs8300-camcc - qcom,sa8775p-camcc clocks: diff --git a/include/dt-bindings/clock/qcom,qcs8300-camcc.h b/include/dt-bindings/clock/qcom,qcs8300-camcc.h new file mode 100644 index 000000000000..fc535c847859 --- /dev/null +++ b/include/dt-bindings/clock/qcom,qcs8300-camcc.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_QCS8300_CAM_CC_H +#define _DT_BINDINGS_CLK_QCOM_QCS8300_CAM_CC_H + +#include "qcom,sa8775p-camcc.h" + +/* QCS8300 introduces below new clocks compared to SA8775P */ + +/* CAM_CC clocks */ +#define CAM_CC_TITAN_TOP_ACCU_SHIFT_CLK 86 + +#endif From 3e86e57356f0e2284454d82c7200807c6fa9e65b Mon Sep 17 00:00:00 2001 From: Imran Shaik Date: Thu, 9 Jan 2025 14:27:48 +0530 Subject: [PATCH 015/310] dt-bindings: clock: qcom: Add QCS8300 video clock controller The QCS8300 video clock controller is a derivative of SA8775P, but QCS8300 has minor difference. Hence, reuse the SA8775P videocc bindings for QCS8300 platform. Acked-by: Krzysztof Kozlowski Reviewed-by: Dmitry Baryshkov Signed-off-by: Imran Shaik Link: https://lore.kernel.org/r/20250109-qcs8300-mm-patches-new-v4-5-63e8ac268b02@quicinc.com Signed-off-by: Rob Herring (Arm) --- .../devicetree/bindings/clock/qcom,sa8775p-videocc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/clock/qcom,sa8775p-videocc.yaml b/Documentation/devicetree/bindings/clock/qcom,sa8775p-videocc.yaml index 928131bff4c1..07e5d811d816 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sa8775p-videocc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sa8775p-videocc.yaml @@ -18,6 +18,7 @@ description: | properties: compatible: enum: + - qcom,qcs8300-videocc - qcom,sa8775p-videocc clocks: From e6649328dc07bff6227367eda6f1b2263d6c10f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 29 Jan 2025 14:35:27 +0100 Subject: [PATCH 016/310] of: address: Add kunit test for __of_address_resource_bounds() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The overflow checking has to deal with different datatypes and edgecases. Add a new kunit testcase to make sure it works correctly. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250129-of-address-overflow-v3-1-95d1760ed791@linutronix.de Signed-off-by: Rob Herring (Arm) --- drivers/of/address.c | 5 +- drivers/of/of_private.h | 4 ++ drivers/of/of_test.c | 119 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 126 insertions(+), 2 deletions(-) diff --git a/drivers/of/address.c b/drivers/of/address.c index 125833e5ce52..d177a2b9edaf 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -16,6 +16,8 @@ #include #include /* for bus_dma_region */ +#include + /* Uncomment me to enable of_dump_addr() debugging output */ // #define DEBUG @@ -183,7 +185,7 @@ static u64 of_bus_pci_map(__be32 *addr, const __be32 *range, int na, int ns, #endif /* CONFIG_PCI */ -static int __of_address_resource_bounds(struct resource *r, u64 start, u64 size) +VISIBLE_IF_KUNIT int __of_address_resource_bounds(struct resource *r, u64 start, u64 size) { if (overflows_type(start, r->start)) return -EOVERFLOW; @@ -197,6 +199,7 @@ static int __of_address_resource_bounds(struct resource *r, u64 start, u64 size) return 0; } +EXPORT_SYMBOL_IF_KUNIT(__of_address_resource_bounds); /* * of_pci_range_to_resource - Create a resource from an of_pci_range diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h index f3e1193c8ded..1bdc7ceef3c5 100644 --- a/drivers/of/of_private.h +++ b/drivers/of/of_private.h @@ -208,4 +208,8 @@ static void __maybe_unused of_dump_addr(const char *s, const __be32 *addr, int n static void __maybe_unused of_dump_addr(const char *s, const __be32 *addr, int na) { } #endif +#if IS_ENABLED(CONFIG_KUNIT) +int __of_address_resource_bounds(struct resource *r, u64 start, u64 size); +#endif + #endif /* _LINUX_OF_PRIVATE_H */ diff --git a/drivers/of/of_test.c b/drivers/of/of_test.c index b0557ded838f..8bba5a72c9c7 100644 --- a/drivers/of/of_test.c +++ b/drivers/of/of_test.c @@ -2,6 +2,7 @@ /* * KUnit tests for OF APIs */ +#include #include #include @@ -54,8 +55,124 @@ static struct kunit_suite of_dtb_suite = { .init = of_dtb_test_init, }; +struct of_address_resource_bounds_case { + u64 start; + u64 size; + int ret; + + u64 res_start; + u64 res_end; +}; + +static void of_address_resource_bounds_case_desc(const struct of_address_resource_bounds_case *p, + char *name) +{ + snprintf(name, KUNIT_PARAM_DESC_SIZE, "start=0x%016llx,size=0x%016llx", p->start, p->size); +} + +static const struct of_address_resource_bounds_case of_address_resource_bounds_cases[] = { + { + .start = 0, + .size = 0, + .ret = 0, + .res_start = 0, + .res_end = -1, + }, + { + .start = 0, + .size = 0x1000, + .ret = 0, + .res_start = 0, + .res_end = 0xfff, + }, + { + .start = 0x1000, + .size = 0, + .ret = 0, + .res_start = 0x1000, + .res_end = 0xfff, + }, + { + .start = 0x1000, + .size = 0x1000, + .ret = 0, + .res_start = 0x1000, + .res_end = 0x1fff, + }, + { + .start = 1, + .size = RESOURCE_SIZE_MAX, + .ret = 0, + .res_start = 1, + .res_end = RESOURCE_SIZE_MAX, + }, + { + .start = RESOURCE_SIZE_MAX, + .size = 1, + .ret = 0, + .res_start = RESOURCE_SIZE_MAX, + .res_end = RESOURCE_SIZE_MAX, + }, + { + .start = 2, + .size = RESOURCE_SIZE_MAX, + .ret = -EOVERFLOW, + }, + { + .start = RESOURCE_SIZE_MAX, + .size = 2, + .ret = -EOVERFLOW, + }, + { + .start = ULL(0x100000000), + .size = 1, + .ret = sizeof(resource_size_t) > sizeof(u32) ? 0 : -EOVERFLOW, + .res_start = ULL(0x100000000), + .res_end = ULL(0x100000000), + }, + { + .start = 0x1000, + .size = 0xffffffff, + .ret = sizeof(resource_size_t) > sizeof(u32) ? 0 : -EOVERFLOW, + .res_start = 0x1000, + .res_end = ULL(0x100000ffe), + }, +}; + +KUNIT_ARRAY_PARAM(of_address_resource_bounds, + of_address_resource_bounds_cases, of_address_resource_bounds_case_desc); + +static void of_address_resource_bounds(struct kunit *test) +{ + const struct of_address_resource_bounds_case *param = test->param_value; + struct resource r; /* Intentionally uninitialized */ + int ret; + + if (!IS_ENABLED(CONFIG_OF_ADDRESS)) + kunit_skip(test, "CONFIG_OF_ADDRESS not enabled\n"); + + ret = __of_address_resource_bounds(&r, param->start, param->size); + KUNIT_EXPECT_EQ(test, param->ret, ret); + if (ret == 0) { + KUNIT_EXPECT_EQ(test, (resource_size_t)param->res_start, r.start); + KUNIT_EXPECT_EQ(test, (resource_size_t)param->res_end, r.end); + KUNIT_EXPECT_EQ(test, param->size, resource_size(&r)); + } +} + +static struct kunit_case of_address_test_cases[] = { + KUNIT_CASE_PARAM(of_address_resource_bounds, of_address_resource_bounds_gen_params), + {} +}; + +static struct kunit_suite of_address_suite = { + .name = "of_address", + .test_cases = of_address_test_cases, +}; + kunit_test_suites( - &of_dtb_suite, + &of_dtb_suite, &of_address_suite, ); MODULE_DESCRIPTION("KUnit tests for OF APIs"); +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); MODULE_LICENSE("GPL"); From ac5a41b472b4ef8bb37d7550796d059b377b4646 Mon Sep 17 00:00:00 2001 From: Josua Mayer Date: Mon, 27 Jan 2025 21:12:02 +0100 Subject: [PATCH 017/310] Revert "mmc: sdhci_am654: Add sdhci_am654_start_signal_voltage_switch" This reverts commit 941a7abd4666912b84ab209396fdb54b0dae685d. This commit uses presence of device-tree properties vmmc-supply and vqmmc-supply for deciding whether to enable a quirk affecting timing of clock and data. The intention was to address issues observed with eMMC and SD on AM62 platforms. This new quirk is however also enabled for AM64 breaking microSD access on the SolidRun HimmingBoard-T which is supported in-tree since v6.11, causing a regression. During boot microSD initialization now fails with the error below: [ 2.008520] mmc1: SDHCI controller on fa00000.mmc [fa00000.mmc] using ADMA 64-bit [ 2.115348] mmc1: error -110 whilst initialising SD card The heuristics for enabling the quirk are clearly not correct as they break at least one but potentially many existing boards. Revert the change and restore original behaviour until a more appropriate method of selecting the quirk is derived. Fixes: 941a7abd4666 ("mmc: sdhci_am654: Add sdhci_am654_start_signal_voltage_switch") Closes: https://lore.kernel.org/linux-mmc/a70fc9fc-186f-4165-a652-3de50733763a@solid-run.com/ Cc: stable@vger.kernel.org Signed-off-by: Josua Mayer Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20250127-am654-mmc-regression-v2-1-9bb39fb12810@solid-run.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci_am654.c | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index b73f673db92b..f75c31815ab0 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -155,7 +155,6 @@ struct sdhci_am654_data { u32 tuning_loop; #define SDHCI_AM654_QUIRK_FORCE_CDTEST BIT(0) -#define SDHCI_AM654_QUIRK_SUPPRESS_V1P8_ENA BIT(1) }; struct window { @@ -357,29 +356,6 @@ static void sdhci_j721e_4bit_set_clock(struct sdhci_host *host, sdhci_set_clock(host, clock); } -static int sdhci_am654_start_signal_voltage_switch(struct mmc_host *mmc, struct mmc_ios *ios) -{ - struct sdhci_host *host = mmc_priv(mmc); - struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - struct sdhci_am654_data *sdhci_am654 = sdhci_pltfm_priv(pltfm_host); - int ret; - - if ((sdhci_am654->quirks & SDHCI_AM654_QUIRK_SUPPRESS_V1P8_ENA) && - ios->signal_voltage == MMC_SIGNAL_VOLTAGE_180) { - if (!IS_ERR(mmc->supply.vqmmc)) { - ret = mmc_regulator_set_vqmmc(mmc, ios); - if (ret < 0) { - pr_err("%s: Switching to 1.8V signalling voltage failed,\n", - mmc_hostname(mmc)); - return -EIO; - } - } - return 0; - } - - return sdhci_start_signal_voltage_switch(mmc, ios); -} - static u8 sdhci_am654_write_power_on(struct sdhci_host *host, u8 val, int reg) { writeb(val, host->ioaddr + reg); @@ -868,11 +844,6 @@ static int sdhci_am654_get_of_property(struct platform_device *pdev, if (device_property_read_bool(dev, "ti,fails-without-test-cd")) sdhci_am654->quirks |= SDHCI_AM654_QUIRK_FORCE_CDTEST; - /* Suppress v1p8 ena for eMMC and SD with vqmmc supply */ - if (!!of_parse_phandle(dev->of_node, "vmmc-supply", 0) == - !!of_parse_phandle(dev->of_node, "vqmmc-supply", 0)) - sdhci_am654->quirks |= SDHCI_AM654_QUIRK_SUPPRESS_V1P8_ENA; - sdhci_get_of_property(pdev); return 0; @@ -969,7 +940,6 @@ static int sdhci_am654_probe(struct platform_device *pdev) goto err_pltfm_free; } - host->mmc_host_ops.start_signal_voltage_switch = sdhci_am654_start_signal_voltage_switch; host->mmc_host_ops.execute_tuning = sdhci_am654_execute_tuning; pm_runtime_get_noresume(dev); From 3e68abf2b9cebe76c6cd4b1aca8e95cd671035a3 Mon Sep 17 00:00:00 2001 From: Andy-ld Lu Date: Thu, 23 Jan 2025 17:26:01 +0800 Subject: [PATCH 018/310] mmc: mtk-sd: Fix register settings for hs400(es) mode For hs400(es) mode, the 'hs400-ds-delay' is typically configured in the dts. However, some projects may only define 'mediatek,hs400-ds-dly3', which can lead to initialization failures in hs400es mode. CMD13 reported response crc error in the mmc_switch_status() just after switching to hs400es mode. [ 1.914038][ T82] mmc0: mmc_select_hs400es failed, error -84 [ 1.914954][ T82] mmc0: error -84 whilst initialising MMC card Currently, the hs400_ds_dly3 value is set within the tuning function. This means that the PAD_DS_DLY3 field is not configured before tuning process, which is the reason for the above-mentioned CMD13 response crc error. Move the PAD_DS_DLY3 field configuration into msdc_prepare_hs400_tuning(), and add a value check of hs400_ds_delay to prevent overwriting by zero when the 'hs400-ds-delay' is not set in the dts. In addition, since hs400(es) only tune the PAD_DS_DLY1, the PAD_DS_DLY2_SEL bit should be cleared to bypass it. Fixes: c4ac38c6539b ("mmc: mtk-sd: Add HS400 online tuning support") Signed-off-by: Andy-ld Lu Reviewed-by: AngeloGioacchino Del Regno Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250123092644.7359-1-andy-ld.lu@mediatek.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index 4b6e91372526..345ea91629e0 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -273,6 +273,7 @@ #define MSDC_PAD_TUNE_CMD2_SEL BIT(21) /* RW */ #define PAD_DS_TUNE_DLY_SEL BIT(0) /* RW */ +#define PAD_DS_TUNE_DLY2_SEL BIT(1) /* RW */ #define PAD_DS_TUNE_DLY1 GENMASK(6, 2) /* RW */ #define PAD_DS_TUNE_DLY2 GENMASK(11, 7) /* RW */ #define PAD_DS_TUNE_DLY3 GENMASK(16, 12) /* RW */ @@ -318,6 +319,7 @@ /* EMMC50_PAD_DS_TUNE mask */ #define PAD_DS_DLY_SEL BIT(16) /* RW */ +#define PAD_DS_DLY2_SEL BIT(15) /* RW */ #define PAD_DS_DLY1 GENMASK(14, 10) /* RW */ #define PAD_DS_DLY3 GENMASK(4, 0) /* RW */ @@ -2504,13 +2506,23 @@ static int msdc_execute_tuning(struct mmc_host *mmc, u32 opcode) static int msdc_prepare_hs400_tuning(struct mmc_host *mmc, struct mmc_ios *ios) { struct msdc_host *host = mmc_priv(mmc); + host->hs400_mode = true; - if (host->top_base) - writel(host->hs400_ds_delay, - host->top_base + EMMC50_PAD_DS_TUNE); - else - writel(host->hs400_ds_delay, host->base + PAD_DS_TUNE); + if (host->top_base) { + if (host->hs400_ds_dly3) + sdr_set_field(host->top_base + EMMC50_PAD_DS_TUNE, + PAD_DS_DLY3, host->hs400_ds_dly3); + if (host->hs400_ds_delay) + writel(host->hs400_ds_delay, + host->top_base + EMMC50_PAD_DS_TUNE); + } else { + if (host->hs400_ds_dly3) + sdr_set_field(host->base + PAD_DS_TUNE, + PAD_DS_TUNE_DLY3, host->hs400_ds_dly3); + if (host->hs400_ds_delay) + writel(host->hs400_ds_delay, host->base + PAD_DS_TUNE); + } /* hs400 mode must set it to 0 */ sdr_clr_bits(host->base + MSDC_PATCH_BIT2, MSDC_PATCH_BIT2_CFGCRCSTS); /* to improve read performance, set outstanding to 2 */ @@ -2530,14 +2542,11 @@ static int msdc_execute_hs400_tuning(struct mmc_host *mmc, struct mmc_card *card if (host->top_base) { sdr_set_bits(host->top_base + EMMC50_PAD_DS_TUNE, PAD_DS_DLY_SEL); - if (host->hs400_ds_dly3) - sdr_set_field(host->top_base + EMMC50_PAD_DS_TUNE, - PAD_DS_DLY3, host->hs400_ds_dly3); + sdr_clr_bits(host->top_base + EMMC50_PAD_DS_TUNE, + PAD_DS_DLY2_SEL); } else { sdr_set_bits(host->base + PAD_DS_TUNE, PAD_DS_TUNE_DLY_SEL); - if (host->hs400_ds_dly3) - sdr_set_field(host->base + PAD_DS_TUNE, - PAD_DS_TUNE_DLY3, host->hs400_ds_dly3); + sdr_clr_bits(host->base + PAD_DS_TUNE, PAD_DS_TUNE_DLY2_SEL); } host->hs400_tuning = true; From 4fd2707e3e71bfd5d4df4f4c9656a009f09dfc7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bence=20Cs=C3=B3k=C3=A1s?= Date: Mon, 3 Feb 2025 16:12:49 +0100 Subject: [PATCH 019/310] spi: atmel-quadspi: Fix warning in doc-comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The doc-comment for `struct atmel_qspi_pcal` had a typo in one of the struct members' name, causing a warning with the `W=1` option. Fixes: 5af42209a4d2 ("spi: atmel-quadspi: Add support for sama7g5 QSPI") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501311707.Ltj0qXse-lkp@intel.com/ Signed-off-by: Bence Csókás Link: https://patch.msgid.link/20250203151249.79876-2-csokas.bence@prolan.hu Signed-off-by: Mark Brown --- drivers/spi/atmel-quadspi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/atmel-quadspi.c b/drivers/spi/atmel-quadspi.c index abdc49d9d940..d8c9be64d006 100644 --- a/drivers/spi/atmel-quadspi.c +++ b/drivers/spi/atmel-quadspi.c @@ -235,8 +235,8 @@ /** * struct atmel_qspi_pcal - Pad Calibration Clock Division * @pclk_rate: peripheral clock rate. - * @pclkdiv: calibration clock division. The clock applied to the calibration - * cell is divided by pclkdiv + 1. + * @pclk_div: calibration clock division. The clock applied to the calibration + * cell is divided by pclk_div + 1. */ struct atmel_qspi_pcal { u32 pclk_rate; From 9e8b21410f310c50733f6e1730bae5a8e30d3570 Mon Sep 17 00:00:00 2001 From: Selvarasu Ganesan Date: Sat, 18 Jan 2025 11:31:33 +0530 Subject: [PATCH 020/310] usb: gadget: f_midi: Fixing wMaxPacketSize exceeded issue during MIDI bind retries The current implementation sets the wMaxPacketSize of bulk in/out endpoints to 1024 bytes at the end of the f_midi_bind function. However, in cases where there is a failure in the first midi bind attempt, consider rebinding. This scenario may encounter an f_midi_bind issue due to the previous bind setting the bulk endpoint's wMaxPacketSize to 1024 bytes, which exceeds the ep->maxpacket_limit where configured dwc3 TX/RX FIFO's maxpacket size of 512 bytes for IN/OUT endpoints in support HS speed only. Here the term "rebind" in this context refers to attempting to bind the MIDI function a second time in certain scenarios. The situations where rebinding is considered include: * When there is a failure in the first UDC write attempt, which may be caused by other functions bind along with MIDI. * Runtime composition change : Example : MIDI,ADB to MIDI. Or MIDI to MIDI,ADB. This commit addresses this issue by resetting the wMaxPacketSize before endpoint claim. And here there is no need to reset all values in the usb endpoint descriptor structure, as all members except wMaxPacketSize and bEndpointAddress have predefined values. This ensures that restores the endpoint to its expected configuration, and preventing conflicts with value of ep->maxpacket_limit. It also aligns with the approach used in other function drivers, which treat endpoint descriptors as if they were full speed before endpoint claim. Fixes: 46decc82ffd5 ("usb: gadget: unconditionally allocate hs/ss descriptor in bind operation") Cc: stable@vger.kernel.org Signed-off-by: Selvarasu Ganesan Link: https://lore.kernel.org/r/20250118060134.927-1-selvarasu.g@samsung.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_midi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c index 837fcdfa3840..9b991cf5b0f8 100644 --- a/drivers/usb/gadget/function/f_midi.c +++ b/drivers/usb/gadget/function/f_midi.c @@ -907,6 +907,15 @@ static int f_midi_bind(struct usb_configuration *c, struct usb_function *f) status = -ENODEV; + /* + * Reset wMaxPacketSize with maximum packet size of FS bulk transfer before + * endpoint claim. This ensures that the wMaxPacketSize does not exceed the + * limit during bind retries where configured dwc3 TX/RX FIFO's maxpacket + * size of 512 bytes for IN/OUT endpoints in support HS speed only. + */ + bulk_in_desc.wMaxPacketSize = cpu_to_le16(64); + bulk_out_desc.wMaxPacketSize = cpu_to_le16(64); + /* allocate instance-specific endpoints */ midi->in_ep = usb_ep_autoconfig(cdev->gadget, &bulk_in_desc); if (!midi->in_ep) From 309005e448c1f3e4b81e4416406991b7c3339c1d Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Mon, 20 Jan 2025 15:42:51 +0100 Subject: [PATCH 021/310] usb: phy: generic: Use proper helper for property detection Since commit c141ecc3cecd7 ("of: Warn when of_property_read_bool() is used on non-boolean properties") a warning is raised if this function is used for property detection. of_property_present() is the correct helper for this. Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20250120144251.580981-1-alexander.stein@ew.tq-group.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/phy/phy-generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/phy/phy-generic.c b/drivers/usb/phy/phy-generic.c index 6c3ececf9137..8423be59ec0f 100644 --- a/drivers/usb/phy/phy-generic.c +++ b/drivers/usb/phy/phy-generic.c @@ -212,7 +212,7 @@ int usb_phy_gen_create_phy(struct device *dev, struct usb_phy_generic *nop) if (of_property_read_u32(node, "clock-frequency", &clk_rate)) clk_rate = 0; - needs_clk = of_property_read_bool(node, "clocks"); + needs_clk = of_property_present(node, "clocks"); } nop->gpiod_reset = devm_gpiod_get_optional(dev, "reset", GPIOD_ASIS); From 1ed3af5a2aaefd0ecd887ecabdc8da07220e31fe Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Tue, 21 Jan 2025 23:11:23 +0000 Subject: [PATCH 022/310] usb: dwc3: Document nostream_work Add missing description to the nostream_work of dwc3_ep. The work is used by bulk multi-stream endpoints for a NoStream event to reinitiate the stream if needed. Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20250120182219.30dcb3c6@canb.auug.org.au/ Signed-off-by: Thinh Nguyen Link: https://lore.kernel.org/r/7cdeaa346d24907712aac533c1c5f90a03151189.1737500936.git.Thinh.Nguyen@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/core.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index ac7c730f81ac..c955039bb4f6 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -717,6 +717,7 @@ struct dwc3_event_buffer { /** * struct dwc3_ep - device side endpoint representation * @endpoint: usb endpoint + * @nostream_work: work for handling bulk NoStream * @cancelled_list: list of cancelled requests for this endpoint * @pending_list: list of pending requests for this endpoint * @started_list: list of started requests on this endpoint From 335a1fc1193481f8027f176649c72868172f6f8b Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 22 Jan 2025 03:12:31 -0500 Subject: [PATCH 023/310] usb: gadget: udc: renesas_usb3: Fix compiler warning drivers/usb/gadget/udc/renesas_usb3.c: In function 'renesas_usb3_probe': drivers/usb/gadget/udc/renesas_usb3.c:2638:73: warning: '%d' directive output may be truncated writing between 1 and 11 bytes into a region of size 6 [-Wformat-truncation=] 2638 | snprintf(usb3_ep->ep_name, sizeof(usb3_ep->ep_name), "ep%d", i); ^~~~~~~~~~~~~~~~~~~~~~~~ ^~ ^ Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Cc: stable@vger.kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501201409.BIQPtkeB-lkp@intel.com/ Signed-off-by: Guo Ren Link: https://lore.kernel.org/r/20250122081231.47594-1-guoren@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/renesas_usb3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index fce5c41d9f29..89b304cf6d03 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -310,7 +310,7 @@ struct renesas_usb3_request { struct list_head queue; }; -#define USB3_EP_NAME_SIZE 8 +#define USB3_EP_NAME_SIZE 16 struct renesas_usb3_ep { struct usb_ep ep; struct renesas_usb3 *usb3; From 2240fed37afbcdb5e8b627bc7ad986891100e05d Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 22 Jan 2025 14:26:17 -0500 Subject: [PATCH 024/310] USB: hub: Ignore non-compliant devices with too many configs or interfaces Robert Morris created a test program which can cause usb_hub_to_struct_hub() to dereference a NULL or inappropriate pointer: Oops: general protection fault, probably for non-canonical address 0xcccccccccccccccc: 0000 [#1] SMP DEBUG_PAGEALLOC PTI CPU: 7 UID: 0 PID: 117 Comm: kworker/7:1 Not tainted 6.13.0-rc3-00017-gf44d154d6e3d #14 Hardware name: FreeBSD BHYVE/BHYVE, BIOS 14.0 10/17/2021 Workqueue: usb_hub_wq hub_event RIP: 0010:usb_hub_adjust_deviceremovable+0x78/0x110 ... Call Trace: ? die_addr+0x31/0x80 ? exc_general_protection+0x1b4/0x3c0 ? asm_exc_general_protection+0x26/0x30 ? usb_hub_adjust_deviceremovable+0x78/0x110 hub_probe+0x7c7/0xab0 usb_probe_interface+0x14b/0x350 really_probe+0xd0/0x2d0 ? __pfx___device_attach_driver+0x10/0x10 __driver_probe_device+0x6e/0x110 driver_probe_device+0x1a/0x90 __device_attach_driver+0x7e/0xc0 bus_for_each_drv+0x7f/0xd0 __device_attach+0xaa/0x1a0 bus_probe_device+0x8b/0xa0 device_add+0x62e/0x810 usb_set_configuration+0x65d/0x990 usb_generic_driver_probe+0x4b/0x70 usb_probe_device+0x36/0xd0 The cause of this error is that the device has two interfaces, and the hub driver binds to interface 1 instead of interface 0, which is where usb_hub_to_struct_hub() looks. We can prevent the problem from occurring by refusing to accept hub devices that violate the USB spec by having more than one configuration or interface. Reported-and-tested-by: Robert Morris Cc: stable Closes: https://lore.kernel.org/linux-usb/95564.1737394039@localhost/ Signed-off-by: Alan Stern Link: https://lore.kernel.org/r/c27f3bf4-63d8-4fb5-ac82-09e3cd19f61c@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index c3f839637cb5..0cd44f1fd56d 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -1848,6 +1848,17 @@ static int hub_probe(struct usb_interface *intf, const struct usb_device_id *id) desc = intf->cur_altsetting; hdev = interface_to_usbdev(intf); + /* + * The USB 2.0 spec prohibits hubs from having more than one + * configuration or interface, and we rely on this prohibition. + * Refuse to accept a device that violates it. + */ + if (hdev->descriptor.bNumConfigurations > 1 || + hdev->actconfig->desc.bNumInterfaces > 1) { + dev_err(&intf->dev, "Invalid hub with more than one config or interface\n"); + return -EINVAL; + } + /* * Set default autosuspend delay as 0 to speedup bus suspend, * based on the below considerations: From 58cd423820d5b5610977e55e4acdd06628829ede Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Fri, 24 Jan 2025 18:33:25 +0100 Subject: [PATCH 025/310] usb: dwc2: gadget: remove of_node reference upon udc_stop In dwc2_hsotg_udc_start(), e.g. when binding composite driver, "of_node" is set to hsotg->dev->of_node. It causes errors when binding the gadget driver several times, on stm32mp157c-ev1 board. Below error is seen: "pin PA10 already requested by 49000000.usb-otg; cannot claim for gadget.0" The first time, no issue is seen as when registering the driver, of_node isn't NULL: -> gadget_dev_desc_UDC_store -> usb_gadget_register_driver_owner -> driver_register ... -> really_probe -> pinctrl_bind_pins (no effect) Then dwc2_hsotg_udc_start() sets of_node. The second time (stop the gadget, reconfigure it, then start it again), of_node has been set, so the probing code tries to acquire pins for the gadget. These pins are hold by the controller, hence the error. So clear gadget.dev.of_node in udc_stop() routine to avoid the issue. Fixes: 7d7b22928b90 ("usb: gadget: s3c-hsotg: Propagate devicetree to gadget drivers") Cc: stable Signed-off-by: Fabrice Gasnier Link: https://lore.kernel.org/r/20250124173325.2747710-1-fabrice.gasnier@foss.st.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/gadget.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c index e7bf9cc635be..bd4c788f03bc 100644 --- a/drivers/usb/dwc2/gadget.c +++ b/drivers/usb/dwc2/gadget.c @@ -4615,6 +4615,7 @@ static int dwc2_hsotg_udc_stop(struct usb_gadget *gadget) spin_lock_irqsave(&hsotg->lock, flags); hsotg->driver = NULL; + hsotg->gadget.dev.of_node = NULL; hsotg->gadget.speed = USB_SPEED_UNKNOWN; hsotg->enabled = 0; From da1668997052ed1cb00322e1f3b63702615c9429 Mon Sep 17 00:00:00 2001 From: John Keeping Date: Thu, 30 Jan 2025 19:50:34 +0000 Subject: [PATCH 026/310] usb: gadget: f_midi: fix MIDI Streaming descriptor lengths While the MIDI jacks are configured correctly, and the MIDIStreaming endpoint descriptors are filled with the correct information, bNumEmbMIDIJack and bLength are set incorrectly in these descriptors. This does not matter when the numbers of in and out ports are equal, but when they differ the host will receive broken descriptors with uninitialized stack memory leaking into the descriptor for whichever value is smaller. The precise meaning of "in" and "out" in the port counts is not clearly defined and can be confusing. But elsewhere the driver consistently uses this to match the USB meaning of IN and OUT viewed from the host, so that "in" ports send data to the host and "out" ports receive data from it. Cc: stable Fixes: c8933c3f79568 ("USB: gadget: f_midi: allow a dynamic number of input and output ports") Signed-off-by: John Keeping Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/20250130195035.3883857-1-jkeeping@inmusicbrands.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_midi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c index 9b991cf5b0f8..47260d65066a 100644 --- a/drivers/usb/gadget/function/f_midi.c +++ b/drivers/usb/gadget/function/f_midi.c @@ -1009,11 +1009,11 @@ static int f_midi_bind(struct usb_configuration *c, struct usb_function *f) } /* configure the endpoint descriptors ... */ - ms_out_desc.bLength = USB_DT_MS_ENDPOINT_SIZE(midi->in_ports); - ms_out_desc.bNumEmbMIDIJack = midi->in_ports; + ms_out_desc.bLength = USB_DT_MS_ENDPOINT_SIZE(midi->out_ports); + ms_out_desc.bNumEmbMIDIJack = midi->out_ports; - ms_in_desc.bLength = USB_DT_MS_ENDPOINT_SIZE(midi->out_ports); - ms_in_desc.bNumEmbMIDIJack = midi->out_ports; + ms_in_desc.bLength = USB_DT_MS_ENDPOINT_SIZE(midi->in_ports); + ms_in_desc.bNumEmbMIDIJack = midi->in_ports; /* ... and add them to the list */ endpoint_descriptor_index = i; From d275a5e0c5f528b4b877ec683b8cd8bfced96af5 Mon Sep 17 00:00:00 2001 From: Devarsh Thakkar Date: Mon, 3 Feb 2025 21:24:31 +0530 Subject: [PATCH 027/310] dt-bindings: display: ti: Fix compatible for am62a7 dss Fix incorrect format of compatible string (comma instead of hyphen) for TI's AM62A7 SoC. s/ti,am62a7,dss/ti,am62a7-dss Fixes: 7959ceb767e4 ("dt-bindings: display: ti: Add support for am62a7 dss") Reviewed-by: Krzysztof Kozlowski Signed-off-by: Devarsh Thakkar Link: https://lore.kernel.org/r/20250203155431.2174170-1-devarsht@ti.com Signed-off-by: Rob Herring (Arm) --- Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml b/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml index 55e3e490d0e6..31c4ffcb599c 100644 --- a/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml +++ b/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml @@ -23,7 +23,7 @@ properties: compatible: enum: - ti,am625-dss - - ti,am62a7,dss + - ti,am62a7-dss - ti,am65x-dss reg: From 3648027de1fa91a0c80cffd3ecff263d06e62605 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 10 Jan 2025 18:51:46 +0100 Subject: [PATCH 028/310] arm64: Fix 5-level paging support in kexec/hibernate trampoline Add the missing code to allocate P4D level page tables when cloning the the kernel page tables. This fixes a crash that may be observed when attempting to resume from hibernation on an LPA2 capable system with 4k pages, which therefore uses 5 levels of paging. Presumably, kexec is equally affected. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250110175145.785702-2-ardb+git@google.com Signed-off-by: Will Deacon --- arch/arm64/mm/trans_pgd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 19c67ed1a21f..18543b603c77 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -162,6 +162,13 @@ static int copy_p4d(struct trans_pgd_info *info, pgd_t *dst_pgdp, unsigned long next; unsigned long addr = start; + if (pgd_none(READ_ONCE(*dst_pgdp))) { + dst_p4dp = trans_alloc(info); + if (!dst_p4dp) + return -ENOMEM; + pgd_populate(NULL, dst_pgdp, dst_p4dp); + } + dst_p4dp = p4d_offset(dst_pgdp, start); src_p4dp = p4d_offset(src_pgdp, start); do { From f458b2165d7ac0f2401fff48f19c8f864e7e1e38 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 17 Jan 2025 07:55:22 -0500 Subject: [PATCH 029/310] arm64: Kconfig: Remove selecting replaced HAVE_FUNCTION_GRAPH_RETVAL Commit a3ed4157b7d8 ("fgraph: Replace fgraph_ret_regs with ftrace_regs") replaces the config HAVE_FUNCTION_GRAPH_RETVAL with the config HAVE_FUNCTION_GRAPH_FREGS, and it replaces all the select commands in the various architecture Kconfig files. In the arm64 architecture, the commit adds the 'select HAVE_FUNCTION_GRAPH_FREGS', but misses to remove the 'select HAVE_FUNCTION_GRAPH_RETVAL', i.e., the select on the replaced config. Remove selecting the replaced config. No functional change, just cleanup. Fixes: a3ed4157b7d8 ("fgraph: Replace fgraph_ret_regs with ftrace_regs") Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20250117125522.99071-1-lukas.bulwahn@redhat.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fcdd0ed3eca8..940343beb3d4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -225,7 +225,6 @@ config ARM64 select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_FREGS select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_GRAPH_RETVAL select HAVE_GCC_PLUGINS select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && \ HW_PERF_EVENTS && HAVE_PERF_EVENTS_NMI From f64f9dddd1f58c41c140034f7d2b0beeef1bc548 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 24 Jan 2025 17:33:22 +0000 Subject: [PATCH 030/310] arm64/gcs: Fix documentation for HWCAP In one of the renumberings of the GCS hwcap a stray reference to HWCAP2 was left, fix it. Reported-by: David Spickett Fixes: 7058bf87cd59 ("arm64/gcs: Document the ABI for Guarded Control Stacks") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20250124-arm64-gcs-hwcap-doc-v1-1-fa9368b01ca6@kernel.org Signed-off-by: Will Deacon --- Documentation/arch/arm64/gcs.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/arch/arm64/gcs.rst b/Documentation/arch/arm64/gcs.rst index 1f65a3193e77..226c0b008456 100644 --- a/Documentation/arch/arm64/gcs.rst +++ b/Documentation/arch/arm64/gcs.rst @@ -37,7 +37,7 @@ intended to be exhaustive. shadow stacks rather than GCS. * Support for GCS is reported to userspace via HWCAP_GCS in the aux vector - AT_HWCAP2 entry. + AT_HWCAP entry. * GCS is enabled per thread. While there is support for disabling GCS at runtime this should be done with great care. From 21fed7c223e20e694b91dbf25936d922a50c8b19 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 3 Feb 2025 20:11:04 +0000 Subject: [PATCH 031/310] arm64/hwcap: Remove stray references to SF8MMx Due to SME currently being disabled when removing the SF8MMx support it wasn't noticed that there were some stray references in the hwcap table, delete them. Fixes: 819935464cb2 ("arm64/hwcap: Describe 2024 dpISA extensions to userspace") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20250203-arm64-remove-sf8mmx-v1-1-6f1da3dbff82@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 4eb7c6698ae4..f0910f20fbf8 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3180,8 +3180,6 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA), HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4), HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2), - HWCAP_CAP(ID_AA64SMFR0_EL1, SF8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8MM8), - HWCAP_CAP(ID_AA64SMFR0_EL1, SF8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8MM4), HWCAP_CAP(ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM), HWCAP_CAP(ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES), HWCAP_CAP(ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA), From ba69e0750b0362870294adab09339a0c39c3beaf Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 1 Feb 2025 18:21:35 +0100 Subject: [PATCH 032/310] efi: Avoid cold plugged memory for placing the kernel UEFI 2.11 introduced EFI_MEMORY_HOT_PLUGGABLE to annotate system memory regions that are 'cold plugged' at boot, i.e., hot pluggable memory that is available from early boot, and described as system RAM by the firmware. Existing loaders and EFI applications running in the boot context will happily use this memory for allocating data structures that cannot be freed or moved at runtime, and this prevents the memory from being unplugged. Going forward, the new EFI_MEMORY_HOT_PLUGGABLE attribute should be tested, and memory annotated as such should be avoided for such allocations. In the EFI stub, there are a couple of occurrences where, instead of the high-level AllocatePages() UEFI boot service, a low-level code sequence is used that traverses the EFI memory map and carves out the requested number of pages from a free region. This is needed, e.g., for allocating as low as possible, or for allocating pages at random. While AllocatePages() should presumably avoid special purpose memory and cold plugged regions, this manual approach needs to incorporate this logic itself, in order to prevent the kernel itself from ending up in a hot unpluggable region, preventing it from being unplugged. So add the EFI_MEMORY_HOTPLUGGABLE macro definition, and check for it where appropriate. Cc: stable@vger.kernel.org Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/efi.c | 6 ++++-- drivers/firmware/efi/libstub/randomalloc.c | 3 +++ drivers/firmware/efi/libstub/relocate.c | 3 +++ include/linux/efi.h | 1 + 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 8296bf985d1d..7309394b8fc9 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -934,13 +934,15 @@ char * __init efi_md_typeattr_format(char *buf, size_t size, EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_RO | EFI_MEMORY_WP | EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_NV | EFI_MEMORY_SP | EFI_MEMORY_CPU_CRYPTO | - EFI_MEMORY_RUNTIME | EFI_MEMORY_MORE_RELIABLE)) + EFI_MEMORY_MORE_RELIABLE | EFI_MEMORY_HOT_PLUGGABLE | + EFI_MEMORY_RUNTIME)) snprintf(pos, size, "|attr=0x%016llx]", (unsigned long long)attr); else snprintf(pos, size, - "|%3s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]", + "|%3s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]", attr & EFI_MEMORY_RUNTIME ? "RUN" : "", + attr & EFI_MEMORY_HOT_PLUGGABLE ? "HP" : "", attr & EFI_MEMORY_MORE_RELIABLE ? "MR" : "", attr & EFI_MEMORY_CPU_CRYPTO ? "CC" : "", attr & EFI_MEMORY_SP ? "SP" : "", diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index e5872e38d9a4..5a732018be36 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -25,6 +25,9 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md, if (md->type != EFI_CONVENTIONAL_MEMORY) return 0; + if (md->attribute & EFI_MEMORY_HOT_PLUGGABLE) + return 0; + if (efi_soft_reserve_enabled() && (md->attribute & EFI_MEMORY_SP)) return 0; diff --git a/drivers/firmware/efi/libstub/relocate.c b/drivers/firmware/efi/libstub/relocate.c index 99b45d1cd624..d4264bfb6dc1 100644 --- a/drivers/firmware/efi/libstub/relocate.c +++ b/drivers/firmware/efi/libstub/relocate.c @@ -53,6 +53,9 @@ efi_status_t efi_low_alloc_above(unsigned long size, unsigned long align, if (desc->type != EFI_CONVENTIONAL_MEMORY) continue; + if (desc->attribute & EFI_MEMORY_HOT_PLUGGABLE) + continue; + if (efi_soft_reserve_enabled() && (desc->attribute & EFI_MEMORY_SP)) continue; diff --git a/include/linux/efi.h b/include/linux/efi.h index 053c57e61869..db293d7de686 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -128,6 +128,7 @@ typedef struct { #define EFI_MEMORY_RO ((u64)0x0000000000020000ULL) /* read-only */ #define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */ #define EFI_MEMORY_CPU_CRYPTO ((u64)0x0000000000080000ULL) /* supports encryption */ +#define EFI_MEMORY_HOT_PLUGGABLE BIT_ULL(20) /* supports unplugging at runtime */ #define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ #define EFI_MEMORY_DESCRIPTOR_VERSION 1 From bbc4578537e350d5bf8a7a2c7d054d6b163b3c41 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 1 Feb 2025 18:21:36 +0100 Subject: [PATCH 033/310] efi: Use BIT_ULL() constants for memory attributes For legibility, use the existing BIT_ULL() to generate the u64 type EFI memory attribute macros. Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/efi.h b/include/linux/efi.h index db293d7de686..7d63d1d75f22 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -114,22 +114,22 @@ typedef struct { #define EFI_MAX_MEMORY_TYPE 16 /* Attribute values: */ -#define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */ -#define EFI_MEMORY_WC ((u64)0x0000000000000002ULL) /* write-coalescing */ -#define EFI_MEMORY_WT ((u64)0x0000000000000004ULL) /* write-through */ -#define EFI_MEMORY_WB ((u64)0x0000000000000008ULL) /* write-back */ -#define EFI_MEMORY_UCE ((u64)0x0000000000000010ULL) /* uncached, exported */ -#define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ -#define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ -#define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ -#define EFI_MEMORY_NV ((u64)0x0000000000008000ULL) /* non-volatile */ -#define EFI_MEMORY_MORE_RELIABLE \ - ((u64)0x0000000000010000ULL) /* higher reliability */ -#define EFI_MEMORY_RO ((u64)0x0000000000020000ULL) /* read-only */ -#define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */ -#define EFI_MEMORY_CPU_CRYPTO ((u64)0x0000000000080000ULL) /* supports encryption */ +#define EFI_MEMORY_UC BIT_ULL(0) /* uncached */ +#define EFI_MEMORY_WC BIT_ULL(1) /* write-coalescing */ +#define EFI_MEMORY_WT BIT_ULL(2) /* write-through */ +#define EFI_MEMORY_WB BIT_ULL(3) /* write-back */ +#define EFI_MEMORY_UCE BIT_ULL(4) /* uncached, exported */ +#define EFI_MEMORY_WP BIT_ULL(12) /* write-protect */ +#define EFI_MEMORY_RP BIT_ULL(13) /* read-protect */ +#define EFI_MEMORY_XP BIT_ULL(14) /* execute-protect */ +#define EFI_MEMORY_NV BIT_ULL(15) /* non-volatile */ +#define EFI_MEMORY_MORE_RELIABLE BIT_ULL(16) /* higher reliability */ +#define EFI_MEMORY_RO BIT_ULL(17) /* read-only */ +#define EFI_MEMORY_SP BIT_ULL(18) /* soft reserved */ +#define EFI_MEMORY_CPU_CRYPTO BIT_ULL(19) /* supports encryption */ #define EFI_MEMORY_HOT_PLUGGABLE BIT_ULL(20) /* supports unplugging at runtime */ -#define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ +#define EFI_MEMORY_RUNTIME BIT_ULL(63) /* range requires runtime mapping */ + #define EFI_MEMORY_DESCRIPTOR_VERSION 1 #define EFI_PAGE_SHIFT 12 From e8ed246ded863eb862806c5591afdcf70012ab5e Mon Sep 17 00:00:00 2001 From: Andre Werner Date: Tue, 21 Jan 2025 08:18:19 +0100 Subject: [PATCH 034/310] serial: sc16is7xx: Fix IRQ number check behavior The logical meaning of the previous version is wrong due to a typo. If the IRQ equals 0, no interrupt pin is available and polling mode shall be used. Additionally, this fix adds a check for IRQ < 0 to increase robustness, because documentation still says that negative IRQ values cannot be absolutely ruled-out. Fixes: 104c1b9dde9d ("serial: sc16is7xx: Add polling mode if no IRQ pin is available") Signed-off-by: Andre Werner Reviewed-by: Jiri Slaby Reviewed-by: Andy Shevchenko Reviewed-by: Maarten Brock Reviewed-by: Hugo Villeneuve Link: https://lore.kernel.org/r/20250121071819.1346672-1-andre.werner@systec-electronic.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sc16is7xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c index 7b51cdc274fd..560f45ed19ae 100644 --- a/drivers/tty/serial/sc16is7xx.c +++ b/drivers/tty/serial/sc16is7xx.c @@ -1561,7 +1561,7 @@ int sc16is7xx_probe(struct device *dev, const struct sc16is7xx_devtype *devtype, /* Always ask for fixed clock rate from a property. */ device_property_read_u32(dev, "clock-frequency", &uartclk); - s->polling = !!irq; + s->polling = (irq <= 0); if (s->polling) dev_dbg(dev, "No interrupt pin definition, falling back to polling mode\n"); From 166ac2bba167d575e7146beaa66093bc7c072f43 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Jan 2025 18:10:46 +0200 Subject: [PATCH 035/310] serial: port: Assign ->iotype correctly when ->iobase is set Currently the ->iotype is always assigned to the UPIO_MEM when the respective property is not found. However, this will not support the cases when user wants to have UPIO_PORT to be set or preserved. Support this scenario by checking ->iobase value and default the ->iotype respectively. Fixes: 1117a6fdc7c1 ("serial: 8250_of: Switch to use uart_read_port_properties()") Fixes: e894b6005dce ("serial: port: Introduce a common helper to read properties") Cc: stable Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250124161530.398361-2-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_port.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/serial_port.c b/drivers/tty/serial/serial_port.c index d35f1d24156c..f28d0633fe6b 100644 --- a/drivers/tty/serial/serial_port.c +++ b/drivers/tty/serial/serial_port.c @@ -173,6 +173,7 @@ EXPORT_SYMBOL(uart_remove_one_port); * The caller is responsible to initialize the following fields of the @port * ->dev (must be valid) * ->flags + * ->iobase * ->mapbase * ->mapsize * ->regshift (if @use_defaults is false) @@ -214,7 +215,7 @@ static int __uart_read_properties(struct uart_port *port, bool use_defaults) /* Read the registers I/O access type (default: MMIO 8-bit) */ ret = device_property_read_u32(dev, "reg-io-width", &value); if (ret) { - port->iotype = UPIO_MEM; + port->iotype = port->iobase ? UPIO_PORT : UPIO_MEM; } else { switch (value) { case 1: From e8486bd50ecf63c9a1e25271f258a8d959f2672f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Jan 2025 18:10:47 +0200 Subject: [PATCH 036/310] serial: port: Always update ->iotype in __uart_read_properties() The documentation of the __uart_read_properties() states that ->iotype member is always altered after the function call, but the code doesn't do that in the case when use_defaults == false and the value of reg-io-width is unsupported. Make sure the code follows the documentation. Note, the current users of the uart_read_and_validate_port_properties() will fail and the change doesn't affect their behaviour, neither users of uart_read_port_properties() will be affected since the alteration happens there even in the current code flow. Fixes: e894b6005dce ("serial: port: Introduce a common helper to read properties") Cc: stable Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250124161530.398361-3-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/serial_port.c b/drivers/tty/serial/serial_port.c index f28d0633fe6b..85285c56fabf 100644 --- a/drivers/tty/serial/serial_port.c +++ b/drivers/tty/serial/serial_port.c @@ -228,11 +228,11 @@ static int __uart_read_properties(struct uart_port *port, bool use_defaults) port->iotype = device_is_big_endian(dev) ? UPIO_MEM32BE : UPIO_MEM32; break; default: + port->iotype = UPIO_UNKNOWN; if (!use_defaults) { dev_err(dev, "Unsupported reg-io-width (%u)\n", value); return -EINVAL; } - port->iotype = UPIO_UNKNOWN; break; } } From 12397549b5014071e1d2b315509f68eb93ef9144 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Jan 2025 18:10:48 +0200 Subject: [PATCH 037/310] serial: port: Make ->iotype validation global in __uart_read_properties() In order to make code robust against potential changes in the future move ->iotype validation outside of switch in __uart_read_properties(). If any code will be added in between that might leave the ->iotype value unknown the validation catches this up. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250124161530.398361-4-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_port.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/tty/serial/serial_port.c b/drivers/tty/serial/serial_port.c index 85285c56fabf..2fc48cd63f6c 100644 --- a/drivers/tty/serial/serial_port.c +++ b/drivers/tty/serial/serial_port.c @@ -229,14 +229,15 @@ static int __uart_read_properties(struct uart_port *port, bool use_defaults) break; default: port->iotype = UPIO_UNKNOWN; - if (!use_defaults) { - dev_err(dev, "Unsupported reg-io-width (%u)\n", value); - return -EINVAL; - } break; } } + if (!use_defaults && port->iotype == UPIO_UNKNOWN) { + dev_err(dev, "Unsupported reg-io-width (%u)\n", value); + return -EINVAL; + } + /* Read the address mapping base offset (default: no offset) */ ret = device_property_read_u32(dev, "reg-offset", &value); if (ret) From fe310f75327edbc042c7cc0df32c6b9ec29eb93a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Jan 2025 18:10:49 +0200 Subject: [PATCH 038/310] serial: 8250_of: Remove unneeded ->iotype assignment If ->iobase is set the default will be UPIO_PORT for ->iotype after the uart_read_and_validate_port_properties() call. Hence no need to assign that explicitly. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250124161530.398361-5-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_of.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_of.c b/drivers/tty/serial/8250/8250_of.c index 64aed7efc569..11c860ea80f6 100644 --- a/drivers/tty/serial/8250/8250_of.c +++ b/drivers/tty/serial/8250/8250_of.c @@ -110,7 +110,6 @@ static int of_platform_serial_setup(struct platform_device *ofdev, spin_lock_init(&port->lock); if (resource_type(&resource) == IORESOURCE_IO) { - port->iotype = UPIO_PORT; port->iobase = resource.start; } else { port->mapbase = resource.start; From 34bbb5d5137f32be3186a995a5ad4c60aaad11a7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Jan 2025 18:10:50 +0200 Subject: [PATCH 039/310] serial: 8250_platform: Remove unneeded ->iotype assignment If ->iobase is set the default will be UPIO_PORT for ->iotype after the uart_read_and_validate_port_properties() call. Hence no need to assign that explicitly. Otherwise it will be UPIO_MEM. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250124161530.398361-6-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_platform.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/tty/serial/8250/8250_platform.c b/drivers/tty/serial/8250/8250_platform.c index 8bdc1879d952..c0343bfb8064 100644 --- a/drivers/tty/serial/8250/8250_platform.c +++ b/drivers/tty/serial/8250/8250_platform.c @@ -112,7 +112,6 @@ static int serial8250_probe_acpi(struct platform_device *pdev) struct device *dev = &pdev->dev; struct uart_8250_port uart = { }; struct resource *regs; - unsigned char iotype; int ret, line; regs = platform_get_mem_or_io(pdev, 0); @@ -122,13 +121,11 @@ static int serial8250_probe_acpi(struct platform_device *pdev) switch (resource_type(regs)) { case IORESOURCE_IO: uart.port.iobase = regs->start; - iotype = UPIO_PORT; break; case IORESOURCE_MEM: uart.port.mapbase = regs->start; uart.port.mapsize = resource_size(regs); uart.port.flags = UPF_IOREMAP; - iotype = UPIO_MEM; break; default: return -EINVAL; @@ -147,12 +144,6 @@ static int serial8250_probe_acpi(struct platform_device *pdev) if (ret) return ret; - /* - * The previous call may not set iotype correctly when reg-io-width - * property is absent and it doesn't support IO port resource. - */ - uart.port.iotype = iotype; - line = serial8250_register_8250_port(&uart); if (line < 0) return line; From 0f3fd9cf6491f5beecbb65abb41556c56135340c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Jan 2025 18:10:51 +0200 Subject: [PATCH 040/310] serial: 8250_pnp: Remove unneeded ->iotype assignment If ->iobase is set the default will be UPIO_PORT for ->iotype after the uart_read_and_validate_port_properties() call. Hence no need to assign that explicitly. Otherwise it will be UPIO_MEM. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250124161530.398361-7-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_pnp.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/tty/serial/8250/8250_pnp.c b/drivers/tty/serial/8250/8250_pnp.c index 7c06ae79d8e2..7a837fdf9df1 100644 --- a/drivers/tty/serial/8250/8250_pnp.c +++ b/drivers/tty/serial/8250/8250_pnp.c @@ -436,7 +436,6 @@ serial_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id) { struct uart_8250_port uart, *port; int ret, flags = dev_id->driver_data; - unsigned char iotype; long line; if (flags & UNKNOWN_DEV) { @@ -448,14 +447,11 @@ serial_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id) memset(&uart, 0, sizeof(uart)); if ((flags & CIR_PORT) && pnp_port_valid(dev, 2)) { uart.port.iobase = pnp_port_start(dev, 2); - iotype = UPIO_PORT; } else if (pnp_port_valid(dev, 0)) { uart.port.iobase = pnp_port_start(dev, 0); - iotype = UPIO_PORT; } else if (pnp_mem_valid(dev, 0)) { uart.port.mapbase = pnp_mem_start(dev, 0); uart.port.mapsize = pnp_mem_len(dev, 0); - iotype = UPIO_MEM; uart.port.flags = UPF_IOREMAP; } else return -ENODEV; @@ -471,12 +467,6 @@ serial_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id) if (ret) return ret; - /* - * The previous call may not set iotype correctly when reg-io-width - * property is absent and it doesn't support IO port resource. - */ - uart.port.iotype = iotype; - if (flags & CIR_PORT) { uart.port.flags |= UPF_FIXED_PORT | UPF_FIXED_TYPE; uart.port.type = PORT_8250_CIR; From fc016ef7da64fd473d73ee6c261ba1b0b47afe2b Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 4 Feb 2025 13:39:41 +0800 Subject: [PATCH 041/310] ASoC: Intel: sof_sdw: Add lookup of quirk using PCI subsystem ID Add lookup of PCI subsystem vendor:device ID to find a quirk. The subsystem ID (SSID) is part of the PCI specification to uniquely identify a particular system-specific implementation of a hardware device. Unlike DMI information, it identifies the sound hardware itself, rather than a specific model of PC. SSID can be more reliable and stable than DMI strings, and is preferred by some vendors as the way to identify the actual sound hardware. Signed-off-by: Richard Fitzgerald Reviewed-by: Liam Girdwood Signed-off-by: Bard Liao Link: https://patch.msgid.link/20250204053943.93596-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index b0d35fda7b17..381fae5943fe 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "sof_sdw_common.h" #include "../../codecs/rt711.h" @@ -751,6 +752,22 @@ static const struct dmi_system_id sof_sdw_quirk_table[] = { {} }; +static const struct snd_pci_quirk sof_sdw_ssid_quirk_table[] = { + {} +}; + +static void sof_sdw_check_ssid_quirk(const struct snd_soc_acpi_mach *mach) +{ + const struct snd_pci_quirk *quirk_entry; + + quirk_entry = snd_pci_quirk_lookup_id(mach->mach_params.subsystem_vendor, + mach->mach_params.subsystem_device, + sof_sdw_ssid_quirk_table); + + if (quirk_entry) + sof_sdw_quirk = quirk_entry->value; +} + static struct snd_soc_dai_link_component platform_component[] = { { /* name might be overridden during probe */ @@ -1278,6 +1295,13 @@ static int mc_probe(struct platform_device *pdev) snd_soc_card_set_drvdata(card, ctx); + if (mach->mach_params.subsystem_id_set) { + snd_soc_card_set_pci_ssid(card, + mach->mach_params.subsystem_vendor, + mach->mach_params.subsystem_device); + sof_sdw_check_ssid_quirk(mach); + } + dmi_check_system(sof_sdw_quirk_table); if (quirk_override != -1) { @@ -1293,12 +1317,6 @@ static int mc_probe(struct platform_device *pdev) for (i = 0; i < ctx->codec_info_list_count; i++) codec_info_list[i].amp_num = 0; - if (mach->mach_params.subsystem_id_set) { - snd_soc_card_set_pci_ssid(card, - mach->mach_params.subsystem_vendor, - mach->mach_params.subsystem_device); - } - ret = sof_card_dai_links_create(card); if (ret < 0) return ret; From 0843449708085c4fb45a3c325c2fbced556f6abf Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 4 Feb 2025 13:39:42 +0800 Subject: [PATCH 042/310] ASoC: Intel: sof_sdw: Add quirk for Asus Zenbook S14 Asus laptops with sound PCI subsystem ID 1043:1e13 have the DMICs connected to the host instead of the CS42L43 so need the SOC_SDW_CODEC_MIC quirk. Signed-off-by: Richard Fitzgerald Reviewed-by: Liam Girdwood Signed-off-by: Bard Liao Link: https://patch.msgid.link/20250204053943.93596-3-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index 381fae5943fe..683e15b459a1 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -753,6 +753,7 @@ static const struct dmi_system_id sof_sdw_quirk_table[] = { }; static const struct snd_pci_quirk sof_sdw_ssid_quirk_table[] = { + SND_PCI_QUIRK(0x1043, 0x1e13, "ASUS Zenbook S14", SOC_SDW_CODEC_MIC), {} }; From d8989106287d3735c7e7fc6acb3811d62ebb666c Mon Sep 17 00:00:00 2001 From: Uday M Bhat Date: Tue, 4 Feb 2025 13:39:43 +0800 Subject: [PATCH 043/310] ASoC: Intel: sof_sdw: Add support for Fatcat board with BT offload enabled in PTL platform This change adds an entry for fatcat boards in soundwire quirk table and also, enables BT offload for PTL RVP. Signed-off-by: Uday M Bhat Signed-off-by: Jairaj Arava Reviewed-by: Ranjani Sridharan Signed-off-by: Bard Liao Link: https://patch.msgid.link/20250204053943.93596-4-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index 683e15b459a1..203b07d4d833 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -749,6 +749,16 @@ static const struct dmi_system_id sof_sdw_quirk_table[] = { }, .driver_data = (void *)(SOC_SDW_PCH_DMIC), }, + { + .callback = sof_sdw_quirk_cb, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Google"), + DMI_MATCH(DMI_PRODUCT_NAME, "Fatcat"), + }, + .driver_data = (void *)(SOC_SDW_PCH_DMIC | + SOF_BT_OFFLOAD_SSP(2) | + SOF_SSP_BT_OFFLOAD_PRESENT), + }, {} }; From 3588b76db7ba798f54dee39a55708b16e1c61de4 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Tue, 4 Feb 2025 11:31:33 +0800 Subject: [PATCH 044/310] ASoC: Intel: soc-acpi-intel-tgl-match: declare adr as ull MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The adr is u64. Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Ranjani Sridharan Link: https://patch.msgid.link/20250204033134.92332-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- .../soc/intel/common/soc-acpi-intel-tgl-match.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sound/soc/intel/common/soc-acpi-intel-tgl-match.c b/sound/soc/intel/common/soc-acpi-intel-tgl-match.c index 6f8c06413665..b77aafb0bfb6 100644 --- a/sound/soc/intel/common/soc-acpi-intel-tgl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-tgl-match.c @@ -658,25 +658,25 @@ static const struct snd_soc_acpi_endpoint cs35l56_7_fb_endpoints[] = { static const struct snd_soc_acpi_adr_device cs35l56_sdw_eight_1_4_fb_adr[] = { { - .adr = 0x00003301fa355601, + .adr = 0x00003301fa355601ull, .num_endpoints = ARRAY_SIZE(cs35l56_l_fb_endpoints), .endpoints = cs35l56_l_fb_endpoints, .name_prefix = "AMP1" }, { - .adr = 0x00003201fa355601, + .adr = 0x00003201fa355601ull, .num_endpoints = ARRAY_SIZE(cs35l56_2_fb_endpoints), .endpoints = cs35l56_2_fb_endpoints, .name_prefix = "AMP2" }, { - .adr = 0x00003101fa355601, + .adr = 0x00003101fa355601ull, .num_endpoints = ARRAY_SIZE(cs35l56_4_fb_endpoints), .endpoints = cs35l56_4_fb_endpoints, .name_prefix = "AMP3" }, { - .adr = 0x00003001fa355601, + .adr = 0x00003001fa355601ull, .num_endpoints = ARRAY_SIZE(cs35l56_6_fb_endpoints), .endpoints = cs35l56_6_fb_endpoints, .name_prefix = "AMP4" @@ -685,25 +685,25 @@ static const struct snd_soc_acpi_adr_device cs35l56_sdw_eight_1_4_fb_adr[] = { static const struct snd_soc_acpi_adr_device cs35l56_sdw_eight_5_8_fb_adr[] = { { - .adr = 0x00013701fa355601, + .adr = 0x00013701fa355601ull, .num_endpoints = ARRAY_SIZE(cs35l56_r_fb_endpoints), .endpoints = cs35l56_r_fb_endpoints, .name_prefix = "AMP8" }, { - .adr = 0x00013601fa355601, + .adr = 0x00013601fa355601ull, .num_endpoints = ARRAY_SIZE(cs35l56_3_fb_endpoints), .endpoints = cs35l56_3_fb_endpoints, .name_prefix = "AMP7" }, { - .adr = 0x00013501fa355601, + .adr = 0x00013501fa355601ull, .num_endpoints = ARRAY_SIZE(cs35l56_5_fb_endpoints), .endpoints = cs35l56_5_fb_endpoints, .name_prefix = "AMP6" }, { - .adr = 0x00013401fa355601, + .adr = 0x00013401fa355601ull, .num_endpoints = ARRAY_SIZE(cs35l56_7_fb_endpoints), .endpoints = cs35l56_7_fb_endpoints, .name_prefix = "AMP5" From 20efccc53abf99fa52ea30a43dec758f6b6b9940 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Tue, 4 Feb 2025 11:31:34 +0800 Subject: [PATCH 045/310] ASoC: Intel: soc-acpi-intel-mtl-match: declare adr as ull MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The adr is u64. Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Ranjani Sridharan Link: https://patch.msgid.link/20250204033134.92332-3-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/common/soc-acpi-intel-mtl-match.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/intel/common/soc-acpi-intel-mtl-match.c b/sound/soc/intel/common/soc-acpi-intel-mtl-match.c index 770e2194a283..9e611e3667ad 100644 --- a/sound/soc/intel/common/soc-acpi-intel-mtl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-mtl-match.c @@ -330,7 +330,7 @@ static const struct snd_soc_acpi_adr_device rt1316_3_single_adr[] = { static const struct snd_soc_acpi_adr_device rt1318_1_single_adr[] = { { - .adr = 0x000130025D131801, + .adr = 0x000130025D131801ull, .num_endpoints = 1, .endpoints = &single_endpoint, .name_prefix = "rt1318-1" From 6b24e67b4056ba83b1e95e005b7e50fdb1cc6cf4 Mon Sep 17 00:00:00 2001 From: John Keeping Date: Tue, 4 Feb 2025 16:13:10 +0000 Subject: [PATCH 046/310] ASoC: rockchip: i2s-tdm: fix shift config for SND_SOC_DAIFMT_DSP_[AB] Commit 2f45a4e289779 ("ASoC: rockchip: i2s_tdm: Fixup config for SND_SOC_DAIFMT_DSP_A/B") applied a partial change to fix the configuration for DSP A and DSP B formats. The shift control also needs updating to set the correct offset for frame data compared to LRCK. Set the correct values. Fixes: 081068fd64140 ("ASoC: rockchip: add support for i2s-tdm controller") Signed-off-by: John Keeping Link: https://patch.msgid.link/20250204161311.2117240-1-jkeeping@inmusicbrands.com Signed-off-by: Mark Brown --- sound/soc/rockchip/rockchip_i2s_tdm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/rockchip/rockchip_i2s_tdm.c b/sound/soc/rockchip/rockchip_i2s_tdm.c index 7f5fcaecee4b..78ab88843f86 100644 --- a/sound/soc/rockchip/rockchip_i2s_tdm.c +++ b/sound/soc/rockchip/rockchip_i2s_tdm.c @@ -451,11 +451,11 @@ static int rockchip_i2s_tdm_set_fmt(struct snd_soc_dai *cpu_dai, break; case SND_SOC_DAIFMT_DSP_A: val = I2S_TXCR_TFS_TDM_PCM; - tdm_val = TDM_SHIFT_CTRL(0); + tdm_val = TDM_SHIFT_CTRL(2); break; case SND_SOC_DAIFMT_DSP_B: val = I2S_TXCR_TFS_TDM_PCM; - tdm_val = TDM_SHIFT_CTRL(2); + tdm_val = TDM_SHIFT_CTRL(4); break; default: ret = -EINVAL; From 2a03d2da55b4cd5c86b360db0e917ee93b3f0cb9 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 9 Jan 2025 06:35:49 +0200 Subject: [PATCH 047/310] dt-bindings: nvmem: qcom,qfprom: Add SAR2130P compatible Document compatible for the QFPROM on SAR2130P platform. Signed-off-by: Dmitry Baryshkov Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250109-sar2130p-nvmem-v4-5-633739fe5f11@linaro.org Signed-off-by: Rob Herring (Arm) --- Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml b/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml index d37f544ab8aa..39c209249c9c 100644 --- a/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml +++ b/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml @@ -36,6 +36,7 @@ properties: - qcom,qcs404-qfprom - qcom,qcs615-qfprom - qcom,qcs8300-qfprom + - qcom,sar2130p-qfprom - qcom,sc7180-qfprom - qcom,sc7280-qfprom - qcom,sc8280xp-qfprom From aff2355d260e47e780cd96af127beaab18a664b1 Mon Sep 17 00:00:00 2001 From: Mark Lord Date: Tue, 4 Feb 2025 19:45:06 +0200 Subject: [PATCH 048/310] spi: pxa2xx: Fix regression when toggling chip select on LPSS devices The commit 78b435c9044a ("spi: pxa2xx: Introduce __lpss_ssp_update_priv() helper") broke speaker output on my ASUS UX5304MA laptop. The problem is in inverted value that got written in the private register. Simple bug, simple fix. Fixes: 78b435c9044a ("spi: pxa2xx: Introduce __lpss_ssp_update_priv() helper") Signed-off-by: Mark Lord Tested-by: Mark Lord Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20250204174506.149978-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-pxa2xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index 5f9cac41baff..06711a62fa3d 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -399,7 +399,7 @@ static void lpss_ssp_cs_control(struct spi_device *spi, bool enable) lpss_ssp_select_cs(spi, config); mask = LPSS_CS_CONTROL_CS_HIGH; - __lpss_ssp_update_priv(drv_data, config->reg_cs_ctrl, mask, enable ? mask : 0); + __lpss_ssp_update_priv(drv_data, config->reg_cs_ctrl, mask, enable ? 0 : mask); if (config->cs_clk_stays_gated) { /* * Changing CS alone when dynamic clock gating is on won't From fd079124112c6e11c1bca2e7c71470a2d60bc363 Mon Sep 17 00:00:00 2001 From: Bharadwaj Raju Date: Wed, 5 Feb 2025 00:59:53 +0530 Subject: [PATCH 049/310] selftests/cgroup: use bash in test_cpuset_v1_hp.sh The script uses non-POSIX features like `[[` for conditionals and hence does not work when run with a POSIX /bin/sh. Change the shebang to /bin/bash instead, like the other tests in cgroup. Signed-off-by: Bharadwaj Raju Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh index 3f45512fb512..7406c24be1ac 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # # Test the special cpuset v1 hotplug case where a cpuset become empty of From dabbd325b25edb5cdd99c94391817202dd54b651 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 4 Feb 2025 23:50:08 +0000 Subject: [PATCH 050/310] ASoC: simple-card-utils.c: add missing dlc->of_node commit 90de551c1bf ("ASoC: simple-card-utils.c: enable multi Component support") added muiti Component support, but was missing to add dlc->of_node. Because of it, Sound device list will indicates strange name if it was DPCM connection and driver supports dai->driver->dai_args, like below > aplay -l card X: sndulcbmix [xxxx], device 0: fe.(null).rsnd-dai.0 (*) [] ... ^^^^^^ It will be fixed by this patch > aplay -l card X: sndulcbmix [xxxx], device 0: fe.sound@ec500000.rsnd-dai.0 (*) [] ... ^^^^^^^^^^^^^^ Signed-off-by: Kuninori Morimoto Reviewed-by: Daniel Baluta Link: https://patch.msgid.link/87ikpp2rtb.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/generic/simple-card-utils.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/generic/simple-card-utils.c b/sound/soc/generic/simple-card-utils.c index dd414634b4ac..c2445c5ccd84 100644 --- a/sound/soc/generic/simple-card-utils.c +++ b/sound/soc/generic/simple-card-utils.c @@ -1092,6 +1092,7 @@ int graph_util_parse_dai(struct device *dev, struct device_node *ep, args.np = ep; dai = snd_soc_get_dai_via_args(&args); if (dai) { + dlc->of_node = node; dlc->dai_name = snd_soc_dai_name_get(dai); dlc->dai_args = snd_soc_copy_dai_args(dev, &args); if (!dlc->dai_args) From 32ffed055dcee17f6705f545b069e44a66067808 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Wed, 5 Feb 2025 00:43:43 +0000 Subject: [PATCH 051/310] regmap-irq: Add missing kfree() Add kfree() for "d->main_status_buf" to the error-handling path to prevent a memory leak. Fixes: a2d21848d921 ("regmap: regmap-irq: Add main status register support") Cc: stable@vger.kernel.org # v5.1+ Signed-off-by: Jiasheng Jiang Link: https://patch.msgid.link/20250205004343.14413-1-jiashengjiangcool@gmail.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-irq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c index 0bcd81389a29..978613407ea3 100644 --- a/drivers/base/regmap/regmap-irq.c +++ b/drivers/base/regmap/regmap-irq.c @@ -906,6 +906,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode, kfree(d->wake_buf); kfree(d->mask_buf_def); kfree(d->mask_buf); + kfree(d->main_status_buf); kfree(d->status_buf); kfree(d->status_reg_buf); if (d->config_buf) { @@ -981,6 +982,7 @@ void regmap_del_irq_chip(int irq, struct regmap_irq_chip_data *d) kfree(d->wake_buf); kfree(d->mask_buf_def); kfree(d->mask_buf); + kfree(d->main_status_buf); kfree(d->status_reg_buf); kfree(d->status_buf); if (d->config_buf) { From 76b0a22d4cf7dc9091129560fdc04e73eb9db4cb Mon Sep 17 00:00:00 2001 From: Edson Juliano Drosdeck Date: Sat, 1 Feb 2025 11:39:30 -0300 Subject: [PATCH 052/310] ALSA: hda/realtek: Limit mic boost on Positivo ARN50 The internal mic boost on the Positivo ARN50 is too high. Fix this by applying the ALC269_FIXUP_LIMIT_INT_MIC_BOOST fixup to the machine to limit the gain. Signed-off-by: Edson Juliano Drosdeck Link: https://patch.msgid.link/20250201143930.25089-1-edson.drosdeck@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 8192be394d0d..ae0beb52e7b0 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -11045,6 +11045,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1d72, 0x1945, "Redmi G", ALC256_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1947, "RedmiBook Air", ALC255_FIXUP_XIAOMI_HEADSET_MIC), SND_PCI_QUIRK(0x1f66, 0x0105, "Ayaneo Portable Game Player", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x2014, 0x800a, "Positivo ARN50", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x2782, 0x0214, "VAIO VJFE-CL", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x2782, 0x0228, "Infinix ZERO BOOK 13", ALC269VB_FIXUP_INFINIX_ZERO_BOOK_13), SND_PCI_QUIRK(0x2782, 0x0232, "CHUWI CoreBook XPro", ALC269VB_FIXUP_CHUWI_COREBOOK_XPRO), From b0eddc21900fb44f8c5db95710479865e3700fbd Mon Sep 17 00:00:00 2001 From: Varadarajan Narayanan Date: Wed, 5 Feb 2025 13:16:56 +0530 Subject: [PATCH 053/310] regulator: qcom_smd: Add l2, l5 sub-node to mp5496 regulator Adding l2, l5 sub-node entry to mp5496 regulator node. Cc: stable@vger.kernel.org Acked-by: Rob Herring Signed-off-by: Varadarajan Narayanan Link: https://patch.msgid.link/20250205074657.4142365-2-quic_varada@quicinc.com Signed-off-by: Mark Brown --- .../devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml b/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml index f2fd2df68a9e..b7241ce975b9 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml @@ -22,7 +22,7 @@ description: Each sub-node is identified using the node's name, with valid values listed for each of the pmics below. - For mp5496, s1, s2 + For mp5496, s1, s2, l2, l5 For pm2250, s1, s2, s3, s4, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15, l16, l17, l18, l19, l20, l21, l22 From 796106e29e5df6cd4b4e2b51262a8a19e9fa0625 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 5 Feb 2025 00:20:36 +0000 Subject: [PATCH 054/310] ASoC: rsnd: indicate unsupported clock rate It will indicate "unsupported clock rate" when setup clock failed. But it is unclear what kind of rate was failed. Indicate it. Signed-off-by: Kuninori Morimoto Reviewed-by: Yoshihiro Shimoda Link: https://patch.msgid.link/874j192qej.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/renesas/rcar/ssi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/renesas/rcar/ssi.c b/sound/soc/renesas/rcar/ssi.c index b3d4e8ae07ef..0c6424a1fcac 100644 --- a/sound/soc/renesas/rcar/ssi.c +++ b/sound/soc/renesas/rcar/ssi.c @@ -336,7 +336,8 @@ static int rsnd_ssi_master_clk_start(struct rsnd_mod *mod, return 0; rate_err: - dev_err(dev, "unsupported clock rate\n"); + dev_err(dev, "unsupported clock rate (%d)\n", rate); + return ret; } From c3fc002b206c6c83d1e3702b979733002ba6fb2c Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 5 Feb 2025 00:20:42 +0000 Subject: [PATCH 055/310] ASoC: rsnd: don't indicate warning on rsnd_kctrl_accept_runtime() rsnd_kctrl_accept_runtime() (1) is used for runtime convert rate (= Synchronous SRC Mode). Now, rsnd driver has 2 kctrls for it (A): "SRC Out Rate Switch" (B): "SRC Out Rate" // it calls (1) (A): can be called anytime (B): can be called only runtime, and will indicate warning if it was used at non-runtime. To use runtime convert rate (= Synchronous SRC Mode), user might uses command in below order. (X): > amixer set "SRC Out Rate" on > aplay xxx.wav & (Y): > amixer set "SRC Out Rate" 48010 // convert rate to 48010Hz (Y): calls B (X): calls both A and B. In this case, when user calls (X), it calls both (A) and (B), but it is not yet start running. So, (B) will indicate warning. This warning was added by commit b5c088689847 ("ASoC: rsnd: add warning message to rsnd_kctrl_accept_runtime()"), but the message sounds like the operation was not correct. Let's update warning message. The message is very SRC specific, implement it in src.c Signed-off-by: Kuninori Morimoto Reviewed-by: Yoshihiro Shimoda Link: https://patch.msgid.link/8734gt2qed.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/renesas/rcar/core.c | 14 -------------- sound/soc/renesas/rcar/rsnd.h | 1 - sound/soc/renesas/rcar/src.c | 18 +++++++++++++++++- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/sound/soc/renesas/rcar/core.c b/sound/soc/renesas/rcar/core.c index d3709fd0409e..f3f0c3f0bb9f 100644 --- a/sound/soc/renesas/rcar/core.c +++ b/sound/soc/renesas/rcar/core.c @@ -1770,20 +1770,6 @@ int rsnd_kctrl_accept_anytime(struct rsnd_dai_stream *io) return 1; } -int rsnd_kctrl_accept_runtime(struct rsnd_dai_stream *io) -{ - struct snd_pcm_runtime *runtime = rsnd_io_to_runtime(io); - struct rsnd_priv *priv = rsnd_io_to_priv(io); - struct device *dev = rsnd_priv_to_dev(priv); - - if (!runtime) { - dev_warn(dev, "Can't update kctrl when idle\n"); - return 0; - } - - return 1; -} - struct rsnd_kctrl_cfg *rsnd_kctrl_init_m(struct rsnd_kctrl_cfg_m *cfg) { cfg->cfg.val = cfg->val; diff --git a/sound/soc/renesas/rcar/rsnd.h b/sound/soc/renesas/rcar/rsnd.h index a5f54b65313c..04c70690f7a2 100644 --- a/sound/soc/renesas/rcar/rsnd.h +++ b/sound/soc/renesas/rcar/rsnd.h @@ -742,7 +742,6 @@ struct rsnd_kctrl_cfg_s { #define rsnd_kctrl_vals(x) ((x).val) /* = (x).cfg.val[0] */ int rsnd_kctrl_accept_anytime(struct rsnd_dai_stream *io); -int rsnd_kctrl_accept_runtime(struct rsnd_dai_stream *io); struct rsnd_kctrl_cfg *rsnd_kctrl_init_m(struct rsnd_kctrl_cfg_m *cfg); struct rsnd_kctrl_cfg *rsnd_kctrl_init_s(struct rsnd_kctrl_cfg_s *cfg); int rsnd_kctrl_new(struct rsnd_mod *mod, diff --git a/sound/soc/renesas/rcar/src.c b/sound/soc/renesas/rcar/src.c index e7f86db0d94c..309918029772 100644 --- a/sound/soc/renesas/rcar/src.c +++ b/sound/soc/renesas/rcar/src.c @@ -531,6 +531,22 @@ static irqreturn_t rsnd_src_interrupt(int irq, void *data) return IRQ_HANDLED; } +static int rsnd_src_kctrl_accept_runtime(struct rsnd_dai_stream *io) +{ + struct snd_pcm_runtime *runtime = rsnd_io_to_runtime(io); + + if (!runtime) { + struct rsnd_priv *priv = rsnd_io_to_priv(io); + struct device *dev = rsnd_priv_to_dev(priv); + + dev_warn(dev, "\"SRC Out Rate\" can use during running\n"); + + return 0; + } + + return 1; +} + static int rsnd_src_probe_(struct rsnd_mod *mod, struct rsnd_dai_stream *io, struct rsnd_priv *priv) @@ -594,7 +610,7 @@ static int rsnd_src_pcm_new(struct rsnd_mod *mod, rsnd_io_is_play(io) ? "SRC Out Rate" : "SRC In Rate", - rsnd_kctrl_accept_runtime, + rsnd_src_kctrl_accept_runtime, rsnd_src_set_convert_rate, &src->sync, 192000); From 89f9cf185885d4358aa92b48e51d0f09b71775aa Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 5 Feb 2025 00:20:48 +0000 Subject: [PATCH 056/310] ASoC: rsnd: adjust convert rate limitation Current rsnd driver supports Synchronous SRC Mode, but HW allow to update rate only within 1% from current rate. Adjust to it. Becially, this feature is used to fine-tune subtle difference that occur during sampling rate conversion in SRC. So, it should be called within 1% margin of rate difference. If there was difference over 1%, it will apply with 1% increments by using loop without indicating error message. Cc: Yoshihiro Shimoda Signed-off-by: Kuninori Morimoto Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://patch.msgid.link/871pwd2qe8.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/renesas/rcar/src.c | 98 ++++++++++++++++++++++++++++-------- 1 file changed, 76 insertions(+), 22 deletions(-) diff --git a/sound/soc/renesas/rcar/src.c b/sound/soc/renesas/rcar/src.c index 309918029772..7d73b183bda6 100644 --- a/sound/soc/renesas/rcar/src.c +++ b/sound/soc/renesas/rcar/src.c @@ -35,6 +35,7 @@ struct rsnd_src { struct rsnd_mod *dma; struct rsnd_kctrl_cfg_s sen; /* sync convert enable */ struct rsnd_kctrl_cfg_s sync; /* sync convert */ + u32 current_sync_rate; int irq; }; @@ -100,7 +101,7 @@ static u32 rsnd_src_convert_rate(struct rsnd_dai_stream *io, if (!rsnd_src_sync_is_enabled(mod)) return rsnd_io_converted_rate(io); - convert_rate = src->sync.val; + convert_rate = src->current_sync_rate; if (!convert_rate) convert_rate = rsnd_io_converted_rate(io); @@ -201,13 +202,73 @@ static const u32 chan222222[] = { static void rsnd_src_set_convert_rate(struct rsnd_dai_stream *io, struct rsnd_mod *mod) { + struct snd_pcm_runtime *runtime = rsnd_io_to_runtime(io); struct rsnd_priv *priv = rsnd_mod_to_priv(mod); - struct device *dev = rsnd_priv_to_dev(priv); + struct rsnd_src *src = rsnd_mod_to_src(mod); + u32 fin, fout, new_rate; + int inc, cnt, rate; + u64 base, val; + + if (!runtime) + return; + + if (!rsnd_src_sync_is_enabled(mod)) + return; + + fin = rsnd_src_get_in_rate(priv, io); + fout = rsnd_src_get_out_rate(priv, io); + + new_rate = src->sync.val; + + if (!new_rate) + new_rate = fout; + + /* Do nothing if no diff */ + if (new_rate == src->current_sync_rate) + return; + + /* + * SRCm_IFSVR::INTIFS can change within 1% + * see + * SRCm_IFSVR::INTIFS Note + */ + inc = fout / 100; + cnt = abs(new_rate - fout) / inc; + if (fout > new_rate) + inc *= -1; + + /* + * After start running SRC, we can update only SRC_IFSVR + * for Synchronous Mode + */ + base = (u64)0x0400000 * fin; + rate = fout; + for (int i = 0; i < cnt; i++) { + val = base; + rate += inc; + do_div(val, rate); + + rsnd_mod_write(mod, SRC_IFSVR, val); + } + val = base; + do_div(val, new_rate); + + rsnd_mod_write(mod, SRC_IFSVR, val); + + /* update current_sync_rate */ + src->current_sync_rate = new_rate; +} + +static void rsnd_src_init_convert_rate(struct rsnd_dai_stream *io, + struct rsnd_mod *mod) +{ struct snd_pcm_runtime *runtime = rsnd_io_to_runtime(io); + struct rsnd_priv *priv = rsnd_mod_to_priv(mod); + struct device *dev = rsnd_priv_to_dev(priv); int is_play = rsnd_io_is_play(io); int use_src = 0; u32 fin, fout; - u32 ifscr, fsrate, adinr; + u32 ifscr, adinr; u32 cr, route; u32 i_busif, o_busif, tmp; const u32 *bsdsr_table; @@ -245,26 +306,15 @@ static void rsnd_src_set_convert_rate(struct rsnd_dai_stream *io, adinr = rsnd_get_adinr_bit(mod, io) | chan; /* - * SRC_IFSCR / SRC_IFSVR - */ - ifscr = 0; - fsrate = 0; - if (use_src) { - u64 n; - - ifscr = 1; - n = (u64)0x0400000 * fin; - do_div(n, fout); - fsrate = n; - } - - /* + * SRC_IFSCR * SRC_SRCCR / SRC_ROUTE_MODE0 */ + ifscr = 0; cr = 0x00011110; route = 0x0; if (use_src) { route = 0x1; + ifscr = 0x1; if (rsnd_src_sync_is_enabled(mod)) { cr |= 0x1; @@ -335,7 +385,6 @@ static void rsnd_src_set_convert_rate(struct rsnd_dai_stream *io, rsnd_mod_write(mod, SRC_SRCIR, 1); /* initialize */ rsnd_mod_write(mod, SRC_ADINR, adinr); rsnd_mod_write(mod, SRC_IFSCR, ifscr); - rsnd_mod_write(mod, SRC_IFSVR, fsrate); rsnd_mod_write(mod, SRC_SRCCR, cr); rsnd_mod_write(mod, SRC_BSDSR, bsdsr_table[idx]); rsnd_mod_write(mod, SRC_BSISR, bsisr_table[idx]); @@ -348,6 +397,9 @@ static void rsnd_src_set_convert_rate(struct rsnd_dai_stream *io, rsnd_adg_set_src_timesel_gen2(mod, io, fin, fout); + /* update SRC_IFSVR */ + rsnd_src_set_convert_rate(io, mod); + return; convert_rate_err: @@ -467,7 +519,8 @@ static int rsnd_src_init(struct rsnd_mod *mod, int ret; /* reset sync convert_rate */ - src->sync.val = 0; + src->sync.val = + src->current_sync_rate = 0; ret = rsnd_mod_power_on(mod); if (ret < 0) @@ -475,7 +528,7 @@ static int rsnd_src_init(struct rsnd_mod *mod, rsnd_src_activation(mod); - rsnd_src_set_convert_rate(io, mod); + rsnd_src_init_convert_rate(io, mod); rsnd_src_status_clear(mod); @@ -493,7 +546,8 @@ static int rsnd_src_quit(struct rsnd_mod *mod, rsnd_mod_power_off(mod); /* reset sync convert_rate */ - src->sync.val = 0; + src->sync.val = + src->current_sync_rate = 0; return 0; } @@ -601,7 +655,7 @@ static int rsnd_src_pcm_new(struct rsnd_mod *mod, "SRC Out Rate Switch" : "SRC In Rate Switch", rsnd_kctrl_accept_anytime, - rsnd_src_set_convert_rate, + rsnd_src_init_convert_rate, &src->sen, 1); if (ret < 0) return ret; From d8d99c3b5c485f339864aeaa29f76269cc0ea975 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 5 Feb 2025 15:52:31 +0200 Subject: [PATCH 057/310] ASoC: SOF: stream-ipc: Check for cstream nullity in sof_ipc_msg_data() The nullity of sps->cstream should be checked similarly as it is done in sof_set_stream_data_offset() function. Assuming that it is not NULL if sps->stream is NULL is incorrect and can lead to NULL pointer dereference. Fixes: 090349a9feba ("ASoC: SOF: Add support for compress API for stream data/offset") Cc: stable@vger.kernel.org Reported-by: Curtis Malainey Closes: https://github.com/thesofproject/linux/pull/5214 Signed-off-by: Peter Ujfalusi Reviewed-by: Daniel Baluta Reviewed-by: Ranjani Sridharan Reviewed-by: Bard Liao Reviewed-by: Curtis Malainey Link: https://patch.msgid.link/20250205135232.19762-2-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/stream-ipc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/stream-ipc.c b/sound/soc/sof/stream-ipc.c index 794c7bbccbaf..8262443ac89a 100644 --- a/sound/soc/sof/stream-ipc.c +++ b/sound/soc/sof/stream-ipc.c @@ -43,7 +43,7 @@ int sof_ipc_msg_data(struct snd_sof_dev *sdev, return -ESTRPIPE; posn_offset = stream->posn_offset; - } else { + } else if (sps->cstream) { struct sof_compr_stream *sstream = sps->cstream->runtime->private_data; @@ -51,6 +51,10 @@ int sof_ipc_msg_data(struct snd_sof_dev *sdev, return -ESTRPIPE; posn_offset = sstream->posn_offset; + + } else { + dev_err(sdev->dev, "%s: No stream opened\n", __func__); + return -EINVAL; } snd_sof_dsp_mailbox_read(sdev, posn_offset, p, sz); From 46c7b901e2a03536df5a3cb40b3b26e2be505df6 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 5 Feb 2025 15:52:32 +0200 Subject: [PATCH 058/310] ASoC: SOF: pcm: Clear the susbstream pointer to NULL on close The spcm->stream[substream->stream].substream is set during open and was left untouched. After the first PCM stream it will never be NULL and we have code which checks for substream NULLity as indication if the stream is active or not. For the compressed cstream pointer the same has been done, this change will correct the handling of PCM streams. Fixes: 090349a9feba ("ASoC: SOF: Add support for compress API for stream data/offset") Cc: stable@vger.kernel.org Reported-by: Curtis Malainey Closes: https://github.com/thesofproject/linux/pull/5214 Signed-off-by: Peter Ujfalusi Reviewed-by: Daniel Baluta Reviewed-by: Ranjani Sridharan Reviewed-by: Bard Liao Reviewed-by: Curtis Malainey Link: https://patch.msgid.link/20250205135232.19762-3-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/pcm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/sof/pcm.c b/sound/soc/sof/pcm.c index 35a7462d8b69..c5c6353f18ce 100644 --- a/sound/soc/sof/pcm.c +++ b/sound/soc/sof/pcm.c @@ -511,6 +511,8 @@ static int sof_pcm_close(struct snd_soc_component *component, */ } + spcm->stream[substream->stream].substream = NULL; + return 0; } From 038e33fcd40e59b60cdca561c2a39998e6759e08 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Thu, 9 Jan 2025 16:41:49 +0100 Subject: [PATCH 059/310] dt-bindings: display: Add powertip,{st7272|hx8238a} as DT Schema description This patch provides the DT Schema description of: - powertip,st7272 320 x 240 LCD display - powertip,hx8238a 320 x 240 LCD display Used with the different HW revisions of btt3 devices. Signed-off-by: Lukasz Majewski Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20250109154149.1212631-1-lukma@denx.de Signed-off-by: Rob Herring (Arm) --- .../display/panel/powertip,hx8238a.yaml | 29 +++++++++++++++++++ .../display/panel/powertip,st7272.yaml | 29 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 Documentation/devicetree/bindings/display/panel/powertip,hx8238a.yaml create mode 100644 Documentation/devicetree/bindings/display/panel/powertip,st7272.yaml diff --git a/Documentation/devicetree/bindings/display/panel/powertip,hx8238a.yaml b/Documentation/devicetree/bindings/display/panel/powertip,hx8238a.yaml new file mode 100644 index 000000000000..b7d74faeb5d5 --- /dev/null +++ b/Documentation/devicetree/bindings/display/panel/powertip,hx8238a.yaml @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/display/panel/powertip,hx8238a.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Powertip Electronic Technology Co. 320 x 240 LCD panel + +maintainers: + - Lukasz Majewski + +allOf: + - $ref: panel-dpi.yaml# + +properties: + compatible: + items: + - const: powertip,hx8238a + - {} # panel-dpi, but not listed here to avoid false select + + height-mm: true + panel-timing: true + port: true + power-supply: true + width-mm: true + +additionalProperties: false + +... diff --git a/Documentation/devicetree/bindings/display/panel/powertip,st7272.yaml b/Documentation/devicetree/bindings/display/panel/powertip,st7272.yaml new file mode 100644 index 000000000000..f3622800f13f --- /dev/null +++ b/Documentation/devicetree/bindings/display/panel/powertip,st7272.yaml @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/display/panel/powertip,st7272.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Powertip Electronic Technology Co. 320 x 240 LCD panel + +maintainers: + - Lukasz Majewski + +allOf: + - $ref: panel-dpi.yaml# + +properties: + compatible: + items: + - const: powertip,st7272 + - {} # panel-dpi, but not listed here to avoid false select + + height-mm: true + panel-timing: true + port: true + power-supply: true + width-mm: true + +additionalProperties: false + +... From 679074942c2502a95842a80471d8fb718165ac77 Mon Sep 17 00:00:00 2001 From: Vitaly Rodionov Date: Wed, 5 Feb 2025 16:08:46 +0000 Subject: [PATCH 060/310] ASoC: arizona/madera: use fsleep() in up/down DAPM event delays. Using `fsleep` instead of `msleep` resolves some customer complaints regarding the precision of up/down DAPM event timing. `fsleep()` automatically selects the appropriate sleep function, making the delay time more predictable. Signed-off-by: Vitaly Rodionov Link: https://patch.msgid.link/20250205160849.500306-1-vitalyr@opensource.cirrus.com Reviewed-by: Charles Keepax Signed-off-by: Mark Brown --- sound/soc/codecs/arizona.c | 14 +++++++------- sound/soc/codecs/madera.c | 10 +++++----- sound/soc/codecs/wm5110.c | 8 ++++---- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/sound/soc/codecs/arizona.c b/sound/soc/codecs/arizona.c index 402b9a2ff024..68cdb1027d0c 100644 --- a/sound/soc/codecs/arizona.c +++ b/sound/soc/codecs/arizona.c @@ -967,7 +967,7 @@ int arizona_out_ev(struct snd_soc_dapm_widget *w, case ARIZONA_OUT3L_ENA_SHIFT: case ARIZONA_OUT3R_ENA_SHIFT: priv->out_up_pending++; - priv->out_up_delay += 17; + priv->out_up_delay += 17000; break; case ARIZONA_OUT4L_ENA_SHIFT: case ARIZONA_OUT4R_ENA_SHIFT: @@ -977,7 +977,7 @@ int arizona_out_ev(struct snd_soc_dapm_widget *w, case WM8997: break; default: - priv->out_up_delay += 10; + priv->out_up_delay += 10000; break; } break; @@ -999,7 +999,7 @@ int arizona_out_ev(struct snd_soc_dapm_widget *w, if (!priv->out_up_pending && priv->out_up_delay) { dev_dbg(component->dev, "Power up delay: %d\n", priv->out_up_delay); - msleep(priv->out_up_delay); + fsleep(priv->out_up_delay); priv->out_up_delay = 0; } break; @@ -1017,7 +1017,7 @@ int arizona_out_ev(struct snd_soc_dapm_widget *w, case ARIZONA_OUT3L_ENA_SHIFT: case ARIZONA_OUT3R_ENA_SHIFT: priv->out_down_pending++; - priv->out_down_delay++; + priv->out_down_delay += 1000; break; case ARIZONA_OUT4L_ENA_SHIFT: case ARIZONA_OUT4R_ENA_SHIFT: @@ -1028,10 +1028,10 @@ int arizona_out_ev(struct snd_soc_dapm_widget *w, break; case WM8998: case WM1814: - priv->out_down_delay += 5; + priv->out_down_delay += 5000; break; default: - priv->out_down_delay++; + priv->out_down_delay += 1000; break; } break; @@ -1053,7 +1053,7 @@ int arizona_out_ev(struct snd_soc_dapm_widget *w, if (!priv->out_down_pending && priv->out_down_delay) { dev_dbg(component->dev, "Power down delay: %d\n", priv->out_down_delay); - msleep(priv->out_down_delay); + fsleep(priv->out_down_delay); priv->out_down_delay = 0; } break; diff --git a/sound/soc/codecs/madera.c b/sound/soc/codecs/madera.c index a840a2eb92b9..af109761f359 100644 --- a/sound/soc/codecs/madera.c +++ b/sound/soc/codecs/madera.c @@ -2323,10 +2323,10 @@ int madera_out_ev(struct snd_soc_dapm_widget *w, case CS42L92: case CS47L92: case CS47L93: - out_up_delay = 6; + out_up_delay = 6000; break; default: - out_up_delay = 17; + out_up_delay = 17000; break; } @@ -2357,7 +2357,7 @@ int madera_out_ev(struct snd_soc_dapm_widget *w, case MADERA_OUT3R_ENA_SHIFT: priv->out_up_pending--; if (!priv->out_up_pending) { - msleep(priv->out_up_delay); + fsleep(priv->out_up_delay); priv->out_up_delay = 0; } break; @@ -2376,7 +2376,7 @@ int madera_out_ev(struct snd_soc_dapm_widget *w, case MADERA_OUT3L_ENA_SHIFT: case MADERA_OUT3R_ENA_SHIFT: priv->out_down_pending++; - priv->out_down_delay++; + priv->out_down_delay += 1000; break; default: break; @@ -2393,7 +2393,7 @@ int madera_out_ev(struct snd_soc_dapm_widget *w, case MADERA_OUT3R_ENA_SHIFT: priv->out_down_pending--; if (!priv->out_down_pending) { - msleep(priv->out_down_delay); + fsleep(priv->out_down_delay); priv->out_down_delay = 0; } break; diff --git a/sound/soc/codecs/wm5110.c b/sound/soc/codecs/wm5110.c index 502196253d42..64eee0d2347d 100644 --- a/sound/soc/codecs/wm5110.c +++ b/sound/soc/codecs/wm5110.c @@ -302,7 +302,7 @@ static int wm5110_hp_pre_enable(struct snd_soc_dapm_widget *w) } else { wseq = wm5110_no_dre_left_enable; nregs = ARRAY_SIZE(wm5110_no_dre_left_enable); - priv->out_up_delay += 10; + priv->out_up_delay += 10000; } break; case ARIZONA_OUT1R_ENA_SHIFT: @@ -312,7 +312,7 @@ static int wm5110_hp_pre_enable(struct snd_soc_dapm_widget *w) } else { wseq = wm5110_no_dre_right_enable; nregs = ARRAY_SIZE(wm5110_no_dre_right_enable); - priv->out_up_delay += 10; + priv->out_up_delay += 10000; } break; default: @@ -338,7 +338,7 @@ static int wm5110_hp_pre_disable(struct snd_soc_dapm_widget *w) snd_soc_component_update_bits(component, ARIZONA_SPARE_TRIGGERS, ARIZONA_WS_TRG1, 0); - priv->out_down_delay += 27; + priv->out_down_delay += 27000; } break; case ARIZONA_OUT1R_ENA_SHIFT: @@ -350,7 +350,7 @@ static int wm5110_hp_pre_disable(struct snd_soc_dapm_widget *w) snd_soc_component_update_bits(component, ARIZONA_SPARE_TRIGGERS, ARIZONA_WS_TRG2, 0); - priv->out_down_delay += 27; + priv->out_down_delay += 27000; } break; default: From 1d44a30ae3f9195cb4eb7d81bb9ced2776232094 Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Wed, 5 Feb 2025 16:48:04 +0000 Subject: [PATCH 061/310] ASoC: cs35l41: Fallback to using HID for system_name if no SUB is available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For systems which load firmware on the cs35l41 which use ACPI, the _SUB value is used to differentiate firmware and tuning files for the individual systems. In the case where a system does not have a _SUB defined in ACPI node for cs35l41, there needs to be a fallback to allow the files for that system to be differentiated. Since all ACPI nodes for cs35l41 should have a HID defined, the HID should be a safe option. Signed-off-by: Stefan Binding Reviewed-by: André Almeida Tested-by: André Almeida Link: https://patch.msgid.link/20250205164806.414020-1-sbinding@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l41.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/sound/soc/codecs/cs35l41.c b/sound/soc/codecs/cs35l41.c index 07a5cab35fe1..30b89018b113 100644 --- a/sound/soc/codecs/cs35l41.c +++ b/sound/soc/codecs/cs35l41.c @@ -1150,19 +1150,28 @@ static int cs35l41_dsp_init(struct cs35l41_private *cs35l41) static int cs35l41_acpi_get_name(struct cs35l41_private *cs35l41) { - acpi_handle handle = ACPI_HANDLE(cs35l41->dev); + struct acpi_device *adev = ACPI_COMPANION(cs35l41->dev); + acpi_handle handle = acpi_device_handle(adev); + const char *hid; const char *sub; - /* If there is no ACPI_HANDLE, there is no ACPI for this system, return 0 */ - if (!handle) + /* If there is no acpi_device, there is no ACPI for this system, return 0 */ + if (!adev) return 0; sub = acpi_get_subsystem_id(handle); if (IS_ERR(sub)) { - /* If bad ACPI, return 0 and fallback to legacy firmware path, otherwise fail */ - if (PTR_ERR(sub) == -ENODATA) - return 0; - else + /* If no _SUB, fallback to _HID, otherwise fail */ + if (PTR_ERR(sub) == -ENODATA) { + hid = acpi_device_hid(adev); + /* If dummy hid, return 0 and fallback to legacy firmware path */ + if (!strcmp(hid, "device")) + return 0; + sub = kstrdup(hid, GFP_KERNEL); + if (!sub) + sub = ERR_PTR(-ENOMEM); + + } else return PTR_ERR(sub); } From 6fd60136d256b3b948333ebdb3835f41a95ab7ef Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 6 Feb 2025 10:46:42 +0200 Subject: [PATCH 062/310] ASoC: SOF: ipc4-topology: Harden loops for looking up ALH copiers Other, non DAI copier widgets could have the same stream name (sname) as the ALH copier and in that case the copier->data is NULL, no alh_data is attached, which could lead to NULL pointer dereference. We could check for this NULL pointer in sof_ipc4_prepare_copier_module() and avoid the crash, but a similar loop in sof_ipc4_widget_setup_comp_dai() will miscalculate the ALH device count, causing broken audio. The correct fix is to harden the matching logic by making sure that the 1. widget is a DAI widget - so dai = w->private is valid 2. the dai (and thus the copier) is ALH copier Fixes: a150345aa758 ("ASoC: SOF: ipc4-topology: add SoundWire/ALH aggregation support") Reported-by: Seppo Ingalsuo Link: https://github.com/thesofproject/sof/pull/9652 Signed-off-by: Peter Ujfalusi Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Reviewed-by: Bard Liao Link: https://patch.msgid.link/20250206084642.14988-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-topology.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index c04c62478827..6d5cda813e48 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -765,10 +765,16 @@ static int sof_ipc4_widget_setup_comp_dai(struct snd_sof_widget *swidget) } list_for_each_entry(w, &sdev->widget_list, list) { - if (w->widget->sname && + struct snd_sof_dai *alh_dai; + + if (!WIDGET_IS_DAI(w->id) || !w->widget->sname || strcmp(w->widget->sname, swidget->widget->sname)) continue; + alh_dai = w->private; + if (alh_dai->type != SOF_DAI_INTEL_ALH) + continue; + blob->alh_cfg.device_count++; } @@ -2061,11 +2067,13 @@ sof_ipc4_prepare_copier_module(struct snd_sof_widget *swidget, list_for_each_entry(w, &sdev->widget_list, list) { u32 node_type; - if (w->widget->sname && + if (!WIDGET_IS_DAI(w->id) || !w->widget->sname || strcmp(w->widget->sname, swidget->widget->sname)) continue; dai = w->private; + if (dai->type != SOF_DAI_INTEL_ALH) + continue; alh_copier = (struct sof_ipc4_copier *)dai->private; alh_data = &alh_copier->data; node_type = SOF_IPC4_GET_NODE_TYPE(alh_data->gtw_cfg.node_id); From 33b7dc7843dbdc9b90c91d11ba30b107f9138ffd Mon Sep 17 00:00:00 2001 From: Terry Cheong Date: Thu, 6 Feb 2025 11:47:23 +0200 Subject: [PATCH 063/310] ASoC: SOF: Intel: hda: add softdep pre to snd-hda-codec-hdmi module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In enviornment without KMOD requesting module may fail to load snd-hda-codec-hdmi, resulting in HDMI audio not usable. Add softdep to loading HDMI codec module first to ensure we can load it correctly. Signed-off-by: Terry Cheong Reviewed-by: Bard Liao Reviewed-by: Johny Lin Reviewed-by: Péter Ujfalusi Signed-off-by: Peter Ujfalusi Link: https://patch.msgid.link/20250206094723.18013-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-codec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/sof/intel/hda-codec.c b/sound/soc/sof/intel/hda-codec.c index 568f3dfe822f..2f9925830d1d 100644 --- a/sound/soc/sof/intel/hda-codec.c +++ b/sound/soc/sof/intel/hda-codec.c @@ -454,6 +454,7 @@ int hda_codec_i915_exit(struct snd_sof_dev *sdev) } EXPORT_SYMBOL_NS_GPL(hda_codec_i915_exit, "SND_SOC_SOF_HDA_AUDIO_CODEC_I915"); +MODULE_SOFTDEP("pre: snd-hda-codec-hdmi"); #endif MODULE_LICENSE("Dual BSD/GPL"); From 3588b1c0fde2f58d166e3f94a5a58d64b893526c Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 6 Feb 2025 17:57:47 +0900 Subject: [PATCH 064/310] spi: sn-f-ospi: Fix division by zero When there is no dummy cycle in the spi-nor commands, both dummy bus cycle bytes and width are zero. Because of the cpu's warning when divided by zero, the warning should be avoided. Return just zero to avoid such calculations. Fixes: 1b74dd64c861 ("spi: Add Socionext F_OSPI SPI flash controller driver") Co-developed-by: Kohei Ito Signed-off-by: Kohei Ito Signed-off-by: Kunihiko Hayashi Link: https://patch.msgid.link/20250206085747.3834148-1-hayashi.kunihiko@socionext.com Signed-off-by: Mark Brown --- drivers/spi/spi-sn-f-ospi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/spi/spi-sn-f-ospi.c b/drivers/spi/spi-sn-f-ospi.c index 6ad4b729897e..c4969f66a0ba 100644 --- a/drivers/spi/spi-sn-f-ospi.c +++ b/drivers/spi/spi-sn-f-ospi.c @@ -116,6 +116,9 @@ struct f_ospi { static u32 f_ospi_get_dummy_cycle(const struct spi_mem_op *op) { + if (!op->dummy.nbytes) + return 0; + return (op->dummy.nbytes * 8) / op->dummy.buswidth; } From fb97bc2e47f694f79d6358d981ae0428db8e8088 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 29 Jan 2025 15:21:53 +0100 Subject: [PATCH 065/310] drm/tests: hdmi: Fix WW_MUTEX_SLOWPATH failures The light_up_connector helper function in the HDMI infrastructure unit tests uses drm_atomic_set_crtc_for_connector(), but fails when it returns an error. This function can return EDEADLK though if the sequence needs to be restarted, and WW_MUTEX_SLOWPATH is meant to test that we handle it properly. Let's handle EDEADLK and restart the sequence in our tests as well. Fixes: eb66d34d793e ("drm/tests: Add output bpc tests") Reported-by: Dave Airlie Closes: https://lore.kernel.org/r/CAPM=9tzJ4-ERDxvuwrCyUPY0=+P44orhp1kLWVGL7MCfpQjMEQ@mail.gmail.com/ Link: https://lore.kernel.org/r/20241031091558.2435850-1-mripard@kernel.org Reviewed-by: Simona Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20250129-test-kunit-v2-1-fe59c43805d5@kernel.org Signed-off-by: Maxime Ripard --- drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c index b976a5e9aef5..8e6eb94075a5 100644 --- a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c +++ b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c @@ -70,10 +70,17 @@ static int light_up_connector(struct kunit *test, state = drm_kunit_helper_atomic_state_alloc(test, drm, ctx); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, state); +retry: conn_state = drm_atomic_get_connector_state(state, connector); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, conn_state); ret = drm_atomic_set_crtc_for_connector(conn_state, crtc); + if (ret == -EDEADLK) { + drm_atomic_state_clear(state); + ret = drm_modeset_backoff(ctx); + if (!ret) + goto retry; + } KUNIT_EXPECT_EQ(test, ret, 0); crtc_state = drm_atomic_get_crtc_state(state, crtc); From bb4f929a8875b4801db95b8cf3b2c527c1e475e0 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 29 Jan 2025 15:21:54 +0100 Subject: [PATCH 066/310] drm/tests: hdmi: Remove redundant assignments Some tests have the drm pointer assigned multiple times to the same value. Drop the redundant assignments. Reviewed-by: Simona Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20250129-test-kunit-v2-2-fe59c43805d5@kernel.org Signed-off-by: Maxime Ripard --- drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c index 8e6eb94075a5..a36422aa9e27 100644 --- a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c +++ b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c @@ -481,7 +481,6 @@ static void drm_test_check_broadcast_rgb_auto_cea_mode_vic_1(struct kunit *test) mode = drm_kunit_display_mode_from_cea_vic(test, drm, 1); KUNIT_ASSERT_NOT_NULL(test, mode); - drm = &priv->drm; crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, mode, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -595,7 +594,6 @@ static void drm_test_check_broadcast_rgb_full_cea_mode_vic_1(struct kunit *test) mode = drm_kunit_display_mode_from_cea_vic(test, drm, 1); KUNIT_ASSERT_NOT_NULL(test, mode); - drm = &priv->drm; crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, mode, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -711,7 +709,6 @@ static void drm_test_check_broadcast_rgb_limited_cea_mode_vic_1(struct kunit *te mode = drm_kunit_display_mode_from_cea_vic(test, drm, 1); KUNIT_ASSERT_NOT_NULL(test, mode); - drm = &priv->drm; crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, mode, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1313,7 +1310,6 @@ static void drm_test_check_output_bpc_format_vic_1(struct kunit *test) rate = mode->clock * 1500; KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); - drm = &priv->drm; crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, mode, ctx); KUNIT_EXPECT_EQ(test, ret, 0); From 6b6bfd63e1626ceedc738b2a06505aa5b46c1481 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 29 Jan 2025 15:21:55 +0100 Subject: [PATCH 067/310] drm/tests: hdmi: Reorder DRM entities variables assignment The tests all deviate slightly in how they assign their local pointers to DRM entities. This makes refactoring pretty difficult, so let's just move the assignment as soon as the entities are allocated. Reviewed-by: Simona Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20250129-test-kunit-v2-3-fe59c43805d5@kernel.org Signed-off-by: Maxime Ripard --- .../drm/tests/drm_hdmi_state_helper_test.c | 81 ++++++++++--------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c index a36422aa9e27..925724b57878 100644 --- a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c +++ b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c @@ -289,15 +289,16 @@ static void drm_test_check_broadcast_rgb_crtc_mode_changed(struct kunit *test) 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; + conn = &priv->connector; + ctx = drm_kunit_helper_acquire_ctx_alloc(test); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - conn = &priv->connector; preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -352,15 +353,16 @@ static void drm_test_check_broadcast_rgb_crtc_mode_not_changed(struct kunit *tes 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; + conn = &priv->connector; + ctx = drm_kunit_helper_acquire_ctx_alloc(test); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - conn = &priv->connector; preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -415,6 +417,8 @@ static void drm_test_check_broadcast_rgb_auto_cea_mode(struct kunit *test) 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; KUNIT_ASSERT_TRUE(test, conn->display_info.is_hdmi); @@ -425,8 +429,6 @@ static void drm_test_check_broadcast_rgb_auto_cea_mode(struct kunit *test) KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_NE(test, drm_match_cea_mode(preferred), 1); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -526,6 +528,8 @@ static void drm_test_check_broadcast_rgb_full_cea_mode(struct kunit *test) 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; KUNIT_ASSERT_TRUE(test, conn->display_info.is_hdmi); @@ -536,8 +540,6 @@ static void drm_test_check_broadcast_rgb_full_cea_mode(struct kunit *test) KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_NE(test, drm_match_cea_mode(preferred), 1); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -641,6 +643,8 @@ static void drm_test_check_broadcast_rgb_limited_cea_mode(struct kunit *test) 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; KUNIT_ASSERT_TRUE(test, conn->display_info.is_hdmi); @@ -651,8 +655,6 @@ static void drm_test_check_broadcast_rgb_limited_cea_mode(struct kunit *test) KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_NE(test, drm_match_cea_mode(preferred), 1); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -758,6 +760,8 @@ static void drm_test_check_output_bpc_crtc_mode_changed(struct kunit *test) 10); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_200mhz, @@ -770,8 +774,6 @@ static void drm_test_check_output_bpc_crtc_mode_changed(struct kunit *test) preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -832,6 +834,8 @@ static void drm_test_check_output_bpc_crtc_mode_not_changed(struct kunit *test) 10); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_200mhz, @@ -844,8 +848,6 @@ static void drm_test_check_output_bpc_crtc_mode_not_changed(struct kunit *test) preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -903,6 +905,8 @@ static void drm_test_check_output_bpc_dvi(struct kunit *test) 12); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_dvi_1080p, @@ -918,8 +922,6 @@ static void drm_test_check_output_bpc_dvi(struct kunit *test) preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -950,6 +952,8 @@ static void drm_test_check_tmds_char_rate_rgb_8bpc(struct kunit *test) 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_max_200mhz, @@ -963,8 +967,6 @@ static void drm_test_check_tmds_char_rate_rgb_8bpc(struct kunit *test) KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -997,6 +999,8 @@ static void drm_test_check_tmds_char_rate_rgb_10bpc(struct kunit *test) 10); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_340mhz, @@ -1010,8 +1014,6 @@ static void drm_test_check_tmds_char_rate_rgb_10bpc(struct kunit *test) KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1044,6 +1046,8 @@ static void drm_test_check_tmds_char_rate_rgb_12bpc(struct kunit *test) 12); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_340mhz, @@ -1057,8 +1061,6 @@ static void drm_test_check_tmds_char_rate_rgb_12bpc(struct kunit *test) KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1095,15 +1097,16 @@ static void drm_test_check_hdmi_funcs_reject_rate(struct kunit *test) 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; + conn = &priv->connector; + ctx = drm_kunit_helper_acquire_ctx_alloc(test); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - conn = &priv->connector; preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1151,6 +1154,8 @@ static void drm_test_check_max_tmds_rate_bpc_fallback(struct kunit *test) 12); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_200mhz, @@ -1174,8 +1179,6 @@ static void drm_test_check_max_tmds_rate_bpc_fallback(struct kunit *test) rate = drm_hdmi_compute_mode_clock(preferred, 10, HDMI_COLORSPACE_RGB); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1220,6 +1223,8 @@ static void drm_test_check_max_tmds_rate_format_fallback(struct kunit *test) 12); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_200mhz, @@ -1246,8 +1251,6 @@ static void drm_test_check_max_tmds_rate_format_fallback(struct kunit *test) rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_YUV422); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1343,6 +1346,8 @@ static void drm_test_check_output_bpc_format_driver_rgb_only(struct kunit *test) 12); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_200mhz, @@ -1374,8 +1379,6 @@ static void drm_test_check_output_bpc_format_driver_rgb_only(struct kunit *test) rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_YUV422); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1410,6 +1413,8 @@ static void drm_test_check_output_bpc_format_display_rgb_only(struct kunit *test 12); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_max_200mhz, @@ -1441,8 +1446,6 @@ static void drm_test_check_output_bpc_format_display_rgb_only(struct kunit *test rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_YUV422); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1476,6 +1479,8 @@ static void drm_test_check_output_bpc_format_driver_8bpc_only(struct kunit *test 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_340mhz, @@ -1499,8 +1504,6 @@ static void drm_test_check_output_bpc_format_driver_8bpc_only(struct kunit *test rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_RGB); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1536,6 +1539,8 @@ static void drm_test_check_output_bpc_format_display_8bpc_only(struct kunit *tes 12); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_max_340mhz, @@ -1559,8 +1564,6 @@ static void drm_test_check_output_bpc_format_display_8bpc_only(struct kunit *tes rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_RGB); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); From 5d14c08a47460e8eedf0185a28b116420ea7f29d Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 29 Jan 2025 15:21:56 +0100 Subject: [PATCH 068/310] drm/tests: hdmi: Fix recursive locking The find_preferred_mode() functions takes the mode_config mutex, but due to the order most tests have, is called with the crtc_ww_class_mutex taken. This raises a warning for a circular dependency when running the tests with lockdep. Reorder the tests to call find_preferred_mode before the acquire context has been created to avoid the issue. Reviewed-by: Simona Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20250129-test-kunit-v2-4-fe59c43805d5@kernel.org Signed-off-by: Maxime Ripard --- .../drm/tests/drm_hdmi_state_helper_test.c | 114 +++++++++--------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c index 925724b57878..23ecc00accb2 100644 --- a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c +++ b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c @@ -293,12 +293,12 @@ static void drm_test_check_broadcast_rgb_crtc_mode_changed(struct kunit *test) crtc = priv->crtc; conn = &priv->connector; - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -357,12 +357,12 @@ static void drm_test_check_broadcast_rgb_crtc_mode_not_changed(struct kunit *tes crtc = priv->crtc; conn = &priv->connector; - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -422,13 +422,13 @@ static void drm_test_check_broadcast_rgb_auto_cea_mode(struct kunit *test) conn = &priv->connector; KUNIT_ASSERT_TRUE(test, conn->display_info.is_hdmi); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_NE(test, drm_match_cea_mode(preferred), 1); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -533,13 +533,13 @@ static void drm_test_check_broadcast_rgb_full_cea_mode(struct kunit *test) conn = &priv->connector; KUNIT_ASSERT_TRUE(test, conn->display_info.is_hdmi); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_NE(test, drm_match_cea_mode(preferred), 1); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -648,13 +648,13 @@ static void drm_test_check_broadcast_rgb_limited_cea_mode(struct kunit *test) conn = &priv->connector; KUNIT_ASSERT_TRUE(test, conn->display_info.is_hdmi); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_NE(test, drm_match_cea_mode(preferred), 1); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -768,12 +768,12 @@ static void drm_test_check_output_bpc_crtc_mode_changed(struct kunit *test) ARRAY_SIZE(test_edid_hdmi_1080p_rgb_yuv_dc_max_200mhz)); KUNIT_ASSERT_GT(test, ret, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -842,12 +842,12 @@ static void drm_test_check_output_bpc_crtc_mode_not_changed(struct kunit *test) ARRAY_SIZE(test_edid_hdmi_1080p_rgb_yuv_dc_max_200mhz)); KUNIT_ASSERT_GT(test, ret, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -916,12 +916,12 @@ static void drm_test_check_output_bpc_dvi(struct kunit *test) info = &conn->display_info; KUNIT_ASSERT_FALSE(test, info->is_hdmi); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -960,13 +960,13 @@ static void drm_test_check_tmds_char_rate_rgb_8bpc(struct kunit *test) ARRAY_SIZE(test_edid_hdmi_1080p_rgb_max_200mhz)); KUNIT_ASSERT_GT(test, ret, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1007,13 +1007,13 @@ static void drm_test_check_tmds_char_rate_rgb_10bpc(struct kunit *test) ARRAY_SIZE(test_edid_hdmi_1080p_rgb_yuv_dc_max_340mhz)); KUNIT_ASSERT_GT(test, ret, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1054,13 +1054,13 @@ static void drm_test_check_tmds_char_rate_rgb_12bpc(struct kunit *test) ARRAY_SIZE(test_edid_hdmi_1080p_rgb_yuv_dc_max_340mhz)); KUNIT_ASSERT_GT(test, ret, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1101,12 +1101,12 @@ static void drm_test_check_hdmi_funcs_reject_rate(struct kunit *test) crtc = priv->crtc; conn = &priv->connector; - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1166,9 +1166,6 @@ static void drm_test_check_max_tmds_rate_bpc_fallback(struct kunit *test) KUNIT_ASSERT_TRUE(test, info->is_hdmi); KUNIT_ASSERT_GT(test, info->max_tmds_clock, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); @@ -1179,6 +1176,9 @@ static void drm_test_check_max_tmds_rate_bpc_fallback(struct kunit *test) rate = drm_hdmi_compute_mode_clock(preferred, 10, HDMI_COLORSPACE_RGB); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1235,9 +1235,6 @@ static void drm_test_check_max_tmds_rate_format_fallback(struct kunit *test) KUNIT_ASSERT_TRUE(test, info->is_hdmi); KUNIT_ASSERT_GT(test, info->max_tmds_clock, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); @@ -1251,6 +1248,9 @@ static void drm_test_check_max_tmds_rate_format_fallback(struct kunit *test) rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_YUV422); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1297,9 +1297,6 @@ static void drm_test_check_output_bpc_format_vic_1(struct kunit *test) KUNIT_ASSERT_TRUE(test, info->is_hdmi); KUNIT_ASSERT_GT(test, info->max_tmds_clock, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - mode = drm_kunit_display_mode_from_cea_vic(test, drm, 1); KUNIT_ASSERT_NOT_NULL(test, mode); @@ -1313,6 +1310,9 @@ static void drm_test_check_output_bpc_format_vic_1(struct kunit *test) rate = mode->clock * 1500; KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, mode, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1358,9 +1358,6 @@ static void drm_test_check_output_bpc_format_driver_rgb_only(struct kunit *test) KUNIT_ASSERT_TRUE(test, info->is_hdmi); KUNIT_ASSERT_GT(test, info->max_tmds_clock, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); @@ -1379,6 +1376,9 @@ static void drm_test_check_output_bpc_format_driver_rgb_only(struct kunit *test) rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_YUV422); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1425,9 +1425,6 @@ static void drm_test_check_output_bpc_format_display_rgb_only(struct kunit *test KUNIT_ASSERT_TRUE(test, info->is_hdmi); KUNIT_ASSERT_GT(test, info->max_tmds_clock, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); @@ -1446,6 +1443,9 @@ static void drm_test_check_output_bpc_format_display_rgb_only(struct kunit *test rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_YUV422); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1491,9 +1491,6 @@ static void drm_test_check_output_bpc_format_driver_8bpc_only(struct kunit *test KUNIT_ASSERT_TRUE(test, info->is_hdmi); KUNIT_ASSERT_GT(test, info->max_tmds_clock, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); @@ -1504,6 +1501,9 @@ static void drm_test_check_output_bpc_format_driver_8bpc_only(struct kunit *test rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_RGB); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1551,9 +1551,6 @@ static void drm_test_check_output_bpc_format_display_8bpc_only(struct kunit *tes KUNIT_ASSERT_TRUE(test, info->is_hdmi); KUNIT_ASSERT_GT(test, info->max_tmds_clock, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); @@ -1564,6 +1561,9 @@ static void drm_test_check_output_bpc_format_display_8bpc_only(struct kunit *tes rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_RGB); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); From 0b06000704f8ae72056ad777a67742b7799d6660 Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Thu, 6 Feb 2025 20:38:08 +0800 Subject: [PATCH 069/310] ASoC: tas2781: drop a redundant code Report from internal ticket, priv->cali_data.data devm_kzalloc twice, drop the first one, it is the unnecessary one. Signed-off-by: Shenghao Ding Link: https://patch.msgid.link/20250206123808.1590-1-shenghao-ding@ti.com Signed-off-by: Mark Brown --- sound/soc/codecs/tas2781-i2c.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c index a730ab6ad4e3..90c5b2e74d12 100644 --- a/sound/soc/codecs/tas2781-i2c.c +++ b/sound/soc/codecs/tas2781-i2c.c @@ -2,7 +2,7 @@ // // ALSA SoC Texas Instruments TAS2563/TAS2781 Audio Smart Amplifier // -// Copyright (C) 2022 - 2024 Texas Instruments Incorporated +// Copyright (C) 2022 - 2025 Texas Instruments Incorporated // https://www.ti.com // // The TAS2563/TAS2781 driver implements a flexible and configurable @@ -1260,8 +1260,6 @@ static int tasdevice_create_cali_ctrls(struct tasdevice_priv *priv) (cali_data->cali_dat_sz_per_dev + 1) + 1 + 15 + 1; priv->cali_data.total_sz = priv->ndev * (cali_data->cali_dat_sz_per_dev + 1); - priv->cali_data.data = devm_kzalloc(priv->dev, - ext_cali_data->max, GFP_KERNEL); cali_ctrls[i].name = cali_name; cali_ctrls[i].iface = SNDRV_CTL_ELEM_IFACE_MIXER; cali_ctrls[i].info = snd_soc_bytes_info_ext; From ca0f4fe7cf7183bfbdc67ca2de56ae1fc3a8db2b Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 6 Feb 2025 10:21:38 -0700 Subject: [PATCH 070/310] arm64: Handle .ARM.attributes section in linker scripts A recent LLVM commit [1] started generating an .ARM.attributes section similar to the one that exists for 32-bit, which results in orphan section warnings (or errors if CONFIG_WERROR is enabled) from the linker because it is not handled in the arm64 linker scripts. ld.lld: error: arch/arm64/kernel/vdso/vgettimeofday.o:(.ARM.attributes) is being placed in '.ARM.attributes' ld.lld: error: arch/arm64/kernel/vdso/vgetrandom.o:(.ARM.attributes) is being placed in '.ARM.attributes' ld.lld: error: vmlinux.a(lib/vsprintf.o):(.ARM.attributes) is being placed in '.ARM.attributes' ld.lld: error: vmlinux.a(lib/win_minmax.o):(.ARM.attributes) is being placed in '.ARM.attributes' ld.lld: error: vmlinux.a(lib/xarray.o):(.ARM.attributes) is being placed in '.ARM.attributes' Discard the new sections in the necessary linker scripts to resolve the warnings, as the kernel and vDSO do not need to retain it, similar to the .note.gnu.property section. Cc: stable@vger.kernel.org Fixes: b3e5d80d0c48 ("arm64/build: Warn on orphan section placement") Link: https://github.com/llvm/llvm-project/commit/ee99c4d4845db66c4daa2373352133f4b237c942 [1] Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20250206-arm64-handle-arm-attributes-in-linker-script-v3-1-d53d169913eb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/vdso/vdso.lds.S | 1 + arch/arm64/kernel/vmlinux.lds.S | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index 4ec32e86a8da..47ad6944f9f0 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -41,6 +41,7 @@ SECTIONS */ /DISCARD/ : { *(.note.GNU-stack .note.gnu.property) + *(.ARM.attributes) } .note : { *(.note.*) } :text :note diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index f84c71f04d9e..e73326bd3ff7 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -162,6 +162,7 @@ SECTIONS /DISCARD/ : { *(.interp .dynamic) *(.dynsym .dynstr .hash .gnu.hash) + *(.ARM.attributes) } . = KIMAGE_VADDR; From 875d742cf5327c93cba1f11e12b08d3cce7a88d2 Mon Sep 17 00:00:00 2001 From: Radu Rendec Date: Thu, 6 Feb 2025 12:44:20 -0500 Subject: [PATCH 071/310] arm64: cacheinfo: Avoid out-of-bounds write to cacheinfo array The loop that detects/populates cache information already has a bounds check on the array size but does not account for cache levels with separate data/instructions cache. Fix this by incrementing the index for any populated leaf (instead of any populated level). Fixes: 5d425c186537 ("arm64: kernel: add support for cpu cache information") Signed-off-by: Radu Rendec Link: https://lore.kernel.org/r/20250206174420.2178724-1-rrendec@redhat.com Signed-off-by: Will Deacon --- arch/arm64/kernel/cacheinfo.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index d9c9218fa1fd..309942b06c5b 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -101,16 +101,18 @@ int populate_cache_leaves(unsigned int cpu) unsigned int level, idx; enum cache_type type; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf = this_cpu_ci->info_list; + struct cacheinfo *infos = this_cpu_ci->info_list; for (idx = 0, level = 1; level <= this_cpu_ci->num_levels && - idx < this_cpu_ci->num_leaves; idx++, level++) { + idx < this_cpu_ci->num_leaves; level++) { type = get_cache_type(level); if (type == CACHE_TYPE_SEPARATE) { - ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); - ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); + if (idx + 1 >= this_cpu_ci->num_leaves) + break; + ci_leaf_init(&infos[idx++], CACHE_TYPE_DATA, level); + ci_leaf_init(&infos[idx++], CACHE_TYPE_INST, level); } else { - ci_leaf_init(this_leaf++, type, level); + ci_leaf_init(&infos[idx++], type, level); } } return 0; From 02458fbfaa0170aabf8506f7d4ed054f02414251 Mon Sep 17 00:00:00 2001 From: Rupinderjit Singh Date: Thu, 6 Feb 2025 15:58:03 +0000 Subject: [PATCH 072/310] gpu: host1x: Fix a use of uninitialized mutex commit c8347f915e67 ("gpu: host1x: Fix boot regression for Tegra") caused a use of uninitialized mutex leading to below warning when CONFIG_DEBUG_MUTEXES and CONFIG_DEBUG_LOCK_ALLOC are enabled. [ 41.662843] ------------[ cut here ]------------ [ 41.663012] DEBUG_LOCKS_WARN_ON(lock->magic != lock) [ 41.663035] WARNING: CPU: 4 PID: 794 at kernel/locking/mutex.c:587 __mutex_lock+0x670/0x878 [ 41.663458] Modules linked in: rtw88_8822c(+) bluetooth(+) rtw88_pci rtw88_core mac80211 aquantia libarc4 crc_itu_t cfg80211 tegra194_cpufreq dwmac_tegra(+) arm_dsu_pmu stmmac_platform stmmac pcs_xpcs rfkill at24 host1x(+) tegra_bpmp_thermal ramoops reed_solomon fuse loop nfnetlink xfs mmc_block rpmb_core ucsi_ccg ina3221 crct10dif_ce xhci_tegra ghash_ce lm90 sha2_ce sha256_arm64 sha1_ce sdhci_tegra pwm_fan sdhci_pltfm sdhci gpio_keys rtc_tegra cqhci mmc_core phy_tegra_xusb i2c_tegra tegra186_gpc_dma i2c_tegra_bpmp spi_tegra114 dm_mirror dm_region_hash dm_log dm_mod [ 41.665078] CPU: 4 UID: 0 PID: 794 Comm: (udev-worker) Not tainted 6.11.0-29.31_1538613708.el10.aarch64+debug #1 [ 41.665838] Hardware name: NVIDIA NVIDIA Jetson AGX Orin Developer Kit/Jetson, BIOS 36.3.0-gcid-35594366 02/26/2024 [ 41.672555] pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 41.679636] pc : __mutex_lock+0x670/0x878 [ 41.683834] lr : __mutex_lock+0x670/0x878 [ 41.688035] sp : ffff800084b77090 [ 41.691446] x29: ffff800084b77160 x28: ffffdd4bebf7b000 x27: ffffdd4be96b1000 [ 41.698799] x26: 1fffe0002308361c x25: 1ffff0001096ee18 x24: 0000000000000000 [ 41.706149] x23: 0000000000000000 x22: 0000000000000002 x21: ffffdd4be6e3c7a0 [ 41.713500] x20: ffff800084b770f0 x19: ffff00011841b1e8 x18: 0000000000000000 [ 41.720675] x17: 0000000000000000 x16: 0000000000000000 x15: 0720072007200720 [ 41.728023] x14: 0000000000000000 x13: 0000000000000001 x12: ffff6001a96eaab3 [ 41.735375] x11: 1fffe001a96eaab2 x10: ffff6001a96eaab2 x9 : ffffdd4be4838bbc [ 41.742723] x8 : 00009ffe5691554e x7 : ffff000d4b755593 x6 : 0000000000000001 [ 41.749985] x5 : ffff000d4b755590 x4 : 1fffe0001d88f001 x3 : dfff800000000000 [ 41.756988] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0000ec478000 [ 41.764251] Call trace: [ 41.766695] __mutex_lock+0x670/0x878 [ 41.770373] mutex_lock_nested+0x2c/0x40 [ 41.774134] host1x_intr_start+0x54/0xf8 [host1x] [ 41.778863] host1x_runtime_resume+0x150/0x228 [host1x] [ 41.783935] pm_generic_runtime_resume+0x84/0xc8 [ 41.788485] __rpm_callback+0xa0/0x478 [ 41.792422] rpm_callback+0x15c/0x1a8 [ 41.795922] rpm_resume+0x698/0xc08 [ 41.799597] __pm_runtime_resume+0xa8/0x140 [ 41.803621] host1x_probe+0x810/0xbc0 [host1x] [ 41.807909] platform_probe+0xcc/0x1a8 [ 41.811845] really_probe+0x188/0x800 [ 41.815347] __driver_probe_device+0x164/0x360 [ 41.819810] driver_probe_device+0x64/0x1a8 [ 41.823834] __driver_attach+0x180/0x490 [ 41.827773] bus_for_each_dev+0x104/0x1a0 [ 41.831797] driver_attach+0x44/0x68 [ 41.835296] bus_add_driver+0x23c/0x4e8 [ 41.839235] driver_register+0x15c/0x3a8 [ 41.843170] __platform_register_drivers+0xa4/0x208 [ 41.848159] tegra_host1x_init+0x4c/0xff8 [host1x] [ 41.853147] do_one_initcall+0xd4/0x380 [ 41.856997] do_init_module+0x1dc/0x698 [ 41.860758] load_module+0xc70/0x1300 [ 41.864435] __do_sys_init_module+0x1a8/0x1d0 [ 41.868721] __arm64_sys_init_module+0x74/0xb0 [ 41.873183] invoke_syscall.constprop.0+0xdc/0x1e8 [ 41.877997] do_el0_svc+0x154/0x1d0 [ 41.881671] el0_svc+0x54/0x140 [ 41.884820] el0t_64_sync_handler+0x120/0x130 [ 41.889285] el0t_64_sync+0x1a4/0x1a8 [ 41.892960] irq event stamp: 69737 [ 41.896370] hardirqs last enabled at (69737): [] _raw_spin_unlock_irqrestore+0x44/0xe8 [ 41.905739] hardirqs last disabled at (69736): [] clk_enable_lock+0x98/0x198 [ 41.914314] softirqs last enabled at (68082): [] handle_softirqs+0x4c8/0x890 [ 41.922977] softirqs last disabled at (67945): [] __do_softirq+0x1c/0x28 [ 41.931289] ---[ end trace 0000000000000000 ]--- Inside the probe function when pm_runtime_enable() is called, the PM core invokes a resume callback if the device Host1x is in a suspended state. As it can be seen in the logs above, this leads to host1x_intr_start() function call which is trying to acquire a mutex lock. But, the function host_intr_init() only gets called after the pm_runtime_enable() where mutex is initialised leading to the use of mutex prior to its initialisation. Fix this by moving the mutex initialisation prior to the runtime PM enablement function pm_runtime_enable() in probe. Fixes: c8347f915e67 ("gpu: host1x: Fix boot regression for Tegra") Signed-off-by: Rupinderjit Singh Reviewed-by: Jon Hunter Tested-by: Jon Hunter Signed-off-by: Thierry Reding Link: https://patchwork.ozlabs.org/project/linux-tegra/patch/20250206155803.201942-1-rusingh@redhat.com/ --- drivers/gpu/host1x/dev.c | 2 ++ drivers/gpu/host1x/intr.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c index 7b1d091f3c09..46cae925b095 100644 --- a/drivers/gpu/host1x/dev.c +++ b/drivers/gpu/host1x/dev.c @@ -619,6 +619,8 @@ static int host1x_probe(struct platform_device *pdev) goto free_contexts; } + mutex_init(&host->intr_mutex); + pm_runtime_enable(&pdev->dev); err = devm_tegra_core_dev_init_opp_table_common(&pdev->dev); diff --git a/drivers/gpu/host1x/intr.c b/drivers/gpu/host1x/intr.c index b3285dd10180..f77a678949e9 100644 --- a/drivers/gpu/host1x/intr.c +++ b/drivers/gpu/host1x/intr.c @@ -104,8 +104,6 @@ int host1x_intr_init(struct host1x *host) unsigned int id; int i, err; - mutex_init(&host->intr_mutex); - for (id = 0; id < host1x_syncpt_nb_pts(host); ++id) { struct host1x_syncpt *syncpt = &host->syncpt[id]; From 3b32b7f638fe61e9d29290960172f4e360e38233 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Sun, 19 Jan 2025 10:58:29 +0800 Subject: [PATCH 073/310] drm/panthor: avoid garbage value in panthor_ioctl_dev_query() 'priorities_info' is uninitialized, and the uninitialized value is copied to user object when calling PANTHOR_UOBJ_SET(). Using memset to initialize 'priorities_info' to avoid this garbage value problem. Fixes: f70000ef2352 ("drm/panthor: Add DEV_QUERY_GROUP_PRIORITIES_INFO dev query") Signed-off-by: Su Hui Reviewed-by: Dan Carpenter Reviewed-by: Boris Brezillon Reviewed-by: Steven Price Signed-off-by: Boris Brezillon Link: https://patchwork.freedesktop.org/patch/msgid/20250119025828.1168419-1-suhui@nfschina.com --- drivers/gpu/drm/panthor/panthor_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c index d5dcd3d1b33a..08136e790ca0 100644 --- a/drivers/gpu/drm/panthor/panthor_drv.c +++ b/drivers/gpu/drm/panthor/panthor_drv.c @@ -802,6 +802,7 @@ static void panthor_query_group_priorities_info(struct drm_file *file, { int prio; + memset(arg, 0, sizeof(*arg)); for (prio = PANTHOR_GROUP_PRIORITY_REALTIME; prio >= 0; prio--) { if (!group_priority_permit(file, prio)) arg->allowed_mask |= BIT(prio); From db5fd3cf8bf41b84b577b8ad5234ea95f327c9be Mon Sep 17 00:00:00 2001 From: Muhammad Adeel Date: Fri, 7 Feb 2025 14:24:32 +0000 Subject: [PATCH 074/310] cgroup: Remove steal time from usage_usec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CPU usage time is the time when user, system or both are using the CPU. Steal time is the time when CPU is waiting to be run by the Hypervisor. It should not be added to the CPU usage time, hence removing it from the usage_usec entry. Fixes: 936f2a70f2077 ("cgroup: add cpu.stat file to root cgroup") Acked-by: Axel Busch Acked-by: Michal Koutný Signed-off-by: Muhammad Adeel Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 5877974ece92..aac91466279f 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -590,7 +590,6 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; - cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; #ifdef CONFIG_SCHED_CORE bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; From 3724062ca2b1364f02cf44dbea1a552227844ad1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 14 Jan 2025 13:57:58 -0800 Subject: [PATCH 075/310] objtool: Ignore dangling jump table entries Clang sometimes leaves dangling unused jump table entries which point to the end of the function. Ignore them. Closes: https://lore.kernel.org/20250113235835.vqgvb7cdspksy5dn@jpoimboe Reported-by: Klaus Kusche Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/ee25c0b7e80113e950bd1d4c208b671d35774ff4.1736891751.git.jpoimboe@kernel.org --- tools/objtool/check.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 753dbc4f8198..3520a45ebde8 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1975,6 +1975,14 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, reloc_addend(reloc) == pfunc->offset) break; + /* + * Clang sometimes leaves dangling unused jump table entries + * which point to the end of the function. Ignore them. + */ + if (reloc->sym->sec == pfunc->sec && + reloc_addend(reloc) == pfunc->offset + pfunc->len) + goto next; + dest_insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc)); if (!dest_insn) break; @@ -1992,6 +2000,7 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, alt->insn = dest_insn; alt->next = insn->alts; insn->alts = alt; +next: prev_offset = reloc_offset(reloc); } From 7e501637bd5b702a2fa627e903a0025654110e1e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Feb 2025 11:12:08 +0100 Subject: [PATCH 076/310] objtool: Move dodgy linker warn to verbose The lld.ld borkage is fixed in the latest llvm release (?) but will not be backported, meaning we're stuck with broken linker for a fair while. Lets not spam all clang build logs and move warning to verbose. Signed-off-by: Peter Zijlstra (Intel) --- tools/objtool/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 3520a45ebde8..497cb8dfb3eb 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2273,7 +2273,7 @@ static int read_annotate(struct objtool_file *file, if (sec->sh.sh_entsize != 8) { static bool warned = false; - if (!warned) { + if (!warned && opts.verbose) { WARN("%s: dodgy linker, sh_entsize != 8", sec->name); warned = true; } From bcc6244e13b4d4903511a1ea84368abf925031c0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 29 Jan 2025 20:53:03 +0100 Subject: [PATCH 077/310] sched: Clarify wake_up_q()'s write to task->wake_q.next Clarify that wake_up_q() does an atomic write to task->wake_q.next, after which a concurrent __wake_q_add() can immediately overwrite task->wake_q.next again. Signed-off-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250129-sched-wakeup-prettier-v1-1-2f51f5f663fa@google.com --- kernel/sched/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3e5a6bf587f9..8931d9b1e895 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1055,9 +1055,10 @@ void wake_up_q(struct wake_q_head *head) struct task_struct *task; task = container_of(node, struct task_struct, wake_q); - /* Task can safely be re-inserted now: */ node = node->next; - task->wake_q.next = NULL; + /* pairs with cmpxchg_relaxed() in __wake_q_add() */ + WRITE_ONCE(task->wake_q.next, NULL); + /* Task can safely be re-inserted now. */ /* * wake_up_process() executes a full barrier, which pairs with From 469c76a83bb9f6b2c7b2989c46617c4fe01fee79 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 29 Jan 2025 08:05:14 +0000 Subject: [PATCH 078/310] perf/x86/rapl: Fix the error checking order After the commit b4943b8bfc41 ("perf/x86/rapl: Add core energy counter support for AMD CPUs"), the default "perf record"/"perf top" command is broken in systems where there isn't a PMU registered for type PERF_TYPE_RAW. This is due to the change in order of error checks in rapl_pmu_event_init() Due to which we return -EINVAL instead of -ENOENT, when we reach here from the fallback loop in perf_init_event(). Move the "PMU and event type match" back to the beginning of the function so that we return -ENOENT early on. Closes: https://lore.kernel.org/all/uv7mz6vew2bzgre5jdpmwldxljp5djzmuiksqdcdwipfm4zm7w@ribobcretidk/ Fixes: b4943b8bfc41 ("perf/x86/rapl: Add core energy counter support for AMD CPUs") Reported-by: Koichiro Den Signed-off-by: Dhananjay Ugwekar Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250129080513.30353-1-dhananjay.ugwekar@amd.com --- arch/x86/events/rapl.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index d3bb3865c1b1..4952faf03e82 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -370,6 +370,10 @@ static int rapl_pmu_event_init(struct perf_event *event) unsigned int rapl_pmu_idx; struct rapl_pmus *rapl_pmus; + /* only look at RAPL events */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + /* unsupported modes and filters */ if (event->attr.sample_period) /* no sampling */ return -EINVAL; @@ -387,10 +391,6 @@ static int rapl_pmu_event_init(struct perf_event *event) rapl_pmus_scope = rapl_pmus->pmu.scope; if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) { - /* only look at RAPL package events */ - if (event->attr.type != rapl_pmus_pkg->pmu.type) - return -ENOENT; - cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) return -EINVAL; @@ -398,10 +398,6 @@ static int rapl_pmu_event_init(struct perf_event *event) bit = cfg - 1; event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) { - /* only look at RAPL core events */ - if (event->attr.type != rapl_pmus_core->pmu.type) - return -ENOENT; - cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1); if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) return -EINVAL; From 0a5561501397e2bbd0fb0e300eb489f72a90597a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 29 Jan 2025 07:48:18 -0800 Subject: [PATCH 079/310] perf/x86/intel: Clean up PEBS-via-PT on hybrid The PEBS-via-PT feature is exposed for the e-core of some hybrid platforms, e.g., ADL and MTL. But it never works. $ dmesg | grep PEBS [ 1.793888] core: cpu_atom PMU driver: PEBS-via-PT $ perf record -c 1000 -e '{intel_pt/branch=0/, cpu_atom/cpu-cycles,aux-output/pp}' -C8 Error: The sys_perf_event_open() syscall returned with 22 (Invalid argument) for event (cpu_atom/cpu-cycles,aux-output/pp). "dmesg | grep -i perf" may provide additional information. The "PEBS-via-PT" is printed if the corresponding bit of per-PMU capabilities is set. Since the feature is supported by the e-core HW, perf sets the bit for e-core. However, for Intel PT, if a feature is not supported on all CPUs, it is not supported at all. The PEBS-via-PT event cannot be created successfully. The PEBS-via-PT is no longer enumerated on the latest hybrid platform. It will be deprecated on future platforms with Arch PEBS. Let's remove it from the existing hybrid platforms. Fixes: d9977c43bff8 ("perf/x86: Register hybrid PMUs") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250129154820.3755948-2-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 10 ---------- arch/x86/events/intel/ds.c | 10 +++++++++- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7601196d1d18..966f7832497d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4941,11 +4941,6 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) else pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); - if (pmu->intel_cap.pebs_output_pt_available) - pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT; - else - pmu->pmu.capabilities &= ~PERF_PMU_CAP_AUX_OUTPUT; - intel_pmu_check_event_constraints(pmu->event_constraints, pmu->cntr_mask64, pmu->fixed_cntr_mask64, @@ -5023,9 +5018,6 @@ static bool init_hybrid_pmu(int cpu) pr_info("%s PMU driver: ", pmu->name); - if (pmu->intel_cap.pebs_output_pt_available) - pr_cont("PEBS-via-PT "); - pr_cont("\n"); x86_pmu_show_pmu_cap(&pmu->pmu); @@ -6370,11 +6362,9 @@ static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus) pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; if (pmu->pmu_type & hybrid_small_tiny) { pmu->intel_cap.perf_metrics = 0; - pmu->intel_cap.pebs_output_pt_available = 1; pmu->mid_ack = true; } else if (pmu->pmu_type & hybrid_big) { pmu->intel_cap.perf_metrics = 1; - pmu->intel_cap.pebs_output_pt_available = 0; pmu->late_ack = true; } } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ba74e1198328..c2e2eae7309c 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2578,7 +2578,15 @@ void __init intel_ds_init(void) } pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); - if (!is_hybrid() && x86_pmu.intel_cap.pebs_output_pt_available) { + /* + * The PEBS-via-PT is not supported on hybrid platforms, + * because not all CPUs of a hybrid machine support it. + * The global x86_pmu.intel_cap, which only contains the + * common capabilities, is used to check the availability + * of the feature. The per-PMU pebs_output_pt_available + * in a hybrid machine should be ignored. + */ + if (x86_pmu.intel_cap.pebs_output_pt_available) { pr_cont("PEBS-via-PT, "); x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_AUX_OUTPUT; } From 47a973fd75639fe80d59f9e1860113bb2a0b112b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 29 Jan 2025 07:48:19 -0800 Subject: [PATCH 080/310] perf/x86/intel: Fix ARCH_PERFMON_NUM_COUNTER_LEAF The EAX of the CPUID Leaf 023H enumerates the mask of valid sub-leaves. To tell the availability of the sub-leaf 1 (enumerate the counter mask), perf should check the bit 1 (0x2) of EAS, rather than bit 0 (0x1). The error is not user-visible on bare metal. Because the sub-leaf 0 and the sub-leaf 1 are always available. However, it may bring issues in a virtualization environment when a VMM only enumerates the sub-leaf 0. Introduce the cpuid35_e?x to replace the macros, which makes the implementation style consistent. Fixes: eb467aaac21e ("perf/x86/intel: Support Architectural PerfMon Extension leaf") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250129154820.3755948-3-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 18 ++++++++++-------- arch/x86/include/asm/perf_event.h | 28 +++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 966f7832497d..f3d5b718f93f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4905,20 +4905,22 @@ static inline bool intel_pmu_broken_perf_cap(void) static void update_pmu_cap(struct x86_hybrid_pmu *pmu) { - unsigned int sub_bitmaps, eax, ebx, ecx, edx; + unsigned int cntr, fixed_cntr, ecx, edx; + union cpuid35_eax eax; + union cpuid35_ebx ebx; - cpuid(ARCH_PERFMON_EXT_LEAF, &sub_bitmaps, &ebx, &ecx, &edx); + cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); - if (ebx & ARCH_PERFMON_EXT_UMASK2) + if (ebx.split.umask2) pmu->config_mask |= ARCH_PERFMON_EVENTSEL_UMASK2; - if (ebx & ARCH_PERFMON_EXT_EQ) + if (ebx.split.eq) pmu->config_mask |= ARCH_PERFMON_EVENTSEL_EQ; - if (sub_bitmaps & ARCH_PERFMON_NUM_COUNTER_LEAF_BIT) { + if (eax.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, - &eax, &ebx, &ecx, &edx); - pmu->cntr_mask64 = eax; - pmu->fixed_cntr_mask64 = ebx; + &cntr, &fixed_cntr, &ecx, &edx); + pmu->cntr_mask64 = cntr; + pmu->fixed_cntr_mask64 = fixed_cntr; } if (!intel_pmu_broken_perf_cap()) { diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 1ac79f361645..0ba8d20f2d1d 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -188,11 +188,33 @@ union cpuid10_edx { * detection/enumeration details: */ #define ARCH_PERFMON_EXT_LEAF 0x00000023 -#define ARCH_PERFMON_EXT_UMASK2 0x1 -#define ARCH_PERFMON_EXT_EQ 0x2 -#define ARCH_PERFMON_NUM_COUNTER_LEAF_BIT 0x1 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 +union cpuid35_eax { + struct { + unsigned int leaf0:1; + /* Counters Sub-Leaf */ + unsigned int cntr_subleaf:1; + /* Auto Counter Reload Sub-Leaf */ + unsigned int acr_subleaf:1; + /* Events Sub-Leaf */ + unsigned int events_subleaf:1; + unsigned int reserved:28; + } split; + unsigned int full; +}; + +union cpuid35_ebx { + struct { + /* UnitMask2 Supported */ + unsigned int umask2:1; + /* EQ-bit Supported */ + unsigned int eq:1; + unsigned int reserved:30; + } split; + unsigned int full; +}; + /* * Intel Architectural LBR CPUID detection/enumeration details: */ From c631a2de7ae48d50434bdc205d901423f8577c65 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Jan 2025 17:07:21 -0800 Subject: [PATCH 081/310] perf/x86/intel: Ensure LBRs are disabled when a CPU is starting Explicitly clear DEBUGCTL.LBR when a CPU is starting, prior to purging the LBR MSRs themselves, as at least one system has been found to transfer control to the kernel with LBRs enabled (it's unclear whether it's a BIOS flaw or a CPU goof). Because the kernel preserves the original DEBUGCTL, even when toggling LBRs, leaving DEBUGCTL.LBR as is results in running with LBRs enabled at all times. Closes: https://lore.kernel.org/all/c9d8269bff69f6359731d758e3b1135dedd7cc61.camel@redhat.com Reported-by: Maxim Levitsky Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Maxim Levitsky Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250131010721.470503-1-seanjc@google.com --- arch/x86/events/intel/core.c | 5 ++++- arch/x86/include/asm/msr-index.h | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index f3d5b718f93f..e86333eee266 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5042,8 +5042,11 @@ static void intel_pmu_cpu_starting(int cpu) init_debug_store_on_cpu(cpu); /* - * Deal with CPUs that don't clear their LBRs on power-up. + * Deal with CPUs that don't clear their LBRs on power-up, and that may + * even boot with LBRs enabled. */ + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && x86_pmu.lbr_nr) + msr_clear_bit(MSR_IA32_DEBUGCTLMSR, DEBUGCTLMSR_LBR_BIT); intel_pmu_lbr_reset(); cpuc->lbr_sel = NULL; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 9a71880eec07..72765b2fe0d8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -395,7 +395,8 @@ #define MSR_IA32_PASID_VALID BIT_ULL(31) /* DEBUGCTLMSR bits (others vary by model): */ -#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ +#define DEBUGCTLMSR_LBR_BIT 0 /* last branch recording */ +#define DEBUGCTLMSR_LBR (1UL << DEBUGCTLMSR_LBR_BIT) #define DEBUGCTLMSR_BTF_SHIFT 1 #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ #define DEBUGCTLMSR_BUS_LOCK_DETECT (1UL << 2) From 9ab127a18018fb06bd42a54ed38bb7b8c449d686 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Jan 2025 08:10:02 +0100 Subject: [PATCH 082/310] drm/hisilicon/hibmc: select CONFIG_DRM_DISPLAY_DP_HELPER Without the DP helper code, the newly added displayport support causes a link failure: x86_64-linux-ld: drivers/gpu/drm/hisilicon/hibmc/dp/dp_aux.o: in function `hibmc_dp_aux_init': dp_aux.c:(.text+0x37e): undefined reference to `drm_dp_aux_init' x86_64-linux-ld: drivers/gpu/drm/hisilicon/hibmc/dp/dp_link.o: in function `hibmc_dp_link_set_pattern': dp_link.c:(.text+0xae): undefined reference to `drm_dp_dpcd_write' x86_64-linux-ld: drivers/gpu/drm/hisilicon/hibmc/dp/dp_link.o: in function `hibmc_dp_link_get_adjust_train': dp_link.c:(.text+0x121): undefined reference to `drm_dp_get_adjust_request_voltage' x86_64-linux-ld: dp_link.c:(.text+0x12e): undefined reference to `drm_dp_get_adjust_request_pre_emphasis' x86_64-linux-ld: drivers/gpu/drm/hisilicon/hibmc/dp/dp_link.o: in function `hibmc_dp_link_training': dp_link.c:(.text+0x2b0): undefined reference to `drm_dp_dpcd_write' x86_64-linux-ld: dp_link.c:(.text+0x2e3): undefined reference to `drm_dp_dpcd_write' Add both DRM_DISPLAY_DP_HELPER and DRM_DISPLAY_HELPER, which is in turn required by the former. Fixes: 0ab6ea261c1f ("drm/hisilicon/hibmc: add dp module in hibmc") Signed-off-by: Arnd Bergmann Reviewed-by: Dmitry Baryshkov Link: https://patchwork.freedesktop.org/patch/msgid/20250127071059.617567-1-arnd@kernel.org Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/hisilicon/hibmc/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/hisilicon/hibmc/Kconfig b/drivers/gpu/drm/hisilicon/hibmc/Kconfig index 93b8d32e3be1..98d77d74999d 100644 --- a/drivers/gpu/drm/hisilicon/hibmc/Kconfig +++ b/drivers/gpu/drm/hisilicon/hibmc/Kconfig @@ -4,6 +4,8 @@ config DRM_HISI_HIBMC depends on DRM && PCI depends on MMU select DRM_CLIENT_SELECTION + select DRM_DISPLAY_HELPER + select DRM_DISPLAY_DP_HELPER select DRM_KMS_HELPER select DRM_VRAM_HELPER select DRM_TTM From 2fa0fbeb69edd367b7c44f484e8dc5a5a1a311ef Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 7 Feb 2025 10:58:23 -1000 Subject: [PATCH 083/310] sched_ext: Implement auto local dispatching of migration disabled tasks Migration disabled tasks are special and pinned to their previous CPUs. They tripped up some unsuspecting BPF schedulers as their ->nr_cpus_allowed may not agree with the bits set in ->cpus_ptr. Make it easier for BPF schedulers by automatically dispatching them to the pinned local DSQs by default. If a BPF scheduler wants to handle migration disabled tasks explicitly, it can set SCX_OPS_ENQ_MIGRATION_DISABLED. Signed-off-by: Tejun Heo Acked-by: Andrea Righi --- kernel/sched/ext.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index a6d6d6dadde5..efdbf4d85a21 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -122,6 +122,19 @@ enum scx_ops_flags { */ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + /* + * A migration disabled task can only execute on its current CPU. By + * default, such tasks are automatically put on the CPU's local DSQ with + * the default slice on enqueue. If this ops flag is set, they also go + * through ops.enqueue(). + * + * A migration disabled task never invokes ops.select_cpu() as it can + * only select the current CPU. Also, p->cpus_ptr will only contain its + * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr + * and thus may disagree with cpumask_weight(p->cpus_ptr). + */ + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + /* * CPU cgroup support flags */ @@ -130,6 +143,7 @@ enum scx_ops_flags { SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_ENQ_LAST | SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | SCX_OPS_SWITCH_PARTIAL | SCX_OPS_HAS_CGROUP_WEIGHT, }; @@ -882,6 +896,7 @@ static bool scx_warned_zero_slice; static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled); static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); @@ -2014,6 +2029,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, unlikely(p->flags & PF_EXITING)) goto local; + /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ + if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && + is_migration_disabled(p)) + goto local; + if (!SCX_HAS_OP(enqueue)) goto global; @@ -5052,6 +5072,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) static_branch_disable(&scx_has_op[i]); static_branch_disable(&scx_ops_enq_last); static_branch_disable(&scx_ops_enq_exiting); + static_branch_disable(&scx_ops_enq_migration_disabled); static_branch_disable(&scx_ops_cpu_preempt); static_branch_disable(&scx_builtin_idle_enabled); synchronize_rcu(); @@ -5661,6 +5682,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ops->flags & SCX_OPS_ENQ_EXITING) static_branch_enable(&scx_ops_enq_exiting); + if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) + static_branch_enable(&scx_ops_enq_migration_disabled); if (scx_ops.cpu_acquire || scx_ops.cpu_release) static_branch_enable(&scx_ops_cpu_preempt); From 32966821574cd2917bd60f2554f435fe527f4702 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 7 Feb 2025 10:59:06 -1000 Subject: [PATCH 084/310] sched_ext: Fix migration disabled handling in targeted dispatches A dispatch operation that can target a specific local DSQ - scx_bpf_dsq_move_to_local() or scx_bpf_dsq_move() - checks whether the task can be migrated to the target CPU using task_can_run_on_remote_rq(). If the task can't be migrated to the targeted CPU, it is bounced through a global DSQ. task_can_run_on_remote_rq() assumes that the task is on a CPU that's different from the targeted CPU but the callers doesn't uphold the assumption and may call the function when the task is already on the target CPU. When such task has migration disabled, task_can_run_on_remote_rq() ends up returning %false incorrectly unnecessarily bouncing the task to a global DSQ. Fix it by updating the callers to only call task_can_run_on_remote_rq() when the task is on a different CPU than the target CPU. As this is a bit subtle, for clarity and documentation: - Make task_can_run_on_remote_rq() trigger SCHED_WARN_ON() if the task is on the same CPU as the target CPU. - is_migration_disabled() test in task_can_run_on_remote_rq() cannot trigger if the task is on a different CPU than the target CPU as the preceding task_allowed_on_cpu() test should fail beforehand. Convert the test into SCHED_WARN_ON(). Signed-off-by: Tejun Heo Fixes: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") Fixes: 0366017e0973 ("sched_ext: Use task_can_run_on_remote_rq() test in dispatch_to_local_dsq()") Cc: stable@vger.kernel.org # v6.12+ --- kernel/sched/ext.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index efdbf4d85a21..e01144340d67 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2333,12 +2333,16 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, * * - The BPF scheduler is bypassed while the rq is offline and we can always say * no to the BPF scheduler initiated migrations while offline. + * + * The caller must ensure that @p and @rq are on different CPUs. */ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { int cpu = cpu_of(rq); + SCHED_WARN_ON(task_cpu(p) == cpu); + /* * We don't require the BPF scheduler to avoid dispatching to offline * CPUs mostly for convenience but also because CPUs can go offline @@ -2352,8 +2356,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, return false; } - if (unlikely(is_migration_disabled(p))) - return false; + /* + * If @p has migration disabled, @p->cpus_ptr only contains its current + * CPU and the above task_allowed_on_cpu() test should have failed. + */ + SCHED_WARN_ON(is_migration_disabled(p)); if (!scx_rq_online(rq)) return false; @@ -2457,7 +2464,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, if (dst_dsq->id == SCX_DSQ_LOCAL) { dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); - if (!task_can_run_on_remote_rq(p, dst_rq, true)) { + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { dst_dsq = find_global_dsq(p); dst_rq = src_rq; } @@ -2611,7 +2619,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } #ifdef CONFIG_SMP - if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { dispatch_enqueue(find_global_dsq(p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; From c53fbdb60fb61fd6bda2bc0dc89837966625c5dc Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 7 Feb 2025 14:54:37 +0000 Subject: [PATCH 085/310] KVM: arm64: Improve error handling from check_host_shared_guest() The check_host_shared_guest() path expects to find a last-level valid PTE in the guest's stage-2 page-table. However, it checks the PTE's level before its validity, which makes it hard for callers to figure out what went wrong. To make error handling simpler, check the PTE's validity first. Signed-off-by: Quentin Perret Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250207145438.1333475-2-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 7ad7b133b81a..41847c04b270 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -943,10 +943,10 @@ static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ip ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); if (ret) return ret; - if (level != KVM_PGTABLE_LAST_LEVEL) - return -E2BIG; if (!kvm_pte_valid(pte)) return -ENOENT; + if (level != KVM_PGTABLE_LAST_LEVEL) + return -E2BIG; state = guest_get_page_state(pte, ipa); if (state != PKVM_PAGE_SHARED_BORROWED) From eabc7aaef7a553b64bf6e631ce04526af6c8d104 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 7 Feb 2025 14:54:38 +0000 Subject: [PATCH 086/310] KVM: arm64: Simplify np-guest hypercalls When the handling of a guest stage-2 permission fault races with an MMU notifier, the faulting page might be gone from the guest's stage-2 by the point we attempt to call (p)kvm_pgtable_stage2_relax_perms(). In the normal KVM case, this leads to returning -EAGAIN which user_mem_abort() handles correctly by simply re-entering the guest. However, the pKVM hypercall implementation has additional logic to check the page state using __check_host_shared_guest() which gets confused with absence of a page mapped at the requested IPA and returns -ENOENT, hence breaking user_mem_abort() and hilarity ensues. Luckily, several of the hypercalls for managing the stage-2 page-table of NP guests have no effect on the pKVM ownership tracking (wrprotect, test_clear_young, mkyoung, and crucially relax_perms), so the extra state checking logic is in fact not strictly necessary. So, to fix the discrepancy between standard KVM and pKVM, let's just drop the superfluous __check_host_shared_guest() logic from those hypercalls and make the extra state checking a debug assertion dependent on CONFIG_NVHE_EL2_DEBUG as we already do for other transitions. Signed-off-by: Quentin Perret Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250207145438.1333475-3-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 69 +++++++++++++++------------ 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 41847c04b270..4c2f6a6a2efe 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -998,63 +998,73 @@ int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *vm) return ret; } -int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) +static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa) { - struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); - u64 ipa = hyp_pfn_to_phys(gfn); u64 phys; int ret; - if (prot & ~KVM_PGTABLE_PROT_RWX) - return -EINVAL; + if (!IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) + return; host_lock_component(); guest_lock_component(vm); ret = __check_host_shared_guest(vm, &phys, ipa); - if (!ret) - ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); guest_unlock_component(vm); host_unlock_component(); - return ret; + WARN_ON(ret && ret != -ENOENT); } -int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm) +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) { + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); u64 ipa = hyp_pfn_to_phys(gfn); - u64 phys; int ret; - host_lock_component(); - guest_lock_component(vm); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; - ret = __check_host_shared_guest(vm, &phys, ipa); - if (!ret) - ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE); + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; + assert_host_shared_guest(vm, ipa); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); guest_unlock_component(vm); - host_unlock_component(); return ret; } -int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm) +int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm) { u64 ipa = hyp_pfn_to_phys(gfn); - u64 phys; int ret; - host_lock_component(); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; + + assert_host_shared_guest(vm, ipa); guest_lock_component(vm); + ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE); + guest_unlock_component(vm); - ret = __check_host_shared_guest(vm, &phys, ipa); - if (!ret) - ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold); + return ret; +} +int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm) +{ + u64 ipa = hyp_pfn_to_phys(gfn); + int ret; + + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; + + assert_host_shared_guest(vm, ipa); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold); guest_unlock_component(vm); - host_unlock_component(); return ret; } @@ -1063,18 +1073,15 @@ int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) { struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); u64 ipa = hyp_pfn_to_phys(gfn); - u64 phys; int ret; - host_lock_component(); - guest_lock_component(vm); - - ret = __check_host_shared_guest(vm, &phys, ipa); - if (!ret) - kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; + assert_host_shared_guest(vm, ipa); + guest_lock_component(vm); + kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); guest_unlock_component(vm); - host_unlock_component(); return ret; } From dc9c5166c3cb044f8a001e397195242fd6796eee Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 3 Feb 2025 11:14:57 +0100 Subject: [PATCH 087/310] powerpc/code-patching: Disable KASAN report during patching via temporary mm Erhard reports the following KASAN hit on Talos II (power9) with kernel 6.13: [ 12.028126] ================================================================== [ 12.028198] BUG: KASAN: user-memory-access in copy_to_kernel_nofault+0x8c/0x1a0 [ 12.028260] Write of size 8 at addr 0000187e458f2000 by task systemd/1 [ 12.028346] CPU: 87 UID: 0 PID: 1 Comm: systemd Tainted: G T 6.13.0-P9-dirty #3 [ 12.028408] Tainted: [T]=RANDSTRUCT [ 12.028446] Hardware name: T2P9D01 REV 1.01 POWER9 0x4e1202 opal:skiboot-bc106a0 PowerNV [ 12.028500] Call Trace: [ 12.028536] [c000000008dbf3b0] [c000000001656a48] dump_stack_lvl+0xbc/0x110 (unreliable) [ 12.028609] [c000000008dbf3f0] [c0000000006e2fc8] print_report+0x6b0/0x708 [ 12.028666] [c000000008dbf4e0] [c0000000006e2454] kasan_report+0x164/0x300 [ 12.028725] [c000000008dbf600] [c0000000006e54d4] kasan_check_range+0x314/0x370 [ 12.028784] [c000000008dbf640] [c0000000006e6310] __kasan_check_write+0x20/0x40 [ 12.028842] [c000000008dbf660] [c000000000578e8c] copy_to_kernel_nofault+0x8c/0x1a0 [ 12.028902] [c000000008dbf6a0] [c0000000000acfe4] __patch_instructions+0x194/0x210 [ 12.028965] [c000000008dbf6e0] [c0000000000ade80] patch_instructions+0x150/0x590 [ 12.029026] [c000000008dbf7c0] [c0000000001159bc] bpf_arch_text_copy+0x6c/0xe0 [ 12.029085] [c000000008dbf800] [c000000000424250] bpf_jit_binary_pack_finalize+0x40/0xc0 [ 12.029147] [c000000008dbf830] [c000000000115dec] bpf_int_jit_compile+0x3bc/0x930 [ 12.029206] [c000000008dbf990] [c000000000423720] bpf_prog_select_runtime+0x1f0/0x280 [ 12.029266] [c000000008dbfa00] [c000000000434b18] bpf_prog_load+0xbb8/0x1370 [ 12.029324] [c000000008dbfb70] [c000000000436ebc] __sys_bpf+0x5ac/0x2e00 [ 12.029379] [c000000008dbfd00] [c00000000043a228] sys_bpf+0x28/0x40 [ 12.029435] [c000000008dbfd20] [c000000000038eb4] system_call_exception+0x334/0x610 [ 12.029497] [c000000008dbfe50] [c00000000000c270] system_call_vectored_common+0xf0/0x280 [ 12.029561] --- interrupt: 3000 at 0x3fff82f5cfa8 [ 12.029608] NIP: 00003fff82f5cfa8 LR: 00003fff82f5cfa8 CTR: 0000000000000000 [ 12.029660] REGS: c000000008dbfe80 TRAP: 3000 Tainted: G T (6.13.0-P9-dirty) [ 12.029735] MSR: 900000000280f032 CR: 42004848 XER: 00000000 [ 12.029855] IRQMASK: 0 GPR00: 0000000000000169 00003fffdcf789a0 00003fff83067100 0000000000000005 GPR04: 00003fffdcf78a98 0000000000000090 0000000000000000 0000000000000008 GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR12: 0000000000000000 00003fff836ff7e0 c000000000010678 0000000000000000 GPR16: 0000000000000000 0000000000000000 00003fffdcf78f28 00003fffdcf78f90 GPR20: 0000000000000000 0000000000000000 0000000000000000 00003fffdcf78f80 GPR24: 00003fffdcf78f70 00003fffdcf78d10 00003fff835c7239 00003fffdcf78bd8 GPR28: 00003fffdcf78a98 0000000000000000 0000000000000000 000000011f547580 [ 12.030316] NIP [00003fff82f5cfa8] 0x3fff82f5cfa8 [ 12.030361] LR [00003fff82f5cfa8] 0x3fff82f5cfa8 [ 12.030405] --- interrupt: 3000 [ 12.030444] ================================================================== Commit c28c15b6d28a ("powerpc/code-patching: Use temporary mm for Radix MMU") is inspired from x86 but unlike x86 is doesn't disable KASAN reports during patching. This wasn't a problem at the begining because __patch_mem() is not instrumented. Commit 465cabc97b42 ("powerpc/code-patching: introduce patch_instructions()") use copy_to_kernel_nofault() to copy several instructions at once. But when using temporary mm the destination is not regular kernel memory but a kind of kernel-like memory located in user address space. Because it is not in kernel address space it is not covered by KASAN shadow memory. Since commit e4137f08816b ("mm, kasan, kmsan: instrument copy_from/to_kernel_nofault") KASAN reports bad accesses from copy_to_kernel_nofault(). Here a bad access to user memory is reported because KASAN detects the lack of shadow memory and the address is below TASK_SIZE. Do like x86 in commit b3fd8e83ada0 ("x86/alternatives: Use temporary mm for text poking") and disable KASAN reports during patching when using temporary mm. Reported-by: Erhard Furtner Close: https://lore.kernel.org/all/20250201151435.48400261@yea/ Fixes: 465cabc97b42 ("powerpc/code-patching: introduce patch_instructions()") Signed-off-by: Christophe Leroy Acked-by: Michael Ellerman Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/1c05b2a1b02ad75b981cfc45927e0b4a90441046.1738577687.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/code-patching.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index af97fbb3c257..81c0f673eb25 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -493,7 +493,9 @@ static int __do_patch_instructions_mm(u32 *addr, u32 *code, size_t len, bool rep orig_mm = start_using_temp_mm(patching_mm); + kasan_disable_current(); err = __patch_instructions(patch_addr, code, len, repeat_instr); + kasan_enable_current(); /* context synchronisation performed by __patch_instructions */ stop_using_temp_mm(patching_mm, orig_mm); From 61bcc752d1b81fde3cae454ff20c1d3c359df500 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sun, 12 Jan 2025 19:24:46 +0100 Subject: [PATCH 088/310] powerpc/64s: Rewrite __real_pte() and __rpte_to_hidx() as static inline Rewrite __real_pte() and __rpte_to_hidx() as static inline in order to avoid following warnings/errors when building with 4k page size: CC arch/powerpc/mm/book3s64/hash_tlb.o arch/powerpc/mm/book3s64/hash_tlb.c: In function 'hpte_need_flush': arch/powerpc/mm/book3s64/hash_tlb.c:49:16: error: variable 'offset' set but not used [-Werror=unused-but-set-variable] 49 | int i, offset; | ^~~~~~ CC arch/powerpc/mm/book3s64/hash_native.o arch/powerpc/mm/book3s64/hash_native.c: In function 'native_flush_hash_range': arch/powerpc/mm/book3s64/hash_native.c:782:29: error: variable 'index' set but not used [-Werror=unused-but-set-variable] 782 | unsigned long hash, index, hidx, shift, slot; | ^~~~~ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501081741.AYFwybsq-lkp@intel.com/ Fixes: ff31e105464d ("powerpc/mm/hash64: Store the slot information at the right offset for hugetlb") Signed-off-by: Christophe Leroy Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/e0d340a5b7bd478ecbf245d826e6ab2778b74e06.1736706263.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index c3efacab4b94..aa90a048f319 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -77,9 +77,17 @@ /* * With 4K page size the real_pte machinery is all nops. */ -#define __real_pte(e, p, o) ((real_pte_t){(e)}) +static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep, int offset) +{ + return (real_pte_t){pte}; +} + #define __rpte_to_pte(r) ((r).pte) -#define __rpte_to_hidx(r,index) (pte_val(__rpte_to_pte(r)) >> H_PAGE_F_GIX_SHIFT) + +static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index) +{ + return pte_val(__rpte_to_pte(rpte)) >> H_PAGE_F_GIX_SHIFT; +} #define pte_iterate_hashed_subpages(rpte, psize, va, index, shift) \ do { \ From a1f7b7ff0e10ae574d388131596390157222f986 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 10 Feb 2025 10:17:27 +0200 Subject: [PATCH 089/310] PCI: pci_ids: add INTEL_HDA_PTL_H Add Intel PTL-H audio Device ID. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Bard Liao Acked-by: Bjorn Helgaas Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250210081730.22916-2-peter.ujfalusi@linux.intel.com --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index de5deb1a0118..1a2594a38199 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3134,6 +3134,7 @@ #define PCI_DEVICE_ID_INTEL_HDA_LNL_P 0xa828 #define PCI_DEVICE_ID_INTEL_S21152BB 0xb152 #define PCI_DEVICE_ID_INTEL_HDA_BMG 0xe2f7 +#define PCI_DEVICE_ID_INTEL_HDA_PTL_H 0xe328 #define PCI_DEVICE_ID_INTEL_HDA_PTL 0xe428 #define PCI_DEVICE_ID_INTEL_HDA_CML_R 0xf0c8 #define PCI_DEVICE_ID_INTEL_HDA_RKL_S 0xf1c8 From 214e6be2d91d5d58f28d3a37630480077a1aafbd Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 10 Feb 2025 10:17:28 +0200 Subject: [PATCH 090/310] ALSA: hda: intel-dsp-config: Add PTL-H support Use same recipes as PTL for PTL-H. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Bard Liao Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250210081730.22916-3-peter.ujfalusi@linux.intel.com --- sound/hda/intel-dsp-config.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/hda/intel-dsp-config.c b/sound/hda/intel-dsp-config.c index f564ec7af194..ce3ae2cba660 100644 --- a/sound/hda/intel-dsp-config.c +++ b/sound/hda/intel-dsp-config.c @@ -539,6 +539,11 @@ static const struct config_entry config_table[] = { .flags = FLAG_SOF | FLAG_SOF_ONLY_IF_DMIC_OR_SOUNDWIRE, .device = PCI_DEVICE_ID_INTEL_HDA_PTL, }, + { + .flags = FLAG_SOF | FLAG_SOF_ONLY_IF_DMIC_OR_SOUNDWIRE, + .device = PCI_DEVICE_ID_INTEL_HDA_PTL_H, + }, + #endif }; From 4e9c87cfcd0584f2a2e2f352a43ff003d688f3a4 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 10 Feb 2025 10:17:29 +0200 Subject: [PATCH 091/310] ASoC: SOF: Intel: pci-ptl: Add support for PTL-H PTL-H uses the same configuration as PTL. Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Bard Liao Acked-by: Mark Brown Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250210081730.22916-4-peter.ujfalusi@linux.intel.com --- sound/soc/sof/intel/pci-ptl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/sof/intel/pci-ptl.c b/sound/soc/sof/intel/pci-ptl.c index 0aacdfac9fb4..c4fb6a2441b7 100644 --- a/sound/soc/sof/intel/pci-ptl.c +++ b/sound/soc/sof/intel/pci-ptl.c @@ -50,6 +50,7 @@ static const struct sof_dev_desc ptl_desc = { /* PCI IDs */ static const struct pci_device_id sof_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, HDA_PTL, &ptl_desc) }, /* PTL */ + { PCI_DEVICE_DATA(INTEL, HDA_PTL_H, &ptl_desc) }, /* PTL-H */ { 0, } }; MODULE_DEVICE_TABLE(pci, sof_pci_ids); From d7e2447a4d51de5c3c03e3b7892898e98ddd9769 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 10 Feb 2025 10:17:30 +0200 Subject: [PATCH 092/310] ALSA: hda: hda-intel: add Panther Lake-H support Add Intel PTL-H audio Device ID. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Bard Liao Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250210081730.22916-5-peter.ujfalusi@linux.intel.com --- sound/pci/hda/hda_intel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 7d7f9aac50a9..67540e037309 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2496,6 +2496,8 @@ static const struct pci_device_id azx_ids[] = { { PCI_DEVICE_DATA(INTEL, HDA_ARL, AZX_DRIVER_SKL | AZX_DCAPS_INTEL_SKYLAKE) }, /* Panther Lake */ { PCI_DEVICE_DATA(INTEL, HDA_PTL, AZX_DRIVER_SKL | AZX_DCAPS_INTEL_LNL) }, + /* Panther Lake-H */ + { PCI_DEVICE_DATA(INTEL, HDA_PTL_H, AZX_DRIVER_SKL | AZX_DCAPS_INTEL_LNL) }, /* Apollolake (Broxton-P) */ { PCI_DEVICE_DATA(INTEL, HDA_APL, AZX_DRIVER_SKL | AZX_DCAPS_INTEL_BROXTON) }, /* Gemini-Lake */ From 70e90680c2592c38c62e5716f1296a2d74bae7af Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:46:33 +0100 Subject: [PATCH 093/310] ALSA: Switch to use hrtimer_setup() hrtimer_setup() takes the callback function pointer as argument and initializes the timer completely. Replace hrtimer_init() and the open coded initialization of hrtimer::function with the new setup mechanism. Patch was created by using Coccinelle. Acked-by: Zack Rusin Signed-off-by: Nam Cao Cc: Takashi Iwai Link: https://patch.msgid.link/598031332ce738c82286a158cb66eb7e735b2e79.1738746904.git.namcao@linutronix.de Signed-off-by: Takashi Iwai --- sound/core/hrtimer.c | 3 +-- sound/drivers/dummy.c | 3 +-- sound/drivers/pcsp/pcsp.c | 3 +-- sound/sh/sh_dac_audio.c | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/sound/core/hrtimer.c b/sound/core/hrtimer.c index 147c1fea4708..e9c60dce59fb 100644 --- a/sound/core/hrtimer.c +++ b/sound/core/hrtimer.c @@ -66,9 +66,8 @@ static int snd_hrtimer_open(struct snd_timer *t) stime = kzalloc(sizeof(*stime), GFP_KERNEL); if (!stime) return -ENOMEM; - hrtimer_init(&stime->hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL); stime->timer = t; - stime->hrt.function = snd_hrtimer_callback; + hrtimer_setup(&stime->hrt, snd_hrtimer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_REL); t->private_data = stime; return 0; } diff --git a/sound/drivers/dummy.c b/sound/drivers/dummy.c index 8f5df9b3aaaa..c1a3efb633c5 100644 --- a/sound/drivers/dummy.c +++ b/sound/drivers/dummy.c @@ -457,8 +457,7 @@ static int dummy_hrtimer_create(struct snd_pcm_substream *substream) if (!dpcm) return -ENOMEM; substream->runtime->private_data = dpcm; - hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - dpcm->timer.function = dummy_hrtimer_callback; + hrtimer_setup(&dpcm->timer, dummy_hrtimer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); dpcm->substream = substream; atomic_set(&dpcm->running, 0); return 0; diff --git a/sound/drivers/pcsp/pcsp.c b/sound/drivers/pcsp/pcsp.c index 78c9b1c7590f..e8482c2290c3 100644 --- a/sound/drivers/pcsp/pcsp.c +++ b/sound/drivers/pcsp/pcsp.c @@ -103,8 +103,7 @@ static int snd_card_pcsp_probe(int devnum, struct device *dev) if (devnum != 0) return -EINVAL; - hrtimer_init(&pcsp_chip.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - pcsp_chip.timer.function = pcsp_do_timer; + hrtimer_setup(&pcsp_chip.timer, pcsp_do_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); err = snd_devm_card_new(dev, index, id, THIS_MODULE, 0, &card); if (err < 0) diff --git a/sound/sh/sh_dac_audio.c b/sound/sh/sh_dac_audio.c index 3f5422145c5e..84a4b17a0cc2 100644 --- a/sound/sh/sh_dac_audio.c +++ b/sound/sh/sh_dac_audio.c @@ -312,8 +312,7 @@ static int snd_sh_dac_create(struct snd_card *card, chip->card = card; - hrtimer_init(&chip->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - chip->hrtimer.function = sh_dac_audio_timer; + hrtimer_setup(&chip->hrtimer, sh_dac_audio_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); dac_audio_reset(chip); chip->rate = 8000; From 8dbccafce3c8ae026606f5c7bc6637667d9d5595 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 10 Feb 2025 09:17:58 +0000 Subject: [PATCH 094/310] KVM: arm64: Fix __pkvm_host_mkyoung_guest() return value Don't use an uninitialised stack variable, and just return 0 on the non-error path. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502100911.8c9DbtKD-lkp@intel.com/ Reviewed-by: Quentin Perret Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 4c2f6a6a2efe..19c3c631708c 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1073,7 +1073,6 @@ int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) { struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); u64 ipa = hyp_pfn_to_phys(gfn); - int ret; if (pkvm_hyp_vm_is_protected(vm)) return -EPERM; @@ -1083,5 +1082,5 @@ int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); guest_unlock_component(vm); - return ret; + return 0; } From 78ccf6a6bae11e451e20a52dd2bc2ab98f66326b Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 10 Feb 2025 11:19:53 +0800 Subject: [PATCH 095/310] ASoC: Intel: soc-acpi-intel-ptl-match: revise typo of rt712_vb + rt1320 support s/lnl/ptl Fixes: bd40d912728f ("ASoC: Intel: soc-acpi-intel-ptl-match: add rt712_vb + rt1320 support") Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Ranjani Sridharan Signed-off-by: Bard Liao Link: https://patch.msgid.link/20250210031954.6287-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/common/soc-acpi-intel-ptl-match.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/intel/common/soc-acpi-intel-ptl-match.c b/sound/soc/intel/common/soc-acpi-intel-ptl-match.c index 9eb4a43e3e7a..e487c4e1c034 100644 --- a/sound/soc/intel/common/soc-acpi-intel-ptl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-ptl-match.c @@ -270,7 +270,7 @@ static const struct snd_soc_acpi_link_adr lnl_sdw_rt713_vb_l2_rt1320_l13[] = { {} }; -static const struct snd_soc_acpi_link_adr lnl_sdw_rt712_vb_l2_rt1320_l1[] = { +static const struct snd_soc_acpi_link_adr ptl_sdw_rt712_vb_l2_rt1320_l1[] = { { .mask = BIT(2), .num_adr = ARRAY_SIZE(rt712_vb_2_group1_adr), @@ -337,10 +337,10 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_ptl_sdw_machines[] = { }, { .link_mask = BIT(1) | BIT(2), - .links = lnl_sdw_rt712_vb_l2_rt1320_l1, + .links = ptl_sdw_rt712_vb_l2_rt1320_l1, .drv_name = "sof_sdw", .machine_check = snd_soc_acpi_intel_sdca_is_device_rt712_vb, - .sof_tplg_filename = "sof-lnl-rt712-l2-rt1320-l1.tplg" + .sof_tplg_filename = "sof-ptl-rt712-l2-rt1320-l1.tplg" }, { .link_mask = BIT(1) | BIT(2) | BIT(3), From cb78b8dc7834066539253c039f276b3625fecd9f Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 10 Feb 2025 11:19:54 +0800 Subject: [PATCH 096/310] ASoC: Intel: soc-acpi-intel-ptl-match: revise typo of rt713_vb_l2_rt1320_l13 s/lnl/ptl Fixes: a7ebb0255188 ("ASoC: Intel: soc-acpi-intel-ptl-match: add rt713_vb_l2_rt1320_l13 support") Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Ranjani Sridharan Signed-off-by: Bard Liao Link: https://patch.msgid.link/20250210031954.6287-3-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/common/soc-acpi-intel-ptl-match.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/intel/common/soc-acpi-intel-ptl-match.c b/sound/soc/intel/common/soc-acpi-intel-ptl-match.c index e487c4e1c034..dd7993b76dee 100644 --- a/sound/soc/intel/common/soc-acpi-intel-ptl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-ptl-match.c @@ -251,7 +251,7 @@ static const struct snd_soc_acpi_link_adr ptl_rvp[] = { {} }; -static const struct snd_soc_acpi_link_adr lnl_sdw_rt713_vb_l2_rt1320_l13[] = { +static const struct snd_soc_acpi_link_adr ptl_sdw_rt713_vb_l2_rt1320_l13[] = { { .mask = BIT(2), .num_adr = ARRAY_SIZE(rt713_vb_2_adr), @@ -344,10 +344,10 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_ptl_sdw_machines[] = { }, { .link_mask = BIT(1) | BIT(2) | BIT(3), - .links = lnl_sdw_rt713_vb_l2_rt1320_l13, + .links = ptl_sdw_rt713_vb_l2_rt1320_l13, .drv_name = "sof_sdw", .machine_check = snd_soc_acpi_intel_sdca_is_device_rt712_vb, - .sof_tplg_filename = "sof-lnl-rt713-l2-rt1320-l13.tplg" + .sof_tplg_filename = "sof-ptl-rt713-l2-rt1320-l13.tplg" }, {}, }; From 91b98d5a6e8067c5226207487681a48f0d651e46 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Fri, 7 Feb 2025 13:46:02 +0200 Subject: [PATCH 097/310] ASoC: SOF: amd: Add post_fw_run_delay ACP quirk Stress testing resume from suspend on Valve Steam Deck OLED (Galileo) revealed that the DSP firmware could enter an unrecoverable faulty state, where the kernel ring buffer is flooded with IPC related error messages: [ +0.017002] snd_sof_amd_vangogh 0000:04:00.5: acp_sof_ipc_send_msg: Failed to acquire HW lock [ +0.000054] snd_sof_amd_vangogh 0000:04:00.5: ipc3_tx_msg_unlocked: ipc message send for 0x30100000 failed: -22 [ +0.000005] snd_sof_amd_vangogh 0000:04:00.5: Failed to setup widget PIPELINE.6.ACPHS1.IN [ +0.000004] snd_sof_amd_vangogh 0000:04:00.5: Failed to restore pipeline after resume -22 [ +0.000003] snd_sof_amd_vangogh 0000:04:00.5: PM: dpm_run_callback(): pci_pm_resume returns -22 [ +0.000009] snd_sof_amd_vangogh 0000:04:00.5: PM: failed to resume async: error -22 [...] [ +0.002582] PM: suspend exit [ +0.065085] snd_sof_amd_vangogh 0000:04:00.5: ipc tx error for 0x30130000 (msg/reply size: 12/0): -22 [ +0.000499] snd_sof_amd_vangogh 0000:04:00.5: error: failed widget list set up for pcm 1 dir 0 [ +0.000011] snd_sof_amd_vangogh 0000:04:00.5: error: set pcm hw_params after resume [ +0.000006] snd_sof_amd_vangogh 0000:04:00.5: ASoC: error at snd_soc_pcm_component_prepare on 0000:04:00.5: -22 [...] A system reboot would be necessary to restore the speakers functionality. However, by delaying a bit any host to DSP transmission right after the firmware boot completed, the issue could not be reproduced anymore and sound continued to work flawlessly even after performing thousands of suspend/resume cycles. Introduce the post_fw_run_delay ACP quirk to allow providing the aforementioned delay via the snd_sof_dsp_ops->post_fw_run() callback for the affected devices. Signed-off-by: Cristian Ciocaltea Link: https://patch.msgid.link/20250207-sof-vangogh-fixes-v1-1-67824c1e4c9a@collabora.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/acp.c | 1 + sound/soc/sof/amd/acp.h | 1 + sound/soc/sof/amd/vangogh.c | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+) diff --git a/sound/soc/sof/amd/acp.c b/sound/soc/sof/amd/acp.c index 33648ff8b833..9e13c96528be 100644 --- a/sound/soc/sof/amd/acp.c +++ b/sound/soc/sof/amd/acp.c @@ -27,6 +27,7 @@ MODULE_PARM_DESC(enable_fw_debug, "Enable Firmware debug"); static struct acp_quirk_entry quirk_valve_galileo = { .signed_fw_image = true, .skip_iram_dram_size_mod = true, + .post_fw_run_delay = true, }; const struct dmi_system_id acp_sof_quirk_table[] = { diff --git a/sound/soc/sof/amd/acp.h b/sound/soc/sof/amd/acp.h index 800594440f73..2a19d82d6200 100644 --- a/sound/soc/sof/amd/acp.h +++ b/sound/soc/sof/amd/acp.h @@ -220,6 +220,7 @@ struct sof_amd_acp_desc { struct acp_quirk_entry { bool signed_fw_image; bool skip_iram_dram_size_mod; + bool post_fw_run_delay; }; /* Common device data struct for ACP devices */ diff --git a/sound/soc/sof/amd/vangogh.c b/sound/soc/sof/amd/vangogh.c index 8e2672106ac6..d5f1dddd43e7 100644 --- a/sound/soc/sof/amd/vangogh.c +++ b/sound/soc/sof/amd/vangogh.c @@ -11,6 +11,7 @@ * Hardware interface for Audio DSP on Vangogh platform */ +#include #include #include @@ -136,6 +137,20 @@ static struct snd_soc_dai_driver vangogh_sof_dai[] = { }, }; +static int sof_vangogh_post_fw_run_delay(struct snd_sof_dev *sdev) +{ + /* + * Resuming from suspend in some cases my cause the DSP firmware + * to enter an unrecoverable faulty state. Delaying a bit any host + * to DSP transmission right after firmware boot completion seems + * to resolve the issue. + */ + if (!sdev->first_boot) + usleep_range(100, 150); + + return 0; +} + /* Vangogh ops */ struct snd_sof_dsp_ops sof_vangogh_ops; EXPORT_SYMBOL_NS(sof_vangogh_ops, "SND_SOC_SOF_AMD_COMMON"); @@ -157,6 +172,9 @@ int sof_vangogh_ops_init(struct snd_sof_dev *sdev) if (quirks->signed_fw_image) sof_vangogh_ops.load_firmware = acp_sof_load_signed_firmware; + + if (quirks->post_fw_run_delay) + sof_vangogh_ops.post_fw_run = sof_vangogh_post_fw_run_delay; } return 0; From 2ecbc2e9f3b19e2199e8bc3ba603d299f1985f09 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Fri, 7 Feb 2025 13:46:03 +0200 Subject: [PATCH 098/310] ASoC: SOF: amd: Drop unused includes from Vangogh driver Remove all the includes for headers which are not (directly) used from the Vangogh SOF driver sources. Signed-off-by: Cristian Ciocaltea Reviewed-by: Venkata Prasad Potturu Link: https://patch.msgid.link/20250207-sof-vangogh-fixes-v1-2-67824c1e4c9a@collabora.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/pci-vangogh.c | 2 -- sound/soc/sof/amd/vangogh.c | 4 ---- 2 files changed, 6 deletions(-) diff --git a/sound/soc/sof/amd/pci-vangogh.c b/sound/soc/sof/amd/pci-vangogh.c index 53f64d6bc91b..28f2d4050a67 100644 --- a/sound/soc/sof/amd/pci-vangogh.c +++ b/sound/soc/sof/amd/pci-vangogh.c @@ -13,11 +13,9 @@ #include #include -#include #include #include -#include "../ops.h" #include "../sof-pci-dev.h" #include "../../amd/mach-config.h" #include "acp.h" diff --git a/sound/soc/sof/amd/vangogh.c b/sound/soc/sof/amd/vangogh.c index d5f1dddd43e7..6ed5f9aaa414 100644 --- a/sound/soc/sof/amd/vangogh.c +++ b/sound/soc/sof/amd/vangogh.c @@ -12,13 +12,9 @@ */ #include -#include #include -#include "../ops.h" -#include "../sof-audio.h" #include "acp.h" -#include "acp-dsp-offset.h" #define I2S_HS_INSTANCE 0 #define I2S_BT_INSTANCE 1 From ac84ca815adb4171a4276b1d44096b75f6a150b7 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Fri, 7 Feb 2025 13:46:04 +0200 Subject: [PATCH 099/310] ASoC: SOF: amd: Handle IPC replies before FW_BOOT_COMPLETE In some cases, e.g. during resuming from suspend, there is a possibility that some IPC reply messages get received by the host while the DSP firmware has not yet reached the complete boot state. Detect when this happens and do not attempt to process the unexpected replies from DSP. Instead, provide proper debugging support. Signed-off-by: Cristian Ciocaltea Link: https://patch.msgid.link/20250207-sof-vangogh-fixes-v1-3-67824c1e4c9a@collabora.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/acp-ipc.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/sound/soc/sof/amd/acp-ipc.c b/sound/soc/sof/amd/acp-ipc.c index 5f371d9263f3..12caefd08788 100644 --- a/sound/soc/sof/amd/acp-ipc.c +++ b/sound/soc/sof/amd/acp-ipc.c @@ -167,6 +167,7 @@ irqreturn_t acp_sof_ipc_irq_thread(int irq, void *context) if (sdev->first_boot && sdev->fw_state != SOF_FW_BOOT_COMPLETE) { acp_mailbox_read(sdev, sdev->dsp_box.offset, &status, sizeof(status)); + if ((status & SOF_IPC_PANIC_MAGIC_MASK) == SOF_IPC_PANIC_MAGIC) { snd_sof_dsp_panic(sdev, sdev->dsp_box.offset + sizeof(status), true); @@ -188,13 +189,21 @@ irqreturn_t acp_sof_ipc_irq_thread(int irq, void *context) dsp_ack = snd_sof_dsp_read(sdev, ACP_DSP_BAR, ACP_SCRATCH_REG_0 + dsp_ack_write); if (dsp_ack) { - spin_lock_irq(&sdev->ipc_lock); - /* handle immediate reply from DSP core */ - acp_dsp_ipc_get_reply(sdev); - snd_sof_ipc_reply(sdev, 0); - /* set the done bit */ - acp_dsp_ipc_dsp_done(sdev); - spin_unlock_irq(&sdev->ipc_lock); + if (likely(sdev->fw_state == SOF_FW_BOOT_COMPLETE)) { + spin_lock_irq(&sdev->ipc_lock); + + /* handle immediate reply from DSP core */ + acp_dsp_ipc_get_reply(sdev); + snd_sof_ipc_reply(sdev, 0); + /* set the done bit */ + acp_dsp_ipc_dsp_done(sdev); + + spin_unlock_irq(&sdev->ipc_lock); + } else { + dev_dbg_ratelimited(sdev->dev, "IPC reply before FW_BOOT_COMPLETE: %#x\n", + dsp_ack); + } + ipc_irq = true; } From ccc8480d90e8cb60f06bd90e227f34784927e19f Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Fri, 7 Feb 2025 13:46:05 +0200 Subject: [PATCH 100/310] ASoC: SOF: amd: Add branch prediction hint in ACP IRQ handler The conditional involving sdev->first_boot in acp_sof_ipc_irq_thread() will succeed only once, i.e. during the very first run of the DSP firmware. Use the unlikely() annotation to help improve branch prediction accuracy. Signed-off-by: Cristian Ciocaltea Reviewed-by: Venkata Prasad Potturu Link: https://patch.msgid.link/20250207-sof-vangogh-fixes-v1-4-67824c1e4c9a@collabora.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/acp-ipc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sof/amd/acp-ipc.c b/sound/soc/sof/amd/acp-ipc.c index 12caefd08788..22d4b807e1bb 100644 --- a/sound/soc/sof/amd/acp-ipc.c +++ b/sound/soc/sof/amd/acp-ipc.c @@ -165,7 +165,7 @@ irqreturn_t acp_sof_ipc_irq_thread(int irq, void *context) int dsp_msg, dsp_ack; unsigned int status; - if (sdev->first_boot && sdev->fw_state != SOF_FW_BOOT_COMPLETE) { + if (unlikely(sdev->first_boot && sdev->fw_state != SOF_FW_BOOT_COMPLETE)) { acp_mailbox_read(sdev, sdev->dsp_box.offset, &status, sizeof(status)); if ((status & SOF_IPC_PANIC_MAGIC_MASK) == SOF_IPC_PANIC_MAGIC) { From 9759ae2cee7cd42b95f1c48aa3749bd02b5ddb08 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Fri, 17 Jan 2025 13:58:00 +0800 Subject: [PATCH 101/310] iommu: Fix potential memory leak in iopf_queue_remove_device() The iopf_queue_remove_device() helper removes a device from the per-iommu iopf queue when PRI is disabled on the device. It responds to all outstanding iopf's with an IOMMU_PAGE_RESP_INVALID code and detaches the device from the queue. However, it fails to release the group structure that represents a group of iopf's awaiting for a response after responding to the hardware. This can cause a memory leak if iopf_queue_remove_device() is called with pending iopf's. Fix it by calling iopf_free_group() after the iopf group is responded. Fixes: 199112327135 ("iommu: Track iopf group instead of last fault") Cc: stable@vger.kernel.org Suggested-by: Kevin Tian Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250117055800.782462-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgfault.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 4674e618797c..8b5926c1452e 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -478,6 +478,7 @@ void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) ops->page_response(dev, iopf, &resp); list_del_init(&group->pending_node); + iopf_free_group(group); } mutex_unlock(&fault_param->lock); From fc876c9524e2a9f816f51d533ed31df789cff65a Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Wed, 5 Feb 2025 10:40:42 +0530 Subject: [PATCH 102/310] drm/xe/client: bo->client does not need bos_lock bos_lock is to protect list of bos used by client, it is not required to protect bo->client so bring it outside of bos_lock. Fixes: b27970f3e11c ("drm/xe: Add tracking support for bos per client") Signed-off-by: Tejas Upadhyay Reviewed-by: Himal Prasad Ghimiray Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20250205051042.1991192-1-tejas.upadhyay@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit f74fd53ba34551b7626193fb70c17226f06e9bf1) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_drm_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c index 63f30b6df70b..2d4874d2b922 100644 --- a/drivers/gpu/drm/xe/xe_drm_client.c +++ b/drivers/gpu/drm/xe/xe_drm_client.c @@ -135,8 +135,8 @@ void xe_drm_client_add_bo(struct xe_drm_client *client, XE_WARN_ON(bo->client); XE_WARN_ON(!list_empty(&bo->client_link)); - spin_lock(&client->bos_lock); bo->client = xe_drm_client_get(client); + spin_lock(&client->bos_lock); list_add_tail(&bo->client_link, &client->bos_list); spin_unlock(&client->bos_lock); } From 53139b3f9998ea07289e7b70b909fea2264a0de9 Mon Sep 17 00:00:00 2001 From: Krzysztof Karas Date: Thu, 30 Jan 2025 09:19:31 +0000 Subject: [PATCH 103/310] drm/i915/selftests: avoid using uninitialized context There is an error path in igt_ppgtt_alloc(), which leads to ww object being passed down to i915_gem_ww_ctx_fini() without initialization. Correct that by only putting ppgtt->vm and returning early. Fixes: 480ae79537b2 ("drm/i915/selftests: Prepare gtt tests for obj->mm.lock removal") Signed-off-by: Krzysztof Karas Reviewed-by: Mikolaj Wasiak Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/iuaonpjc3rywmvhna6umjlvzilocn2uqsrxfxfob24e2taocbi@lkaivvfp4777 (cherry picked from commit 8d8334632ea62424233ac6529712868241d0f8df) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c index 5c397a2df70e..5d27e1c733c5 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c @@ -168,7 +168,7 @@ static int igt_ppgtt_alloc(void *arg) return PTR_ERR(ppgtt); if (!ppgtt->vm.allocate_va_range) - goto err_ppgtt_cleanup; + goto ppgtt_vm_put; /* * While we only allocate the page tables here and so we could @@ -236,7 +236,7 @@ static int igt_ppgtt_alloc(void *arg) goto retry; } i915_gem_ww_ctx_fini(&ww); - +ppgtt_vm_put: i915_vm_put(&ppgtt->vm); return err; } From b19181638182d1f5c43757b471c056b6196c8ca3 Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Mon, 10 Feb 2025 16:32:50 +0000 Subject: [PATCH 104/310] ASoC: cs35l41: Fix acpi_device_hid() not found MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Function acpi_device_hid() is only defined if CONFIG_ACPI is set. Use #ifdef CONFIG_ACPI to ensure that cs35l41 driver only calls this function is CONFIG_ACPI is define. Fixes: 1d44a30ae3f9 ("ASoC: cs35l41: Fallback to using HID for system_name if no SUB is available") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502090100.SbXmGFqs-lkp@intel.com/ Signed-off-by: Stefan Binding Reviewed-by: André Almeida Link: https://patch.msgid.link/20250210163256.1722350-1-sbinding@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l41.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/codecs/cs35l41.c b/sound/soc/codecs/cs35l41.c index 30b89018b113..ff4134bee858 100644 --- a/sound/soc/codecs/cs35l41.c +++ b/sound/soc/codecs/cs35l41.c @@ -1148,6 +1148,7 @@ static int cs35l41_dsp_init(struct cs35l41_private *cs35l41) return ret; } +#ifdef CONFIG_ACPI static int cs35l41_acpi_get_name(struct cs35l41_private *cs35l41) { struct acpi_device *adev = ACPI_COMPANION(cs35l41->dev); @@ -1180,6 +1181,12 @@ static int cs35l41_acpi_get_name(struct cs35l41_private *cs35l41) return 0; } +#else +static int cs35l41_acpi_get_name(struct cs35l41_private *cs35l41) +{ + return 0; +} +#endif /* CONFIG_ACPI */ int cs35l41_probe(struct cs35l41_private *cs35l41, const struct cs35l41_hw_cfg *hw_cfg) { From f3f08c3acfb8860e07a22814a344e83c99ad7398 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 10 Feb 2025 09:27:09 -1000 Subject: [PATCH 105/310] sched_ext: Fix incorrect assumption about migration disabled tasks in task_can_run_on_remote_rq() While fixing migration disabled task handling, 32966821574c ("sched_ext: Fix migration disabled handling in targeted dispatches") assumed that a migration disabled task's ->cpus_ptr would only have the pinned CPU. While this is eventually true for migration disabled tasks that are switched out, ->cpus_ptr update is performed by migrate_disable_switch() which is called right before context_switch() in __scheduler(). However, the task is enqueued earlier during pick_next_task() via put_prev_task_scx(), so there is a race window where another CPU can see the task on a DSQ. If the CPU tries to dispatch the migration disabled task while in that window, task_allowed_on_cpu() will succeed and task_can_run_on_remote_rq() will subsequently trigger SCHED_WARN(is_migration_disabled()). WARNING: CPU: 8 PID: 1837 at kernel/sched/ext.c:2466 task_can_run_on_remote_rq+0x12e/0x140 Sched_ext: layered (enabled+all), task: runnable_at=-10ms RIP: 0010:task_can_run_on_remote_rq+0x12e/0x140 ... consume_dispatch_q+0xab/0x220 scx_bpf_dsq_move_to_local+0x58/0xd0 bpf_prog_84dd17b0654b6cf0_layered_dispatch+0x290/0x1cfa bpf__sched_ext_ops_dispatch+0x4b/0xab balance_one+0x1fe/0x3b0 balance_scx+0x61/0x1d0 prev_balance+0x46/0xc0 __pick_next_task+0x73/0x1c0 __schedule+0x206/0x1730 schedule+0x3a/0x160 __do_sys_sched_yield+0xe/0x20 do_syscall_64+0xbb/0x1e0 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fix it by converting the SCHED_WARN() back to a regular failure path. Also, perform the migration disabled test before task_allowed_on_cpu() test so that BPF schedulers which fail to handle migration disabled tasks can be noticed easily. While at it, adjust scx_ops_error() message for !task_allowed_on_cpu() case for brevity and consistency. Signed-off-by: Tejun Heo Fixes: 32966821574c ("sched_ext: Fix migration disabled handling in targeted dispatches") Acked-by: Andrea Righi Reported-by: Jake Hillion --- kernel/sched/ext.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index e01144340d67..54edd0e2132a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2343,6 +2343,25 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, SCHED_WARN_ON(task_cpu(p) == cpu); + /* + * If @p has migration disabled, @p->cpus_ptr is updated to contain only + * the pinned CPU in migrate_disable_switch() while @p is being switched + * out. However, put_prev_task_scx() is called before @p->cpus_ptr is + * updated and thus another CPU may see @p on a DSQ inbetween leading to + * @p passing the below task_allowed_on_cpu() check while migration is + * disabled. + * + * Test the migration disabled state first as the race window is narrow + * and the BPF scheduler failing to check migration disabled state can + * easily be masked if task_allowed_on_cpu() is done first. + */ + if (unlikely(is_migration_disabled(p))) { + if (trigger_error) + scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", + p->comm, p->pid, task_cpu(p), cpu); + return false; + } + /* * We don't require the BPF scheduler to avoid dispatching to offline * CPUs mostly for convenience but also because CPUs can go offline @@ -2351,17 +2370,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, */ if (!task_allowed_on_cpu(p, cpu)) { if (trigger_error) - scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", - cpu_of(rq), p->comm, p->pid); + scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", + cpu, p->comm, p->pid); return false; } - /* - * If @p has migration disabled, @p->cpus_ptr only contains its current - * CPU and the above task_allowed_on_cpu() test should have failed. - */ - SCHED_WARN_ON(is_migration_disabled(p)); - if (!scx_rq_online(rq)) return false; From de1d0d160f64ee76df1d364d521b2faf465a091c Mon Sep 17 00:00:00 2001 From: Artur Weber Date: Thu, 6 Feb 2025 18:46:00 +0100 Subject: [PATCH 106/310] gpio: bcm-kona: Fix GPIO lock/unlock for banks above bank 0 The GPIO lock/unlock functions clear/write a bit to the relevant register for each bank. However, due to an oversight the bit that was being written was based on the total GPIO number, not the index of the GPIO within the relevant bank, causing it to fail for any GPIO above 32 (thus any GPIO for banks above bank 0). Fix lock/unlock for these banks by using the correct bit. Fixes: bdb93c03c550 ("gpio: bcm281xx: Centralize register locking") Reviewed-by: Florian Fainelli Reviewed-by: Markus Mayer Signed-off-by: Artur Weber Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250206-kona-gpio-fixes-v2-1-409135eab780@gmail.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-bcm-kona.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-bcm-kona.c b/drivers/gpio/gpio-bcm-kona.c index 5321ef98f442..77bd4ec93a23 100644 --- a/drivers/gpio/gpio-bcm-kona.c +++ b/drivers/gpio/gpio-bcm-kona.c @@ -86,11 +86,12 @@ static void bcm_kona_gpio_lock_gpio(struct bcm_kona_gpio *kona_gpio, u32 val; unsigned long flags; int bank_id = GPIO_BANK(gpio); + int bit = GPIO_BIT(gpio); raw_spin_lock_irqsave(&kona_gpio->lock, flags); val = readl(kona_gpio->reg_base + GPIO_PWD_STATUS(bank_id)); - val |= BIT(gpio); + val |= BIT(bit); bcm_kona_gpio_write_lock_regs(kona_gpio->reg_base, bank_id, val); raw_spin_unlock_irqrestore(&kona_gpio->lock, flags); @@ -102,11 +103,12 @@ static void bcm_kona_gpio_unlock_gpio(struct bcm_kona_gpio *kona_gpio, u32 val; unsigned long flags; int bank_id = GPIO_BANK(gpio); + int bit = GPIO_BIT(gpio); raw_spin_lock_irqsave(&kona_gpio->lock, flags); val = readl(kona_gpio->reg_base + GPIO_PWD_STATUS(bank_id)); - val &= ~BIT(gpio); + val &= ~BIT(bit); bcm_kona_gpio_write_lock_regs(kona_gpio->reg_base, bank_id, val); raw_spin_unlock_irqrestore(&kona_gpio->lock, flags); From 57f5db77a915cc29461a679a6bcae7097967be1a Mon Sep 17 00:00:00 2001 From: Artur Weber Date: Thu, 6 Feb 2025 18:46:01 +0100 Subject: [PATCH 107/310] gpio: bcm-kona: Make sure GPIO bits are unlocked when requesting IRQ The settings for all GPIOs are locked by default in bcm_kona_gpio_reset. The settings for a GPIO are unlocked when requesting it as a GPIO, but not when requesting it as an interrupt, causing the IRQ settings to not get applied. Fix this by making sure to unlock the right bits when an IRQ is requested. To avoid a situation where an IRQ being released causes a lock despite the same GPIO being used by a GPIO request or vice versa, add an unlock counter and only lock if it reaches 0. Fixes: 757651e3d60e ("gpio: bcm281xx: Add GPIO driver") Reviewed-by: Florian Fainelli Reviewed-by: Markus Mayer Signed-off-by: Artur Weber Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250206-kona-gpio-fixes-v2-2-409135eab780@gmail.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-bcm-kona.c | 67 +++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 12 deletions(-) diff --git a/drivers/gpio/gpio-bcm-kona.c b/drivers/gpio/gpio-bcm-kona.c index 77bd4ec93a23..17f3f210fee9 100644 --- a/drivers/gpio/gpio-bcm-kona.c +++ b/drivers/gpio/gpio-bcm-kona.c @@ -69,6 +69,22 @@ struct bcm_kona_gpio { struct bcm_kona_gpio_bank { int id; int irq; + /* + * Used to keep track of lock/unlock operations for each GPIO in the + * bank. + * + * All GPIOs are locked by default (see bcm_kona_gpio_reset), and the + * unlock count for all GPIOs is 0 by default. Each unlock increments + * the counter, and each lock decrements the counter. + * + * The lock function only locks the GPIO once its unlock counter is + * down to 0. This is necessary because the GPIO is unlocked in two + * places in this driver: once for requested GPIOs, and once for + * requested IRQs. Since it is possible for a GPIO to be requested + * as both a GPIO and an IRQ, we need to ensure that we don't lock it + * too early. + */ + u8 gpio_unlock_count[GPIO_PER_BANK]; /* Used in the interrupt handler */ struct bcm_kona_gpio *kona_gpio; }; @@ -87,14 +103,23 @@ static void bcm_kona_gpio_lock_gpio(struct bcm_kona_gpio *kona_gpio, unsigned long flags; int bank_id = GPIO_BANK(gpio); int bit = GPIO_BIT(gpio); + struct bcm_kona_gpio_bank *bank = &kona_gpio->banks[bank_id]; - raw_spin_lock_irqsave(&kona_gpio->lock, flags); + if (bank->gpio_unlock_count[bit] == 0) { + dev_err(kona_gpio->gpio_chip.parent, + "Unbalanced locks for GPIO %u\n", gpio); + return; + } - val = readl(kona_gpio->reg_base + GPIO_PWD_STATUS(bank_id)); - val |= BIT(bit); - bcm_kona_gpio_write_lock_regs(kona_gpio->reg_base, bank_id, val); + if (--bank->gpio_unlock_count[bit] == 0) { + raw_spin_lock_irqsave(&kona_gpio->lock, flags); - raw_spin_unlock_irqrestore(&kona_gpio->lock, flags); + val = readl(kona_gpio->reg_base + GPIO_PWD_STATUS(bank_id)); + val |= BIT(bit); + bcm_kona_gpio_write_lock_regs(kona_gpio->reg_base, bank_id, val); + + raw_spin_unlock_irqrestore(&kona_gpio->lock, flags); + } } static void bcm_kona_gpio_unlock_gpio(struct bcm_kona_gpio *kona_gpio, @@ -104,14 +129,19 @@ static void bcm_kona_gpio_unlock_gpio(struct bcm_kona_gpio *kona_gpio, unsigned long flags; int bank_id = GPIO_BANK(gpio); int bit = GPIO_BIT(gpio); + struct bcm_kona_gpio_bank *bank = &kona_gpio->banks[bank_id]; - raw_spin_lock_irqsave(&kona_gpio->lock, flags); + if (bank->gpio_unlock_count[bit] == 0) { + raw_spin_lock_irqsave(&kona_gpio->lock, flags); - val = readl(kona_gpio->reg_base + GPIO_PWD_STATUS(bank_id)); - val &= ~BIT(bit); - bcm_kona_gpio_write_lock_regs(kona_gpio->reg_base, bank_id, val); + val = readl(kona_gpio->reg_base + GPIO_PWD_STATUS(bank_id)); + val &= ~BIT(bit); + bcm_kona_gpio_write_lock_regs(kona_gpio->reg_base, bank_id, val); - raw_spin_unlock_irqrestore(&kona_gpio->lock, flags); + raw_spin_unlock_irqrestore(&kona_gpio->lock, flags); + } + + ++bank->gpio_unlock_count[bit]; } static int bcm_kona_gpio_get_dir(struct gpio_chip *chip, unsigned gpio) @@ -362,6 +392,7 @@ static void bcm_kona_gpio_irq_mask(struct irq_data *d) kona_gpio = irq_data_get_irq_chip_data(d); reg_base = kona_gpio->reg_base; + raw_spin_lock_irqsave(&kona_gpio->lock, flags); val = readl(reg_base + GPIO_INT_MASK(bank_id)); @@ -384,6 +415,7 @@ static void bcm_kona_gpio_irq_unmask(struct irq_data *d) kona_gpio = irq_data_get_irq_chip_data(d); reg_base = kona_gpio->reg_base; + raw_spin_lock_irqsave(&kona_gpio->lock, flags); val = readl(reg_base + GPIO_INT_MSKCLR(bank_id)); @@ -479,15 +511,26 @@ static void bcm_kona_gpio_irq_handler(struct irq_desc *desc) static int bcm_kona_gpio_irq_reqres(struct irq_data *d) { struct bcm_kona_gpio *kona_gpio = irq_data_get_irq_chip_data(d); + unsigned int gpio = d->hwirq; + + /* + * We need to unlock the GPIO before any other operations are performed + * on the relevant GPIO configuration registers + */ + bcm_kona_gpio_unlock_gpio(kona_gpio, gpio); - return gpiochip_reqres_irq(&kona_gpio->gpio_chip, d->hwirq); + return gpiochip_reqres_irq(&kona_gpio->gpio_chip, gpio); } static void bcm_kona_gpio_irq_relres(struct irq_data *d) { struct bcm_kona_gpio *kona_gpio = irq_data_get_irq_chip_data(d); + unsigned int gpio = d->hwirq; + + /* Once we no longer use it, lock the GPIO again */ + bcm_kona_gpio_lock_gpio(kona_gpio, gpio); - gpiochip_relres_irq(&kona_gpio->gpio_chip, d->hwirq); + gpiochip_relres_irq(&kona_gpio->gpio_chip, gpio); } static struct irq_chip bcm_gpio_irq_chip = { From 615279db222c3ac56d5c93716efd72b843295c1f Mon Sep 17 00:00:00 2001 From: Artur Weber Date: Thu, 6 Feb 2025 18:46:02 +0100 Subject: [PATCH 108/310] gpio: bcm-kona: Add missing newline to dev_err format string Add a missing newline to the format string of the "Couldn't get IRQ for bank..." error message. Fixes: 757651e3d60e ("gpio: bcm281xx: Add GPIO driver") Reviewed-by: Florian Fainelli Reviewed-by: Markus Mayer Signed-off-by: Artur Weber Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250206-kona-gpio-fixes-v2-3-409135eab780@gmail.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-bcm-kona.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-bcm-kona.c b/drivers/gpio/gpio-bcm-kona.c index 17f3f210fee9..64908f1a5e7f 100644 --- a/drivers/gpio/gpio-bcm-kona.c +++ b/drivers/gpio/gpio-bcm-kona.c @@ -659,7 +659,7 @@ static int bcm_kona_gpio_probe(struct platform_device *pdev) bank->irq = platform_get_irq(pdev, i); bank->kona_gpio = kona_gpio; if (bank->irq < 0) { - dev_err(dev, "Couldn't get IRQ for bank %d", i); + dev_err(dev, "Couldn't get IRQ for bank %d\n", i); ret = -ENOENT; goto err_irq_domain; } From 2afd96a4a0b1d62c7a44227e535b073926d73368 Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Tue, 11 Feb 2025 16:39:41 +0800 Subject: [PATCH 109/310] ALSA: hda/tas2781: Update tas2781 hda SPI driver Because firmware issue of platform, found spi device is not stable, so add status check before firmware download, and remove some operations which is not must in current stage. Signed-off-by: Baojun Xu Fixes: bb5f86ea50ff ("ALSA: hda/tas2781: Add tas2781 hda SPI driver") Link: https://patch.msgid.link/20250211083941.5574-1-baojun.xu@ti.com Signed-off-by: Takashi Iwai --- sound/pci/hda/tas2781_hda_spi.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/sound/pci/hda/tas2781_hda_spi.c b/sound/pci/hda/tas2781_hda_spi.c index a42fa990e7b9..04db80af53c0 100644 --- a/sound/pci/hda/tas2781_hda_spi.c +++ b/sound/pci/hda/tas2781_hda_spi.c @@ -912,7 +912,7 @@ static void tasdev_fw_ready(const struct firmware *fmw, void *context) struct tasdevice_priv *tas_priv = context; struct tas2781_hda *tas_hda = dev_get_drvdata(tas_priv->dev); struct hda_codec *codec = tas_priv->codec; - int i, j, ret; + int i, j, ret, val; pm_runtime_get_sync(tas_priv->dev); guard(mutex)(&tas_priv->codec_lock); @@ -981,13 +981,16 @@ static void tasdev_fw_ready(const struct firmware *fmw, void *context) /* Perform AMP reset before firmware download. */ tas_priv->rcabin.profile_cfg_id = TAS2781_PRE_POST_RESET_CFG; - tasdevice_spi_tuning_switch(tas_priv, 0); tas2781_spi_reset(tas_priv); tas_priv->rcabin.profile_cfg_id = 0; - tasdevice_spi_tuning_switch(tas_priv, 1); tas_priv->fw_state = TASDEVICE_DSP_FW_ALL_OK; - ret = tasdevice_spi_prmg_load(tas_priv, 0); + ret = tasdevice_spi_dev_read(tas_priv, TAS2781_REG_CLK_CONFIG, &val); + if (ret < 0) + goto out; + + if (val == TAS2781_REG_CLK_CONFIG_RESET) + ret = tasdevice_spi_prmg_load(tas_priv, 0); if (ret < 0) { dev_err(tas_priv->dev, "FW download failed = %d\n", ret); goto out; @@ -1001,7 +1004,6 @@ static void tasdev_fw_ready(const struct firmware *fmw, void *context) * If calibrated data occurs error, dsp will still works with default * calibrated data inside algo. */ - tas_priv->save_calibration(tas_priv); out: if (fmw) @@ -1160,7 +1162,8 @@ static int tas2781_runtime_suspend(struct device *dev) guard(mutex)(&tas_hda->priv->codec_lock); - tasdevice_spi_tuning_switch(tas_hda->priv, 1); + if (tas_hda->priv->playback_started) + tasdevice_spi_tuning_switch(tas_hda->priv, 1); tas_hda->priv->cur_book = -1; tas_hda->priv->cur_conf = -1; @@ -1174,7 +1177,8 @@ static int tas2781_runtime_resume(struct device *dev) guard(mutex)(&tas_hda->priv->codec_lock); - tasdevice_spi_tuning_switch(tas_hda->priv, 0); + if (tas_hda->priv->playback_started) + tasdevice_spi_tuning_switch(tas_hda->priv, 0); return 0; } @@ -1189,12 +1193,9 @@ static int tas2781_system_suspend(struct device *dev) return ret; /* Shutdown chip before system suspend */ - tasdevice_spi_tuning_switch(tas_hda->priv, 1); - tas2781_spi_reset(tas_hda->priv); - /* - * Reset GPIO may be shared, so cannot reset here. - * However beyond this point, amps may be powered down. - */ + if (tas_hda->priv->playback_started) + tasdevice_spi_tuning_switch(tas_hda->priv, 1); + return 0; } From db79e75460fc59b19f9c89d4b068e61cee59f37d Mon Sep 17 00:00:00 2001 From: "Chester A. Unal" Date: Fri, 24 Jan 2025 10:28:00 +0000 Subject: [PATCH 110/310] USB: serial: option: add MeiG Smart SLM828 MeiG Smart SLM828 is an LTE-A CAT6 modem with the mPCIe form factor. The "Cls=ff(vend.) Sub=10 Prot=02" and "Cls=ff(vend.) Sub=10 Prot=03" interfaces respond to AT commands. Add these interfaces. The product ID the modem uses is shared across multiple modems. Therefore, add comments to describe which interface is used for which modem. T: Bus=01 Lev=01 Prnt=05 Port=01 Cnt=01 Dev#= 6 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2dee ProdID=4d22 Rev=05.04 S: Manufacturer=MEIG S: Product=LTE-A Module S: SerialNumber=4da7ec42 C: #Ifs= 6 Cfg#= 1 Atr=80 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=10 Prot=01 Driver=(none) E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=10 Prot=02 Driver=(none) E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=10 Prot=03 Driver=(none) E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=10 Prot=04 Driver=(none) E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) E: Ad=88(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=10 Prot=05 Driver=qmi_wwan E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Chester A. Unal Link: https://lore.kernel.org/20250124-for-johan-meig-slm828-v2-1-6b4cd3f6344f@arinc9.com Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 1e2ae0c6c41c..887a1c687b52 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -621,7 +621,10 @@ static void option_instat_callback(struct urb *urb); /* MeiG Smart Technology products */ #define MEIGSMART_VENDOR_ID 0x2dee -/* MeiG Smart SRM815/SRM825L based on Qualcomm 315 */ +/* + * MeiG Smart SLM828, SRM815, and SRM825L use the same product ID. SLM828 is + * based on Qualcomm SDX12. SRM815 and SRM825L are based on Qualcomm 315. + */ #define MEIGSMART_PRODUCT_SRM825L 0x4d22 /* MeiG Smart SLM320 based on UNISOC UIS8910 */ #define MEIGSMART_PRODUCT_SLM320 0x4d41 @@ -2405,10 +2408,12 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, LUAT_PRODUCT_AIR720U, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM320, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM770A, 0xff, 0, 0) }, - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0, 0) }, - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x30) }, - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x40) }, - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x60) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0, 0) }, /* MeiG Smart SRM815 */ + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0x10, 0x02) }, /* MeiG Smart SLM828 */ + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0x10, 0x03) }, /* MeiG Smart SLM828 */ + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x30) }, /* MeiG Smart SRM815 and SRM825L */ + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x40) }, /* MeiG Smart SRM825L */ + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x60) }, /* MeiG Smart SRM825L */ { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0530, 0xff), /* TCL IK512 MBIM */ .driver_info = NCTRL(1) }, { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0640, 0xff), /* TCL IK512 ECM */ From c979fb5ece2dc11cc9cc3d5c66f750e210bfdee2 Mon Sep 17 00:00:00 2001 From: Fabio Porcedda Date: Wed, 5 Feb 2025 18:16:45 +0100 Subject: [PATCH 111/310] USB: serial: option: add Telit Cinterion FN990B compositions Add the following Telit Cinterion FN990B40 compositions: 0x10d0: rmnet + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (AT) + tty (diag) + DPL + QDSS (Qualcomm Debug SubSystem) + adb T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 17 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10d0 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN990 S: SerialNumber=43b38f19 C: #Ifs= 9 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8b(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 6 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=8c(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 7 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=70 Driver=(none) E: Ad=8d(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 8 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10d1: MBIM + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (AT) + tty (diag) + DPL + QDSS (Qualcomm Debug SubSystem) + adb T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 16 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10d1 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN990 S: SerialNumber=43b38f19 C: #Ifs=10 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8b(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 7 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=8c(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 8 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=70 Driver=(none) E: Ad=8d(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 9 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10d2: RNDIS + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (AT) + tty (diag) + DPL + QDSS (Qualcomm Debug SubSystem) + adb T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 18 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10d2 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN990 S: SerialNumber=43b38f19 C: #Ifs=10 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=ef(misc ) Sub=04 Prot=01 Driver=rndis_host E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8b(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 7 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=8c(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 8 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=70 Driver=(none) E: Ad=8d(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 9 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10d3: ECM + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (AT) + tty (diag) + DPL + QDSS (Qualcomm Debug SubSystem) + adb T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 20 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10d3 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN990 S: SerialNumber=43b38f19 C: #Ifs=10 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=06 Prot=00 Driver=cdc_ether E: Ad=82(I) Atr=03(Int.) MxPS= 16 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8b(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 7 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=8c(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 8 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=70 Driver=(none) E: Ad=8d(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 9 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Cc: stable@vger.kernel.org Signed-off-by: Fabio Porcedda Reviewed-by: Daniele Palmas Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 887a1c687b52..7f6eff505085 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1406,6 +1406,22 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(0) | NCTRL(3) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c8, 0xff), /* Telit FE910C04 (rmnet) */ .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d0, 0x60) }, /* Telit FN990B (rmnet) */ + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d0, 0x40) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d0, 0x30), + .driver_info = NCTRL(5) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d1, 0x60) }, /* Telit FN990B (MBIM) */ + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d1, 0x40) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d1, 0x30), + .driver_info = NCTRL(6) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d2, 0x60) }, /* Telit FN990B (RNDIS) */ + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d2, 0x40) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d2, 0x30), + .driver_info = NCTRL(6) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d3, 0x60) }, /* Telit FN990B (ECM) */ + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d3, 0x40) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d3, 0x30), + .driver_info = NCTRL(6) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM), From 12606fe73f33647c5e79bf666833bf0b225e649d Mon Sep 17 00:00:00 2001 From: Fabio Porcedda Date: Wed, 5 Feb 2025 18:16:47 +0100 Subject: [PATCH 112/310] USB: serial: option: fix Telit Cinterion FN990A name The correct name for FN990 is FN990A so use it in order to avoid confusion with FN990B. Signed-off-by: Fabio Porcedda Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 7f6eff505085..4a59a40f750a 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1370,15 +1370,15 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(2) | RSVD(3) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1063, 0xff), /* Telit LN920 (ECM) */ .driver_info = NCTRL(0) | RSVD(1) }, - { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1070, 0xff), /* Telit FN990 (rmnet) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1070, 0xff), /* Telit FN990A (rmnet) */ .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, - { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1071, 0xff), /* Telit FN990 (MBIM) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1071, 0xff), /* Telit FN990A (MBIM) */ .driver_info = NCTRL(0) | RSVD(1) }, - { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1072, 0xff), /* Telit FN990 (RNDIS) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1072, 0xff), /* Telit FN990A (RNDIS) */ .driver_info = NCTRL(2) | RSVD(3) }, - { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1073, 0xff), /* Telit FN990 (ECM) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1073, 0xff), /* Telit FN990A (ECM) */ .driver_info = NCTRL(0) | RSVD(1) }, - { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1075, 0xff), /* Telit FN990 (PCIe) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1075, 0xff), /* Telit FN990A (PCIe) */ .driver_info = RSVD(0) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1080, 0xff), /* Telit FE990 (rmnet) */ .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, From 6aa8a63c471eb6756aabd03f880feffe6a7af6c9 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 11 Feb 2025 15:45:16 +0100 Subject: [PATCH 113/310] USB: serial: option: drop MeiG Smart defines Several MeiG Smart modems apparently use the same product id, making the defines even less useful. Drop them in favour of using comments consistently to make the id table slightly less unwieldy. Cc: stable@vger.kernel.org Acked-by: Chester A. Unal Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 4a59a40f750a..58bd54e8c483 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -619,18 +619,6 @@ static void option_instat_callback(struct urb *urb); /* Luat Air72*U series based on UNISOC UIS8910 uses UNISOC's vendor ID */ #define LUAT_PRODUCT_AIR720U 0x4e00 -/* MeiG Smart Technology products */ -#define MEIGSMART_VENDOR_ID 0x2dee -/* - * MeiG Smart SLM828, SRM815, and SRM825L use the same product ID. SLM828 is - * based on Qualcomm SDX12. SRM815 and SRM825L are based on Qualcomm 315. - */ -#define MEIGSMART_PRODUCT_SRM825L 0x4d22 -/* MeiG Smart SLM320 based on UNISOC UIS8910 */ -#define MEIGSMART_PRODUCT_SLM320 0x4d41 -/* MeiG Smart SLM770A based on ASR1803 */ -#define MEIGSMART_PRODUCT_SLM770A 0x4d57 - /* Device flags */ /* Highest interface number which can be used with NCTRL() and RSVD() */ @@ -2366,6 +2354,14 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a05, 0xff) }, /* Fibocom FM650-CN (NCM mode) */ { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a06, 0xff) }, /* Fibocom FM650-CN (RNDIS mode) */ { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a07, 0xff) }, /* Fibocom FM650-CN (MBIM mode) */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d41, 0xff, 0, 0) }, /* MeiG Smart SLM320 */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d57, 0xff, 0, 0) }, /* MeiG Smart SLM770A */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0, 0) }, /* MeiG Smart SRM815 */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0x10, 0x02) }, /* MeiG Smart SLM828 */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0x10, 0x03) }, /* MeiG Smart SLM828 */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0xff, 0x30) }, /* MeiG Smart SRM815 and SRM825L */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0xff, 0x40) }, /* MeiG Smart SRM825L */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0xff, 0x60) }, /* MeiG Smart SRM825L */ { USB_DEVICE_INTERFACE_CLASS(0x2df3, 0x9d03, 0xff) }, /* LongSung M5710 */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1404, 0xff) }, /* GosunCn GM500 RNDIS */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1405, 0xff) }, /* GosunCn GM500 MBIM */ @@ -2422,14 +2418,6 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, TOZED_PRODUCT_LT70C, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, LUAT_PRODUCT_AIR720U, 0xff, 0, 0) }, - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM320, 0xff, 0, 0) }, - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM770A, 0xff, 0, 0) }, - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0, 0) }, /* MeiG Smart SRM815 */ - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0x10, 0x02) }, /* MeiG Smart SLM828 */ - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0x10, 0x03) }, /* MeiG Smart SLM828 */ - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x30) }, /* MeiG Smart SRM815 and SRM825L */ - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x40) }, /* MeiG Smart SRM825L */ - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x60) }, /* MeiG Smart SRM825L */ { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0530, 0xff), /* TCL IK512 MBIM */ .driver_info = NCTRL(1) }, { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0640, 0xff), /* TCL IK512 ECM */ From 35e21de48e693af1dcfdbf2dc3d73dcfa3c8f2d9 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Tue, 11 Feb 2025 16:48:06 +0100 Subject: [PATCH 114/310] regulator: core: let dt properties override driver init_data This reverts commit cd7a38c40b231350a3cd0fd774f4e6bb68c4b411. When submitting the change above, it was thought that the origin of the init_data should be a clear choice, from the driver or from DT but not both. It turns out some devices, such as qcom-msm8974-lge-nexus5-hammerhead, relied on the old behaviour to override the init_data provided by the driver, making it some kind of default if none is provided by the platform. Using the init_data provided by the driver when it is present broke these devices so revert the change to fixup the situation and add a comment to make things a bit more clear Reported-by: Luca Weiss Closes: https://lore.kernel.org/lkml/5857103.DvuYhMxLoT@lucaweiss.eu Fixes: cd7a38c40b23 ("regulator: core: do not silently ignore provided init_data") Signed-off-by: Jerome Brunet Link: https://patch.msgid.link/20250211-regulator-init-data-fixup-v1-1-5ce1c6cff990@baylibre.com Signed-off-by: Mark Brown --- drivers/regulator/core.c | 61 ++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 89578b91c468..4ddf0efead68 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -5774,43 +5774,36 @@ regulator_register(struct device *dev, goto clean; } - if (config->init_data) { - /* - * Providing of_match means the framework is expected to parse - * DT to get the init_data. This would conflict with provided - * init_data, if set. Warn if it happens. - */ - if (regulator_desc->of_match) - dev_warn(dev, "Using provided init data - OF match ignored\n"); + /* + * DT may override the config->init_data provided if the platform + * needs to do so. If so, config->init_data is completely ignored. + */ + init_data = regulator_of_get_init_data(dev, regulator_desc, config, + &rdev->dev.of_node); + /* + * Sometimes not all resources are probed already so we need to take + * that into account. This happens most the time if the ena_gpiod comes + * from a gpio extender or something else. + */ + if (PTR_ERR(init_data) == -EPROBE_DEFER) { + ret = -EPROBE_DEFER; + goto clean; + } + + /* + * We need to keep track of any GPIO descriptor coming from the + * device tree until we have handled it over to the core. If the + * config that was passed in to this function DOES NOT contain + * a descriptor, and the config after this call DOES contain + * a descriptor, we definitely got one from parsing the device + * tree. + */ + if (!cfg->ena_gpiod && config->ena_gpiod) + dangling_of_gpiod = true; + if (!init_data) { init_data = config->init_data; rdev->dev.of_node = of_node_get(config->of_node); - - } else { - init_data = regulator_of_get_init_data(dev, regulator_desc, - config, - &rdev->dev.of_node); - - /* - * Sometimes not all resources are probed already so we need to - * take that into account. This happens most the time if the - * ena_gpiod comes from a gpio extender or something else. - */ - if (PTR_ERR(init_data) == -EPROBE_DEFER) { - ret = -EPROBE_DEFER; - goto clean; - } - - /* - * We need to keep track of any GPIO descriptor coming from the - * device tree until we have handled it over to the core. If the - * config that was passed in to this function DOES NOT contain a - * descriptor, and the config after this call DOES contain a - * descriptor, we definitely got one from parsing the device - * tree. - */ - if (!cfg->ena_gpiod && config->ena_gpiod) - dangling_of_gpiod = true; } ww_mutex_init(&rdev->mutex, ®ulator_ww_class); From 8d1d1e8d3345b56d3d8a64f845962c71468cd776 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 30 Jan 2025 14:56:07 +0100 Subject: [PATCH 115/310] s390/configs: Remove CONFIG_LSM s390 defconfig does not have BPF LSM, resulting in systemd[1]: bpf-restrict-fs: BPF LSM hook not enabled in the kernel, BPF LSM not supported. with the respective kernels. The other architectures do not explicitly set it, and the default values have BPF in them, so just drop it. Reported-by: Marc Hartmayer Acked-by: Vasily Gorbik Signed-off-by: Ilya Leoshkevich Signed-off-by: Vasily Gorbik --- arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - arch/s390/configs/zfcpdump_defconfig | 1 - 3 files changed, 3 deletions(-) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index d6beec5292a0..44f01a4bc810 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -740,7 +740,6 @@ CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y -CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_USER=m # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 8cfbfb10bba8..8bcd37edd3c9 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -725,7 +725,6 @@ CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y -CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_FIPS=y CONFIG_CRYPTO_USER=m diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index bcbaa069de96..853b2326a171 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -62,7 +62,6 @@ CONFIG_ZFCP=y # CONFIG_INOTIFY_USER is not set # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set -CONFIG_LSM="yama,loadpin,safesetid,integrity" # CONFIG_ZLIB_DFLTCC is not set CONFIG_XZ_DEC_MICROLZMA=y CONFIG_PRINTK_TIME=y From 32ae4a2992529e2c7934e422035fad1d9b0f1fb5 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Fri, 31 Jan 2025 12:02:55 +0100 Subject: [PATCH 116/310] s390/cio: Fix CHPID "configure" attribute caching In some environments, the SCLP firmware interface used to query a CHPID's configured state is not supported. On these environments, rapidly reading the corresponding sysfs attribute produces inconsistent results: $ cat /sys/devices/css0/chp0.00/configure cat: /sys/devices/css0/chp0.00/configure: Operation not supported $ cat /sys/devices/css0/chp0.00/configure 3 This occurs for example when Linux is run as a KVM guest. The inconsistency is a result of CIO using cached results for generating the value of the "configure" attribute while failing to handle the situation where no data was returned by SCLP. Fix this by not updating the cache-expiration timestamp when SCLP returns no data. With the fix applied, the system response is consistent: $ cat /sys/devices/css0/chp0.00/configure cat: /sys/devices/css0/chp0.00/configure: Operation not supported $ cat /sys/devices/css0/chp0.00/configure cat: /sys/devices/css0/chp0.00/configure: Operation not supported Reviewed-by: Vineeth Vijayan Reviewed-by: Eric Farman Tested-by: Eric Farman Signed-off-by: Peter Oberparleiter Signed-off-by: Vasily Gorbik --- drivers/s390/cio/chp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index 4a0b3f19bd8e..4f01b1929240 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -695,7 +695,8 @@ static int info_update(void) if (time_after(jiffies, chp_info_expires)) { /* Data is too old, update. */ rc = sclp_chp_read_info(&chp_info); - chp_info_expires = jiffies + CHP_INFO_UPDATE_INTERVAL ; + if (!rc) + chp_info_expires = jiffies + CHP_INFO_UPDATE_INTERVAL; } mutex_unlock(&info_lock); From 6166caf3bbe2429e4fac71b77e1c8254f2690383 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 10 Feb 2025 09:56:53 +0100 Subject: [PATCH 117/310] s390/bitops: Disable arch_test_bit() optimization for PROFILE_ALL_BRANCHES With PROFILE_ALL_BRANCHES enabled gcc sometimes fails to handle __builtin_constant_p() correctly: In function 'arch_test_bit', inlined from 'node_state' at include/linux/nodemask.h:423:9, inlined from 'warn_if_node_offline' at include/linux/gfp.h:252:2, inlined from '__alloc_pages_node_noprof' at include/linux/gfp.h:267:2, inlined from 'alloc_pages_node_noprof' at include/linux/gfp.h:296:9, inlined from 'vm_area_alloc_pages.constprop' at mm/vmalloc.c:3591:11: >> arch/s390/include/asm/bitops.h:60:17: warning: 'asm' operand 2 probably does not match constraints 60 | asm volatile( | ^~~ >> arch/s390/include/asm/bitops.h:60:17: error: impossible constraint in 'asm' Therefore disable the optimization for this case. This is similar to commit 63678eecec57 ("s390/preempt: disable __preempt_count_add() optimization for PROFILE_ALL_BRANCHES") Fixes: b2bc1b1a77c0 ("s390/bitops: Provide optimized arch_test_bit()") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502091912.xL2xTCGw-lkp@intel.com/ Acked-by: Vasily Gorbik Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/bitops.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index d5125296ade2..a5ca0a947691 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -53,7 +53,11 @@ static __always_inline bool arch_test_bit(unsigned long nr, const volatile unsig unsigned long mask; int cc; - if (__builtin_constant_p(nr)) { + /* + * With CONFIG_PROFILE_ALL_BRANCHES enabled gcc fails to + * handle __builtin_constant_p() in some cases. + */ + if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && __builtin_constant_p(nr)) { addr = (const volatile unsigned char *)ptr; addr += (nr ^ (BITS_PER_LONG - BITS_PER_BYTE)) / BITS_PER_BYTE; mask = 1UL << (nr & (BITS_PER_BYTE - 1)); From 05793884a1f30509e477de9da233ab73584b1c8c Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Fri, 7 Feb 2025 13:30:16 +0100 Subject: [PATCH 118/310] s390/pci: Pull search for parent PF out of zpci_iov_setup_virtfn() This creates a new zpci_iov_find_parent_pf() function which a future commit can use to find if a VF has a configured parent PF. Use zdev->rid instead of zdev->devfn such that the new function can be used before it has been decided if the RID will be exposed and zdev->devfn is set. Also handle the hypotheical case that the RID is not available but there is an otherwise matching zbus. Fixes: 25f39d3dcb48 ("s390/pci: Ignore RID for isolated VFs") Cc: stable@vger.kernel.org Reviewed-by: Halil Pasic Signed-off-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_iov.c | 56 ++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/arch/s390/pci/pci_iov.c b/arch/s390/pci/pci_iov.c index ead062bf2b41..c7fdf5e79b3c 100644 --- a/arch/s390/pci/pci_iov.c +++ b/arch/s390/pci/pci_iov.c @@ -60,18 +60,35 @@ static int zpci_iov_link_virtfn(struct pci_dev *pdev, struct pci_dev *virtfn, in return 0; } -int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +/** + * zpci_iov_find_parent_pf - Find the parent PF, if any, of the given function + * @zbus: The bus that the PCI function is on, or would be added on + * @zdev: The PCI function + * + * Finds the parent PF, if it exists and is configured, of the given PCI function + * and increments its refcount. Th PF is searched for on the provided bus so the + * caller has to ensure that this is the correct bus to search. This function may + * be used before adding the PCI function to a zbus. + * + * Return: Pointer to the struct pci_dev of the parent PF or NULL if it not + * found. If the function is not a VF or has no RequesterID information, + * NULL is returned as well. + */ +static struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev) { - int i, cand_devfn; - struct zpci_dev *zdev; + int i, vfid, devfn, cand_devfn; struct pci_dev *pdev; - int vfid = vfn - 1; /* Linux' vfid's start at 0 vfn at 1*/ - int rc = 0; if (!zbus->multifunction) - return 0; - - /* If the parent PF for the given VF is also configured in the + return NULL; + /* Non-VFs and VFs without RID available don't have a parent */ + if (!zdev->vfn || !zdev->rid_available) + return NULL; + /* Linux vfid starts at 0 vfn at 1 */ + vfid = zdev->vfn - 1; + devfn = zdev->rid & ZPCI_RID_MASK_DEVFN; + /* + * If the parent PF for the given VF is also configured in the * instance, it must be on the same zbus. * We can then identify the parent PF by checking what * devfn the VF would have if it belonged to that PF using the PF's @@ -85,15 +102,26 @@ int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn if (!pdev) continue; cand_devfn = pci_iov_virtfn_devfn(pdev, vfid); - if (cand_devfn == virtfn->devfn) { - rc = zpci_iov_link_virtfn(pdev, virtfn, vfid); - /* balance pci_get_slot() */ - pci_dev_put(pdev); - break; - } + if (cand_devfn == devfn) + return pdev; /* balance pci_get_slot() */ pci_dev_put(pdev); } } + return NULL; +} + +int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +{ + struct zpci_dev *zdev = to_zpci(virtfn); + struct pci_dev *pdev_pf; + int rc = 0; + + pdev_pf = zpci_iov_find_parent_pf(zbus, zdev); + if (pdev_pf) { + /* Linux' vfids start at 0 while zdev->vfn starts at 1 */ + rc = zpci_iov_link_virtfn(pdev_pf, virtfn, zdev->vfn - 1); + pci_dev_put(pdev_pf); + } return rc; } From 2844ddbd540fc84d7571cca65d6c43088e4d6952 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Fri, 7 Feb 2025 13:30:17 +0100 Subject: [PATCH 119/310] s390/pci: Fix handling of isolated VFs In contrast to the commit message of the fixed commit VFs whose parent PF is not configured are not always isolated, that is put on their own PCI domain. This is because for VFs to be added to an existing PCI domain it is enough for that PCI domain to share the same topology ID or PCHID. Such a matching PCI domain without a parent PF may exist when a PF from the same PCI card created the domain with the VF being a child of a different, non accessible, PF. While not causing technical issues it makes the rules which VFs are isolated inconsistent. Fix this by explicitly checking that the parent PF exists on the PCI domain determined by the topology ID or PCHID before registering the VF. This works because a parent PF which is under control of this Linux instance must be enabled and configured at the point where its child VFs appear because otherwise SR-IOV could not have been enabled on the parent. Fixes: 25f39d3dcb48 ("s390/pci: Ignore RID for isolated VFs") Cc: stable@vger.kernel.org Reviewed-by: Halil Pasic Signed-off-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_bus.c | 20 ++++++++++++++++++++ arch/s390/pci/pci_iov.c | 2 +- arch/s390/pci/pci_iov.h | 7 +++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 857afbc4828f..39a481ec4a40 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -331,6 +331,17 @@ static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev) return rc; } +static bool zpci_bus_is_isolated_vf(struct zpci_bus *zbus, struct zpci_dev *zdev) +{ + struct pci_dev *pdev; + + pdev = zpci_iov_find_parent_pf(zbus, zdev); + if (!pdev) + return true; + pci_dev_put(pdev); + return false; +} + int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) { bool topo_is_tid = zdev->tid_avail; @@ -345,6 +356,15 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) topo = topo_is_tid ? zdev->tid : zdev->pchid; zbus = zpci_bus_get(topo, topo_is_tid); + /* + * An isolated VF gets its own domain/bus even if there exists + * a matching domain/bus already + */ + if (zbus && zpci_bus_is_isolated_vf(zbus, zdev)) { + zpci_bus_put(zbus); + zbus = NULL; + } + if (!zbus) { zbus = zpci_bus_alloc(topo, topo_is_tid); if (!zbus) diff --git a/arch/s390/pci/pci_iov.c b/arch/s390/pci/pci_iov.c index c7fdf5e79b3c..191e56a623f6 100644 --- a/arch/s390/pci/pci_iov.c +++ b/arch/s390/pci/pci_iov.c @@ -74,7 +74,7 @@ static int zpci_iov_link_virtfn(struct pci_dev *pdev, struct pci_dev *virtfn, in * found. If the function is not a VF or has no RequesterID information, * NULL is returned as well. */ -static struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev) +struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev) { int i, vfid, devfn, cand_devfn; struct pci_dev *pdev; diff --git a/arch/s390/pci/pci_iov.h b/arch/s390/pci/pci_iov.h index e3fa4e77fc86..d2c2793eb0f3 100644 --- a/arch/s390/pci/pci_iov.h +++ b/arch/s390/pci/pci_iov.h @@ -19,6 +19,8 @@ void zpci_iov_map_resources(struct pci_dev *pdev); int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn); +struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev); + #else /* CONFIG_PCI_IOV */ static inline void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn) {} @@ -28,5 +30,10 @@ static inline int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *v { return 0; } + +static inline struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev) +{ + return NULL; +} #endif /* CONFIG_PCI_IOV */ #endif /* __S390_PCI_IOV_h */ From c195b9c6ab9c383d7aa3f4a65879b3ca90cb378b Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 8 Feb 2025 15:49:07 +0800 Subject: [PATCH 120/310] thermal/netlink: Prevent userspace segmentation fault by adjusting UAPI header The intel-lpmd tool [1], which uses the THERMAL_GENL_ATTR_CPU_CAPABILITY attribute to receive HFI events from kernel space, encounters a segmentation fault after commit 1773572863c4 ("thermal: netlink: Add the commands and the events for the thresholds"). The issue arises because the THERMAL_GENL_ATTR_CPU_CAPABILITY raw value was changed while intel_lpmd still uses the old value. Although intel_lpmd can be updated to check the THERMAL_GENL_VERSION and use the appropriate THERMAL_GENL_ATTR_CPU_CAPABILITY value, the commit itself is questionable. The commit introduced a new element in the middle of enum thermal_genl_attr, which affects many existing attributes and introduces potential risks and unnecessary maintenance burdens for userspace thermal netlink event users. Solve the issue by moving the newly introduced THERMAL_GENL_ATTR_TZ_PREV_TEMP attribute to the end of the enum thermal_genl_attr. This ensures that all existing thermal generic netlink attributes remain unaffected. Link: https://github.com/intel/intel-lpmd [1] Fixes: 1773572863c4 ("thermal: netlink: Add the commands and the events for the thresholds") Signed-off-by: Zhang Rui Reviewed-by: Daniel Lezcano Link: https://patch.msgid.link/20250208074907.5679-1-rui.zhang@intel.com [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- include/uapi/linux/thermal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h index 349718c271eb..46a2633d33aa 100644 --- a/include/uapi/linux/thermal.h +++ b/include/uapi/linux/thermal.h @@ -30,7 +30,6 @@ enum thermal_genl_attr { THERMAL_GENL_ATTR_TZ, THERMAL_GENL_ATTR_TZ_ID, THERMAL_GENL_ATTR_TZ_TEMP, - THERMAL_GENL_ATTR_TZ_PREV_TEMP, THERMAL_GENL_ATTR_TZ_TRIP, THERMAL_GENL_ATTR_TZ_TRIP_ID, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, @@ -54,6 +53,7 @@ enum thermal_genl_attr { THERMAL_GENL_ATTR_THRESHOLD, THERMAL_GENL_ATTR_THRESHOLD_TEMP, THERMAL_GENL_ATTR_THRESHOLD_DIRECTION, + THERMAL_GENL_ATTR_TZ_PREV_TEMP, __THERMAL_GENL_ATTR_MAX, }; #define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1) From a6768c4f92e152265590371975d44c071a5279c7 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 11 Feb 2025 09:47:11 +0100 Subject: [PATCH 121/310] thermal/cpufreq_cooling: Remove structure member documentation The structure member documentation refers to a member which does not exist any more. Remove it. Link: https://lore.kernel.org/all/202501220046.h3PMBCti-lkp@intel.com/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501220046.h3PMBCti-lkp@intel.com/ Signed-off-by: Daniel Lezcano Acked-by: Viresh Kumar Link: https://patch.msgid.link/20250211084712.2746705-1-daniel.lezcano@linaro.org [ rjw: Minor changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/cpufreq_cooling.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index 280071be30b1..6b7ab1814c12 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -57,8 +57,6 @@ struct time_in_idle { * @max_level: maximum cooling level. One less than total number of valid * cpufreq frequencies. * @em: Reference on the Energy Model of the device - * @cdev: thermal_cooling_device pointer to keep track of the - * registered cooling device. * @policy: cpufreq policy. * @cooling_ops: cpufreq callbacks to thermal cooling device ops * @idle_time: idle time stats From f1bf10d7e909fe898a112f5cae1e97ce34d6484d Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 11 Feb 2025 10:00:25 +0000 Subject: [PATCH 122/310] cifs: pick channels for individual subrequests The netfs library could break down a read request into multiple subrequests. When multichannel is used, there is potential to improve performance when each of these subrequests pick a different channel. Today we call cifs_pick_channel when the main read request is initialized in cifs_init_request. This change moves this to cifs_prepare_read, which is the right place to pick channel since it gets called for each subrequest. Interestingly cifs_prepare_write already does channel selection for individual subreq, but looks like it was missed for read. This is especially important when multichannel is used with increased rasize. In my test setup, with rasize set to 8MB, a sequential read of large file was taking 11.5s without this change. With the change, it completed in 9s. The difference is even more signigicant with bigger rasize. Cc: Cc: David Howells Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 1 - fs/smb/client/file.c | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index ac1f890a0d54..4bdd6a43e521 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1508,7 +1508,6 @@ struct cifs_io_parms { struct cifs_io_request { struct netfs_io_request rreq; struct cifsFileInfo *cfile; - struct TCP_Server_Info *server; pid_t pid; }; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 79de2f2f9c41..8582cf61242c 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -147,7 +147,7 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); - struct TCP_Server_Info *server = req->server; + struct TCP_Server_Info *server; struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); size_t size; int rc = 0; @@ -156,6 +156,8 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) rdata->xid = get_xid(); rdata->have_xid = true; } + + server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); rdata->server = server; if (cifs_sb->ctx->rsize == 0) @@ -198,7 +200,7 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); - struct TCP_Server_Info *server = req->server; + struct TCP_Server_Info *server = rdata->server; int rc = 0; cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n", @@ -266,7 +268,6 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) open_file = file->private_data; rreq->netfs_priv = file->private_data; req->cfile = cifsFileInfo_get(open_file); - req->server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) req->pid = req->cfile->pid; } else if (rreq->origin != NETFS_WRITEBACK) { From 174448badb4409491bfba2e6b46f7aa078741c5e Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Wed, 12 Feb 2025 14:40:46 +0800 Subject: [PATCH 123/310] ALSA: hda/realtek: Fixup ALC225 depop procedure Headset MIC will no function when power_save=0. Fixes: 1fd50509fe14 ("ALSA: hda/realtek: Update ALC225 depop procedure") Link: https://bugzilla.kernel.org/show_bug.cgi?id=219743 Signed-off-by: Kailang Yang Link: https://lore.kernel.org/0474a095ab0044d0939ec4bf4362423d@realtek.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index ae0beb52e7b0..224616fbec4f 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -3788,6 +3788,7 @@ static void alc225_init(struct hda_codec *codec) AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_UNMUTE); msleep(75); + alc_update_coef_idx(codec, 0x4a, 3 << 10, 0); alc_update_coefex_idx(codec, 0x57, 0x04, 0x0007, 0x4); /* Hight power */ } } From 8743d66979e494c5378563e6b5a32e913380abd8 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 11 Feb 2025 14:32:01 -0600 Subject: [PATCH 124/310] gpiolib: acpi: Add a quirk for Acer Nitro ANV14 Spurious immediate wake up events are reported on Acer Nitro ANV14. GPIO 11 is specified as an edge triggered input and also a wake source but this pin is supposed to be an output pin for an LED, so it's effectively floating. Block the interrupt from getting set up for this GPIO on this device. Cc: stable@vger.kernel.org Reported-by: Delgan Tested-by: Delgan Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3954 Signed-off-by: Mario Limonciello Acked-by: Mika Westerberg Link: https://lore.kernel.org/r/20250211203222.761206-1-superm1@kernel.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-acpi.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index 1f9fe50bba00..f7746c57ba76 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -1689,6 +1689,20 @@ static const struct dmi_system_id gpiolib_acpi_quirks[] __initconst = { .ignore_wake = "PNP0C50:00@8", }, }, + { + /* + * Spurious wakeups from GPIO 11 + * Found in BIOS 1.04 + * https://gitlab.freedesktop.org/drm/amd/-/issues/3954 + */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_FAMILY, "Acer Nitro V 14"), + }, + .driver_data = &(struct acpi_gpiolib_dmi_quirk) { + .ignore_interrupt = "AMDI0030:00@11", + }, + }, {} /* Terminating entry */ }; From d262a192d38e527faa5984629aabda2e0d1c4f54 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 12 Feb 2025 07:46:28 +0100 Subject: [PATCH 125/310] powerpc/code-patching: Fix KASAN hit by not flagging text patching area as VM_ALLOC Erhard reported the following KASAN hit while booting his PowerMac G4 with a KASAN-enabled kernel 6.13-rc6: BUG: KASAN: vmalloc-out-of-bounds in copy_to_kernel_nofault+0xd8/0x1c8 Write of size 8 at addr f1000000 by task chronyd/1293 CPU: 0 UID: 123 PID: 1293 Comm: chronyd Tainted: G W 6.13.0-rc6-PMacG4 #2 Tainted: [W]=WARN Hardware name: PowerMac3,6 7455 0x80010303 PowerMac Call Trace: [c2437590] [c1631a84] dump_stack_lvl+0x70/0x8c (unreliable) [c24375b0] [c0504998] print_report+0xdc/0x504 [c2437610] [c050475c] kasan_report+0xf8/0x108 [c2437690] [c0505a3c] kasan_check_range+0x24/0x18c [c24376a0] [c03fb5e4] copy_to_kernel_nofault+0xd8/0x1c8 [c24376c0] [c004c014] patch_instructions+0x15c/0x16c [c2437710] [c00731a8] bpf_arch_text_copy+0x60/0x7c [c2437730] [c0281168] bpf_jit_binary_pack_finalize+0x50/0xac [c2437750] [c0073cf4] bpf_int_jit_compile+0xb30/0xdec [c2437880] [c0280394] bpf_prog_select_runtime+0x15c/0x478 [c24378d0] [c1263428] bpf_prepare_filter+0xbf8/0xc14 [c2437990] [c12677ec] bpf_prog_create_from_user+0x258/0x2b4 [c24379d0] [c027111c] do_seccomp+0x3dc/0x1890 [c2437ac0] [c001d8e0] system_call_exception+0x2dc/0x420 [c2437f30] [c00281ac] ret_from_syscall+0x0/0x2c --- interrupt: c00 at 0x5a1274 NIP: 005a1274 LR: 006a3b3c CTR: 005296c8 REGS: c2437f40 TRAP: 0c00 Tainted: G W (6.13.0-rc6-PMacG4) MSR: 0200f932 CR: 24004422 XER: 00000000 GPR00: 00000166 af8f3fa0 a7ee3540 00000001 00000000 013b6500 005a5858 0200f932 GPR08: 00000000 00001fe9 013d5fc8 005296c8 2822244c 00b2fcd8 00000000 af8f4b57 GPR16: 00000000 00000001 00000000 00000000 00000000 00000001 00000000 00000002 GPR24: 00afdbb0 00000000 00000000 00000000 006e0004 013ce060 006e7c1c 00000001 NIP [005a1274] 0x5a1274 LR [006a3b3c] 0x6a3b3c --- interrupt: c00 The buggy address belongs to the virtual mapping at [f1000000, f1002000) created by: text_area_cpu_up+0x20/0x190 The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:00000000 index:0x0 pfn:0x76e30 flags: 0x80000000(zone=2) raw: 80000000 00000000 00000122 00000000 00000000 00000000 ffffffff 00000001 raw: 00000000 page dumped because: kasan: bad access detected Memory state around the buggy address: f0ffff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 f0ffff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >f1000000: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 ^ f1000080: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f1000100: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 ================================================================== f8 corresponds to KASAN_VMALLOC_INVALID which means the area is not initialised hence not supposed to be used yet. Powerpc text patching infrastructure allocates a virtual memory area using get_vm_area() and flags it as VM_ALLOC. But that flag is meant to be used for vmalloc() and vmalloc() allocated memory is not supposed to be used before a call to __vmalloc_node_range() which is never called for that area. That went undetected until commit e4137f08816b ("mm, kasan, kmsan: instrument copy_from/to_kernel_nofault") The area allocated by text_area_cpu_up() is not vmalloc memory, it is mapped directly on demand when needed by map_kernel_page(). There is no VM flag corresponding to such usage, so just pass no flag. That way the area will be unpoisonned and usable immediately. Reported-by: Erhard Furtner Closes: https://lore.kernel.org/all/20250112135832.57c92322@yea/ Fixes: 37bc3e5fd764 ("powerpc/lib/code-patching: Use alternate map for patch_instruction()") Signed-off-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/06621423da339b374f48c0886e3a5db18e896be8.1739342693.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/code-patching.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 81c0f673eb25..f84e0337cc02 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -108,7 +108,7 @@ static int text_area_cpu_up(unsigned int cpu) unsigned long addr; int err; - area = get_vm_area(PAGE_SIZE, VM_ALLOC); + area = get_vm_area(PAGE_SIZE, 0); if (!area) { WARN_ONCE(1, "Failed to create text area for cpu %d\n", cpu); From b9644fbfbcab13da7f8b37bef7c51e5b8407d031 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Wed, 12 Feb 2025 10:18:49 +0800 Subject: [PATCH 126/310] gpio: stmpe: Check return value of stmpe_reg_read in stmpe_gpio_irq_sync_unlock The stmpe_reg_read function can fail, but its return value is not checked in stmpe_gpio_irq_sync_unlock. This can lead to silent failures and incorrect behavior if the hardware access fails. This patch adds checks for the return value of stmpe_reg_read. If the function fails, an error message is logged and the function returns early to avoid further issues. Fixes: b888fb6f2a27 ("gpio: stmpe: i2c transfer are forbiden in atomic context") Cc: stable@vger.kernel.org # 4.16+ Signed-off-by: Wentao Liang Link: https://lore.kernel.org/r/20250212021849.275-1-vulab@iscas.ac.cn Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-stmpe.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-stmpe.c b/drivers/gpio/gpio-stmpe.c index 75a3633ceddb..222279a9d82b 100644 --- a/drivers/gpio/gpio-stmpe.c +++ b/drivers/gpio/gpio-stmpe.c @@ -191,7 +191,7 @@ static void stmpe_gpio_irq_sync_unlock(struct irq_data *d) [REG_IE][CSB] = STMPE_IDX_IEGPIOR_CSB, [REG_IE][MSB] = STMPE_IDX_IEGPIOR_MSB, }; - int i, j; + int ret, i, j; /* * STMPE1600: to be able to get IRQ from pins, @@ -199,8 +199,16 @@ static void stmpe_gpio_irq_sync_unlock(struct irq_data *d) * GPSR or GPCR registers */ if (stmpe->partnum == STMPE1600) { - stmpe_reg_read(stmpe, stmpe->regs[STMPE_IDX_GPMR_LSB]); - stmpe_reg_read(stmpe, stmpe->regs[STMPE_IDX_GPMR_CSB]); + ret = stmpe_reg_read(stmpe, stmpe->regs[STMPE_IDX_GPMR_LSB]); + if (ret < 0) { + dev_err(stmpe->dev, "Failed to read GPMR_LSB: %d\n", ret); + goto err; + } + ret = stmpe_reg_read(stmpe, stmpe->regs[STMPE_IDX_GPMR_CSB]); + if (ret < 0) { + dev_err(stmpe->dev, "Failed to read GPMR_CSB: %d\n", ret); + goto err; + } } for (i = 0; i < CACHE_NR_REGS; i++) { @@ -222,6 +230,7 @@ static void stmpe_gpio_irq_sync_unlock(struct irq_data *d) } } +err: mutex_unlock(&stmpe_gpio->irq_lock); } From 56d5f3eba3f5de0efdd556de4ef381e109b973a9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 11 Feb 2025 18:15:59 +0100 Subject: [PATCH 127/310] acct: perform last write from workqueue In [1] it was reported that the acct(2) system call can be used to trigger NULL deref in cases where it is set to write to a file that triggers an internal lookup. This can e.g., happen when pointing acc(2) to /sys/power/resume. At the point the where the write to this file happens the calling task has already exited and called exit_fs(). A lookup will thus trigger a NULL-deref when accessing current->fs. Reorganize the code so that the the final write happens from the workqueue but with the caller's credentials. This preserves the (strange) permission model and has almost no regression risk. This api should stop to exist though. Link: https://lore.kernel.org/r/20250127091811.3183623-1-quzicheng@huawei.com [1] Link: https://lore.kernel.org/r/20250211-work-acct-v1-1-1c16aecab8b3@kernel.org Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Zicheng Qu Cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- kernel/acct.c | 120 +++++++++++++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 50 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index 31222e8cd534..48283efe8a12 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -103,48 +103,50 @@ struct bsd_acct_struct { atomic_long_t count; struct rcu_head rcu; struct mutex lock; - int active; + bool active; + bool check_space; unsigned long needcheck; struct file *file; struct pid_namespace *ns; struct work_struct work; struct completion done; + acct_t ac; }; -static void do_acct_process(struct bsd_acct_struct *acct); +static void fill_ac(struct bsd_acct_struct *acct); +static void acct_write_process(struct bsd_acct_struct *acct); /* * Check the amount of free space and suspend/resume accordingly. */ -static int check_free_space(struct bsd_acct_struct *acct) +static bool check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - if (time_is_after_jiffies(acct->needcheck)) - goto out; + if (!acct->check_space) + return acct->active; /* May block */ if (vfs_statfs(&acct->file->f_path, &sbuf)) - goto out; + return acct->active; if (acct->active) { u64 suspend = sbuf.f_blocks * SUSPEND; do_div(suspend, 100); if (sbuf.f_bavail <= suspend) { - acct->active = 0; + acct->active = false; pr_info("Process accounting paused\n"); } } else { u64 resume = sbuf.f_blocks * RESUME; do_div(resume, 100); if (sbuf.f_bavail >= resume) { - acct->active = 1; + acct->active = true; pr_info("Process accounting resumed\n"); } } acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; -out: return acct->active; } @@ -189,7 +191,11 @@ static void acct_pin_kill(struct fs_pin *pin) { struct bsd_acct_struct *acct = to_acct(pin); mutex_lock(&acct->lock); - do_acct_process(acct); + /* + * Fill the accounting struct with the exiting task's info + * before punting to the workqueue. + */ + fill_ac(acct); schedule_work(&acct->work); wait_for_completion(&acct->done); cmpxchg(&acct->ns->bacct, pin, NULL); @@ -202,6 +208,9 @@ static void close_work(struct work_struct *work) { struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); struct file *file = acct->file; + + /* We were fired by acct_pin_kill() which holds acct->lock. */ + acct_write_process(acct); if (file->f_op->flush) file->f_op->flush(file, NULL); __fput_sync(file); @@ -430,13 +439,27 @@ static u32 encode_float(u64 value) * do_exit() or when switching to a different output file. */ -static void fill_ac(acct_t *ac) +static void fill_ac(struct bsd_acct_struct *acct) { struct pacct_struct *pacct = ¤t->signal->pacct; + struct file *file = acct->file; + acct_t *ac = &acct->ac; u64 elapsed, run_time; time64_t btime; struct tty_struct *tty; + lockdep_assert_held(&acct->lock); + + if (time_is_after_jiffies(acct->needcheck)) { + acct->check_space = false; + + /* Don't fill in @ac if nothing will be written. */ + if (!acct->active) + return; + } else { + acct->check_space = true; + } + /* * Fill the accounting struct with the needed info as recorded * by the different kernel functions. @@ -484,64 +507,61 @@ static void fill_ac(acct_t *ac) ac->ac_majflt = encode_comp_t(pacct->ac_majflt); ac->ac_exitcode = pacct->ac_exitcode; spin_unlock_irq(¤t->sighand->siglock); -} -/* - * do_acct_process does all actual work. Caller holds the reference to file. - */ -static void do_acct_process(struct bsd_acct_struct *acct) -{ - acct_t ac; - unsigned long flim; - const struct cred *orig_cred; - struct file *file = acct->file; - - /* - * Accounting records are not subject to resource limits. - */ - flim = rlimit(RLIMIT_FSIZE); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - /* Perform file operations on behalf of whoever enabled accounting */ - orig_cred = override_creds(file->f_cred); - /* - * First check to see if there is enough free_space to continue - * the process accounting system. - */ - if (!check_free_space(acct)) - goto out; - - fill_ac(&ac); /* we really need to bite the bullet and change layout */ - ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); - ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); + ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid()); + ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid()); #if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ - ac.ac_uid16 = ac.ac_uid; - ac.ac_gid16 = ac.ac_gid; + ac->ac_uid16 = ac->ac_uid; + ac->ac_gid16 = ac->ac_gid; #elif ACCT_VERSION == 3 { struct pid_namespace *ns = acct->ns; - ac.ac_pid = task_tgid_nr_ns(current, ns); + ac->ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); - ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), - ns); + ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); rcu_read_unlock(); } #endif +} + +static void acct_write_process(struct bsd_acct_struct *acct) +{ + struct file *file = acct->file; + const struct cred *cred; + acct_t *ac = &acct->ac; + + /* Perform file operations on behalf of whoever enabled accounting */ + cred = override_creds(file->f_cred); + /* - * Get freeze protection. If the fs is frozen, just skip the write - * as we could deadlock the system otherwise. + * First check to see if there is enough free_space to continue + * the process accounting system. Then get freeze protection. If + * the fs is frozen, just skip the write as we could deadlock + * the system otherwise. */ - if (file_start_write_trylock(file)) { + if (check_free_space(acct) && file_start_write_trylock(file)) { /* it's been opened O_APPEND, so position is irrelevant */ loff_t pos = 0; - __kernel_write(file, &ac, sizeof(acct_t), &pos); + __kernel_write(file, ac, sizeof(acct_t), &pos); file_end_write(file); } -out: + + revert_creds(cred); +} + +static void do_acct_process(struct bsd_acct_struct *acct) +{ + unsigned long flim; + + /* Accounting records are not subject to resource limits. */ + flim = rlimit(RLIMIT_FSIZE); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + fill_ac(acct); + acct_write_process(acct); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; - revert_creds(orig_cred); } /** From 890ed45bde808c422c3c27d3285fc45affa0f930 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 11 Feb 2025 18:16:00 +0100 Subject: [PATCH 128/310] acct: block access to kernel internal filesystems There's no point in allowing anything kernel internal nor procfs or sysfs. Link: https://lore.kernel.org/r/20250127091811.3183623-1-quzicheng@huawei.com Link: https://lore.kernel.org/r/20250211-work-acct-v1-2-1c16aecab8b3@kernel.org Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reviewed-by: Amir Goldstein Reported-by: Zicheng Qu Cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- kernel/acct.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/acct.c b/kernel/acct.c index 48283efe8a12..6520baa13669 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -243,6 +243,20 @@ static int acct_on(struct filename *pathname) return -EACCES; } + /* Exclude kernel kernel internal filesystems. */ + if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) { + kfree(acct); + filp_close(file, NULL); + return -EINVAL; + } + + /* Exclude procfs and sysfs. */ + if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) { + kfree(acct); + filp_close(file, NULL); + return -EINVAL; + } + if (!(file->f_mode & FMODE_CAN_WRITE)) { kfree(acct); filp_close(file, NULL); From e977499820782ab1c69f354d9f41b6d9ad1f43d9 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Mon, 10 Feb 2025 15:36:54 +0100 Subject: [PATCH 129/310] drm/xe: Carve out wopcm portion from the stolen memory The top of stolen memory is WOPCM, which shouldn't be accessed. Remove this portion from the stolen memory region for discrete platforms. This was already done for integrated, but was missing for discrete platforms. This also moves get_wopcm_size() so detect_bar2_dgfx() and detect_bar2_integrated can use the same function. v2: Improve commit message and suitable stable version tag(Lucas) Fixes: d8b52a02cb40 ("drm/xe: Implement stolen memory.") Cc: Maarten Lankhorst Cc: Matthew Auld Cc: Lucas De Marchi Cc: stable@vger.kernel.org # v6.11+ Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20250210143654.2076747-1-nirmoy.das@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit 2c7f45cc7e197a792ce5c693e56ea48f60b312da) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c | 54 ++++++++++++++------------ 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c b/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c index 423856cc18d4..d414421f8c13 100644 --- a/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c @@ -57,12 +57,35 @@ bool xe_ttm_stolen_cpu_access_needs_ggtt(struct xe_device *xe) return GRAPHICS_VERx100(xe) < 1270 && !IS_DGFX(xe); } +static u32 get_wopcm_size(struct xe_device *xe) +{ + u32 wopcm_size; + u64 val; + + val = xe_mmio_read64_2x32(xe_root_tile_mmio(xe), STOLEN_RESERVED); + val = REG_FIELD_GET64(WOPCM_SIZE_MASK, val); + + switch (val) { + case 0x5 ... 0x6: + val--; + fallthrough; + case 0x0 ... 0x3: + wopcm_size = (1U << val) * SZ_1M; + break; + default: + WARN(1, "Missing case wopcm_size=%llx\n", val); + wopcm_size = 0; + } + + return wopcm_size; +} + static s64 detect_bar2_dgfx(struct xe_device *xe, struct xe_ttm_stolen_mgr *mgr) { struct xe_tile *tile = xe_device_get_root_tile(xe); struct xe_mmio *mmio = xe_root_tile_mmio(xe); struct pci_dev *pdev = to_pci_dev(xe->drm.dev); - u64 stolen_size; + u64 stolen_size, wopcm_size; u64 tile_offset; u64 tile_size; @@ -74,7 +97,13 @@ static s64 detect_bar2_dgfx(struct xe_device *xe, struct xe_ttm_stolen_mgr *mgr) if (drm_WARN_ON(&xe->drm, tile_size < mgr->stolen_base)) return 0; + /* Carve out the top of DSM as it contains the reserved WOPCM region */ + wopcm_size = get_wopcm_size(xe); + if (drm_WARN_ON(&xe->drm, !wopcm_size)) + return 0; + stolen_size = tile_size - mgr->stolen_base; + stolen_size -= wopcm_size; /* Verify usage fits in the actual resource available */ if (mgr->stolen_base + stolen_size <= pci_resource_len(pdev, LMEM_BAR)) @@ -89,29 +118,6 @@ static s64 detect_bar2_dgfx(struct xe_device *xe, struct xe_ttm_stolen_mgr *mgr) return ALIGN_DOWN(stolen_size, SZ_1M); } -static u32 get_wopcm_size(struct xe_device *xe) -{ - u32 wopcm_size; - u64 val; - - val = xe_mmio_read64_2x32(xe_root_tile_mmio(xe), STOLEN_RESERVED); - val = REG_FIELD_GET64(WOPCM_SIZE_MASK, val); - - switch (val) { - case 0x5 ... 0x6: - val--; - fallthrough; - case 0x0 ... 0x3: - wopcm_size = (1U << val) * SZ_1M; - break; - default: - WARN(1, "Missing case wopcm_size=%llx\n", val); - wopcm_size = 0; - } - - return wopcm_size; -} - static u32 detect_bar2_integrated(struct xe_device *xe, struct xe_ttm_stolen_mgr *mgr) { struct pci_dev *pdev = to_pci_dev(xe->drm.dev); From 06521ac0485effdcc9c792cb0b40ed8e6f2f5fb8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 12 Feb 2025 13:33:24 +0000 Subject: [PATCH 130/310] io_uring/waitid: don't abuse io_tw_state struct io_tw_state is managed by core io_uring, and opcode handling code must never try to cheat and create their own instances, it's plain incorrect. io_waitid_complete() attempts exactly that outside of the task work context, and even though the ring is locked, there would be no one to reap the requests from the defer completion list. It only works now because luckily it's called before io_uring_try_cancel_uring_cmd(), which flushes completions. Fixes: f31ecf671ddc4 ("io_uring: add IORING_OP_WAITID support") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/waitid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 853e97a7b0ec..c4096d93a287 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -118,7 +118,6 @@ static int io_waitid_finish(struct io_kiocb *req, int ret) static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); - struct io_tw_state ts = {}; /* anyone completing better be holding a reference */ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); @@ -131,7 +130,6 @@ static void io_waitid_complete(struct io_kiocb *req, int ret) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - io_req_task_complete(req, &ts); } static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) @@ -153,6 +151,7 @@ static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) list_del_init(&iwa->wo.child_wait.entry); spin_unlock_irq(&iw->head->lock); io_waitid_complete(req, -ECANCELED); + io_req_queue_tw_complete(req, -ECANCELED); return true; } @@ -258,6 +257,7 @@ static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) } io_waitid_complete(req, ret); + io_req_task_complete(req, ts); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, From 8802766324e1f5d414a81ac43365c20142e85603 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 12 Feb 2025 13:46:46 +0000 Subject: [PATCH 131/310] io_uring/kbuf: reallocate buf lists on upgrade IORING_REGISTER_PBUF_RING can reuse an old struct io_buffer_list if it was created for legacy selected buffer and has been emptied. It violates the requirement that most of the field should stay stable after publish. Always reallocate it instead. Cc: stable@vger.kernel.org Reported-by: Pumpkin Chang Fixes: 2fcabce2d7d34 ("io_uring: disallow mixed provided buffer group registrations") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 04bf493eecae..8e72de7712ac 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -415,6 +415,13 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) } } +static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +{ + scoped_guard(mutex, &ctx->mmap_lock) + WARN_ON_ONCE(xa_erase(&ctx->io_bl_xa, bl->bgid) != bl); + io_put_bl(ctx, bl); +} + int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); @@ -636,12 +643,13 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) /* if mapped buffer ring OR classic exists, don't allow */ if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list)) return -EEXIST; - } else { - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return -ENOMEM; + io_destroy_bl(ctx, bl); } + free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) + return -ENOMEM; + mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT; ring_size = flex_array_size(br, bufs, reg.ring_entries); From a8de7f100bb5989d9c3627d3a223ee1c863f3b69 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2025 16:34:51 -0800 Subject: [PATCH 132/310] KVM: x86: Reject Hyper-V's SEND_IPI hypercalls if local APIC isn't in-kernel Advertise support for Hyper-V's SEND_IPI and SEND_IPI_EX hypercalls if and only if the local API is emulated/virtualized by KVM, and explicitly reject said hypercalls if the local APIC is emulated in userspace, i.e. don't rely on userspace to opt-in to KVM_CAP_HYPERV_ENFORCE_CPUID. Rejecting SEND_IPI and SEND_IPI_EX fixes a NULL-pointer dereference if Hyper-V enlightenments are exposed to the guest without an in-kernel local APIC: dump_stack+0xbe/0xfd __kasan_report.cold+0x34/0x84 kasan_report+0x3a/0x50 __apic_accept_irq+0x3a/0x5c0 kvm_hv_send_ipi.isra.0+0x34e/0x820 kvm_hv_hypercall+0x8d9/0x9d0 kvm_emulate_hypercall+0x506/0x7e0 __vmx_handle_exit+0x283/0xb60 vmx_handle_exit+0x1d/0xd0 vcpu_enter_guest+0x16b0/0x24c0 vcpu_run+0xc0/0x550 kvm_arch_vcpu_ioctl_run+0x170/0x6d0 kvm_vcpu_ioctl+0x413/0xb20 __se_sys_ioctl+0x111/0x160 do_syscal1_64+0x30/0x40 entry_SYSCALL_64_after_hwframe+0x67/0xd1 Note, checking the sending vCPU is sufficient, as the per-VM irqchip_mode can't be modified after vCPUs are created, i.e. if one vCPU has an in-kernel local APIC, then all vCPUs have an in-kernel local APIC. Reported-by: Dongjie Zou Fixes: 214ff83d4473 ("KVM: x86: hyperv: implement PV IPI send hypercalls") Fixes: 2bc39970e932 ("x86/kvm/hyper-v: Introduce KVM_GET_SUPPORTED_HV_CPUID") Cc: stable@vger.kernel.org Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20250118003454.2619573-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/hyperv.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 6a6dd5a84f22..6ebeb6cea6c0 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2226,6 +2226,9 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) u32 vector; bool all_cpus; + if (!lapic_in_kernel(vcpu)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, @@ -2852,7 +2855,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; - ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + if (!vcpu || lapic_in_kernel(vcpu)) + ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; From 0b6db0dc43eefb4f89181546785c3609fd276524 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2025 16:34:52 -0800 Subject: [PATCH 133/310] KVM: selftests: Mark test_hv_cpuid_e2big() static in Hyper-V CPUID test Make the Hyper-V CPUID test's local helper test_hv_cpuid_e2big() static, it's not used outside of the test (and isn't intended to be). Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20250118003454.2619573-3-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/hyperv_cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c index 4f5881d4ef66..9a0fcc713350 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c @@ -111,7 +111,7 @@ static void test_hv_cpuid(const struct kvm_cpuid2 *hv_cpuid_entries, } } -void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +static void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { static struct kvm_cpuid2 cpuid = {.nent = 0}; int ret; From cd5a0c2f0faeb4a3fab3b78f6693a2d55ee51efa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2025 16:34:53 -0800 Subject: [PATCH 134/310] KVM: selftests: Manage CPUID array in Hyper-V CPUID test's core helper Allocate, get, and free the CPUID array in the Hyper-V CPUID test in the test's core helper, instead of copy+pasting code at each call site. In addition to deduplicating a small amount of code, restricting visibility of the array to a single invocation of the core test prevents "leaking" an array across test cases. Passing in @vcpu to the helper will also allow pivoting on VM-scoped information without needing to pass more booleans, e.g. to conditionally assert on features that require an in-kernel APIC. To avoid use-after-free bugs due to overzealous and careless developers, opportunstically add a comment to explain that the system-scoped helper caches the Hyper-V CPUID entries, i.e. that the caller is not responsible for freeing the memory. Cc: Vitaly Kuznetsov Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20250118003454.2619573-4-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/x86/hyperv_cpuid.c | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c index 9a0fcc713350..3188749ec6e1 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c @@ -41,13 +41,18 @@ static bool smt_possible(void) return res; } -static void test_hv_cpuid(const struct kvm_cpuid2 *hv_cpuid_entries, - bool evmcs_expected) +static void test_hv_cpuid(struct kvm_vcpu *vcpu, bool evmcs_expected) { + const struct kvm_cpuid2 *hv_cpuid_entries; int i; int nent_expected = 10; u32 test_val; + if (vcpu) + hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); + else + hv_cpuid_entries = kvm_get_supported_hv_cpuid(); + TEST_ASSERT(hv_cpuid_entries->nent == nent_expected, "KVM_GET_SUPPORTED_HV_CPUID should return %d entries" " (returned %d)", @@ -109,6 +114,13 @@ static void test_hv_cpuid(const struct kvm_cpuid2 *hv_cpuid_entries, * entry->edx); */ } + + /* + * Note, the CPUID array returned by the system-scoped helper is a one- + * time allocation, i.e. must not be freed. + */ + if (vcpu) + free((void *)hv_cpuid_entries); } static void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) @@ -129,7 +141,6 @@ static void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) int main(int argc, char *argv[]) { struct kvm_vm *vm; - const struct kvm_cpuid2 *hv_cpuid_entries; struct kvm_vcpu *vcpu; TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID)); @@ -138,10 +149,7 @@ int main(int argc, char *argv[]) /* Test vCPU ioctl version */ test_hv_cpuid_e2big(vm, vcpu); - - hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); - test_hv_cpuid(hv_cpuid_entries, false); - free((void *)hv_cpuid_entries); + test_hv_cpuid(vcpu, false); if (!kvm_cpu_has(X86_FEATURE_VMX) || !kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { @@ -149,9 +157,7 @@ int main(int argc, char *argv[]) goto do_sys; } vcpu_enable_evmcs(vcpu); - hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); - test_hv_cpuid(hv_cpuid_entries, true); - free((void *)hv_cpuid_entries); + test_hv_cpuid(vcpu, true); do_sys: /* Test system ioctl version */ @@ -161,9 +167,7 @@ int main(int argc, char *argv[]) } test_hv_cpuid_e2big(vm, NULL); - - hv_cpuid_entries = kvm_get_supported_hv_cpuid(); - test_hv_cpuid(hv_cpuid_entries, kvm_cpu_has(X86_FEATURE_VMX)); + test_hv_cpuid(NULL, kvm_cpu_has(X86_FEATURE_VMX)); out: kvm_vm_free(vm); From e36454461c5ebe6372952560b2abad5dc9ac579d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2025 16:34:54 -0800 Subject: [PATCH 135/310] KVM: selftests: Add CPUID tests for Hyper-V features that need in-kernel APIC Add testcases to x86's Hyper-V CPUID test to verify that KVM advertises support for features that require an in-kernel local APIC appropriately, i.e. that KVM hides support from the vCPU-scoped ioctl if the VM doesn't have an in-kernel local APIC. Cc: Vitaly Kuznetsov Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20250118003454.2619573-5-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/hyperv_cpuid.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c index 3188749ec6e1..4e920705681a 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c @@ -43,6 +43,7 @@ static bool smt_possible(void) static void test_hv_cpuid(struct kvm_vcpu *vcpu, bool evmcs_expected) { + const bool has_irqchip = !vcpu || vcpu->vm->has_irqchip; const struct kvm_cpuid2 *hv_cpuid_entries; int i; int nent_expected = 10; @@ -85,12 +86,19 @@ static void test_hv_cpuid(struct kvm_vcpu *vcpu, bool evmcs_expected) entry->eax, evmcs_expected ); break; + case 0x40000003: + TEST_ASSERT(has_irqchip || !(entry->edx & BIT(19)), + "\"Direct\" Synthetic Timers should require in-kernel APIC"); + break; case 0x40000004: test_val = entry->eax & (1UL << 18); TEST_ASSERT(!!test_val == !smt_possible(), "NoNonArchitecturalCoreSharing bit" " doesn't reflect SMT setting"); + + TEST_ASSERT(has_irqchip || !(entry->eax & BIT(10)), + "Cluster IPI (i.e. SEND_IPI) should require in-kernel APIC"); break; case 0x4000000A: TEST_ASSERT(entry->eax & (1UL << 19), @@ -145,9 +153,14 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID)); - vm = vm_create_with_one_vcpu(&vcpu, guest_code); + /* Test the vCPU ioctl without an in-kernel local APIC. */ + vm = vm_create_barebones(); + vcpu = __vm_vcpu_add(vm, 0); + test_hv_cpuid(vcpu, false); + kvm_vm_free(vm); /* Test vCPU ioctl version */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); test_hv_cpuid_e2big(vm, vcpu); test_hv_cpuid(vcpu, false); From 46d6c6f3ef0eaff71c2db6d77d4e2ebb7adac34f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 29 Jan 2025 17:08:25 -0800 Subject: [PATCH 136/310] KVM: nSVM: Enter guest mode before initializing nested NPT MMU When preparing vmcb02 for nested VMRUN (or state restore), "enter" guest mode prior to initializing the MMU for nested NPT so that guest_mode is set in the MMU's role. KVM's model is that all L2 MMUs are tagged with guest_mode, as the behavior of hypervisor MMUs tends to be significantly different than kernel MMUs. Practically speaking, the bug is relatively benign, as KVM only directly queries role.guest_mode in kvm_mmu_free_guest_mode_roots() and kvm_mmu_page_ad_need_write_protect(), which SVM doesn't use, and in paths that are optimizations (mmu_page_zap_pte() and shadow_mmu_try_split_huge_pages()). And while the role is incorprated into shadow page usage, because nested NPT requires KVM to be using NPT for L1, reusing shadow pages across L1 and L2 is impossible as L1 MMUs will always have direct=1, while L2 MMUs will have direct=0. Hoist the TLB processing and setting of HF_GUEST_MASK to the beginning of the flow instead of forcing guest_mode in the MMU, as nothing in nested_vmcb02_prepare_control() between the old and new locations touches TLB flush requests or HF_GUEST_MASK, i.e. there's no reason to present inconsistent vCPU state to the MMU. Fixes: 69cb877487de ("KVM: nSVM: move MMU setup to nested_prepare_vmcb_control") Cc: stable@vger.kernel.org Reported-by: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://lore.kernel.org/r/20250130010825.220346-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/svm/nested.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 74c20dbb92da..d4ac4a1f8b81 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5540,7 +5540,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, union kvm_mmu_page_role root_role; /* NPT requires CR0.PG=1. */ - WARN_ON_ONCE(cpu_role.base.direct); + WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode); root_role = cpu_role.base; root_role.level = kvm_mmu_get_tdp_level(vcpu); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index d77b094d9a4d..04c375bf1ac2 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -646,6 +646,11 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, u32 pause_count12; u32 pause_thresh12; + nested_svm_transition_tlb_flush(vcpu); + + /* Enter Guest-Mode */ + enter_guest_mode(vcpu); + /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. @@ -762,11 +767,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, } } - nested_svm_transition_tlb_flush(vcpu); - - /* Enter Guest-Mode */ - enter_guest_mode(vcpu); - /* * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. From c2fee09fc167c74a64adb08656cb993ea475197e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 24 Jan 2025 17:18:33 -0800 Subject: [PATCH 137/310] KVM: x86: Load DR6 with guest value only before entering .vcpu_run() loop Move the conditional loading of hardware DR6 with the guest's DR6 value out of the core .vcpu_run() loop to fix a bug where KVM can load hardware with a stale vcpu->arch.dr6. When the guest accesses a DR and host userspace isn't debugging the guest, KVM disables DR interception and loads the guest's values into hardware on VM-Enter and saves them on VM-Exit. This allows the guest to access DRs at will, e.g. so that a sequence of DR accesses to configure a breakpoint only generates one VM-Exit. For DR0-DR3, the logic/behavior is identical between VMX and SVM, and also identical between KVM_DEBUGREG_BP_ENABLED (userspace debugging the guest) and KVM_DEBUGREG_WONT_EXIT (guest using DRs), and so KVM handles loading DR0-DR3 in common code, _outside_ of the core kvm_x86_ops.vcpu_run() loop. But for DR6, the guest's value doesn't need to be loaded into hardware for KVM_DEBUGREG_BP_ENABLED, and SVM provides a dedicated VMCB field whereas VMX requires software to manually load the guest value, and so loading the guest's value into DR6 is handled by {svm,vmx}_vcpu_run(), i.e. is done _inside_ the core run loop. Unfortunately, saving the guest values on VM-Exit is initiated by common x86, again outside of the core run loop. If the guest modifies DR6 (in hardware, when DR interception is disabled), and then the next VM-Exit is a fastpath VM-Exit, KVM will reload hardware DR6 with vcpu->arch.dr6 and clobber the guest's actual value. The bug shows up primarily with nested VMX because KVM handles the VMX preemption timer in the fastpath, and the window between hardware DR6 being modified (in guest context) and DR6 being read by guest software is orders of magnitude larger in a nested setup. E.g. in non-nested, the VMX preemption timer would need to fire precisely between #DB injection and the #DB handler's read of DR6, whereas with a KVM-on-KVM setup, the window where hardware DR6 is "dirty" extends all the way from L1 writing DR6 to VMRESUME (in L1). L1's view: ========== CPU 0/KVM-7289 [023] d.... 2925.640961: kvm_entry: vcpu 0 A: L1 Writes DR6 CPU 0/KVM-7289 [023] d.... 2925.640963: : Set DRs, DR6 = 0xffff0ff1 B: CPU 0/KVM-7289 [023] d.... 2925.640967: kvm_exit: vcpu 0 reason EXTERNAL_INTERRUPT intr_info 0x800000ec D: L1 reads DR6, arch.dr6 = 0 CPU 0/KVM-7289 [023] d.... 2925.640969: : Sync DRs, DR6 = 0xffff0ff0 CPU 0/KVM-7289 [023] d.... 2925.640976: kvm_entry: vcpu 0 L2 reads DR6, L1 disables DR interception CPU 0/KVM-7289 [023] d.... 2925.640980: kvm_exit: vcpu 0 reason DR_ACCESS info1 0x0000000000000216 CPU 0/KVM-7289 [023] d.... 2925.640983: kvm_entry: vcpu 0 CPU 0/KVM-7289 [023] d.... 2925.640983: : Set DRs, DR6 = 0xffff0ff0 L2 detects failure CPU 0/KVM-7289 [023] d.... 2925.640987: kvm_exit: vcpu 0 reason HLT L1 reads DR6 (confirms failure) CPU 0/KVM-7289 [023] d.... 2925.640990: : Sync DRs, DR6 = 0xffff0ff0 L0's view: ========== L2 reads DR6, arch.dr6 = 0 CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216 CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216 L2 => L1 nested VM-Exit CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit_inject: reason: DR_ACCESS ext_inf1: 0x0000000000000216 CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_entry: vcpu 23 CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_exit: vcpu 23 reason VMREAD CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_entry: vcpu 23 CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason VMREAD CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_entry: vcpu 23 L1 writes DR7, L0 disables DR interception CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000007 CPU 23/KVM-5046 [001] d.... 3410.005613: kvm_entry: vcpu 23 L0 writes DR6 = 0 (arch.dr6) CPU 23/KVM-5046 [001] d.... 3410.005613: : Set DRs, DR6 = 0xffff0ff0 A: B: CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_exit: vcpu 23 reason PREEMPTION_TIMER CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_entry: vcpu 23 C: L0 writes DR6 = 0 (arch.dr6) CPU 23/KVM-5046 [001] d.... 3410.005614: : Set DRs, DR6 = 0xffff0ff0 L1 => L2 nested VM-Enter CPU 23/KVM-5046 [001] d.... 3410.005616: kvm_exit: vcpu 23 reason VMRESUME L0 reads DR6, arch.dr6 = 0 Reported-by: John Stultz Closes: https://lkml.kernel.org/r/CANDhNCq5_F3HfFYABqFGCA1bPd_%2BxgNj-iDQhH4tDk%2Bwi8iZZg%40mail.gmail.com Fixes: 375e28ffc0cf ("KVM: X86: Set host DR6 only on VMX and for KVM_DEBUGREG_WONT_EXIT") Fixes: d67668e9dd76 ("KVM: x86, SVM: isolate vcpu->arch.dr6 from vmcb->save.dr6") Cc: stable@vger.kernel.org Cc: Jim Mattson Tested-by: John Stultz Link: https://lore.kernel.org/r/20250125011833.3644371-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/svm.c | 13 ++++++------- arch/x86/kvm/vmx/main.c | 1 + arch/x86/kvm/vmx/vmx.c | 10 ++++++---- arch/x86/kvm/vmx/x86_ops.h | 1 + arch/x86/kvm/x86.c | 3 +++ 7 files changed, 19 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index c35550581da0..823c0434bbad 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -48,6 +48,7 @@ KVM_X86_OP(set_idt) KVM_X86_OP(get_gdt) KVM_X86_OP(set_gdt) KVM_X86_OP(sync_dirty_debug_regs) +KVM_X86_OP(set_dr6) KVM_X86_OP(set_dr7) KVM_X86_OP(cache_reg) KVM_X86_OP(get_rflags) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b15cde0a9b5c..0b7af5902ff7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1696,6 +1696,7 @@ struct kvm_x86_ops { void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); + void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7640a84e554a..a713c803a3a3 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1991,11 +1991,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid = sd->next_asid++; } -static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) { - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = to_svm(vcpu)->vmcb; - if (svm->vcpu.arch.guest_state_protected) + if (vcpu->arch.guest_state_protected) return; if (unlikely(value != vmcb->save.dr6)) { @@ -4247,10 +4247,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - svm_set_dr6(svm, vcpu->arch.dr6); - else - svm_set_dr6(svm, DR6_ACTIVE_LOW); + if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); kvm_load_guest_xsave_state(vcpu); @@ -5043,6 +5041,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, + .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 2427f918e763..43ee9ed11291 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -61,6 +61,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, + .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f72835e85b6d..6c56d5235f0f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5648,6 +5648,12 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) set_debugreg(DR6_RESERVED, 6); } +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ + lockdep_assert_irqs_disabled(); + set_debugreg(vcpu->arch.dr6, 6); +} + void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -7417,10 +7423,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) vmx->loaded_vmcs->host_state.cr4 = cr4; } - /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - set_debugreg(vcpu->arch.dr6, 6); - /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index ce3295a67c04..430773a5ef8e 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -73,6 +73,7 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val); void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val); void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu); void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8e77e61d4fbd..02159c967d29 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10961,6 +10961,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); } else if (unlikely(hw_breakpoint_active())) { set_debugreg(0, 7); } From 34cae91215c6f65bed2a124fb9283da6ec0b8dd9 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 12 Feb 2025 13:45:45 -0700 Subject: [PATCH 138/310] io_uring/uring_cmd: don't assume io_uring_cmd_data layout eaf72f7b414f ("io_uring/uring_cmd: cleanup struct io_uring_cmd_data layout") removed most of the places assuming struct io_uring_cmd_data has sqes as its first field. However, the EAGAIN case in io_uring_cmd() still compares ioucmd->sqe to the struct io_uring_cmd_data pointer using a void * cast. Since fa3595523d72 ("io_uring: get rid of alloc cache init_once handling"), sqes is no longer io_uring_cmd_data's first field. As a result, the pointers will always compare unequal and memcpy() may be called with the same source and destination. Replace the incorrect void * cast with the address of the sqes field. Signed-off-by: Caleb Sander Mateos Fixes: eaf72f7b414f ("io_uring/uring_cmd: cleanup struct io_uring_cmd_data layout") Link: https://lore.kernel.org/r/20250212204546.3751645-2-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 1f6a82128b47..cfb22e1de0e7 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -252,7 +252,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EAGAIN) { struct io_uring_cmd_data *cache = req->async_data; - if (ioucmd->sqe != (void *) cache) + if (ioucmd->sqe != cache->sqes) memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); return -EAGAIN; } else if (ret == -EIOCBQUEUED) { From e663da62ba8672aaa66843f1af8b20e3bb1a0515 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 12 Feb 2025 13:45:46 -0700 Subject: [PATCH 139/310] io_uring/uring_cmd: switch sqe to async_data on EAGAIN 5eff57fa9f3a ("io_uring/uring_cmd: defer SQE copying until it's needed") moved the unconditional memcpy() of the uring_cmd SQE to async_data to 2 cases when the request goes async: - If REQ_F_FORCE_ASYNC is set to force the initial issue to go async - If ->uring_cmd() returns -EAGAIN in the initial non-blocking issue Unlike the REQ_F_FORCE_ASYNC case, in the EAGAIN case, io_uring_cmd() copies the SQE to async_data but neglects to update the io_uring_cmd's sqe field to point to async_data. As a result, sqe still points to the slot in the userspace-mapped SQ. At the end of io_submit_sqes(), the kernel advances the SQ head index, allowing userspace to reuse the slot for a new SQE. If userspace reuses the slot before the io_uring worker reissues the original SQE, the io_uring_cmd's SQE will be corrupted. Introduce a helper io_uring_cmd_cache_sqes() to copy the original SQE to the io_uring_cmd's async_data and point sqe there. Use it for both the REQ_F_FORCE_ASYNC and EAGAIN cases. This ensures the uring_cmd doesn't read from the SQ slot after it has been returned to userspace. Signed-off-by: Caleb Sander Mateos Fixes: 5eff57fa9f3a ("io_uring/uring_cmd: defer SQE copying until it's needed") Link: https://lore.kernel.org/r/20250212204546.3751645-3-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index cfb22e1de0e7..bcfca18395c4 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -168,6 +168,15 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, } EXPORT_SYMBOL_GPL(io_uring_cmd_done); +static void io_uring_cmd_cache_sqes(struct io_kiocb *req) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + struct io_uring_cmd_data *cache = req->async_data; + + memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = cache->sqes; +} + static int io_uring_cmd_prep_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -179,14 +188,10 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, return -ENOMEM; cache->op_data = NULL; - if (!(req->flags & REQ_F_FORCE_ASYNC)) { - /* defer memcpy until we need it */ - ioucmd->sqe = sqe; - return 0; - } - - memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = cache->sqes; + ioucmd->sqe = sqe; + /* defer memcpy until we need it */ + if (unlikely(req->flags & REQ_F_FORCE_ASYNC)) + io_uring_cmd_cache_sqes(req); return 0; } @@ -253,7 +258,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) struct io_uring_cmd_data *cache = req->async_data; if (ioucmd->sqe != cache->sqes) - memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); + io_uring_cmd_cache_sqes(req); return -EAGAIN; } else if (ret == -EIOCBQUEUED) { return -EIOCBQUEUED; From 472ff48e2c09e49f2f90eeb6922f747306559506 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 12 Feb 2025 11:53:32 -0700 Subject: [PATCH 140/310] PCI: Fix BUILD_BUG_ON usage for old gcc As reported in the below link, it seems older versions of gcc cannot determine that the howmany variable is known for all callers. Include a test so that newer compilers can enforce this sanity check and older compilers can still work. Add __always_inline attribute to give the compiler an even better chance to know the inputs. Link: https://lore.kernel.org/r/20250212185337.293023-1-alex.williamson@redhat.com Fixes: 4453f360862e ("PCI: Batch BAR sizing operations") Reported-by: Oleg Nesterov Link: https://lore.kernel.org/all/20250209154512.GA18688@redhat.com Signed-off-by: Alex Williamson Signed-off-by: Bjorn Helgaas Tested-by: Oleg Nesterov Tested-by: Mitchell Augustin --- drivers/pci/probe.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index b6536ed599c3..246744d8d268 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -339,13 +339,14 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, return (res->flags & IORESOURCE_MEM_64) ? 1 : 0; } -static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom) +static __always_inline void pci_read_bases(struct pci_dev *dev, + unsigned int howmany, int rom) { u32 rombar, stdbars[PCI_STD_NUM_BARS]; unsigned int pos, reg; u16 orig_cmd; - BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS); + BUILD_BUG_ON(statically_true(howmany > PCI_STD_NUM_BARS)); if (dev->non_compliant_bars) return; From cee6f9a9c87b6ecfb51845950c28216b231c3610 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 12 Jan 2025 15:39:51 +0100 Subject: [PATCH 141/310] objtool/rust: add one more `noreturn` Rust function Starting with Rust 1.85.0 (currently in beta, to be released 2025-02-20), under some kernel configurations with `CONFIG_RUST_DEBUG_ASSERTIONS=y`, one may trigger a new `objtool` warning: rust/kernel.o: warning: objtool: _R...securityNtB2_11SecurityCtx8as_bytes() falls through to next function _R...core3ops4drop4Drop4drop() due to a call to the `noreturn` symbol: core::panicking::assert_failed:: Thus add it to the list so that `objtool` knows it is actually `noreturn`. Do so matching with `strstr` since it is a generic. See commit 56d680dd23c3 ("objtool/rust: list `noreturn` Rust functions") for more details. Cc: stable@vger.kernel.org # Needed in 6.12.y and 6.13.y only (Rust is pinned in older LTSs). Fixes: 56d680dd23c3 ("objtool/rust: list `noreturn` Rust functions") Reviewed-by: Gary Guo Link: https://lore.kernel.org/r/20250112143951.751139-1-ojeda@kernel.org [ Updated Cc: stable@ to include 6.13.y. - Miguel ] Signed-off-by: Miguel Ojeda --- tools/objtool/check.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 753dbc4f8198..a027d1c0bb2b 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -227,6 +227,7 @@ static bool is_rust_noreturn(const struct symbol *func) str_ends_with(func->name, "_4core9panicking18panic_bounds_check") || str_ends_with(func->name, "_4core9panicking19assert_failed_inner") || str_ends_with(func->name, "_4core9panicking36panic_misaligned_pointer_dereference") || + strstr(func->name, "_4core9panicking13assert_failed") || strstr(func->name, "_4core9panicking11panic_const24panic_const_") || (strstr(func->name, "_4core5slice5index24slice_") && str_ends_with(func->name, "_fail")); From 2e4f982cf392af2f1282b5537a72144e064799e3 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 7 Feb 2025 00:20:22 +0100 Subject: [PATCH 142/310] rust: rbtree: fix overindented list item Starting with Rust 1.86.0 (to be released 2025-04-03), Clippy will have a new lint, `doc_overindented_list_items` [1], which catches cases of overindented list items. The lint has been added by Yutaro Ohno, based on feedback from the kernel [2] on a patch that fixed a similar case -- commit 0c5928deada1 ("rust: block: fix formatting in GenDisk doc"). Clippy reports a few cases in the kernel, apart from the one already fixed in the commit above. One is this one: error: doc list item overindented --> rust/kernel/rbtree.rs:1152:5 | 1152 | /// null, it is a pointer to the root of the [`RBTree`]. | ^^^^ help: try using ` ` (2 spaces) | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#doc_overindented_list_items = note: `-D clippy::doc-overindented-list-items` implied by `-D warnings` = help: to override `-D warnings` add `#[allow(clippy::doc_overindented_list_items)]` Thus clean it up. Cc: Yutaro Ohno Cc: stable@vger.kernel.org # Needed in 6.12.y and 6.13.y only (Rust is pinned in older LTSs). Fixes: a335e9591404 ("rust: rbtree: add `RBTree::entry`") Link: https://github.com/rust-lang/rust-clippy/pull/13711 [1] Link: https://github.com/rust-lang/rust-clippy/issues/13601 [2] Reviewed-by: Alice Ryhl Reviewed-by: Yutaro Ohno Link: https://lore.kernel.org/r/20250206232022.599998-1-ojeda@kernel.org [ There are a few other cases, so updated message. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/kernel/rbtree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/rbtree.rs b/rust/kernel/rbtree.rs index ee2731dad72d..0d1e75810664 100644 --- a/rust/kernel/rbtree.rs +++ b/rust/kernel/rbtree.rs @@ -1149,7 +1149,7 @@ pub struct VacantEntry<'a, K, V> { /// # Invariants /// - `parent` may be null if the new node becomes the root. /// - `child_field_of_parent` is a valid pointer to the left-child or right-child of `parent`. If `parent` is -/// null, it is a pointer to the root of the [`RBTree`]. +/// null, it is a pointer to the root of the [`RBTree`]. struct RawVacantEntry<'a, K, V> { rbtree: *mut RBTree, /// The node that will become the parent of the new node if we insert one. From 0edf1283a9d1419a2095b4fcdd95c11ac00a191c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 12 Feb 2025 14:05:11 -0700 Subject: [PATCH 143/310] io_uring/uring_cmd: remove dead req_has_async_data() check Any uring_cmd always has async data allocated now, there's no reason to check and clear a cached copy of the SQE. Fixes: d10f19dff56e ("io_uring/uring_cmd: switch to always allocating async data") Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index bcfca18395c4..8af7780407b7 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -54,9 +54,6 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, continue; if (cmd->flags & IORING_URING_CMD_CANCELABLE) { - /* ->sqe isn't available if no async data */ - if (!req_has_async_data(req)) - cmd->sqe = NULL; file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL | IO_URING_F_COMPLETE_DEFER); ret = true; From 5298b7cffa8461009a4410f4e23f1c50ade39182 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 4 Dec 2024 08:48:27 +0100 Subject: [PATCH 144/310] um: add back support for FXSAVE registers It was reported that qemu may not enable the XSTATE CPU extension, which is a requirement after commit 3f17fed21491 ("um: switch to regset API and depend on XSTATE"). Add a fallback to use FXSAVE (FP registers on x86_64 and XFP on i386) which is just a shorter version of the same data. The only difference is that the XSTATE magic should not be set in the signal frame. Note that this still drops support for the older i386 FP register layout as supporting this would require more backward compatibility to build a correct signal frame. Fixes: 3f17fed21491 ("um: switch to regset API and depend on XSTATE") Reported-by: SeongJae Park Closes: https://lore.kernel.org/r/20241203070218.240797-1-sj@kernel.org Tested-by: SeongJae Park Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241204074827.1582917-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/x86/um/os-Linux/registers.c | 21 ++++++++++++++++++--- arch/x86/um/signal.c | 5 +++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c index 76eaeb93928c..eb1cdadc8a61 100644 --- a/arch/x86/um/os-Linux/registers.c +++ b/arch/x86/um/os-Linux/registers.c @@ -18,6 +18,7 @@ #include #include +static unsigned long ptrace_regset; unsigned long host_fp_size; int get_fp_registers(int pid, unsigned long *regs) @@ -27,7 +28,7 @@ int get_fp_registers(int pid, unsigned long *regs) .iov_len = host_fp_size, }; - if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) < 0) + if (ptrace(PTRACE_GETREGSET, pid, ptrace_regset, &iov) < 0) return -errno; return 0; } @@ -39,7 +40,7 @@ int put_fp_registers(int pid, unsigned long *regs) .iov_len = host_fp_size, }; - if (ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov) < 0) + if (ptrace(PTRACE_SETREGSET, pid, ptrace_regset, &iov) < 0) return -errno; return 0; } @@ -58,9 +59,23 @@ int arch_init_registers(int pid) return -ENOMEM; /* GDB has x86_xsave_length, which uses x86_cpuid_count */ - ret = ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); + ptrace_regset = NT_X86_XSTATE; + ret = ptrace(PTRACE_GETREGSET, pid, ptrace_regset, &iov); if (ret) ret = -errno; + + if (ret == -ENODEV) { +#ifdef CONFIG_X86_32 + ptrace_regset = NT_PRXFPREG; +#else + ptrace_regset = NT_PRFPREG; +#endif + iov.iov_len = 2 * 1024 * 1024; + ret = ptrace(PTRACE_GETREGSET, pid, ptrace_regset, &iov); + if (ret) + ret = -errno; + } + munmap(iov.iov_base, 2 * 1024 * 1024); host_fp_size = iov.iov_len; diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 75087e85b6fd..ea5b3bcc4245 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -187,7 +187,12 @@ static int copy_sc_to_user(struct sigcontext __user *to, * Put magic/size values for userspace. We do not bother to verify them * later on, however, userspace needs them should it try to read the * XSTATE data. And ptrace does not fill in these parts. + * + * Skip this if we do not have an XSTATE frame. */ + if (host_fp_size <= sizeof(to_fp64->fpstate)) + return 0; + BUILD_BUG_ON(sizeof(int) != FP_XSTATE_MAGIC2_SIZE); #ifdef CONFIG_X86_32 __put_user(offsetof(struct _fpstate_32, _fxsr_env) + From 8891b176d350ec5ea9a39c6ef4c99bd63d68e64c Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 17 Dec 2024 21:27:44 +0100 Subject: [PATCH 145/310] um: avoid copying FP state from init_task The init_task instance of struct task_struct is statically allocated and does not contain the dynamic area for the userspace FP registers. As such, limit the copy to the valid area of init_task and fill the rest with zero. Note that the FP state is only needed for userspace, and as such it is entirely reasonable for init_task to not contain it. Reported-by: Brian Norris Closes: https://lore.kernel.org/Z1ySXmjZm-xOqk90@google.com Fixes: 3f17fed21491 ("um: switch to regset API and depend on XSTATE") Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241217202745.1402932-3-benjamin@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/kernel/process.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index e5a2d4d897e0..0cd6fad3d908 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -191,7 +191,15 @@ void initial_thread_cb(void (*proc)(void *), void *arg) int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - memcpy(dst, src, arch_task_struct_size); + /* init_task is not dynamically sized (missing FPU state) */ + if (unlikely(src == &init_task)) { + memcpy(dst, src, sizeof(init_task)); + memset((void *)dst + sizeof(init_task), 0, + arch_task_struct_size - sizeof(init_task)); + } else { + memcpy(dst, src, arch_task_struct_size); + } + return 0; } From 3c2fc7434d90338cf4c1b37bc95994208d23bfc6 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 7 Jan 2025 14:35:09 +0100 Subject: [PATCH 146/310] um: properly align signal stack on x86_64 The stack needs to be properly aligned so 16 byte memory accesses on the stack are correct. This was broken when introducing the dynamic math register sizing as the rounding was not moved appropriately. Fixes: 3f17fed21491 ("um: switch to regset API and depend on XSTATE") Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250107133509.265576-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/x86/um/signal.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index ea5b3bcc4245..2934e170b0fe 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -372,11 +372,13 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, int err = 0, sig = ksig->sig; unsigned long fp_to; - frame = (struct rt_sigframe __user *) - round_down(stack_top - sizeof(struct rt_sigframe), 16); + frame = (void __user *)stack_top - sizeof(struct rt_sigframe); /* Add required space for math frame */ - frame = (struct rt_sigframe __user *)((unsigned long)frame - math_size); + frame = (void __user *)((unsigned long)frame - math_size); + + /* ABI requires 16 byte boundary alignment */ + frame = (void __user *)round_down((unsigned long)frame, 16); /* Subtract 128 for a red zone and 8 for proper alignment */ frame = (struct rt_sigframe __user *) ((unsigned long) frame - 128 - 8); From f82a9e7b9fa922bb9cccb00aae684a27b79e6df7 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 13 Jan 2025 10:41:07 +0100 Subject: [PATCH 147/310] um: fix execve stub execution on old host OSs The stub execution uses the somewhat new close_range and execveat syscalls. Of these two, the execveat call is essential, but the close_range call is more about stub process hygiene rather than safety (and its result is ignored). Replace both calls with a raw syscall as older machines might not have a recent enough kernel for close_range (with CLOSE_RANGE_CLOEXEC) or a libc that does not yet expose both of the syscalls. Fixes: 32e8eaf263d9 ("um: use execveat to create userspace MMs") Reported-by: Glenn Washburn Closes: https://lore.kernel.org/20250108022404.05e0de1e@crass-HP-ZBook-15-G2 Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250113094107.674738-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/os-Linux/skas/process.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index f683cfc9e51a..e2f8f156402f 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -181,6 +181,10 @@ extern char __syscall_stub_start[]; static int stub_exe_fd; +#ifndef CLOSE_RANGE_CLOEXEC +#define CLOSE_RANGE_CLOEXEC (1U << 2) +#endif + static int userspace_tramp(void *stack) { char *const argv[] = { "uml-userspace", NULL }; @@ -202,8 +206,12 @@ static int userspace_tramp(void *stack) init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset); init_data.stub_data_offset = MMAP_OFFSET(offset); - /* Set CLOEXEC on all FDs and then unset on all memory related FDs */ - close_range(0, ~0U, CLOSE_RANGE_CLOEXEC); + /* + * Avoid leaking unneeded FDs to the stub by setting CLOEXEC on all FDs + * and then unsetting it on all memory related FDs. + * This is not strictly necessary from a safety perspective. + */ + syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC); fcntl(init_data.stub_data_fd, F_SETFD, 0); for (iomem = iomem_regions; iomem; iomem = iomem->next) @@ -224,7 +232,9 @@ static int userspace_tramp(void *stack) if (ret != sizeof(init_data)) exit(4); - execveat(stub_exe_fd, "", argv, NULL, AT_EMPTY_PATH); + /* Raw execveat for compatibility with older libc versions */ + syscall(__NR_execveat, stub_exe_fd, (unsigned long)"", + (unsigned long)argv, NULL, AT_EMPTY_PATH); exit(5); } From 5b166b782d327f4b66190cc43afd3be36f2b3b7a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 Jan 2025 13:54:04 +0100 Subject: [PATCH 148/310] um: virt-pci: don't use kmalloc() This code can be called deep in the IRQ handling, for example, and then cannot normally use kmalloc(). Have its own pre-allocated memory and use from there instead so this doesn't occur. Only in the (very rare) case of memcpy_toio() we'd still need to allocate memory. Link: https://patch.msgid.link/20250110125550.32479-6-johannes@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/drivers/virt-pci.c | 198 +++++++++++++++++++------------------ 1 file changed, 102 insertions(+), 96 deletions(-) diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c index 744e7f31e8ef..dd5580f975cc 100644 --- a/arch/um/drivers/virt-pci.c +++ b/arch/um/drivers/virt-pci.c @@ -25,8 +25,10 @@ #define MAX_IRQ_MSG_SIZE (sizeof(struct virtio_pcidev_msg) + sizeof(u32)) #define NUM_IRQ_MSGS 10 -#define HANDLE_NO_FREE(ptr) ((void *)((unsigned long)(ptr) | 1)) -#define HANDLE_IS_NO_FREE(ptr) ((unsigned long)(ptr) & 1) +struct um_pci_message_buffer { + struct virtio_pcidev_msg hdr; + u8 data[8]; +}; struct um_pci_device { struct virtio_device *vdev; @@ -36,6 +38,11 @@ struct um_pci_device { struct virtqueue *cmd_vq, *irq_vq; +#define UM_PCI_WRITE_BUFS 20 + struct um_pci_message_buffer bufs[UM_PCI_WRITE_BUFS + 1]; + void *extra_ptrs[UM_PCI_WRITE_BUFS + 1]; + DECLARE_BITMAP(used_bufs, UM_PCI_WRITE_BUFS); + #define UM_PCI_STAT_WAITING 0 unsigned long status; @@ -61,12 +68,40 @@ static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)]; static unsigned int um_pci_max_delay_us = 40000; module_param_named(max_delay_us, um_pci_max_delay_us, uint, 0644); -struct um_pci_message_buffer { - struct virtio_pcidev_msg hdr; - u8 data[8]; -}; +static int um_pci_get_buf(struct um_pci_device *dev, bool *posted) +{ + int i; + + for (i = 0; i < UM_PCI_WRITE_BUFS; i++) { + if (!test_and_set_bit(i, dev->used_bufs)) + return i; + } -static struct um_pci_message_buffer __percpu *um_pci_msg_bufs; + *posted = false; + return UM_PCI_WRITE_BUFS; +} + +static void um_pci_free_buf(struct um_pci_device *dev, void *buf) +{ + int i; + + if (buf == &dev->bufs[UM_PCI_WRITE_BUFS]) { + kfree(dev->extra_ptrs[UM_PCI_WRITE_BUFS]); + dev->extra_ptrs[UM_PCI_WRITE_BUFS] = NULL; + return; + } + + for (i = 0; i < UM_PCI_WRITE_BUFS; i++) { + if (buf == &dev->bufs[i]) { + kfree(dev->extra_ptrs[i]); + dev->extra_ptrs[i] = NULL; + WARN_ON(!test_and_clear_bit(i, dev->used_bufs)); + return; + } + } + + WARN_ON(1); +} static int um_pci_send_cmd(struct um_pci_device *dev, struct virtio_pcidev_msg *cmd, @@ -82,7 +117,9 @@ static int um_pci_send_cmd(struct um_pci_device *dev, }; struct um_pci_message_buffer *buf; int delay_count = 0; + bool bounce_out; int ret, len; + int buf_idx; bool posted; if (WARN_ON(cmd_size < sizeof(*cmd) || cmd_size > sizeof(*buf))) @@ -101,26 +138,28 @@ static int um_pci_send_cmd(struct um_pci_device *dev, break; } - buf = get_cpu_var(um_pci_msg_bufs); - if (buf) - memcpy(buf, cmd, cmd_size); + bounce_out = !posted && cmd_size <= sizeof(*cmd) && + out && out_size <= sizeof(buf->data); - if (posted) { - u8 *ncmd = kmalloc(cmd_size + extra_size, GFP_ATOMIC); - - if (ncmd) { - memcpy(ncmd, cmd, cmd_size); - if (extra) - memcpy(ncmd + cmd_size, extra, extra_size); - cmd = (void *)ncmd; - cmd_size += extra_size; - extra = NULL; - extra_size = 0; - } else { - /* try without allocating memory */ - posted = false; - cmd = (void *)buf; + buf_idx = um_pci_get_buf(dev, &posted); + buf = &dev->bufs[buf_idx]; + memcpy(buf, cmd, cmd_size); + + if (posted && extra && extra_size > sizeof(buf) - cmd_size) { + dev->extra_ptrs[buf_idx] = kmemdup(extra, extra_size, + GFP_ATOMIC); + + if (!dev->extra_ptrs[buf_idx]) { + um_pci_free_buf(dev, buf); + return -ENOMEM; } + extra = dev->extra_ptrs[buf_idx]; + } else if (extra && extra_size <= sizeof(buf) - cmd_size) { + memcpy((u8 *)buf + cmd_size, extra, extra_size); + cmd_size += extra_size; + extra_size = 0; + extra = NULL; + cmd = (void *)buf; } else { cmd = (void *)buf; } @@ -128,39 +167,40 @@ static int um_pci_send_cmd(struct um_pci_device *dev, sg_init_one(&out_sg, cmd, cmd_size); if (extra) sg_init_one(&extra_sg, extra, extra_size); - if (out) + /* allow stack for small buffers */ + if (bounce_out) + sg_init_one(&in_sg, buf->data, out_size); + else if (out) sg_init_one(&in_sg, out, out_size); /* add to internal virtio queue */ ret = virtqueue_add_sgs(dev->cmd_vq, sgs_list, extra ? 2 : 1, out ? 1 : 0, - posted ? cmd : HANDLE_NO_FREE(cmd), - GFP_ATOMIC); + cmd, GFP_ATOMIC); if (ret) { - if (posted) - kfree(cmd); - goto out; + um_pci_free_buf(dev, buf); + return ret; } if (posted) { virtqueue_kick(dev->cmd_vq); - ret = 0; - goto out; + return 0; } /* kick and poll for getting a response on the queue */ set_bit(UM_PCI_STAT_WAITING, &dev->status); virtqueue_kick(dev->cmd_vq); + ret = 0; while (1) { void *completed = virtqueue_get_buf(dev->cmd_vq, &len); - if (completed == HANDLE_NO_FREE(cmd)) + if (completed == buf) break; - if (completed && !HANDLE_IS_NO_FREE(completed)) - kfree(completed); + if (completed) + um_pci_free_buf(dev, completed); if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) || ++delay_count > um_pci_max_delay_us, @@ -172,8 +212,11 @@ static int um_pci_send_cmd(struct um_pci_device *dev, } clear_bit(UM_PCI_STAT_WAITING, &dev->status); -out: - put_cpu_var(um_pci_msg_bufs); + if (bounce_out) + memcpy(out, buf->data, out_size); + + um_pci_free_buf(dev, buf); + return ret; } @@ -187,20 +230,13 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, .size = size, .addr = offset, }; - /* buf->data is maximum size - we may only use parts of it */ - struct um_pci_message_buffer *buf; - u8 *data; - unsigned long ret = ULONG_MAX; - size_t bytes = sizeof(buf->data); + /* max 8, we might not use it all */ + u8 data[8]; if (!dev) return ULONG_MAX; - buf = get_cpu_var(um_pci_msg_bufs); - data = buf->data; - - if (buf) - memset(data, 0xff, bytes); + memset(data, 0xff, sizeof(data)); switch (size) { case 1: @@ -212,34 +248,26 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, break; default: WARN(1, "invalid config space read size %d\n", size); - goto out; + return ULONG_MAX; } - if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, bytes)) - goto out; + if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, size)) + return ULONG_MAX; switch (size) { case 1: - ret = data[0]; - break; + return data[0]; case 2: - ret = le16_to_cpup((void *)data); - break; + return le16_to_cpup((void *)data); case 4: - ret = le32_to_cpup((void *)data); - break; + return le32_to_cpup((void *)data); #ifdef CONFIG_64BIT case 8: - ret = le64_to_cpup((void *)data); - break; + return le64_to_cpup((void *)data); #endif default: - break; + return ULONG_MAX; } - -out: - put_cpu_var(um_pci_msg_bufs); - return ret; } static void um_pci_cfgspace_write(void *priv, unsigned int offset, int size, @@ -312,13 +340,8 @@ static void um_pci_bar_copy_from(void *priv, void *buffer, static unsigned long um_pci_bar_read(void *priv, unsigned int offset, int size) { - /* buf->data is maximum size - we may only use parts of it */ - struct um_pci_message_buffer *buf; - u8 *data; - unsigned long ret = ULONG_MAX; - - buf = get_cpu_var(um_pci_msg_bufs); - data = buf->data; + /* 8 is maximum size - we may only use parts of it */ + u8 data[8]; switch (size) { case 1: @@ -330,33 +353,25 @@ static unsigned long um_pci_bar_read(void *priv, unsigned int offset, break; default: WARN(1, "invalid config space read size %d\n", size); - goto out; + return ULONG_MAX; } um_pci_bar_copy_from(priv, data, offset, size); switch (size) { case 1: - ret = data[0]; - break; + return data[0]; case 2: - ret = le16_to_cpup((void *)data); - break; + return le16_to_cpup((void *)data); case 4: - ret = le32_to_cpup((void *)data); - break; + return le32_to_cpup((void *)data); #ifdef CONFIG_64BIT case 8: - ret = le64_to_cpup((void *)data); - break; + return le64_to_cpup((void *)data); #endif default: - break; + return ULONG_MAX; } - -out: - put_cpu_var(um_pci_msg_bufs); - return ret; } static void um_pci_bar_copy_to(void *priv, unsigned int offset, @@ -523,11 +538,8 @@ static void um_pci_cmd_vq_cb(struct virtqueue *vq) if (test_bit(UM_PCI_STAT_WAITING, &dev->status)) return; - while ((cmd = virtqueue_get_buf(vq, &len))) { - if (WARN_ON(HANDLE_IS_NO_FREE(cmd))) - continue; - kfree(cmd); - } + while ((cmd = virtqueue_get_buf(vq, &len))) + um_pci_free_buf(dev, cmd); } static void um_pci_irq_vq_cb(struct virtqueue *vq) @@ -1006,10 +1018,6 @@ static int __init um_pci_init(void) "No virtio device ID configured for PCI - no PCI support\n")) return 0; - um_pci_msg_bufs = alloc_percpu(struct um_pci_message_buffer); - if (!um_pci_msg_bufs) - return -ENOMEM; - bridge = pci_alloc_host_bridge(0); if (!bridge) { err = -ENOMEM; @@ -1070,7 +1078,6 @@ static int __init um_pci_init(void) pci_free_resource_list(&bridge->windows); pci_free_host_bridge(bridge); } - free_percpu(um_pci_msg_bufs); return err; } module_init(um_pci_init); @@ -1082,6 +1089,5 @@ static void __exit um_pci_exit(void) irq_domain_remove(um_pci_inner_domain); pci_free_resource_list(&bridge->windows); pci_free_host_bridge(bridge); - free_percpu(um_pci_msg_bufs); } module_exit(um_pci_exit); From daa1a05ba431540097ec925d4e01d53ef29a98f1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 Jan 2025 13:54:05 +0100 Subject: [PATCH 149/310] um: virtio_uml: use raw spinlock This is needed because at least in time-travel the code can be called directly from the deep architecture and IRQ handling code. Link: https://patch.msgid.link/20250110125550.32479-7-johannes@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/drivers/virtio_uml.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 65df43fa9be5..ad8d78fb1d9a 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -52,7 +52,7 @@ struct virtio_uml_device { struct platform_device *pdev; struct virtio_uml_platform_data *pdata; - spinlock_t sock_lock; + raw_spinlock_t sock_lock; int sock, req_fd, irq; u64 features; u64 protocol_features; @@ -246,7 +246,7 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev, if (request_ack) msg->header.flags |= VHOST_USER_FLAG_NEED_REPLY; - spin_lock_irqsave(&vu_dev->sock_lock, flags); + raw_spin_lock_irqsave(&vu_dev->sock_lock, flags); rc = full_sendmsg_fds(vu_dev->sock, msg, size, fds, num_fds); if (rc < 0) goto out; @@ -266,7 +266,7 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev, } out: - spin_unlock_irqrestore(&vu_dev->sock_lock, flags); + raw_spin_unlock_irqrestore(&vu_dev->sock_lock, flags); return rc; } @@ -1239,7 +1239,7 @@ static int virtio_uml_probe(struct platform_device *pdev) goto error_free; vu_dev->sock = rc; - spin_lock_init(&vu_dev->sock_lock); + raw_spin_lock_init(&vu_dev->sock_lock); rc = vhost_user_init(vu_dev); if (rc) From 96178631c3f53398044ed437010f7632ad764bf8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 Jan 2025 13:54:06 +0100 Subject: [PATCH 150/310] um: convert irq_lock to raw spinlock Since this is deep in the architecture, and the code is called nested into other deep management code, this really needs to be a raw spinlock. Convert it. Link: https://patch.msgid.link/20250110125550.32479-8-johannes@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/kernel/irq.c | 79 ++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 338450741aac..a4991746f5ea 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -52,7 +52,7 @@ struct irq_entry { bool sigio_workaround; }; -static DEFINE_SPINLOCK(irq_lock); +static DEFINE_RAW_SPINLOCK(irq_lock); static LIST_HEAD(active_fds); static DECLARE_BITMAP(irqs_allocated, UM_LAST_SIGNAL_IRQ); static bool irqs_suspended; @@ -257,7 +257,7 @@ static struct irq_entry *get_irq_entry_by_fd(int fd) return NULL; } -static void free_irq_entry(struct irq_entry *to_free, bool remove) +static void remove_irq_entry(struct irq_entry *to_free, bool remove) { if (!to_free) return; @@ -265,7 +265,6 @@ static void free_irq_entry(struct irq_entry *to_free, bool remove) if (remove) os_del_epoll_fd(to_free->fd); list_del(&to_free->list); - kfree(to_free); } static bool update_irq_entry(struct irq_entry *entry) @@ -286,17 +285,19 @@ static bool update_irq_entry(struct irq_entry *entry) return false; } -static void update_or_free_irq_entry(struct irq_entry *entry) +static struct irq_entry *update_or_remove_irq_entry(struct irq_entry *entry) { - if (!update_irq_entry(entry)) - free_irq_entry(entry, false); + if (update_irq_entry(entry)) + return NULL; + remove_irq_entry(entry, false); + return entry; } static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, void (*timetravel_handler)(int, int, void *, struct time_travel_event *)) { - struct irq_entry *irq_entry; + struct irq_entry *irq_entry, *to_free = NULL; int err, events = os_event_mask(type); unsigned long flags; @@ -304,9 +305,10 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, if (err < 0) goto out; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); irq_entry = get_irq_entry_by_fd(fd); if (irq_entry) { +already: /* cannot register the same FD twice with the same type */ if (WARN_ON(irq_entry->reg[type].events)) { err = -EALREADY; @@ -316,11 +318,22 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, /* temporarily disable to avoid IRQ-side locking */ os_del_epoll_fd(fd); } else { - irq_entry = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); - if (!irq_entry) { - err = -ENOMEM; - goto out_unlock; + struct irq_entry *new; + + /* don't restore interrupts */ + raw_spin_unlock(&irq_lock); + new = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); + if (!new) { + local_irq_restore(flags); + return -ENOMEM; } + raw_spin_lock(&irq_lock); + irq_entry = get_irq_entry_by_fd(fd); + if (irq_entry) { + to_free = new; + goto already; + } + irq_entry = new; irq_entry->fd = fd; list_add_tail(&irq_entry->list, &active_fds); maybe_sigio_broken(fd); @@ -339,12 +352,11 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, #endif WARN_ON(!update_irq_entry(irq_entry)); - spin_unlock_irqrestore(&irq_lock, flags); - - return 0; + err = 0; out_unlock: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); out: + kfree(to_free); return err; } @@ -358,19 +370,20 @@ void free_irq_by_fd(int fd) struct irq_entry *to_free; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); to_free = get_irq_entry_by_fd(fd); - free_irq_entry(to_free, true); - spin_unlock_irqrestore(&irq_lock, flags); + remove_irq_entry(to_free, true); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } EXPORT_SYMBOL(free_irq_by_fd); static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) { - struct irq_entry *entry; + struct irq_entry *entry, *to_free = NULL; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type i; @@ -386,12 +399,13 @@ static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) os_del_epoll_fd(entry->fd); reg->events = 0; - update_or_free_irq_entry(entry); + to_free = update_or_remove_irq_entry(entry); goto out; } } out: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } void deactivate_fd(int fd, int irqnum) @@ -402,7 +416,7 @@ void deactivate_fd(int fd, int irqnum) os_del_epoll_fd(fd); - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); entry = get_irq_entry_by_fd(fd); if (!entry) goto out; @@ -414,9 +428,10 @@ void deactivate_fd(int fd, int irqnum) entry->reg[i].events = 0; } - update_or_free_irq_entry(entry); + entry = update_or_remove_irq_entry(entry); out: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(entry); ignore_sigio_fd(fd); } @@ -546,7 +561,7 @@ void um_irqs_suspend(void) irqs_suspended = true; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type t; bool clear = true; @@ -579,7 +594,7 @@ void um_irqs_suspend(void) !__ignore_sigio_fd(entry->fd); } } - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); } void um_irqs_resume(void) @@ -588,7 +603,7 @@ void um_irqs_resume(void) unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { if (entry->suspended) { int err = os_set_fd_async(entry->fd); @@ -602,7 +617,7 @@ void um_irqs_resume(void) } } } - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); irqs_suspended = false; send_sigio_to_self(); @@ -613,7 +628,7 @@ static int normal_irq_set_wake(struct irq_data *d, unsigned int on) struct irq_entry *entry; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type t; @@ -628,7 +643,7 @@ static int normal_irq_set_wake(struct irq_data *d, unsigned int on) } } unlock: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); return 0; } #else From 2b4fc4cd43f28e9e39179c8702e6ee821258584d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 12 Feb 2025 15:52:54 -0700 Subject: [PATCH 151/310] io_uring/waitid: setup async data in the prep handler This is the idiomatic way that opcodes should setup their async data, so that it's always valid inside ->issue() without issue needing to do that. Fixes: f31ecf671ddc4 ("io_uring: add IORING_OP_WAITID support") Signed-off-by: Jens Axboe --- io_uring/waitid.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/io_uring/waitid.c b/io_uring/waitid.c index c4096d93a287..15a7daf3ff4f 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -285,10 +285,16 @@ static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct io_waitid_async *iwa; if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags) return -EINVAL; + iwa = io_uring_alloc_async_data(NULL, req); + if (!unlikely(iwa)) + return -ENOMEM; + iwa->req = req; + iw->which = READ_ONCE(sqe->len); iw->upid = READ_ONCE(sqe->fd); iw->options = READ_ONCE(sqe->file_index); @@ -299,16 +305,10 @@ int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_waitid(struct io_kiocb *req, unsigned int issue_flags) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; - struct io_waitid_async *iwa; int ret; - iwa = io_uring_alloc_async_data(NULL, req); - if (!iwa) - return -ENOMEM; - - iwa->req = req; - ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, iw->options, NULL); if (ret) From 960a62877466067adc89bd37fe36d3b6edddb965 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 10 Feb 2025 18:18:29 -0500 Subject: [PATCH 152/310] drm/amdgpu/pm: fix UVD handing in amdgpu_dpm_set_powergating_by_smu() UVD and VCN were split into separate dpm helpers in commit ff69bba05f08 ("drm/amd/pm: add inst to dpm_set_powergating_by_smu") as such, there is no need to include UVD in the is_vcn variable since UVD and VCN are handled by separate dpm helpers now. Fix the check. Fixes: ff69bba05f08 ("drm/amd/pm: add inst to dpm_set_powergating_by_smu") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3959 Link: https://lists.freedesktop.org/archives/amd-gfx/2025-February/119827.html Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher Cc: Boyuan Zhang --- drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c index 6a9e26905edf..7a22aef6e59c 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c @@ -78,7 +78,7 @@ int amdgpu_dpm_set_powergating_by_smu(struct amdgpu_device *adev, int ret = 0; const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; enum ip_power_state pwr_state = gate ? POWER_STATE_OFF : POWER_STATE_ON; - bool is_vcn = (block_type == AMD_IP_BLOCK_TYPE_UVD || block_type == AMD_IP_BLOCK_TYPE_VCN); + bool is_vcn = block_type == AMD_IP_BLOCK_TYPE_VCN; if (atomic_read(&adev->pm.pwr_state[block_type]) == pwr_state && (!is_vcn || adev->vcn.num_vcn_inst == 1)) { From b35eb9128ebeec534eed1cefd6b9b1b7282cf5ba Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 28 Jan 2025 11:55:22 -0500 Subject: [PATCH 153/310] drm/amdgpu/gfx9: manually control gfxoff for CS on RV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When mesa started using compute queues more often we started seeing additional hangs with compute queues. Disabling gfxoff seems to mitigate that. Manually control gfxoff and gfx pg with command submissions to avoid any issues related to gfxoff. KFD already does the same thing for these chips. v2: limit to compute v3: limit to APUs v4: limit to Raven/PCO v5: only update the compute ring_funcs v6: Disable GFX PG v7: adjust order Reviewed-by: Lijo Lazar Suggested-by: Błażej Szczygieł Suggested-by: Sergey Kovalenko Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3861 Link: https://lists.freedesktop.org/archives/amd-gfx/2025-January/119116.html Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.12.x --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 36 +++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index fa572b40989e..0dce4421418c 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -7437,6 +7437,38 @@ static void gfx_v9_0_ring_emit_cleaner_shader(struct amdgpu_ring *ring) amdgpu_ring_write(ring, 0); /* RESERVED field, programmed to zero */ } +static void gfx_v9_0_ring_begin_use_compute(struct amdgpu_ring *ring) +{ + struct amdgpu_device *adev = ring->adev; + struct amdgpu_ip_block *gfx_block = + amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); + + amdgpu_gfx_enforce_isolation_ring_begin_use(ring); + + /* Raven and PCO APUs seem to have stability issues + * with compute and gfxoff and gfx pg. Disable gfx pg during + * submission and allow again afterwards. + */ + if (gfx_block && amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 1, 0)) + gfx_v9_0_set_powergating_state(gfx_block, AMD_PG_STATE_UNGATE); +} + +static void gfx_v9_0_ring_end_use_compute(struct amdgpu_ring *ring) +{ + struct amdgpu_device *adev = ring->adev; + struct amdgpu_ip_block *gfx_block = + amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); + + /* Raven and PCO APUs seem to have stability issues + * with compute and gfxoff and gfx pg. Disable gfx pg during + * submission and allow again afterwards. + */ + if (gfx_block && amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 1, 0)) + gfx_v9_0_set_powergating_state(gfx_block, AMD_PG_STATE_GATE); + + amdgpu_gfx_enforce_isolation_ring_end_use(ring); +} + static const struct amd_ip_funcs gfx_v9_0_ip_funcs = { .name = "gfx_v9_0", .early_init = gfx_v9_0_early_init, @@ -7613,8 +7645,8 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = { .emit_wave_limit = gfx_v9_0_emit_wave_limit, .reset = gfx_v9_0_reset_kcq, .emit_cleaner_shader = gfx_v9_0_ring_emit_cleaner_shader, - .begin_use = amdgpu_gfx_enforce_isolation_ring_begin_use, - .end_use = amdgpu_gfx_enforce_isolation_ring_end_use, + .begin_use = gfx_v9_0_ring_begin_use_compute, + .end_use = gfx_v9_0_ring_end_use_compute, }; static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = { From 55ed2b1b50d029dd7e49a35f6628ca64db6d75d8 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 31 Jan 2025 13:53:40 -0500 Subject: [PATCH 154/310] drm/amdgpu: bump version for RV/PCO compute fix Bump the driver version for RV/PCO compute stability fix so mesa can use this check to enable compute queues on RV/PCO. Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.12.x --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index dce9323fb410..95a05b03f799 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -120,9 +120,10 @@ * - 3.58.0 - Add GFX12 DCC support * - 3.59.0 - Cleared VRAM * - 3.60.0 - Add AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE (Vulkan requirement) + * - 3.61.0 - Contains fix for RV/PCO compute queues */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 60 +#define KMS_DRIVER_MINOR 61 #define KMS_DRIVER_PATCHLEVEL 0 /* From a33f7f9660705fb2ecf3467b2c48965564f392ce Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Sun, 26 Jan 2025 17:21:10 +0800 Subject: [PATCH 155/310] amdkfd: properly free gang_ctx_bo when failed to init user queue The destructor of a gtt bo is declared as void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj); Which takes void** as the second parameter. GCC allows passing void* to the function because void* can be implicitly casted to any other types, so it can pass compiling. However, passing this void* parameter into the function's execution process(which expects void** and dereferencing void**) will result in errors. Signed-off-by: Zhu Lingshan Reviewed-by: Felix Kuehling Fixes: fb91065851cd ("drm/amdkfd: Refactor queue wptr_bo GART mapping") Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index bcddd989c7f3..bd36a75309e1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -300,7 +300,7 @@ static int init_user_queue(struct process_queue_manager *pqm, return 0; free_gang_ctx_bo: - amdgpu_amdkfd_free_gtt_mem(dev->adev, (*q)->gang_ctx_bo); + amdgpu_amdkfd_free_gtt_mem(dev->adev, &(*q)->gang_ctx_bo); cleanup: uninit_queue(*q); *q = NULL; From a0a455b4bc7483ad60e8b8a50330c1e05bb7bfcf Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 7 Feb 2025 14:28:51 +0800 Subject: [PATCH 156/310] drm/amdgpu: bail out when failed to load fw in psp_init_cap_microcode() In function psp_init_cap_microcode(), it should bail out when failed to load firmware, otherwise it may cause invalid memory access. Fixes: 07dbfc6b102e ("drm/amd: Use `amdgpu_ucode_*` helpers for PSP") Reviewed-by: Lijo Lazar Signed-off-by: Jiang Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index babe94ade247..e5fc80ed06ea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -3815,9 +3815,10 @@ int psp_init_cap_microcode(struct psp_context *psp, const char *chip_name) if (err == -ENODEV) { dev_warn(adev->dev, "cap microcode does not exist, skip\n"); err = 0; - goto out; + } else { + dev_err(adev->dev, "fail to initialize cap microcode\n"); } - dev_err(adev->dev, "fail to initialize cap microcode\n"); + goto out; } info = &adev->firmware.ucode[AMDGPU_UCODE_ID_CAP]; From d584198a6fe4c51f4aa88ad72f258f8961a0f11c Mon Sep 17 00:00:00 2001 From: Lancelot SIX Date: Tue, 28 Jan 2025 19:16:49 +0000 Subject: [PATCH 157/310] drm/amdkfd: Ensure consistent barrier state saved in gfx12 trap handler It is possible for some waves in a workgroup to finish their save sequence before the group leader has had time to capture the workgroup barrier state. When this happens, having those waves exit do impact the barrier state. As a consequence, the state captured by the group leader is invalid, and is eventually incorrectly restored. This patch proposes to have all waves in a workgroup wait for each other at the end of their save sequence (just before calling s_endpgm_saved). Signed-off-by: Lancelot SIX Reviewed-by: Jay Cornwall Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.12.x --- drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 3 ++- drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 984f0e705078..651660958e5b 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -4121,7 +4121,8 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0x0000ffff, 0x8bfe7e7e, 0x8bea6a6a, 0xb97af804, 0xbe804ec2, 0xbf94fffe, - 0xbe804a6c, 0xbfb10000, + 0xbe804a6c, 0xbe804ec2, + 0xbf94fffe, 0xbfb10000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0x00000000, diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm index 1740e98c6719..7b9d36e5fa43 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm @@ -1049,6 +1049,10 @@ L_SKIP_BARRIER_RESTORE: s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution L_END_PGM: + // Make sure that no wave of the workgroup can exit the trap handler + // before the workgroup barrier state is saved. + s_barrier_signal -2 + s_barrier_wait -2 s_endpgm_saved end From 1abb2648698bf10783d2236a6b4a7ca5e8021699 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 7 Feb 2025 14:44:14 +0800 Subject: [PATCH 158/310] drm/amdgpu: avoid buffer overflow attach in smu_sys_set_pp_table() It malicious user provides a small pptable through sysfs and then a bigger pptable, it may cause buffer overflow attack in function smu_sys_set_pp_table(). Reviewed-by: Lijo Lazar Signed-off-by: Jiang Liu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 8ca793c222ff..ed9dac00ebfb 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -612,7 +612,8 @@ static int smu_sys_set_pp_table(void *handle, return -EIO; } - if (!smu_table->hardcode_pptable) { + if (!smu_table->hardcode_pptable || smu_table->power_play_table_size < size) { + kfree(smu_table->hardcode_pptable); smu_table->hardcode_pptable = kzalloc(size, GFP_KERNEL); if (!smu_table->hardcode_pptable) return -ENOMEM; From e00a2e5d485faf53c7a24b9d1b575a642227947f Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Wed, 12 Feb 2025 18:18:51 +0200 Subject: [PATCH 159/310] drm: Fix DSC BPP increment decoding Starting with DPCD version 2.0 bits 6:3 of the DP_DSC_BITS_PER_PIXEL_INC DPCD register contains the NativeYCbCr422_MAX_bpp_DELTA field, which can be non-zero as opposed to earlier DPCD versions, hence decoding the bit_per_pixel increment value at bits 2:0 in the same register requires applying a mask, do so. Cc: Ankit Nautiyal Fixes: 0c2287c96521 ("drm/display/dp: Add helper function to get DSC bpp precision") Reviewed-by: Jani Nikula Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20250212161851.4007005-1-imre.deak@intel.com --- drivers/gpu/drm/display/drm_dp_helper.c | 2 +- include/drm/display/drm_dp.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/display/drm_dp_helper.c b/drivers/gpu/drm/display/drm_dp_helper.c index da3c8521a7fa..61c7c2c588c6 100644 --- a/drivers/gpu/drm/display/drm_dp_helper.c +++ b/drivers/gpu/drm/display/drm_dp_helper.c @@ -2544,7 +2544,7 @@ u8 drm_dp_dsc_sink_bpp_incr(const u8 dsc_dpcd[DP_DSC_RECEIVER_CAP_SIZE]) { u8 bpp_increment_dpcd = dsc_dpcd[DP_DSC_BITS_PER_PIXEL_INC - DP_DSC_SUPPORT]; - switch (bpp_increment_dpcd) { + switch (bpp_increment_dpcd & DP_DSC_BITS_PER_PIXEL_MASK) { case DP_DSC_BITS_PER_PIXEL_1_16: return 16; case DP_DSC_BITS_PER_PIXEL_1_8: diff --git a/include/drm/display/drm_dp.h b/include/drm/display/drm_dp.h index a6f8b098c56f..3bd9f482f0c3 100644 --- a/include/drm/display/drm_dp.h +++ b/include/drm/display/drm_dp.h @@ -359,6 +359,7 @@ # define DP_DSC_BITS_PER_PIXEL_1_4 0x2 # define DP_DSC_BITS_PER_PIXEL_1_2 0x3 # define DP_DSC_BITS_PER_PIXEL_1_1 0x4 +# define DP_DSC_BITS_PER_PIXEL_MASK 0x7 #define DP_PSR_SUPPORT 0x070 /* XXX 1.2? */ # define DP_PSR_IS_SUPPORTED 1 From d923782b041218ef3804b2fed87619b5b1a497f3 Mon Sep 17 00:00:00 2001 From: Beata Michalska Date: Fri, 31 Jan 2025 15:58:42 +0000 Subject: [PATCH 160/310] arm64: amu: Delay allocating cpumask for AMU FIE support For the time being, the amu_fie_cpus cpumask is being exclusively used by the AMU-related internals of FIE support and is guaranteed to be valid on every access currently made. Still the mask is not being invalidated on one of the error handling code paths, which leaves a soft spot with theoretical risk of UAF for CPUMASK_OFFSTACK cases. To make things sound, delay allocating said cpumask (for CPUMASK_OFFSTACK) avoiding otherwise nasty sanitising case failing to register the cpufreq policy notifications. Signed-off-by: Beata Michalska Reviewed-by: Prasanna Kumar T S M Reviewed-by: Sumit Gupta Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20250131155842.3839098-1-beata.michalska@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/topology.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 1a2c72f3e7f8..cb180684d10d 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -194,12 +194,19 @@ static void amu_fie_setup(const struct cpumask *cpus) int cpu; /* We are already set since the last insmod of cpufreq driver */ - if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + if (cpumask_available(amu_fie_cpus) && + unlikely(cpumask_subset(cpus, amu_fie_cpus))) return; - for_each_cpu(cpu, cpus) { + for_each_cpu(cpu, cpus) if (!freq_counters_valid(cpu)) return; + + if (!cpumask_available(amu_fie_cpus) && + !zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) { + WARN_ONCE(1, "Failed to allocate FIE cpumask for CPUs[%*pbl]\n", + cpumask_pr_args(cpus)); + return; } cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); @@ -237,17 +244,8 @@ static struct notifier_block init_amu_fie_notifier = { static int __init init_amu_fie(void) { - int ret; - - if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) - return -ENOMEM; - - ret = cpufreq_register_notifier(&init_amu_fie_notifier, + return cpufreq_register_notifier(&init_amu_fie_notifier, CPUFREQ_POLICY_NOTIFIER); - if (ret) - free_cpumask_var(amu_fie_cpus); - - return ret; } core_initcall(init_amu_fie); From f818227a2f3d1d4f26469347e428323d61cc83f0 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 28 Jan 2025 00:17:49 +0000 Subject: [PATCH 161/310] ACPI: GTDT: Relax sanity checking on Platform Timers array count Perhaps unsurprisingly there are some platforms where the GTDT isn't quite right and the Platforms Timer array overflows the length of the overall table. While the recently-added sanity checking isn't wrong, it makes it impossible to boot the kernel on offending platforms. Try to hobble along and limit the Platform Timer count to the bounds of the table. Cc: Marc Zyngier Cc: Lorenzo Pieralisi Cc: Zheng Zengkai Cc: stable@vger.kernel.org Fixes: 263e22d6bd1f ("ACPI: GTDT: Tighten the check for the array of platform timer structures") Signed-off-by: Oliver Upton Acked-by: Marc Zyngier Reviewed-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20250128001749.3132656-1-oliver.upton@linux.dev Signed-off-by: Will Deacon --- drivers/acpi/arm64/gtdt.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index 3561553eff8b..70f8290b659d 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -163,7 +163,7 @@ int __init acpi_gtdt_init(struct acpi_table_header *table, { void *platform_timer; struct acpi_table_gtdt *gtdt; - int cnt = 0; + u32 cnt = 0; gtdt = container_of(table, struct acpi_table_gtdt, header); acpi_gtdt_desc.gtdt = gtdt; @@ -188,13 +188,17 @@ int __init acpi_gtdt_init(struct acpi_table_header *table, cnt++; if (cnt != gtdt->platform_timer_count) { + cnt = min(cnt, gtdt->platform_timer_count); + pr_err(FW_BUG "limiting Platform Timer count to %d\n", cnt); + } + + if (!cnt) { acpi_gtdt_desc.platform_timer = NULL; - pr_err(FW_BUG "invalid timer data.\n"); - return -EINVAL; + return 0; } if (platform_timer_count) - *platform_timer_count = gtdt->platform_timer_count; + *platform_timer_count = cnt; return 0; } From a4cc8494f1d853a0945d2a655b4891935d717355 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 12 Feb 2025 00:30:42 +0000 Subject: [PATCH 162/310] arm64: Add missing registrations of hwcaps Commit 819935464cb2 ("arm64/hwcap: Describe 2024 dpISA extensions to userspace") added definitions for HWCAP_FPRCVT, HWCAP_F8MM8 and HWCAP_F8MM4 but did not include the crucial registration in arm64_elf_hwcaps. Add it. Fixes: 819935464cb2 ("arm64/hwcap: Describe 2024 dpISA extensions to userspace") Reported-by: Mark Rutland Signed-off-by: Mark Brown Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20250212-arm64-fix-2024-dpisa-v2-1-67a1c11d6001@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index f0910f20fbf8..d561cf3b8ac7 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3091,6 +3091,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM, CAP_HWCAP, KERNEL_HWCAP_FLAGM), HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), HWCAP_CAP(ID_AA64ISAR0_EL1, RNDR, IMP, CAP_HWCAP, KERNEL_HWCAP_RNG), + HWCAP_CAP(ID_AA64ISAR3_EL1, FPRCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_FPRCVT), HWCAP_CAP(ID_AA64PFR0_EL1, FP, IMP, CAP_HWCAP, KERNEL_HWCAP_FP), HWCAP_CAP(ID_AA64PFR0_EL1, FP, FP16, CAP_HWCAP, KERNEL_HWCAP_FPHP), HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMD), @@ -3190,6 +3191,8 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA), HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP4), HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM8), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM4), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2), #ifdef CONFIG_ARM64_POE From 571b69f2f9b1ec7cf7d0e9b79e52115a87a869c4 Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Thu, 13 Feb 2025 15:05:18 +0800 Subject: [PATCH 163/310] ASoC: imx-audmix: remove cpu_mclk which is from cpu dai device When defer probe happens, there may be below error: platform 59820000.sai: Resources present before probing The cpu_mclk clock is from the cpu dai device, if it is not released, then the cpu dai device probe will fail for the second time. The cpu_mclk is used to get rate for rate constraint, rate constraint may be specific for each platform, which is not necessary for machine driver, so remove it. Fixes: b86ef5367761 ("ASoC: fsl: Add Audio Mixer machine driver") Signed-off-by: Shengjiu Wang Link: https://patch.msgid.link/20250213070518.547375-1-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/imx-audmix.c | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/sound/soc/fsl/imx-audmix.c b/sound/soc/fsl/imx-audmix.c index 231400661c90..50ecc5f51100 100644 --- a/sound/soc/fsl/imx-audmix.c +++ b/sound/soc/fsl/imx-audmix.c @@ -23,7 +23,6 @@ struct imx_audmix { struct snd_soc_card card; struct platform_device *audmix_pdev; struct platform_device *out_pdev; - struct clk *cpu_mclk; int num_dai; struct snd_soc_dai_link *dai; int num_dai_conf; @@ -32,34 +31,11 @@ struct imx_audmix { struct snd_soc_dapm_route *dapm_routes; }; -static const u32 imx_audmix_rates[] = { - 8000, 12000, 16000, 24000, 32000, 48000, 64000, 96000, -}; - -static const struct snd_pcm_hw_constraint_list imx_audmix_rate_constraints = { - .count = ARRAY_SIZE(imx_audmix_rates), - .list = imx_audmix_rates, -}; - static int imx_audmix_fe_startup(struct snd_pcm_substream *substream) { - struct snd_soc_pcm_runtime *rtd = snd_soc_substream_to_rtd(substream); - struct imx_audmix *priv = snd_soc_card_get_drvdata(rtd->card); struct snd_pcm_runtime *runtime = substream->runtime; - struct device *dev = rtd->card->dev; - unsigned long clk_rate = clk_get_rate(priv->cpu_mclk); int ret; - if (clk_rate % 24576000 == 0) { - ret = snd_pcm_hw_constraint_list(runtime, 0, - SNDRV_PCM_HW_PARAM_RATE, - &imx_audmix_rate_constraints); - if (ret < 0) - return ret; - } else { - dev_warn(dev, "mclk may be not supported %lu\n", clk_rate); - } - ret = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_CHANNELS, 1, 8); if (ret < 0) @@ -323,13 +299,6 @@ static int imx_audmix_probe(struct platform_device *pdev) } put_device(&cpu_pdev->dev); - priv->cpu_mclk = devm_clk_get(&cpu_pdev->dev, "mclk1"); - if (IS_ERR(priv->cpu_mclk)) { - ret = PTR_ERR(priv->cpu_mclk); - dev_err(&cpu_pdev->dev, "failed to get DAI mclk1: %d\n", ret); - return ret; - } - priv->audmix_pdev = audmix_pdev; priv->out_pdev = cpu_pdev; From ed975485a13d1f6080218aa71c29425ba2dfb332 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 11 Feb 2025 18:22:30 +0000 Subject: [PATCH 164/310] MIPS: Export syscall stack arguments properly for remote use We have several places across the kernel where we want to access another task's syscall arguments, such as ptrace(2), seccomp(2), etc., by making a call to syscall_get_arguments(). This works for register arguments right away by accessing the task's `regs' member of `struct pt_regs', however for stack arguments seen with 32-bit/o32 kernels things are more complicated. Technically they ought to be obtained from the user stack with calls to an access_remote_vm(), but we have an easier way available already. So as to be able to access syscall stack arguments as regular function arguments following the MIPS calling convention we copy them over from the user stack to the kernel stack in arch/mips/kernel/scall32-o32.S, in handle_sys(), to the current stack frame's outgoing argument space at the top of the stack, which is where the handler called expects to see its incoming arguments. This area is also pointed at by the `pt_regs' pointer obtained by task_pt_regs(). Make the o32 stack argument space a proper member of `struct pt_regs' then, by renaming the existing member from `pad0' to `args' and using generated offsets to access the space. No functional change though. With the change in place the o32 kernel stack frame layout at the entry to a syscall handler invoked by handle_sys() is therefore as follows: $sp + 68 -> | ... | <- pt_regs.regs[9] +---------------------+ $sp + 64 -> | $t0 | <- pt_regs.regs[8] +---------------------+ $sp + 60 -> | $a3/argument #4 | <- pt_regs.regs[7] +---------------------+ $sp + 56 -> | $a2/argument #3 | <- pt_regs.regs[6] +---------------------+ $sp + 52 -> | $a1/argument #2 | <- pt_regs.regs[5] +---------------------+ $sp + 48 -> | $a0/argument #1 | <- pt_regs.regs[4] +---------------------+ $sp + 44 -> | $v1 | <- pt_regs.regs[3] +---------------------+ $sp + 40 -> | $v0 | <- pt_regs.regs[2] +---------------------+ $sp + 36 -> | $at | <- pt_regs.regs[1] +---------------------+ $sp + 32 -> | $zero | <- pt_regs.regs[0] +---------------------+ $sp + 28 -> | stack argument #8 | <- pt_regs.args[7] +---------------------+ $sp + 24 -> | stack argument #7 | <- pt_regs.args[6] +---------------------+ $sp + 20 -> | stack argument #6 | <- pt_regs.args[5] +---------------------+ $sp + 16 -> | stack argument #5 | <- pt_regs.args[4] +---------------------+ $sp + 12 -> | psABI space for $a3 | <- pt_regs.args[3] +---------------------+ $sp + 8 -> | psABI space for $a2 | <- pt_regs.args[2] +---------------------+ $sp + 4 -> | psABI space for $a1 | <- pt_regs.args[1] +---------------------+ $sp + 0 -> | psABI space for $a0 | <- pt_regs.args[0] +---------------------+ holding user data received and with the first 4 frame slots reserved by the psABI for the compiler to spill the incoming arguments from $a0-$a3 registers (which it sometimes does according to its needs) and the next 4 frame slots designated by the psABI for any stack function arguments that follow. This data is also available for other tasks to peek/poke at as reqired and where permitted. Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/ptrace.h | 4 ++-- arch/mips/kernel/asm-offsets.c | 6 ++++++ arch/mips/kernel/scall32-o32.S | 8 ++++---- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index 4a2b40ce39e0..85fa9962266a 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -27,8 +27,8 @@ */ struct pt_regs { #ifdef CONFIG_32BIT - /* Pad bytes for argument save space on the stack. */ - unsigned long pad0[8]; + /* Saved syscall stack arguments; entries 0-3 unused. */ + unsigned long args[8]; #endif /* Saved main processor registers. */ diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index cb1045ebab06..b910ec54a3a1 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -27,6 +27,12 @@ void output_ptreg_defines(void); void output_ptreg_defines(void) { COMMENT("MIPS pt_regs offsets."); +#ifdef CONFIG_32BIT + OFFSET(PT_ARG4, pt_regs, args[4]); + OFFSET(PT_ARG5, pt_regs, args[5]); + OFFSET(PT_ARG6, pt_regs, args[6]); + OFFSET(PT_ARG7, pt_regs, args[7]); +#endif OFFSET(PT_R0, pt_regs, regs[0]); OFFSET(PT_R1, pt_regs, regs[1]); OFFSET(PT_R2, pt_regs, regs[2]); diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 2c604717e630..4947a4f39e37 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -64,10 +64,10 @@ load_a6: user_lw(t7, 24(t0)) # argument #7 from usp load_a7: user_lw(t8, 28(t0)) # argument #8 from usp loads_done: - sw t5, 16(sp) # argument #5 to ksp - sw t6, 20(sp) # argument #6 to ksp - sw t7, 24(sp) # argument #7 to ksp - sw t8, 28(sp) # argument #8 to ksp + sw t5, PT_ARG4(sp) # argument #5 to ksp + sw t6, PT_ARG5(sp) # argument #6 to ksp + sw t7, PT_ARG6(sp) # argument #7 to ksp + sw t8, PT_ARG7(sp) # argument #8 to ksp .set pop .section __ex_table,"a" From 733a90561ad0a4a74035d2d627098da85d43b592 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Wed, 12 Feb 2025 01:02:09 +0200 Subject: [PATCH 165/310] MIPS: fix mips_get_syscall_arg() for o32 This makes ptrace/get_syscall_info selftest pass on mips o32 and mips64 o32 by fixing the following two test assertions: 1. get_syscall_info test assertion on mips o32: # get_syscall_info.c:218:get_syscall_info:Expected exp_args[5] (3134521044) == info.entry.args[4] (4911432) # get_syscall_info.c:219:get_syscall_info:wait #1: entry stop mismatch 2. get_syscall_info test assertion on mips64 o32: # get_syscall_info.c:209:get_syscall_info:Expected exp_args[2] (3134324433) == info.entry.args[1] (18446744072548908753) # get_syscall_info.c:210:get_syscall_info:wait #1: entry stop mismatch The first assertion happens due to mips_get_syscall_arg() trying to access another task's context but failing to do it properly because get_user() it calls just peeks at the current task's context. It usually does not crash because the default user stack always gets assigned the same VMA, but it is pure luck which mips_get_syscall_arg() wouldn't have if e.g. the stack was switched (via setcontext(3) or however) or a non-default process's thread peeked at, and in any case irrelevant data is obtained just as observed with the test case. mips_get_syscall_arg() ought to be using access_remote_vm() instead to retrieve the other task's stack contents, but given that the data has been already obtained and saved in `struct pt_regs' it would be an overkill. The first assertion is fixed for mips o32 by using struct pt_regs.args instead of get_user() to obtain syscall arguments. This approach works due to this piece in arch/mips/kernel/scall32-o32.S: /* * Ok, copy the args from the luser stack to the kernel stack. */ .set push .set noreorder .set nomacro load_a4: user_lw(t5, 16(t0)) # argument #5 from usp load_a5: user_lw(t6, 20(t0)) # argument #6 from usp load_a6: user_lw(t7, 24(t0)) # argument #7 from usp load_a7: user_lw(t8, 28(t0)) # argument #8 from usp loads_done: sw t5, PT_ARG4(sp) # argument #5 to ksp sw t6, PT_ARG5(sp) # argument #6 to ksp sw t7, PT_ARG6(sp) # argument #7 to ksp sw t8, PT_ARG7(sp) # argument #8 to ksp .set pop .section __ex_table,"a" PTR_WD load_a4, bad_stack_a4 PTR_WD load_a5, bad_stack_a5 PTR_WD load_a6, bad_stack_a6 PTR_WD load_a7, bad_stack_a7 .previous arch/mips/kernel/scall64-o32.S has analogous code for mips64 o32 that allows fixing the issue by obtaining syscall arguments from struct pt_regs.regs[4..11] instead of the erroneous use of get_user(). The second assertion is fixed by truncating 64-bit values to 32-bit syscall arguments. Fixes: c0ff3c53d4f9 ("MIPS: Enable HAVE_ARCH_TRACEHOOK.") Signed-off-by: Dmitry V. Levin Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/syscall.h | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h index ebdf4d910af2..056aa1b713e2 100644 --- a/arch/mips/include/asm/syscall.h +++ b/arch/mips/include/asm/syscall.h @@ -57,37 +57,21 @@ static inline void mips_syscall_update_nr(struct task_struct *task, static inline void mips_get_syscall_arg(unsigned long *arg, struct task_struct *task, struct pt_regs *regs, unsigned int n) { - unsigned long usp __maybe_unused = regs->regs[29]; - +#ifdef CONFIG_32BIT switch (n) { case 0: case 1: case 2: case 3: *arg = regs->regs[4 + n]; - - return; - -#ifdef CONFIG_32BIT - case 4: case 5: case 6: case 7: - get_user(*arg, (int *)usp + n); return; -#endif - -#ifdef CONFIG_64BIT case 4: case 5: case 6: case 7: -#ifdef CONFIG_MIPS32_O32 - if (test_tsk_thread_flag(task, TIF_32BIT_REGS)) - get_user(*arg, (int *)usp + n); - else -#endif - *arg = regs->regs[4 + n]; - + *arg = regs->args[n]; return; -#endif - - default: - BUG(); } - - unreachable(); +#else + *arg = regs->regs[4 + n]; + if ((IS_ENABLED(CONFIG_MIPS32_O32) && + test_tsk_thread_flag(task, TIF_32BIT_REGS))) + *arg = (unsigned int)*arg; +#endif } static inline long syscall_get_error(struct task_struct *task, From 446a8351f160d65a1c5df7097f31c74102ed2bb1 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Mon, 10 Feb 2025 17:37:32 +0100 Subject: [PATCH 166/310] arm64: rust: clean Rust 1.85.0 warning using softfloat target Starting with Rust 1.85.0 (to be released 2025-02-20), `rustc` warns [1] about disabling neon in the aarch64 hardfloat target: warning: target feature `neon` cannot be toggled with `-Ctarget-feature`: unsound on hard-float targets because it changes float ABI | = note: this was previously accepted by the compiler but is being phased out; it will become a hard error in a future release! = note: for more information, see issue #116344 Thus, instead, use the softfloat target instead. While trying it out, I found that the kernel sanitizers were not enabled for that built-in target [2]. Upstream Rust agreed to backport the enablement for the current beta so that it is ready for the 1.85.0 release [3] -- thanks! However, that still means that before Rust 1.85.0, we cannot switch since sanitizers could be in use. Thus conditionally do so. Cc: stable@vger.kernel.org # Needed in 6.12.y and 6.13.y only (Rust is pinned in older LTSs). Cc: Catalin Marinas Cc: Will Deacon Cc: Matthew Maurer Cc: Alice Ryhl Cc: Ralf Jung Cc: Jubilee Young Link: https://github.com/rust-lang/rust/pull/133417 [1] Link: https://rust-lang.zulipchat.com/#narrow/channel/131828-t-compiler/topic/arm64.20neon.20.60-Ctarget-feature.60.20warning/near/495358442 [2] Link: https://github.com/rust-lang/rust/pull/135905 [3] Link: https://github.com/rust-lang/rust/issues/116344 Signed-off-by: Miguel Ojeda Reviewed-by: Trevor Gross Tested-by: Matthew Maurer Reviewed-by: Ralf Jung Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/r/20250210163732.281786-1-ojeda@kernel.org Signed-off-by: Will Deacon --- arch/arm64/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 358c68565bfd..2b25d671365f 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -48,7 +48,11 @@ KBUILD_CFLAGS += $(CC_FLAGS_NO_FPU) \ KBUILD_CFLAGS += $(call cc-disable-warning, psabi) KBUILD_AFLAGS += $(compat_vdso) +ifeq ($(call test-ge, $(CONFIG_RUSTC_VERSION), 108500),y) +KBUILD_RUSTFLAGS += --target=aarch64-unknown-none-softfloat +else KBUILD_RUSTFLAGS += --target=aarch64-unknown-none -Ctarget-feature="-neon" +endif KBUILD_CFLAGS += $(call cc-option,-mabi=lp64) KBUILD_AFLAGS += $(call cc-option,-mabi=lp64) From 85fcb57c983f423180ba6ec5d0034242da05cc54 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 10 Feb 2025 08:43:39 +0100 Subject: [PATCH 167/310] xen/swiotlb: relax alignment requirements When mapping a buffer for DMA via .map_page or .map_sg DMA operations, there is no need to check the machine frames to be aligned according to the mapped areas size. All what is needed in these cases is that the buffer is contiguous at machine level. So carve out the alignment check from range_straddles_page_boundary() and move it to a helper called by xen_swiotlb_alloc_coherent() and xen_swiotlb_free_coherent() directly. Fixes: 9f40ec84a797 ("xen/swiotlb: add alignment check for dma buffers") Reported-by: Jan Vejvalka Tested-by: Jan Vejvalka Signed-off-by: Juergen Gross Reviewed-by: Stefano Stabellini Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index a337edcf8faf..26c62e0d34e9 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -74,19 +74,21 @@ static inline phys_addr_t xen_dma_to_phys(struct device *dev, return xen_bus_to_phys(dev, dma_to_phys(dev, dma_addr)); } +static inline bool range_requires_alignment(phys_addr_t p, size_t size) +{ + phys_addr_t algn = 1ULL << (get_order(size) + PAGE_SHIFT); + phys_addr_t bus_addr = pfn_to_bfn(XEN_PFN_DOWN(p)) << XEN_PAGE_SHIFT; + + return IS_ALIGNED(p, algn) && !IS_ALIGNED(bus_addr, algn); +} + static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p); unsigned int i, nr_pages = XEN_PFN_UP(xen_offset_in_page(p) + size); - phys_addr_t algn = 1ULL << (get_order(size) + PAGE_SHIFT); next_bfn = pfn_to_bfn(xen_pfn); - /* If buffer is physically aligned, ensure DMA alignment. */ - if (IS_ALIGNED(p, algn) && - !IS_ALIGNED((phys_addr_t)next_bfn << XEN_PAGE_SHIFT, algn)) - return 1; - for (i = 1; i < nr_pages; i++) if (pfn_to_bfn(++xen_pfn) != ++next_bfn) return 1; @@ -156,7 +158,8 @@ xen_swiotlb_alloc_coherent(struct device *dev, size_t size, *dma_handle = xen_phys_to_dma(dev, phys); if (*dma_handle + size - 1 > dma_mask || - range_straddles_page_boundary(phys, size)) { + range_straddles_page_boundary(phys, size) || + range_requires_alignment(phys, size)) { if (xen_create_contiguous_region(phys, order, fls64(dma_mask), dma_handle) != 0) goto out_free_pages; @@ -182,7 +185,8 @@ xen_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, size = ALIGN(size, XEN_PAGE_SIZE); if (WARN_ON_ONCE(dma_handle + size - 1 > dev->coherent_dma_mask) || - WARN_ON_ONCE(range_straddles_page_boundary(phys, size))) + WARN_ON_ONCE(range_straddles_page_boundary(phys, size) || + range_requires_alignment(phys, size))) return; if (TestClearPageXenRemapped(virt_to_page(vaddr))) From e93ec87286bd1fd30b7389e7a387cfb259f297e3 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 11 Feb 2025 11:16:28 +0100 Subject: [PATCH 168/310] x86/xen: allow larger contiguous memory regions in PV guests Today a PV guest (including dom0) can create 2MB contiguous memory regions for DMA buffers at max. This has led to problems at least with the megaraid_sas driver, which wants to allocate a 2.3MB DMA buffer. The limiting factor is the frame array used to do the hypercall for making the memory contiguous, which has 512 entries and is just a static array in mmu_pv.c. In order to not waste memory for non-PV guests, put the initial frame array into .init.data section and dynamically allocate an array from the .init_after_bootmem hook of PV guests. In case a contiguous memory area larger than the initially supported 2MB is requested, allocate a larger buffer for the frame list. Note that such an allocation is tried only after memory management has been initialized properly, which is tested via a flag being set in the .init_after_bootmem hook. Fixes: 9f40ec84a797 ("xen/swiotlb: add alignment check for dma buffers") Signed-off-by: Juergen Gross Tested-by: Alan Robinson Reviewed-by: Jan Beulich Signed-off-by: Juergen Gross --- arch/x86/xen/mmu_pv.c | 71 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 9 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2c70cd35e72c..d078de2c952b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -111,6 +111,51 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; */ static DEFINE_SPINLOCK(xen_reservation_lock); +/* Protected by xen_reservation_lock. */ +#define MIN_CONTIG_ORDER 9 /* 2MB */ +static unsigned int discontig_frames_order = MIN_CONTIG_ORDER; +static unsigned long discontig_frames_early[1UL << MIN_CONTIG_ORDER] __initdata; +static unsigned long *discontig_frames __refdata = discontig_frames_early; +static bool discontig_frames_dyn; + +static int alloc_discontig_frames(unsigned int order) +{ + unsigned long *new_array, *old_array; + unsigned int old_order; + unsigned long flags; + + BUG_ON(order < MIN_CONTIG_ORDER); + BUILD_BUG_ON(sizeof(discontig_frames_early) != PAGE_SIZE); + + new_array = (unsigned long *)__get_free_pages(GFP_KERNEL, + order - MIN_CONTIG_ORDER); + if (!new_array) + return -ENOMEM; + + spin_lock_irqsave(&xen_reservation_lock, flags); + + old_order = discontig_frames_order; + + if (order > discontig_frames_order || !discontig_frames_dyn) { + if (!discontig_frames_dyn) + old_array = NULL; + else + old_array = discontig_frames; + + discontig_frames = new_array; + discontig_frames_order = order; + discontig_frames_dyn = true; + } else { + old_array = new_array; + } + + spin_unlock_irqrestore(&xen_reservation_lock, flags); + + free_pages((unsigned long)old_array, old_order - MIN_CONTIG_ORDER); + + return 0; +} + /* * Note about cr3 (pagetable base) values: * @@ -814,6 +859,9 @@ static void __init xen_after_bootmem(void) SetPagePinned(virt_to_page(level3_user_vsyscall)); #endif xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); + + if (alloc_discontig_frames(MIN_CONTIG_ORDER)) + BUG(); } static void xen_unpin_page(struct mm_struct *mm, struct page *page, @@ -2203,10 +2251,6 @@ void __init xen_init_mmu_ops(void) memset(dummy_mapping, 0xff, PAGE_SIZE); } -/* Protected by xen_reservation_lock. */ -#define MAX_CONTIG_ORDER 9 /* 2MB */ -static unsigned long discontig_frames[1< MAX_CONTIG_ORDER)) - return -ENOMEM; + if (unlikely(order > discontig_frames_order)) { + if (!discontig_frames_dyn) + return -ENOMEM; + + if (alloc_discontig_frames(order)) + return -ENOMEM; + } memset((void *) vstart, 0, PAGE_SIZE << order); spin_lock_irqsave(&xen_reservation_lock, flags); + in_frames = discontig_frames; + /* 1. Zap current PTEs, remembering MFNs. */ xen_zap_pfn_range(vstart, order, in_frames, NULL); @@ -2358,12 +2409,12 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) { - unsigned long *out_frames = discontig_frames, in_frame; + unsigned long *out_frames, in_frame; unsigned long flags; int success; unsigned long vstart; - if (unlikely(order > MAX_CONTIG_ORDER)) + if (unlikely(order > discontig_frames_order)) return; vstart = (unsigned long)phys_to_virt(pstart); @@ -2371,6 +2422,8 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) spin_lock_irqsave(&xen_reservation_lock, flags); + out_frames = discontig_frames; + /* 1. Find start MFN of contiguous extent. */ in_frame = virt_to_mfn((void *)vstart); From 75ad02318af2e4ae669e26a79f001bd5e1f97472 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 12 Feb 2025 16:14:38 +0100 Subject: [PATCH 169/310] Xen/swiotlb: mark xen_swiotlb_fixup() __init It's sole user (pci_xen_swiotlb_init()) is __init, too. Signed-off-by: Jan Beulich Reviewed-by: Stefano Stabellini Message-ID: Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 26c62e0d34e9..1f65795cf5d7 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -113,7 +113,7 @@ static struct io_tlb_pool *xen_swiotlb_find_pool(struct device *dev, } #ifdef CONFIG_X86 -int xen_swiotlb_fixup(void *buf, unsigned long nslabs) +int __init xen_swiotlb_fixup(void *buf, unsigned long nslabs) { int rc; unsigned int order = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT); From 4cf7d58620bfc2ebe934e3dfa97208f13f14ab8b Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Sun, 9 Feb 2025 09:46:50 +0530 Subject: [PATCH 170/310] genirq: Remove unused CONFIG_GENERIC_PENDING_IRQ_CHIPFLAGS CONFIG_GENERIC_PENDING_IRQ_CHIPFLAGS is not used anymore, hence remove it. Fixes: f94a18249b7f ("genirq: Remove IRQ_MOVE_PCNTXT and related code") Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250209041655.331470-7-apatel@ventanamicro.com --- kernel/irq/Kconfig | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5432418c0fea..875f25ed6f71 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -31,10 +31,6 @@ config GENERIC_IRQ_EFFECTIVE_AFF_MASK config GENERIC_PENDING_IRQ bool -# Deduce delayed migration from top-level interrupt chip flags -config GENERIC_PENDING_IRQ_CHIPFLAGS - bool - # Support for generic irq migrating off cpu before the cpu is offline. config GENERIC_IRQ_MIGRATION bool From 81f64e925c29fe6e99f04b131fac1935ac931e81 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 12 Feb 2025 13:35:16 -0600 Subject: [PATCH 171/310] PCI: Avoid FLR for Mediatek MT7922 WiFi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Mediatek MT7922 WiFi device advertises FLR support, but it apparently does not work, and all subsequent config reads return ~0: pci 0000:01:00.0: [14c3:0616] type 00 class 0x028000 PCIe Endpoint pciback 0000:01:00.0: not ready 65535ms after FLR; giving up After an FLR, pci_dev_wait() waits for the device to become ready. Prior to d591f6804e7e ("PCI: Wait for device readiness with Configuration RRS"), it polls PCI_COMMAND until it is something other that PCI_POSSIBLE_ERROR (~0). If it times out, pci_dev_wait() returns -ENOTTY and __pci_reset_function_locked() tries the next available reset method. Typically this is Secondary Bus Reset, which does work, so the MT7922 is eventually usable. After d591f6804e7e, if Configuration Request Retry Status Software Visibility (RRS SV) is enabled, pci_dev_wait() polls PCI_VENDOR_ID until it is something other than the special 0x0001 Vendor ID that indicates a completion with RRS status. When RRS SV is enabled, reads of PCI_VENDOR_ID should return either 0x0001, i.e., the config read was completed with RRS, or a valid Vendor ID. On the MT7922, it seems that all config reads after FLR return ~0 indefinitely. When pci_dev_wait() reads PCI_VENDOR_ID and gets 0xffff, it assumes that's a valid Vendor ID and the device is now ready, so it returns with success. After pci_dev_wait() returns success, we restore config space and continue. Since the MT7922 is not actually ready after the FLR, the restore fails and the device is unusable. We considered changing pci_dev_wait() to continue polling if a PCI_VENDOR_ID read returns either 0x0001 or 0xffff. This "works" as it did before d591f6804e7e, although we have to wait for the timeout and then fall back to SBR. But it doesn't work for SR-IOV VFs, which *always* return 0xffff as the Vendor ID. Mark Mediatek MT7922 WiFi devices to avoid the use of FLR completely. This will cause fallback to another reset method, such as SBR. Link: https://lore.kernel.org/r/20250212193516.88741-1-helgaas@kernel.org Fixes: d591f6804e7e ("PCI: Wait for device readiness with Configuration RRS") Link: https://github.com/QubesOS/qubes-issues/issues/9689#issuecomment-2582927149 Link: https://lore.kernel.org/r/Z4pHll_6GX7OUBzQ@mail-itl Signed-off-by: Bjorn Helgaas Tested-by: Marek Marczykowski-Górecki Cc: stable@vger.kernel.org --- drivers/pci/quirks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index b84ff7bade82..82b21e34c545 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5522,7 +5522,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap); * AMD Matisse USB 3.0 Host Controller 0x149c * Intel 82579LM Gigabit Ethernet Controller 0x1502 * Intel 82579V Gigabit Ethernet Controller 0x1503 - * + * Mediatek MT7922 802.11ax PCI Express Wireless Network Adapter */ static void quirk_no_flr(struct pci_dev *dev) { @@ -5534,6 +5534,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x149c, quirk_no_flr); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x7901, quirk_no_flr); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1502, quirk_no_flr); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1503, quirk_no_flr); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_MEDIATEK, 0x0616, quirk_no_flr); /* FLR may cause the SolidRun SNET DPU (rev 0x1) to hang */ static void quirk_no_flr_snet(struct pci_dev *dev) From 1d0013962d220b166d9f7c9fe2746f1542e459a3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2025 22:23:59 +0000 Subject: [PATCH 172/310] netfs: Fix a number of read-retry hangs Fix a number of hangs in the netfslib read-retry code, including: (1) netfs_reissue_read() doubles up the getting of references on subrequests, thereby leaking the subrequest and causing inode eviction to wait indefinitely. This can lead to the kernel reporting a hang in the filesystem's evict_inode(). Fix this by removing the get from netfs_reissue_read() and adding one to netfs_retry_read_subrequests() to deal with the one place that didn't double up. (2) The loop in netfs_retry_read_subrequests() that retries a sequence of failed subrequests doesn't record whether or not it retried the one that the "subreq" pointer points to when it leaves the loop. It may not if renegotiation/repreparation of the subrequests means that fewer subrequests are needed to span the cumulative range of the sequence. Because it doesn't record this, the piece of code that discards now-superfluous subrequests doesn't know whether it should discard the one "subreq" points to - and so it doesn't. Fix this by noting whether the last subreq it examines is superfluous and if it is, then getting rid of it and all subsequent subrequests. If that one one wasn't superfluous, then we would have tried to go round the previous loop again and so there can be no further unretried subrequests in the sequence. (3) netfs_retry_read_subrequests() gets yet an extra ref on any additional subrequests it has to get because it ran out of ones it could reuse to to renegotiation/repreparation shrinking the subrequests. Fix this by removing that extra ref. (4) In netfs_retry_reads(), it was using wait_on_bit() to wait for NETFS_SREQ_IN_PROGRESS to be cleared on all subrequests in the sequence - but netfs_read_subreq_terminated() is now using a wait queue on the request instead and so this wait will never finish. Fix this by waiting on the wait queue instead. To make this work, a new flag, NETFS_RREQ_RETRYING, is now set around the wait loop to tell the wake-up code to wake up the wait queue rather than requeuing the request's work item. Note that this flag replaces the NETFS_RREQ_NEED_RETRY flag which is no longer used. (5) Whilst not strictly anything to do with the hang, netfs_retry_read_subrequests() was also doubly incrementing the subreq_counter and re-setting the debug index, leaving a gap in the trace. This is also fixed. One of these hangs was observed with 9p and with cifs. Others were forced by manual code injection into fs/afs/file.c. Firstly, afs_prepare_read() was created to provide an changing pattern of maximum subrequest sizes: static int afs_prepare_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; if (!S_ISREG(subreq->rreq->inode->i_mode)) return 0; if (subreq->retry_count < 20) rreq->io_streams[0].sreq_max_len = umax(200, 2222 - subreq->retry_count * 40); else rreq->io_streams[0].sreq_max_len = 3333; return 0; } and pointed to by afs_req_ops. Then the following: struct netfs_io_subrequest *subreq = op->fetch.subreq; if (subreq->error == 0 && S_ISREG(subreq->rreq->inode->i_mode) && subreq->retry_count < 20) { subreq->transferred = subreq->already_done; __clear_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); afs_fetch_data_notify(op); return; } was inserted into afs_fetch_data_success() at the beginning and struct netfs_io_subrequest given an extra field, "already_done" that was set to the value in "subreq->transferred" by netfs_reissue_read(). When reading a 4K file, the subrequests would get gradually smaller, a new subrequest would be allocated around the 3rd retry and then eventually be rendered superfluous when the 20th retry was hit and the limit on the first subrequest was eased. Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20250212222402.3618494-2-dhowells@redhat.com Tested-by: Marc Dionne Tested-by: Steve French cc: Ihor Solodrai cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: Paulo Alcantara cc: Jeff Layton cc: v9fs@lists.linux.dev cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_collect.c | 6 ++++-- fs/netfs/read_retry.c | 40 +++++++++++++++++++++++++++--------- include/linux/netfs.h | 2 +- include/trace/events/netfs.h | 4 +++- 4 files changed, 38 insertions(+), 14 deletions(-) diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index f65affa5a9e4..636cc5a98ef5 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -470,7 +470,8 @@ void netfs_read_collection_worker(struct work_struct *work) */ void netfs_wake_read_collector(struct netfs_io_request *rreq) { - if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { + if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && + !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { if (!work_pending(&rreq->work)) { netfs_get_request(rreq, netfs_rreq_trace_get_work); if (!queue_work(system_unbound_wq, &rreq->work)) @@ -586,7 +587,8 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq) smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */ /* If we are at the head of the queue, wake up the collector. */ - if (list_is_first(&subreq->rreq_link, &stream->subrequests)) + if (list_is_first(&subreq->rreq_link, &stream->subrequests) || + test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) netfs_wake_read_collector(rreq); netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated); diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 2290af0d51ac..8316c4533a51 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -14,7 +14,6 @@ static void netfs_reissue_read(struct netfs_io_request *rreq, { __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); subreq->rreq->netfs_ops->issue_read(subreq); } @@ -48,6 +47,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); subreq->retry_count++; netfs_reset_iter(subreq); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_read(rreq, subreq); } } @@ -75,7 +75,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) struct iov_iter source; unsigned long long start, len; size_t part; - bool boundary = false; + bool boundary = false, subreq_superfluous = false; /* Go through the subreqs and find the next span of contiguous * buffer that we then rejig (cifs, for example, needs the @@ -116,8 +116,10 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) /* Work through the sublist. */ subreq = from; list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { - if (!len) + if (!len) { + subreq_superfluous = true; break; + } subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->start = start - subreq->transferred; subreq->len = len + subreq->transferred; @@ -154,19 +156,21 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_read(rreq, subreq); - if (subreq == to) + if (subreq == to) { + subreq_superfluous = false; break; + } } /* If we managed to use fewer subreqs, we can discard the * excess; if we used the same number, then we're done. */ if (!len) { - if (subreq == to) + if (!subreq_superfluous) continue; list_for_each_entry_safe_from(subreq, tmp, &stream->subrequests, rreq_link) { - trace_netfs_sreq(subreq, netfs_sreq_trace_discard); + trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous); list_del(&subreq->rreq_link); netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); if (subreq == to) @@ -187,14 +191,12 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->start = start; subreq->len = len; - subreq->debug_index = atomic_inc_return(&rreq->subreq_counter); subreq->stream_nr = stream->stream_nr; subreq->retry_count = 1; trace_netfs_sreq_ref(rreq->debug_id, subreq->debug_index, refcount_read(&subreq->ref), netfs_sreq_trace_new); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); list_add(&subreq->rreq_link, &to->rreq_link); to = list_next_entry(to, rreq_link); @@ -256,14 +258,32 @@ void netfs_retry_reads(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream = &rreq->io_streams[0]; + DEFINE_WAIT(myself); + + set_bit(NETFS_RREQ_RETRYING, &rreq->flags); /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + continue; + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + for (;;) { + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + break; + + trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for); + schedule(); + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); + } + + finish_wait(&rreq->waitq, &myself); } + clear_bit(NETFS_RREQ_RETRYING, &rreq->flags); trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); netfs_retry_read_subrequests(rreq); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 071d05d81d38..c86a11cfc4a3 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -278,7 +278,7 @@ struct netfs_io_request { #define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */ #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ #define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ -#define NETFS_RREQ_NEED_RETRY 14 /* Need to try retrying */ +#define NETFS_RREQ_RETRYING 14 /* Set if we're in the retry path */ #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark * write to cache on read */ const struct netfs_request_ops *netfs_ops; diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 6e699cadcb29..f880835f7695 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -99,7 +99,7 @@ EM(netfs_sreq_trace_limited, "LIMIT") \ EM(netfs_sreq_trace_need_clear, "N-CLR") \ EM(netfs_sreq_trace_partial_read, "PARTR") \ - EM(netfs_sreq_trace_need_retry, "NRTRY") \ + EM(netfs_sreq_trace_need_retry, "ND-RT") \ EM(netfs_sreq_trace_prepare, "PREP ") \ EM(netfs_sreq_trace_prep_failed, "PRPFL") \ EM(netfs_sreq_trace_progress, "PRGRS") \ @@ -108,7 +108,9 @@ EM(netfs_sreq_trace_short, "SHORT") \ EM(netfs_sreq_trace_split, "SPLIT") \ EM(netfs_sreq_trace_submit, "SUBMT") \ + EM(netfs_sreq_trace_superfluous, "SPRFL") \ EM(netfs_sreq_trace_terminated, "TERM ") \ + EM(netfs_sreq_trace_wait_for, "_WAIT") \ EM(netfs_sreq_trace_write, "WRITE") \ EM(netfs_sreq_trace_write_skip, "SKIP ") \ E_(netfs_sreq_trace_write_term, "WTERM") From d01c495f432ce34df8bfd092e71720a2cf169a90 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2025 22:24:00 +0000 Subject: [PATCH 173/310] netfs: Add retry stat counters Add stat counters to count the number of request and subrequest retries and display them in /proc/fs/netfs/stats. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20250212222402.3618494-3-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/internal.h | 4 ++++ fs/netfs/read_retry.c | 3 +++ fs/netfs/stats.c | 9 +++++++++ fs/netfs/write_issue.c | 1 + fs/netfs/write_retry.c | 2 ++ 5 files changed, 19 insertions(+) diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index eb76f98c894b..1c4f953c3d68 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -135,6 +135,8 @@ extern atomic_t netfs_n_rh_write_begin; extern atomic_t netfs_n_rh_write_done; extern atomic_t netfs_n_rh_write_failed; extern atomic_t netfs_n_rh_write_zskip; +extern atomic_t netfs_n_rh_retry_read_req; +extern atomic_t netfs_n_rh_retry_read_subreq; extern atomic_t netfs_n_wh_buffered_write; extern atomic_t netfs_n_wh_writethrough; extern atomic_t netfs_n_wh_dio_write; @@ -147,6 +149,8 @@ extern atomic_t netfs_n_wh_upload_failed; extern atomic_t netfs_n_wh_write; extern atomic_t netfs_n_wh_write_done; extern atomic_t netfs_n_wh_write_failed; +extern atomic_t netfs_n_wh_retry_write_req; +extern atomic_t netfs_n_wh_retry_write_subreq; extern atomic_t netfs_n_wb_lock_skip; extern atomic_t netfs_n_wb_lock_wait; extern atomic_t netfs_n_folioq; diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 8316c4533a51..0f294b26e08c 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -14,6 +14,7 @@ static void netfs_reissue_read(struct netfs_io_request *rreq, { __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_stat(&netfs_n_rh_retry_read_subreq); subreq->rreq->netfs_ops->issue_read(subreq); } @@ -260,6 +261,8 @@ void netfs_retry_reads(struct netfs_io_request *rreq) struct netfs_io_stream *stream = &rreq->io_streams[0]; DEFINE_WAIT(myself); + netfs_stat(&netfs_n_rh_retry_read_req); + set_bit(NETFS_RREQ_RETRYING, &rreq->flags); /* Wait for all outstanding I/O to quiesce before performing retries as diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index f1af344266cc..ab6b916addc4 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -29,6 +29,8 @@ atomic_t netfs_n_rh_write_begin; atomic_t netfs_n_rh_write_done; atomic_t netfs_n_rh_write_failed; atomic_t netfs_n_rh_write_zskip; +atomic_t netfs_n_rh_retry_read_req; +atomic_t netfs_n_rh_retry_read_subreq; atomic_t netfs_n_wh_buffered_write; atomic_t netfs_n_wh_writethrough; atomic_t netfs_n_wh_dio_write; @@ -41,6 +43,8 @@ atomic_t netfs_n_wh_upload_failed; atomic_t netfs_n_wh_write; atomic_t netfs_n_wh_write_done; atomic_t netfs_n_wh_write_failed; +atomic_t netfs_n_wh_retry_write_req; +atomic_t netfs_n_wh_retry_write_subreq; atomic_t netfs_n_wb_lock_skip; atomic_t netfs_n_wb_lock_wait; atomic_t netfs_n_folioq; @@ -81,6 +85,11 @@ int netfs_stats_show(struct seq_file *m, void *v) atomic_read(&netfs_n_wh_write), atomic_read(&netfs_n_wh_write_done), atomic_read(&netfs_n_wh_write_failed)); + seq_printf(m, "Retries: rq=%u rs=%u wq=%u ws=%u\n", + atomic_read(&netfs_n_rh_retry_read_req), + atomic_read(&netfs_n_rh_retry_read_subreq), + atomic_read(&netfs_n_wh_retry_write_req), + atomic_read(&netfs_n_wh_retry_write_subreq)); seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n", atomic_read(&netfs_n_rh_rreq), atomic_read(&netfs_n_rh_sreq), diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 69727411683e..77279fc5b5a7 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -253,6 +253,7 @@ void netfs_reissue_write(struct netfs_io_stream *stream, subreq->retry_count++; __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_stat(&netfs_n_wh_retry_write_subreq); netfs_do_issue_write(stream, subreq); } diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c index c841a851dd73..545d33079a77 100644 --- a/fs/netfs/write_retry.c +++ b/fs/netfs/write_retry.c @@ -203,6 +203,8 @@ void netfs_retry_writes(struct netfs_io_request *wreq) struct netfs_io_stream *stream; int s; + netfs_stat(&netfs_n_wh_retry_write_req); + /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ From 5de0219a9bb9dacc4ce6e8f2745540dcce786983 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2025 22:24:01 +0000 Subject: [PATCH 174/310] netfs: Fix setting NETFS_RREQ_ALL_QUEUED to be after all subreqs queued Due to the code that queues a subreq on the active subrequest list getting moved to netfs_issue_read(), the NETFS_RREQ_ALL_QUEUED flag may now get set before the list-add actually happens. This is not a problem if the collection worker happens after the list-add, but it's a race - and, for 9P, where the read from the server is synchronous and done in the submitting thread, this is a lot more likely. The result is that, if the timing is wrong, a ref gets leaked because the collector thinks that all the subreqs have completed (because it can't see the last one yet) and clears NETFS_RREQ_IN_PROGRESS - at which point, the collection worker no longer goes into the collector. This can be provoked with AFS by injecting an msleep() right before the final subreq is queued. Fix this by splitting the queuing part out of netfs_issue_read() into a new function, netfs_queue_read(), and calling it separately. The setting of NETFS_RREQ_ALL_QUEUED is then done by netfs_queue_read() whilst it is holding the spinlock (that's probably unnecessary, but shouldn't hurt). It might be better to set a flag on the final subreq, but this could be a problem if an error occurs and we can't queue it. Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") Reported-by: Ihor Solodrai Closes: https://lore.kernel.org/r/a7x33d4dnMdGTtRivptq6S1i8btK70SNBP2XyX_xwDAhLvgQoPox6FVBOkifq4eBinfFfbZlIkMZBe3QarlWTxoEtHZwJCZbNKtaqrR7PvI=@pm.me/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/20250212222402.3618494-4-dhowells@redhat.com Tested-by: Ihor Solodrai cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: Marc Dionne cc: Steve French cc: Paulo Alcantara cc: Jeff Layton cc: v9fs@lists.linux.dev cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_read.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index f761d44b3436..0d1b6d35ff3b 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -155,8 +155,9 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq, netfs_cache_read_terminated, subreq); } -static void netfs_issue_read(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) +static void netfs_queue_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq, + bool last_subreq) { struct netfs_io_stream *stream = &rreq->io_streams[0]; @@ -177,8 +178,17 @@ static void netfs_issue_read(struct netfs_io_request *rreq, } } + if (last_subreq) { + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); + } + spin_unlock(&rreq->lock); +} +static void netfs_issue_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ switch (subreq->source) { case NETFS_DOWNLOAD_FROM_SERVER: rreq->netfs_ops->issue_read(subreq); @@ -293,11 +303,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) } size -= slice; start += slice; - if (size <= 0) { - smp_wmb(); /* Write lists before ALL_QUEUED. */ - set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); - } + netfs_queue_read(rreq, subreq, size <= 0); netfs_issue_read(rreq, subreq); cond_resched(); } while (size > 0); From 1f47ed294a2bd577d5ae43e6e28e1c9a3be4a833 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Feb 2025 08:18:46 -0700 Subject: [PATCH 175/310] block: cleanup and fix batch completion adding conditions The conditions for whether or not a request is allowed adding to a completion batch are a bit hard to read, and they also have a few issues. One is that ioerror may indeed be a random value on passthrough, and it's being checked unconditionally of whether or not the given request is a passthrough request or not. Rewrite the conditions to be separate for easier reading, and only check ioerror for non-passthrough requests. This fixes an issue with bio unmapping on passthrough, where it fails getting added to a batch. This both leads to suboptimal performance, and may trigger a potential schedule-under-atomic condition for polled passthrough IO. Fixes: f794f3351f26 ("block: add support for blk_mq_end_request_batch()") Link: https://lore.kernel.org/r/20575f0a-656e-4bb3-9d82-dec6c7e3a35c@kernel.dk Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9ebb53f031cd..fa2a76cc2f73 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -861,12 +861,22 @@ static inline bool blk_mq_add_to_batch(struct request *req, void (*complete)(struct io_comp_batch *)) { /* - * blk_mq_end_request_batch() can't end request allocated from - * sched tags + * Check various conditions that exclude batch processing: + * 1) No batch container + * 2) Has scheduler data attached + * 3) Not a passthrough request and end_io set + * 4) Not a passthrough request and an ioerror */ - if (!iob || (req->rq_flags & RQF_SCHED_TAGS) || ioerror || - (req->end_io && !blk_rq_is_passthrough(req))) + if (!iob) return false; + if (req->rq_flags & RQF_SCHED_TAGS) + return false; + if (!blk_rq_is_passthrough(req)) { + if (req->end_io) + return false; + if (ioerror < 0) + return false; + } if (!iob->complete) iob->complete = complete; From 6fe9116dd6bebee570406ec3f00a50388a62ccb3 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 4 Feb 2025 13:52:36 +0200 Subject: [PATCH 176/310] MAINTAINERS: Use my kernel.org address for I2C ACPI work Switch to use my kernel.org address for I2C ACPI work. Signed-off-by: Mika Westerberg Signed-off-by: Wolfram Sang --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 25c86f47353d..1d200adbdcea 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10822,7 +10822,7 @@ S: Odd Fixes F: drivers/tty/hvc/ I2C ACPI SUPPORT -M: Mika Westerberg +M: Mika Westerberg L: linux-i2c@vger.kernel.org L: linux-acpi@vger.kernel.org S: Maintained From 35fa2d88ca9481e5caf533d58b99ca259c63b2fe Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 10 Feb 2025 13:30:25 +0100 Subject: [PATCH 177/310] driver core: add a faux bus for use when a simple device/bus is needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many drivers abuse the platform driver/bus system as it provides a simple way to create and bind a device to a driver-specific set of probe/release functions. Instead of doing that, and wasting all of the memory associated with a platform device, here is a "faux" bus that can be used instead. Reviewed-by: Jonathan Cameron Reviewed-by: Danilo Krummrich Reviewed-by: Lyude Paul Reviewed-by: Thomas Weißschuh Reviewed-by: Zijun Hu Link: https://lore.kernel.org/r/2025021026-atlantic-gibberish-3f0c@gregkh Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/infrastructure.rst | 6 + drivers/base/Makefile | 2 +- drivers/base/base.h | 1 + drivers/base/faux.c | 232 ++++++++++++++++++++ drivers/base/init.c | 1 + include/linux/device/faux.h | 69 ++++++ 6 files changed, 310 insertions(+), 1 deletion(-) create mode 100644 drivers/base/faux.c create mode 100644 include/linux/device/faux.h diff --git a/Documentation/driver-api/infrastructure.rst b/Documentation/driver-api/infrastructure.rst index 3d52dfdfa9fd..35e36fee4238 100644 --- a/Documentation/driver-api/infrastructure.rst +++ b/Documentation/driver-api/infrastructure.rst @@ -41,6 +41,12 @@ Device Drivers Base .. kernel-doc:: drivers/base/class.c :export: +.. kernel-doc:: include/linux/device/faux.h + :internal: + +.. kernel-doc:: drivers/base/faux.c + :export: + .. kernel-doc:: drivers/base/node.c :internal: diff --git a/drivers/base/Makefile b/drivers/base/Makefile index 7fb21768ca36..8074a10183dc 100644 --- a/drivers/base/Makefile +++ b/drivers/base/Makefile @@ -6,7 +6,7 @@ obj-y := component.o core.o bus.o dd.o syscore.o \ cpu.o firmware.o init.o map.o devres.o \ attribute_container.o transport_class.o \ topology.o container.o property.o cacheinfo.o \ - swnode.o + swnode.o faux.o obj-$(CONFIG_AUXILIARY_BUS) += auxiliary.o obj-$(CONFIG_DEVTMPFS) += devtmpfs.o obj-y += power/ diff --git a/drivers/base/base.h b/drivers/base/base.h index 8cf04a557bdb..0042e4774b0c 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -137,6 +137,7 @@ int hypervisor_init(void); static inline int hypervisor_init(void) { return 0; } #endif int platform_bus_init(void); +int faux_bus_init(void); void cpu_dev_init(void); void container_dev_init(void); #ifdef CONFIG_AUXILIARY_BUS diff --git a/drivers/base/faux.c b/drivers/base/faux.c new file mode 100644 index 000000000000..531e9d789ee0 --- /dev/null +++ b/drivers/base/faux.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025 Greg Kroah-Hartman + * Copyright (c) 2025 The Linux Foundation + * + * A "simple" faux bus that allows devices to be created and added + * automatically to it. This is to be used whenever you need to create a + * device that is not associated with any "real" system resources, and do + * not want to have to deal with a bus/driver binding logic. It is + * intended to be very simple, with only a create and a destroy function + * available. + */ +#include +#include +#include +#include +#include +#include +#include "base.h" + +/* + * Internal wrapper structure so we can hold a pointer to the + * faux_device_ops for this device. + */ +struct faux_object { + struct faux_device faux_dev; + const struct faux_device_ops *faux_ops; +}; +#define to_faux_object(dev) container_of_const(dev, struct faux_object, faux_dev.dev) + +static struct device faux_bus_root = { + .init_name = "faux", +}; + +static int faux_match(struct device *dev, const struct device_driver *drv) +{ + /* Match always succeeds, we only have one driver */ + return 1; +} + +static int faux_probe(struct device *dev) +{ + struct faux_object *faux_obj = to_faux_object(dev); + struct faux_device *faux_dev = &faux_obj->faux_dev; + const struct faux_device_ops *faux_ops = faux_obj->faux_ops; + int ret = 0; + + if (faux_ops && faux_ops->probe) + ret = faux_ops->probe(faux_dev); + + return ret; +} + +static void faux_remove(struct device *dev) +{ + struct faux_object *faux_obj = to_faux_object(dev); + struct faux_device *faux_dev = &faux_obj->faux_dev; + const struct faux_device_ops *faux_ops = faux_obj->faux_ops; + + if (faux_ops && faux_ops->remove) + faux_ops->remove(faux_dev); +} + +static const struct bus_type faux_bus_type = { + .name = "faux", + .match = faux_match, + .probe = faux_probe, + .remove = faux_remove, +}; + +static struct device_driver faux_driver = { + .name = "faux_driver", + .bus = &faux_bus_type, + .probe_type = PROBE_FORCE_SYNCHRONOUS, +}; + +static void faux_device_release(struct device *dev) +{ + struct faux_object *faux_obj = to_faux_object(dev); + + kfree(faux_obj); +} + +/** + * faux_device_create_with_groups - Create and register with the driver + * core a faux device and populate the device with an initial + * set of sysfs attributes. + * @name: The name of the device we are adding, must be unique for + * all faux devices. + * @parent: Pointer to a potential parent struct device. If set to + * NULL, the device will be created in the "root" of the faux + * device tree in sysfs. + * @faux_ops: struct faux_device_ops that the new device will call back + * into, can be NULL. + * @groups: The set of sysfs attributes that will be created for this + * device when it is registered with the driver core. + * + * Create a new faux device and register it in the driver core properly. + * If present, callbacks in @faux_ops will be called with the device that + * for the caller to do something with at the proper time given the + * device's lifecycle. + * + * Note, when this function is called, the functions specified in struct + * faux_ops can be called before the function returns, so be prepared for + * everything to be properly initialized before that point in time. + * + * Return: + * * NULL if an error happened with creating the device + * * pointer to a valid struct faux_device that is registered with sysfs + */ +struct faux_device *faux_device_create_with_groups(const char *name, + struct device *parent, + const struct faux_device_ops *faux_ops, + const struct attribute_group **groups) +{ + struct faux_object *faux_obj; + struct faux_device *faux_dev; + struct device *dev; + int ret; + + faux_obj = kzalloc(sizeof(*faux_obj), GFP_KERNEL); + if (!faux_obj) + return NULL; + + /* Save off the callbacks so we can use them in the future */ + faux_obj->faux_ops = faux_ops; + + /* Initialize the device portion and register it with the driver core */ + faux_dev = &faux_obj->faux_dev; + dev = &faux_dev->dev; + + device_initialize(dev); + dev->release = faux_device_release; + if (parent) + dev->parent = parent; + else + dev->parent = &faux_bus_root; + dev->bus = &faux_bus_type; + dev->groups = groups; + dev_set_name(dev, "%s", name); + + ret = device_add(dev); + if (ret) { + pr_err("%s: device_add for faux device '%s' failed with %d\n", + __func__, name, ret); + put_device(dev); + return NULL; + } + + return faux_dev; +} +EXPORT_SYMBOL_GPL(faux_device_create_with_groups); + +/** + * faux_device_create - create and register with the driver core a faux device + * @name: The name of the device we are adding, must be unique for all + * faux devices. + * @parent: Pointer to a potential parent struct device. If set to + * NULL, the device will be created in the "root" of the faux + * device tree in sysfs. + * @faux_ops: struct faux_device_ops that the new device will call back + * into, can be NULL. + * + * Create a new faux device and register it in the driver core properly. + * If present, callbacks in @faux_ops will be called with the device that + * for the caller to do something with at the proper time given the + * device's lifecycle. + * + * Note, when this function is called, the functions specified in struct + * faux_ops can be called before the function returns, so be prepared for + * everything to be properly initialized before that point in time. + * + * Return: + * * NULL if an error happened with creating the device + * * pointer to a valid struct faux_device that is registered with sysfs + */ +struct faux_device *faux_device_create(const char *name, + struct device *parent, + const struct faux_device_ops *faux_ops) +{ + return faux_device_create_with_groups(name, parent, faux_ops, NULL); +} +EXPORT_SYMBOL_GPL(faux_device_create); + +/** + * faux_device_destroy - destroy a faux device + * @faux_dev: faux device to destroy + * + * Unregisters and cleans up a device that was created with a call to + * faux_device_create() + */ +void faux_device_destroy(struct faux_device *faux_dev) +{ + struct device *dev = &faux_dev->dev; + + if (!faux_dev) + return; + + device_del(dev); + + /* The final put_device() will clean up the memory we allocated for this device. */ + put_device(dev); +} +EXPORT_SYMBOL_GPL(faux_device_destroy); + +int __init faux_bus_init(void) +{ + int ret; + + ret = device_register(&faux_bus_root); + if (ret) { + put_device(&faux_bus_root); + return ret; + } + + ret = bus_register(&faux_bus_type); + if (ret) + goto error_bus; + + ret = driver_register(&faux_driver); + if (ret) + goto error_driver; + + return ret; + +error_driver: + bus_unregister(&faux_bus_type); + +error_bus: + device_unregister(&faux_bus_root); + return ret; +} diff --git a/drivers/base/init.c b/drivers/base/init.c index c4954835128c..9d2b06d65dfc 100644 --- a/drivers/base/init.c +++ b/drivers/base/init.c @@ -32,6 +32,7 @@ void __init driver_init(void) /* These are also core pieces, but must come after the * core core pieces. */ + faux_bus_init(); of_core_init(); platform_bus_init(); auxiliary_bus_init(); diff --git a/include/linux/device/faux.h b/include/linux/device/faux.h new file mode 100644 index 000000000000..9f43c0e46aa4 --- /dev/null +++ b/include/linux/device/faux.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025 Greg Kroah-Hartman + * Copyright (c) 2025 The Linux Foundation + * + * A "simple" faux bus that allows devices to be created and added + * automatically to it. This is to be used whenever you need to create a + * device that is not associated with any "real" system resources, and do + * not want to have to deal with a bus/driver binding logic. It is + * intended to be very simple, with only a create and a destroy function + * available. + */ +#ifndef _FAUX_DEVICE_H_ +#define _FAUX_DEVICE_H_ + +#include +#include + +/** + * struct faux_device - a "faux" device + * @dev: internal struct device of the object + * + * A simple faux device that can be created/destroyed. To be used when a + * driver only needs to have a device to "hang" something off. This can be + * used for downloading firmware or other basic tasks. Use this instead of + * a struct platform_device if the device has no resources assigned to + * it at all. + */ +struct faux_device { + struct device dev; +}; +#define to_faux_device(x) container_of_const((x), struct faux_device, dev) + +/** + * struct faux_device_ops - a set of callbacks for a struct faux_device + * @probe: called when a faux device is probed by the driver core + * before the device is fully bound to the internal faux bus + * code. If probe succeeds, return 0, otherwise return a + * negative error number to stop the probe sequence from + * succeeding. + * @remove: called when a faux device is removed from the system + * + * Both @probe and @remove are optional, if not needed, set to NULL. + */ +struct faux_device_ops { + int (*probe)(struct faux_device *faux_dev); + void (*remove)(struct faux_device *faux_dev); +}; + +struct faux_device *faux_device_create(const char *name, + struct device *parent, + const struct faux_device_ops *faux_ops); +struct faux_device *faux_device_create_with_groups(const char *name, + struct device *parent, + const struct faux_device_ops *faux_ops, + const struct attribute_group **groups); +void faux_device_destroy(struct faux_device *faux_dev); + +static inline void *faux_device_get_drvdata(const struct faux_device *faux_dev) +{ + return dev_get_drvdata(&faux_dev->dev); +} + +static inline void faux_device_set_drvdata(struct faux_device *faux_dev, void *data) +{ + dev_set_drvdata(&faux_dev->dev, data); +} + +#endif /* _FAUX_DEVICE_H_ */ From 78418f300d3999f1cf8a9ac71065bf2eca61f4dd Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Mon, 10 Feb 2025 13:30:26 +0100 Subject: [PATCH 178/310] rust/kernel: Add faux device bindings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This introduces a module for working with faux devices in rust, along with adding sample code to show how the API is used. Unlike other types of devices, we don't provide any hooks for device probe/removal - since these are optional for the faux API and are unnecessary in rust. Signed-off-by: Lyude Paul Cc: Maíra Canal Cc: Danilo Krummrich Cc: Miguel Ojeda Acked-by: Danilo Krummrich Link: https://lore.kernel.org/r/2025021026-exert-accent-b4c6@gregkh Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 2 + rust/bindings/bindings_helper.h | 1 + rust/kernel/faux.rs | 67 ++++++++++++++++++++++++++++++++ rust/kernel/lib.rs | 1 + samples/rust/Kconfig | 10 +++++ samples/rust/Makefile | 1 + samples/rust/rust_driver_faux.rs | 29 ++++++++++++++ 7 files changed, 111 insertions(+) create mode 100644 rust/kernel/faux.rs create mode 100644 samples/rust/rust_driver_faux.rs diff --git a/MAINTAINERS b/MAINTAINERS index 25c86f47353d..19ea159b2309 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7116,8 +7116,10 @@ F: rust/kernel/device.rs F: rust/kernel/device_id.rs F: rust/kernel/devres.rs F: rust/kernel/driver.rs +F: rust/kernel/faux.rs F: rust/kernel/platform.rs F: samples/rust/rust_driver_platform.rs +F: samples/rust/rust_driver_faux.rs DRIVERS FOR OMAP ADAPTIVE VOLTAGE SCALING (AVS) M: Nishanth Menon diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 55354e4dec14..f46cf3bb7069 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/rust/kernel/faux.rs b/rust/kernel/faux.rs new file mode 100644 index 000000000000..5acc0c02d451 --- /dev/null +++ b/rust/kernel/faux.rs @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0-only + +//! Abstractions for the faux bus. +//! +//! This module provides bindings for working with faux devices in kernel modules. +//! +//! C header: [`include/linux/device/faux.h`] + +use crate::{bindings, device, error::code::*, prelude::*}; +use core::ptr::{addr_of_mut, null, null_mut, NonNull}; + +/// The registration of a faux device. +/// +/// This type represents the registration of a [`struct faux_device`]. When an instance of this type +/// is dropped, its respective faux device will be unregistered from the system. +/// +/// # Invariants +/// +/// `self.0` always holds a valid pointer to an initialized and registered [`struct faux_device`]. +/// +/// [`struct faux_device`]: srctree/include/linux/device/faux.h +#[repr(transparent)] +pub struct Registration(NonNull); + +impl Registration { + /// Create and register a new faux device with the given name. + pub fn new(name: &CStr) -> Result { + // SAFETY: + // - `name` is copied by this function into its own storage + // - `faux_ops` is safe to leave NULL according to the C API + let dev = unsafe { bindings::faux_device_create(name.as_char_ptr(), null_mut(), null()) }; + + // The above function will return either a valid device, or NULL on failure + // INVARIANT: The device will remain registered until faux_device_destroy() is called, which + // happens in our Drop implementation. + Ok(Self(NonNull::new(dev).ok_or(ENODEV)?)) + } + + fn as_raw(&self) -> *mut bindings::faux_device { + self.0.as_ptr() + } +} + +impl AsRef for Registration { + fn as_ref(&self) -> &device::Device { + // SAFETY: The underlying `device` in `faux_device` is guaranteed by the C API to be + // a valid initialized `device`. + unsafe { device::Device::as_ref(addr_of_mut!((*self.as_raw()).dev)) } + } +} + +impl Drop for Registration { + fn drop(&mut self) { + // SAFETY: `self.0` is a valid registered faux_device via our type invariants. + unsafe { bindings::faux_device_destroy(self.as_raw()) } + } +} + +// SAFETY: The faux device API is thread-safe as guaranteed by the device core, as long as +// faux_device_destroy() is guaranteed to only be called once - which is guaranteed by our type not +// having Copy/Clone. +unsafe impl Send for Registration {} + +// SAFETY: The faux device API is thread-safe as guaranteed by the device core, as long as +// faux_device_destroy() is guaranteed to only be called once - which is guaranteed by our type not +// having Copy/Clone. +unsafe impl Sync for Registration {} diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index 496ed32b0911..398242f92a96 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -46,6 +46,7 @@ pub mod device_id; pub mod devres; pub mod driver; pub mod error; +pub mod faux; #[cfg(CONFIG_RUST_FW_LOADER_ABSTRACTIONS)] pub mod firmware; pub mod fs; diff --git a/samples/rust/Kconfig b/samples/rust/Kconfig index 918dbead2c0b..3b6eae84b297 100644 --- a/samples/rust/Kconfig +++ b/samples/rust/Kconfig @@ -61,6 +61,16 @@ config SAMPLE_RUST_DRIVER_PLATFORM If unsure, say N. +config SAMPLE_RUST_DRIVER_FAUX + tristate "Faux Driver" + help + This option builds the Rust Faux driver sample. + + To compile this as a module, choose M here: + the module will be called rust_driver_faux. + + If unsure, say N. + config SAMPLE_RUST_HOSTPROGS bool "Host programs" help diff --git a/samples/rust/Makefile b/samples/rust/Makefile index 5a8ab0df0567..0dbc6d90f1ef 100644 --- a/samples/rust/Makefile +++ b/samples/rust/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_SAMPLE_RUST_MISC_DEVICE) += rust_misc_device.o obj-$(CONFIG_SAMPLE_RUST_PRINT) += rust_print.o obj-$(CONFIG_SAMPLE_RUST_DRIVER_PCI) += rust_driver_pci.o obj-$(CONFIG_SAMPLE_RUST_DRIVER_PLATFORM) += rust_driver_platform.o +obj-$(CONFIG_SAMPLE_RUST_DRIVER_FAUX) += rust_driver_faux.o rust_print-y := rust_print_main.o rust_print_events.o diff --git a/samples/rust/rust_driver_faux.rs b/samples/rust/rust_driver_faux.rs new file mode 100644 index 000000000000..048c6cb98b29 --- /dev/null +++ b/samples/rust/rust_driver_faux.rs @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-only + +//! Rust faux device sample. + +use kernel::{c_str, faux, prelude::*, Module}; + +module! { + type: SampleModule, + name: "rust_faux_driver", + author: "Lyude Paul", + description: "Rust faux device sample", + license: "GPL", +} + +struct SampleModule { + _reg: faux::Registration, +} + +impl Module for SampleModule { + fn init(_module: &'static ThisModule) -> Result { + pr_info!("Initialising Rust Faux Device Sample\n"); + + let reg = faux::Registration::new(c_str!("rust-faux-sample-device"))?; + + dev_info!(reg.as_ref(), "Hello from faux device!\n"); + + Ok(Self { _reg: reg }) + } +} From 0760d62dad5d3e76c2aa175d9bc42b5f664967c2 Mon Sep 17 00:00:00 2001 From: Devaansh Kumar Date: Tue, 11 Feb 2025 22:48:48 +0530 Subject: [PATCH 179/310] sched_ext: selftests: Fix grammar in tests description Fixed grammar for a few tests of sched_ext. Signed-off-by: Devaansh Kumar Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/init_enable_count.c | 2 +- tools/testing/selftests/sched_ext/maybe_null.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c index 0f3eddc7a17a..eddf9e0e26e7 100644 --- a/tools/testing/selftests/sched_ext/init_enable_count.c +++ b/tools/testing/selftests/sched_ext/init_enable_count.c @@ -150,7 +150,7 @@ static enum scx_test_status run(void *ctx) struct scx_test init_enable_count = { .name = "init_enable_count", - .description = "Verify we do the correct amount of counting of init, " + .description = "Verify we correctly count the occurrences of init, " "enable, etc callbacks.", .run = run, }; diff --git a/tools/testing/selftests/sched_ext/maybe_null.c b/tools/testing/selftests/sched_ext/maybe_null.c index 31cfafb0cf65..aacf0c58ca4f 100644 --- a/tools/testing/selftests/sched_ext/maybe_null.c +++ b/tools/testing/selftests/sched_ext/maybe_null.c @@ -43,7 +43,7 @@ static enum scx_test_status run(void *ctx) struct scx_test maybe_null = { .name = "maybe_null", - .description = "Verify if PTR_MAYBE_NULL work for .dispatch", + .description = "Verify if PTR_MAYBE_NULL works for .dispatch", .run = run, }; REGISTER_SCX_TEST(&maybe_null) From 2e2006c91c842c551521434466f9b4324719c9a7 Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Wed, 12 Feb 2025 15:19:36 +0800 Subject: [PATCH 180/310] sched_ext: Fix the incorrect bpf_list kfunc API in common.bpf.h. Now BPF only supports bpf_list_push_{front,back}_impl kfunc, not bpf_list_ push_{front,back}. This patch fix this issue. Without this patch, if we use bpf_list kfunc in scx, the BPF verifier would complain: libbpf: extern (func ksym) 'bpf_list_push_back': not found in kernel or module BTFs libbpf: failed to load object 'scx_foo' libbpf: failed to load BPF skeleton 'scx_foo': -EINVAL With this patch, the bpf list kfunc will work as expected. Signed-off-by: Chuyi Zhou Fixes: 2a52ca7c98960 ("sched_ext: Add scx_simple and scx_example_qmap example schedulers") Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/common.bpf.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index d72b60a0c582..7849405614b1 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -270,8 +270,16 @@ void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; #define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) #define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) -void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; -void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +int bpf_list_push_front_impl(struct bpf_list_head *head, + struct bpf_list_node *node, + void *meta, __u64 off) __ksym; +#define bpf_list_push_front(head, node) bpf_list_push_front_impl(head, node, NULL, 0) + +int bpf_list_push_back_impl(struct bpf_list_head *head, + struct bpf_list_node *node, + void *meta, __u64 off) __ksym; +#define bpf_list_push_back(head, node) bpf_list_push_back_impl(head, node, NULL, 0) + struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, From f5717c93a1b999970f3a64d771a1a9ee68cc37d0 Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Wed, 12 Feb 2025 21:09:35 +0800 Subject: [PATCH 181/310] sched_ext: Use SCX_CALL_OP_TASK in task_tick_scx Now when we use scx_bpf_task_cgroup() in ops.tick() to get the cgroup of the current task, the following error will occur: scx_foo[3795244] triggered exit kind 1024: runtime error (called on a task not being operated on) The reason is that we are using SCX_CALL_OP() instead of SCX_CALL_OP_TASK() when calling ops.tick(), which triggers the error during the subsequent scx_kf_allowed_on_arg_tasks() check. SCX_CALL_OP_TASK() was first introduced in commit 36454023f50b ("sched_ext: Track tasks that are subjects of the in-flight SCX operation") to ensure task's rq lock is held when accessing task's sched_group. Since ops.tick() is marked as SCX_KF_TERMINAL and task_tick_scx() is protected by the rq lock, we can use SCX_CALL_OP_TASK() to avoid the above issue. Similarly, the same changes should be made for ops.disable() and ops.exit_task(), as they are also protected by task_rq_lock() and it's safe to access the task's task_group. Fixes: 36454023f50b ("sched_ext: Track tasks that are subjects of the in-flight SCX operation") Signed-off-by: Chuyi Zhou Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 54edd0e2132a..5a81d9a1e31f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3899,7 +3899,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) curr->scx.slice = 0; touch_core_sched(rq, curr); } else if (SCX_HAS_OP(tick)) { - SCX_CALL_OP(SCX_KF_REST, tick, curr); + SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr); } if (!curr->scx.slice) @@ -4046,7 +4046,7 @@ static void scx_ops_disable_task(struct task_struct *p) WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); if (SCX_HAS_OP(disable)) - SCX_CALL_OP(SCX_KF_REST, disable, p); + SCX_CALL_OP_TASK(SCX_KF_REST, disable, p); scx_set_task_state(p, SCX_TASK_READY); } @@ -4075,7 +4075,7 @@ static void scx_ops_exit_task(struct task_struct *p) } if (SCX_HAS_OP(exit_task)) - SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); + SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args); scx_set_task_state(p, SCX_TASK_NONE); } From d6211ebbdaa541af197b50b8dd8f22642ce0b87f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Feb 2025 08:24:23 -0700 Subject: [PATCH 182/310] io_uring/uring_cmd: unconditionally copy SQEs at prep time This isn't generally necessary, but conditions have been observed where SQE data is accessed from the original SQE after prep has been done and outside of the initial issue. Opcode prep handlers must ensure that any SQE related data is stable beyond the prep phase, but uring_cmd is a bit special in how it handles the SQE which makes it susceptible to reading stale data. If the application has reused the SQE before the original completes, then that can lead to data corruption. Down the line we can relax this again once uring_cmd has been sanitized a bit, and avoid unnecessarily copying the SQE. Fixes: 5eff57fa9f3a ("io_uring/uring_cmd: defer SQE copying until it's needed") Reported-by: Caleb Sander Mateos Reviewed-by: Caleb Sander Mateos Reviewed-by: Li Zetao Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 8af7780407b7..e6701b7aa147 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -165,15 +165,6 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, } EXPORT_SYMBOL_GPL(io_uring_cmd_done); -static void io_uring_cmd_cache_sqes(struct io_kiocb *req) -{ - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct io_uring_cmd_data *cache = req->async_data; - - memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = cache->sqes; -} - static int io_uring_cmd_prep_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -185,10 +176,15 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, return -ENOMEM; cache->op_data = NULL; - ioucmd->sqe = sqe; - /* defer memcpy until we need it */ - if (unlikely(req->flags & REQ_F_FORCE_ASYNC)) - io_uring_cmd_cache_sqes(req); + /* + * Unconditionally cache the SQE for now - this is only needed for + * requests that go async, but prep handlers must ensure that any + * sqe data is stable beyond prep. Since uring_cmd is special in + * that it doesn't read in per-op data, play it safe and ensure that + * any SQE data is stable beyond prep. This can later get relaxed. + */ + memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = cache->sqes; return 0; } @@ -251,16 +247,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } ret = file->f_op->uring_cmd(ioucmd, issue_flags); - if (ret == -EAGAIN) { - struct io_uring_cmd_data *cache = req->async_data; - - if (ioucmd->sqe != cache->sqes) - io_uring_cmd_cache_sqes(req); - return -EAGAIN; - } else if (ret == -EIOCBQUEUED) { - return -EIOCBQUEUED; - } - + if (ret == -EAGAIN || ret == -EIOCBQUEUED) + return ret; if (ret < 0) req_set_fail(req); io_req_uring_cleanup(req, issue_flags); From 7b4aebeecbbd5b5fe73e35fad3f62ed21aa7ef44 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 13 Feb 2025 17:56:46 +0200 Subject: [PATCH 183/310] gpiolib: Fix crash on error in gpiochip_get_ngpios() The gpiochip_get_ngpios() uses chip_*() macros to print messages. However these macros rely on gpiodev to be initialised and set, which is not the case when called via bgpio_init(). In such a case the printing messages will crash on NULL pointer dereference. Replace chip_*() macros by the respective dev_*() ones to avoid such crash. Fixes: 55b2395e4e92 ("gpio: mmio: handle "ngpios" properly in bgpio_init()") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250213155646.2882324-1-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 679ed764cb14..ca2f58a2cd45 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -904,13 +904,13 @@ int gpiochip_get_ngpios(struct gpio_chip *gc, struct device *dev) } if (gc->ngpio == 0) { - chip_err(gc, "tried to insert a GPIO chip with zero lines\n"); + dev_err(dev, "tried to insert a GPIO chip with zero lines\n"); return -EINVAL; } if (gc->ngpio > FASTPATH_NGPIO) - chip_warn(gc, "line cnt %u is greater than fast path cnt %u\n", - gc->ngpio, FASTPATH_NGPIO); + dev_warn(dev, "line cnt %u is greater than fast path cnt %u\n", + gc->ngpio, FASTPATH_NGPIO); return 0; } From fbc7e61195e23f744814e78524b73b59faa54ab4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:19 +0000 Subject: [PATCH 184/310] KVM: arm64: Unconditionally save+flush host FPSIMD/SVE/SME state There are several problems with the way hyp code lazily saves the host's FPSIMD/SVE state, including: * Host SVE being discarded unexpectedly due to inconsistent configuration of TIF_SVE and CPACR_ELx.ZEN. This has been seen to result in QEMU crashes where SVE is used by memmove(), as reported by Eric Auger: https://issues.redhat.com/browse/RHEL-68997 * Host SVE state is discarded *after* modification by ptrace, which was an unintentional ptrace ABI change introduced with lazy discarding of SVE state. * The host FPMR value can be discarded when running a non-protected VM, where FPMR support is not exposed to a VM, and that VM uses FPSIMD/SVE. In these cases the hyp code does not save the host's FPMR before unbinding the host's FPSIMD/SVE/SME state, leaving a stale value in memory. Avoid these by eagerly saving and "flushing" the host's FPSIMD/SVE/SME state when loading a vCPU such that KVM does not need to save any of the host's FPSIMD/SVE/SME state. For clarity, fpsimd_kvm_prepare() is removed and the necessary call to fpsimd_save_and_flush_cpu_state() is placed in kvm_arch_vcpu_load_fp(). As 'fpsimd_state' and 'fpmr_ptr' should not be used, they are set to NULL; all uses of these will be removed in subsequent patches. Historical problems go back at least as far as v5.17, e.g. erroneous assumptions about TIF_SVE being clear in commit: 8383741ab2e773a9 ("KVM: arm64: Get rid of host SVE tracking/saving") ... and so this eager save+flush probably needs to be backported to ALL stable trees. Fixes: 93ae6b01bafee8fa ("KVM: arm64: Discard any SVE state when entering KVM guests") Fixes: 8c845e2731041f0f ("arm64/sve: Leave SVE enabled on syscall if we don't context switch") Fixes: ef3be86021c3bdf3 ("KVM: arm64: Add save/restore support for FPMR") Reported-by: Eric Auger Reported-by: Wilco Dijkstra Reviewed-by: Mark Brown Tested-by: Mark Brown Tested-by: Eric Auger Acked-by: Will Deacon Cc: Catalin Marinas Cc: Florian Weimer Cc: Fuad Tabba Cc: Jeremy Linton Cc: Marc Zyngier Cc: Oliver Upton Cc: Paolo Bonzini Signed-off-by: Mark Rutland Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-2-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kernel/fpsimd.c | 25 ------------------------- arch/arm64/kvm/fpsimd.c | 35 ++++++++++------------------------- 2 files changed, 10 insertions(+), 50 deletions(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 8c4c1a2186cc..ec68d520b7ca 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1694,31 +1694,6 @@ void fpsimd_signal_preserve_current_state(void) sve_to_fpsimd(current); } -/* - * Called by KVM when entering the guest. - */ -void fpsimd_kvm_prepare(void) -{ - if (!system_supports_sve()) - return; - - /* - * KVM does not save host SVE state since we can only enter - * the guest from a syscall so the ABI means that only the - * non-saved SVE state needs to be saved. If we have left - * SVE enabled for performance reasons then update the task - * state to be FPSIMD only. - */ - get_cpu_fpsimd_context(); - - if (test_and_clear_thread_flag(TIF_SVE)) { - sve_to_fpsimd(current); - current->thread.fp_type = FP_STATE_FPSIMD; - } - - put_cpu_fpsimd_context(); -} - /* * Associate current's FPSIMD context with this cpu * The caller must have ownership of the cpu FPSIMD context before calling diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 4d3d1a2eb157..ceeb0a4893aa 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -54,16 +54,18 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) if (!system_supports_fpsimd()) return; - fpsimd_kvm_prepare(); - /* - * We will check TIF_FOREIGN_FPSTATE just before entering the - * guest in kvm_arch_vcpu_ctxflush_fp() and override this to - * FP_STATE_FREE if the flag set. + * Ensure that any host FPSIMD/SVE/SME state is saved and unbound such + * that the host kernel is responsible for restoring this state upon + * return to userspace, and the hyp code doesn't need to save anything. + * + * When the host may use SME, fpsimd_save_and_flush_cpu_state() ensures + * that PSTATE.{SM,ZA} == {0,0}. */ - *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; - *host_data_ptr(fpsimd_state) = kern_hyp_va(¤t->thread.uw.fpsimd_state); - *host_data_ptr(fpmr_ptr) = kern_hyp_va(¤t->thread.uw.fpmr); + fpsimd_save_and_flush_cpu_state(); + *host_data_ptr(fp_owner) = FP_STATE_FREE; + *host_data_ptr(fpsimd_state) = NULL; + *host_data_ptr(fpmr_ptr) = NULL; host_data_clear_flag(HOST_SVE_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) @@ -73,23 +75,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) host_data_clear_flag(HOST_SME_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) host_data_set_flag(HOST_SME_ENABLED); - - /* - * If PSTATE.SM is enabled then save any pending FP - * state and disable PSTATE.SM. If we leave PSTATE.SM - * enabled and the guest does not enable SME via - * CPACR_EL1.SMEN then operations that should be valid - * may generate SME traps from EL1 to EL1 which we - * can't intercept and which would confuse the guest. - * - * Do the same for PSTATE.ZA in the case where there - * is state in the registers which has not already - * been saved, this is very unlikely to happen. - */ - if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) { - *host_data_ptr(fp_owner) = FP_STATE_FREE; - fpsimd_save_and_flush_cpu_state(); - } } /* From 8eca7f6d5100b6997df4f532090bc3f7e0203bef Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:20 +0000 Subject: [PATCH 185/310] KVM: arm64: Remove host FPSIMD saving for non-protected KVM Now that the host eagerly saves its own FPSIMD/SVE/SME state, non-protected KVM never needs to save the host FPSIMD/SVE/SME state, and the code to do this is never used. Protected KVM still needs to save/restore the host FPSIMD/SVE state to avoid leaking guest state to the host (and to avoid revealing to the host whether the guest used FPSIMD/SVE/SME), and that code needs to be retained. Remove the unused code and data structures. To avoid the need for a stub copy of kvm_hyp_save_fpsimd_host() in the VHE hyp code, the nVHE/hVHE version is moved into the shared switch header, where it is only invoked when KVM is in protected mode. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Tested-by: Mark Brown Acked-by: Will Deacon Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Oliver Upton Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-3-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 20 +++++------------- arch/arm64/kvm/arm.c | 8 ------- arch/arm64/kvm/fpsimd.c | 2 -- arch/arm64/kvm/hyp/include/hyp/switch.h | 25 ++++++++++++++++++++-- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 2 +- arch/arm64/kvm/hyp/nvhe/switch.c | 28 ------------------------- arch/arm64/kvm/hyp/vhe/switch.c | 8 ------- 7 files changed, 29 insertions(+), 64 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7cfa024de4e3..f56c07568591 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -624,23 +624,13 @@ struct kvm_host_data { struct kvm_cpu_context host_ctxt; /* - * All pointers in this union are hyp VA. + * Hyp VA. * sve_state is only used in pKVM and if system_supports_sve(). */ - union { - struct user_fpsimd_state *fpsimd_state; - struct cpu_sve_state *sve_state; - }; - - union { - /* HYP VA pointer to the host storage for FPMR */ - u64 *fpmr_ptr; - /* - * Used by pKVM only, as it needs to provide storage - * for the host - */ - u64 fpmr; - }; + struct cpu_sve_state *sve_state; + + /* Used by pKVM only. */ + u64 fpmr; /* Ownership of the FP regs */ enum { diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 62c650c2f7b6..4b7389ad94f5 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2481,14 +2481,6 @@ static void finalize_init_hyp_mode(void) per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = kern_hyp_va(sve_state); } - } else { - for_each_possible_cpu(cpu) { - struct user_fpsimd_state *fpsimd_state; - - fpsimd_state = &per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->host_ctxt.fp_regs; - per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->fpsimd_state = - kern_hyp_va(fpsimd_state); - } } } diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index ceeb0a4893aa..332cb3904e68 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -64,8 +64,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) */ fpsimd_save_and_flush_cpu_state(); *host_data_ptr(fp_owner) = FP_STATE_FREE; - *host_data_ptr(fpsimd_state) = NULL; - *host_data_ptr(fpmr_ptr) = NULL; host_data_clear_flag(HOST_SVE_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index f838a45665f2..c5b8a11ac4f5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -375,7 +375,28 @@ static inline void __hyp_sve_save_host(void) true); } -static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu); +static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) +{ + /* + * Non-protected kvm relies on the host restoring its sve state. + * Protected kvm restores the host's sve state as not to reveal that + * fpsimd was used by a guest nor leak upper sve bits. + */ + if (system_supports_sve()) { + __hyp_sve_save_host(); + + /* Re-enable SVE traps if not supported for the guest vcpu. */ + if (!vcpu_has_sve(vcpu)) + cpacr_clear_set(CPACR_EL1_ZEN, 0); + + } else { + __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); + } + + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) + *host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR); +} + /* * We trap the first access to the FP/SIMD to save the host context and @@ -425,7 +446,7 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) isb(); /* Write out the host state if it's in the registers */ - if (host_owns_fp_regs()) + if (is_protected_kvm_enabled() && host_owns_fp_regs()) kvm_hyp_save_fpsimd_host(vcpu); /* Restore the guest state */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 6e12c070832f..1a334a38d8fd 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -83,7 +83,7 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) if (system_supports_sve()) __hyp_sve_restore_host(); else - __fpsimd_restore_state(*host_data_ptr(fpsimd_state)); + __fpsimd_restore_state(host_data_ptr(host_ctxt.fp_regs)); if (has_fpmr) write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR); diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 6c846d033d24..7a2d18917624 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -192,34 +192,6 @@ static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) kvm_handle_pvm_sysreg(vcpu, exit_code)); } -static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) -{ - /* - * Non-protected kvm relies on the host restoring its sve state. - * Protected kvm restores the host's sve state as not to reveal that - * fpsimd was used by a guest nor leak upper sve bits. - */ - if (unlikely(is_protected_kvm_enabled() && system_supports_sve())) { - __hyp_sve_save_host(); - - /* Re-enable SVE traps if not supported for the guest vcpu. */ - if (!vcpu_has_sve(vcpu)) - cpacr_clear_set(CPACR_EL1_ZEN, 0); - - } else { - __fpsimd_save_state(*host_data_ptr(fpsimd_state)); - } - - if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) { - u64 val = read_sysreg_s(SYS_FPMR); - - if (unlikely(is_protected_kvm_enabled())) - *host_data_ptr(fpmr) = val; - else - **host_data_ptr(fpmr_ptr) = val; - } -} - static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index b5b9dbaf1fdd..e8a07d4bb546 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -413,14 +413,6 @@ static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) return true; } -static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) -{ - __fpsimd_save_state(*host_data_ptr(fpsimd_state)); - - if (kvm_has_fpmr(vcpu->kvm)) - **host_data_ptr(fpmr_ptr) = read_sysreg_s(SYS_FPMR); -} - static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) { int ret = -EINVAL; From 459f059be702056d91537b99a129994aa6ccdd35 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:21 +0000 Subject: [PATCH 186/310] KVM: arm64: Remove VHE host restore of CPACR_EL1.ZEN When KVM is in VHE mode, the host kernel tries to save and restore the configuration of CPACR_EL1.ZEN (i.e. CPTR_EL2.ZEN when HCR_EL2.E2H=1) across kvm_arch_vcpu_load_fp() and kvm_arch_vcpu_put_fp(), since the configuration may be clobbered by hyp when running a vCPU. This logic is currently redundant. The VHE hyp code unconditionally configures CPTR_EL2.ZEN to 0b01 when returning to the host, permitting host kernel usage of SVE. Now that the host eagerly saves and unbinds its own FPSIMD/SVE/SME state, there's no need to save/restore the state of the EL0 SVE trap. The kernel can safely save/restore state without trapping, as described above, and will restore userspace state (including trap controls) before returning to userspace. Remove the redundant logic. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Tested-by: Mark Brown Acked-by: Will Deacon Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Oliver Upton Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-4-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/fpsimd.c | 16 ---------------- 2 files changed, 17 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f56c07568591..ed6841bf21b2 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -615,7 +615,6 @@ struct cpu_sve_state { struct kvm_host_data { #define KVM_HOST_DATA_FLAG_HAS_SPE 0 #define KVM_HOST_DATA_FLAG_HAS_TRBE 1 -#define KVM_HOST_DATA_FLAG_HOST_SVE_ENABLED 2 #define KVM_HOST_DATA_FLAG_HOST_SME_ENABLED 3 #define KVM_HOST_DATA_FLAG_TRBE_ENABLED 4 #define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 5 diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 332cb3904e68..4ff0dee1a403 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -65,10 +65,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) fpsimd_save_and_flush_cpu_state(); *host_data_ptr(fp_owner) = FP_STATE_FREE; - host_data_clear_flag(HOST_SVE_ENABLED); - if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) - host_data_set_flag(HOST_SVE_ENABLED); - if (system_supports_sme()) { host_data_clear_flag(HOST_SME_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) @@ -202,18 +198,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) * when needed. */ fpsimd_save_and_flush_cpu_state(); - } else if (has_vhe() && system_supports_sve()) { - /* - * The FPSIMD/SVE state in the CPU has not been touched, and we - * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been - * reset by kvm_reset_cptr_el2() in the Hyp code, disabling SVE - * for EL0. To avoid spurious traps, restore the trap state - * seen by kvm_arch_vcpu_load_fp(): - */ - if (host_data_test_flag(HOST_SVE_ENABLED)) - sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN); - else - sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); } local_irq_restore(flags); From 407a99c4654e8ea65393f412c421a55cac539f5b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:22 +0000 Subject: [PATCH 187/310] KVM: arm64: Remove VHE host restore of CPACR_EL1.SMEN When KVM is in VHE mode, the host kernel tries to save and restore the configuration of CPACR_EL1.SMEN (i.e. CPTR_EL2.SMEN when HCR_EL2.E2H=1) across kvm_arch_vcpu_load_fp() and kvm_arch_vcpu_put_fp(), since the configuration may be clobbered by hyp when running a vCPU. This logic has historically been broken, and is currently redundant. This logic was originally introduced in commit: 861262ab86270206 ("KVM: arm64: Handle SME host state when running guests") At the time, the VHE hyp code would reset CPTR_EL2.SMEN to 0b00 when returning to the host, trapping host access to SME state. Unfortunately, this was unsafe as the host could take a softirq before calling kvm_arch_vcpu_put_fp(), and if a softirq handler were to use kernel mode NEON the resulting attempt to save the live FPSIMD/SVE/SME state would result in a fatal trap. That issue was limited to VHE mode. For nVHE/hVHE modes, KVM always saved/restored the host kernel's CPACR_EL1 value, and configured CPTR_EL2.TSM to 0b0, ensuring that host usage of SME would not be trapped. The issue above was incidentally fixed by commit: 375110ab51dec5dc ("KVM: arm64: Fix resetting SME trap values on reset for (h)VHE") That commit changed the VHE hyp code to configure CPTR_EL2.SMEN to 0b01 when returning to the host, permitting host kernel usage of SME, avoiding the issue described above. At the time, this was not identified as a fix for commit 861262ab86270206. Now that the host eagerly saves and unbinds its own FPSIMD/SVE/SME state, there's no need to save/restore the state of the EL0 SME trap. The kernel can safely save/restore state without trapping, as described above, and will restore userspace state (including trap controls) before returning to userspace. Remove the redundant logic. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Tested-by: Mark Brown Acked-by: Will Deacon Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Oliver Upton Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-5-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/fpsimd.c | 21 --------------------- 2 files changed, 22 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ed6841bf21b2..c77acc990457 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -615,7 +615,6 @@ struct cpu_sve_state { struct kvm_host_data { #define KVM_HOST_DATA_FLAG_HAS_SPE 0 #define KVM_HOST_DATA_FLAG_HAS_TRBE 1 -#define KVM_HOST_DATA_FLAG_HOST_SME_ENABLED 3 #define KVM_HOST_DATA_FLAG_TRBE_ENABLED 4 #define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 5 unsigned long flags; diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 4ff0dee1a403..f64724197958 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -65,12 +65,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) fpsimd_save_and_flush_cpu_state(); *host_data_ptr(fp_owner) = FP_STATE_FREE; - if (system_supports_sme()) { - host_data_clear_flag(HOST_SME_ENABLED); - if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) - host_data_set_flag(HOST_SME_ENABLED); - } - /* * If normal guests gain SME support, maintain this behavior for pKVM * guests, which don't support SME. @@ -141,21 +135,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) local_irq_save(flags); - /* - * If we have VHE then the Hyp code will reset CPACR_EL1 to - * the default value and we need to reenable SME. - */ - if (has_vhe() && system_supports_sme()) { - /* Also restore EL0 state seen on entry */ - if (host_data_test_flag(HOST_SME_ENABLED)) - sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_SMEN); - else - sysreg_clear_set(CPACR_EL1, - CPACR_EL1_SMEN_EL0EN, - CPACR_EL1_SMEN_EL1EN); - isb(); - } - if (guest_owns_fp_regs()) { if (vcpu_has_sve(vcpu)) { u64 zcr = read_sysreg_el1(SYS_ZCR); From ee14db31a9c84e65f5adfd45598760d851f1d817 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:23 +0000 Subject: [PATCH 188/310] KVM: arm64: Refactor CPTR trap deactivation For historical reasons, the VHE and nVHE/hVHE implementations of __activate_cptr_traps() pair with a common implementation of __kvm_reset_cptr_el2(), which ideally would be named __deactivate_cptr_traps(). Rename __kvm_reset_cptr_el2() to __deactivate_cptr_traps(), and split it into separate VHE and nVHE/hVHE variants so that each can be paired with its corresponding implementation of __activate_cptr_traps(). At the same time, fold kvm_write_cptr_el2() into its callers. This makes it clear in-context whether a write is made to the CPACR_EL1 encoding or the CPTR_EL2 encoding, and removes the possibility of confusion as to whether kvm_write_cptr_el2() reformats the sysreg fields as cpacr_clear_set() does. In the nVHE/hVHE implementation of __activate_cptr_traps(), placing the sysreg writes within the if-else blocks requires that the call to __activate_traps_fpsimd32() is moved earlier, but as this was always called before writing to CPTR_EL2/CPACR_EL1, this should not result in a functional change. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Tested-by: Mark Brown Acked-by: Will Deacon Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Oliver Upton Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-6-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 42 ---------------------------- arch/arm64/kvm/hyp/nvhe/switch.c | 35 ++++++++++++++++++++--- arch/arm64/kvm/hyp/vhe/switch.c | 12 +++++++- 3 files changed, 42 insertions(+), 47 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 47f2cf408eed..78ec1ef2cfe8 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -605,48 +605,6 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) __cpacr_to_cptr_set(clr, set));\ } while (0) -static __always_inline void kvm_write_cptr_el2(u64 val) -{ - if (has_vhe() || has_hvhe()) - write_sysreg(val, cpacr_el1); - else - write_sysreg(val, cptr_el2); -} - -/* Resets the value of cptr_el2 when returning to the host. */ -static __always_inline void __kvm_reset_cptr_el2(struct kvm *kvm) -{ - u64 val; - - if (has_vhe()) { - val = (CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN); - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN_EL1EN; - } else if (has_hvhe()) { - val = CPACR_EL1_FPEN; - - if (!kvm_has_sve(kvm) || !guest_owns_fp_regs()) - val |= CPACR_EL1_ZEN; - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN; - } else { - val = CPTR_NVHE_EL2_RES1; - - if (kvm_has_sve(kvm) && guest_owns_fp_regs()) - val |= CPTR_EL2_TZ; - if (!cpus_have_final_cap(ARM64_SME)) - val |= CPTR_EL2_TSM; - } - - kvm_write_cptr_el2(val); -} - -#ifdef __KVM_NVHE_HYPERVISOR__ -#define kvm_reset_cptr_el2(v) __kvm_reset_cptr_el2(kern_hyp_va((v)->kvm)) -#else -#define kvm_reset_cptr_el2(v) __kvm_reset_cptr_el2((v)->kvm) -#endif - /* * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE * format if E2H isn't set. diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 7a2d18917624..5d79f63a4f86 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -39,6 +39,9 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) { u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ + if (!guest_owns_fp_regs()) + __activate_traps_fpsimd32(vcpu); + if (has_hvhe()) { val |= CPACR_EL1_TTA; @@ -47,6 +50,8 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) if (vcpu_has_sve(vcpu)) val |= CPACR_EL1_ZEN; } + + write_sysreg(val, cpacr_el1); } else { val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; @@ -61,12 +66,34 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) if (!guest_owns_fp_regs()) val |= CPTR_EL2_TFP; + + write_sysreg(val, cptr_el2); } +} - if (!guest_owns_fp_regs()) - __activate_traps_fpsimd32(vcpu); +static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = kern_hyp_va(vcpu->kvm); - kvm_write_cptr_el2(val); + if (has_hvhe()) { + u64 val = CPACR_EL1_FPEN; + + if (!kvm_has_sve(kvm) || !guest_owns_fp_regs()) + val |= CPACR_EL1_ZEN; + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN; + + write_sysreg(val, cpacr_el1); + } else { + u64 val = CPTR_NVHE_EL2_RES1; + + if (kvm_has_sve(kvm) && guest_owns_fp_regs()) + val |= CPTR_EL2_TZ; + if (!cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; + + write_sysreg(val, cptr_el2); + } } static void __activate_traps(struct kvm_vcpu *vcpu) @@ -119,7 +146,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); - kvm_reset_cptr_el2(vcpu); + __deactivate_cptr_traps(vcpu); write_sysreg(__kvm_hyp_host_vector, vbar_el2); } diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index e8a07d4bb546..4748b1947ffa 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -136,6 +136,16 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) write_sysreg(val, cpacr_el1); } +static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN; + + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN_EL1EN; + + write_sysreg(val, cpacr_el1); +} + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; @@ -207,7 +217,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) */ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); - kvm_reset_cptr_el2(vcpu); + __deactivate_cptr_traps(vcpu); if (!arm64_kernel_unmapped_at_el0()) host_vectors = __this_cpu_read(this_cpu_vector); From 9b66195063c5a145843547b1d692bd189be85287 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:24 +0000 Subject: [PATCH 189/310] KVM: arm64: Refactor exit handlers The hyp exit handling logic is largely shared between VHE and nVHE/hVHE, with common logic in arch/arm64/kvm/hyp/include/hyp/switch.h. The code in the header depends on function definitions provided by arch/arm64/kvm/hyp/vhe/switch.c and arch/arm64/kvm/hyp/nvhe/switch.c when they include the header. This is an unusual header dependency, and prevents the use of arch/arm64/kvm/hyp/include/hyp/switch.h in other files as this would result in compiler warnings regarding missing definitions, e.g. | In file included from arch/arm64/kvm/hyp/nvhe/hyp-main.c:8: | ./arch/arm64/kvm/hyp/include/hyp/switch.h:733:31: warning: 'kvm_get_exit_handler_array' used but never defined | 733 | static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); | | ^~~~~~~~~~~~~~~~~~~~~~~~~~ | ./arch/arm64/kvm/hyp/include/hyp/switch.h:735:13: warning: 'early_exit_filter' used but never defined | 735 | static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); | | ^~~~~~~~~~~~~~~~~ Refactor the logic such that the header doesn't depend on anything from the C files. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Tested-by: Mark Brown Acked-by: Will Deacon Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Oliver Upton Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-7-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 30 +++++-------------------- arch/arm64/kvm/hyp/nvhe/switch.c | 28 +++++++++++++---------- arch/arm64/kvm/hyp/vhe/switch.c | 9 ++++---- 3 files changed, 26 insertions(+), 41 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index c5b8a11ac4f5..46df5c2eeaf5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -679,23 +679,16 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); -static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); - -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); - /* * Allow the hypervisor to handle the exit with an exit handler if it has one. * * Returns true if the hypervisor handled the exit, and control should go back * to the guest, or false if it hasn't. */ -static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code, + const exit_handler_fn *handlers) { - const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); - exit_handler_fn fn; - - fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; - + exit_handler_fn fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; if (fn) return fn(vcpu, exit_code); @@ -725,20 +718,9 @@ static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code * the guest, false when we should restore the host state and return to the * main run loop. */ -static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool __fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code, + const exit_handler_fn *handlers) { - /* - * Save PSTATE early so that we can evaluate the vcpu mode - * early on. - */ - synchronize_vcpu_pstate(vcpu, exit_code); - - /* - * Check whether we want to repaint the state one way or - * another. - */ - early_exit_filter(vcpu, exit_code); - if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); @@ -768,7 +750,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) goto exit; /* Check if there's an exit handler and allow it to handle the exit. */ - if (kvm_hyp_handle_exit(vcpu, exit_code)) + if (kvm_hyp_handle_exit(vcpu, exit_code, handlers)) goto guest; exit: /* Return to the host kernel and handle the exit */ diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 5d79f63a4f86..69d7d3b4294a 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -250,19 +250,21 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) return hyp_exit_handlers; } -/* - * Some guests (e.g., protected VMs) are not be allowed to run in AArch32. - * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a - * guest from dropping to AArch32 EL0 if implemented by the CPU. If the - * hypervisor spots a guest in such a state ensure it is handled, and don't - * trust the host to spot or fix it. The check below is based on the one in - * kvm_arch_vcpu_ioctl_run(). - * - * Returns false if the guest ran in AArch32 when it shouldn't have, and - * thus should exit to the host, or true if a the guest run loop can continue. - */ -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { + const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); + + synchronize_vcpu_pstate(vcpu, exit_code); + + /* + * Some guests (e.g., protected VMs) are not be allowed to run in + * AArch32. The ARMv8 architecture does not give the hypervisor a + * mechanism to prevent a guest from dropping to AArch32 EL0 if + * implemented by the CPU. If the hypervisor spots a guest in such a + * state ensure it is handled, and don't trust the host to spot or fix + * it. The check below is based on the one in + * kvm_arch_vcpu_ioctl_run(). + */ if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) { /* * As we have caught the guest red-handed, decide that it isn't @@ -275,6 +277,8 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); *exit_code |= ARM_EXCEPTION_IL; } + + return __fixup_guest_exit(vcpu, exit_code, handlers); } /* Switch to the guest for legacy non-VHE systems */ diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 4748b1947ffa..c854d8445889 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -540,13 +540,10 @@ static const exit_handler_fn hyp_exit_handlers[] = { [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, }; -static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { - return hyp_exit_handlers; -} + synchronize_vcpu_pstate(vcpu, exit_code); -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) -{ /* * If we were in HYP context on entry, adjust the PSTATE view * so that the usual helpers work correctly. @@ -566,6 +563,8 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT); *vcpu_cpsr(vcpu) |= mode; } + + return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers); } /* Switch to the guest for VHE systems running in EL2 */ From f9dd00de1e53a47763dfad601635d18542c3836d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:25 +0000 Subject: [PATCH 190/310] KVM: arm64: Mark some header functions as inline The shared hyp switch header has a number of static functions which might not be used by all files that include the header, and when unused they will provoke compiler warnings, e.g. | In file included from arch/arm64/kvm/hyp/nvhe/hyp-main.c:8: | ./arch/arm64/kvm/hyp/include/hyp/switch.h:703:13: warning: 'kvm_hyp_handle_dabt_low' defined but not used [-Wunused-function] | 703 | static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) | | ^~~~~~~~~~~~~~~~~~~~~~~ | ./arch/arm64/kvm/hyp/include/hyp/switch.h:682:13: warning: 'kvm_hyp_handle_cp15_32' defined but not used [-Wunused-function] | 682 | static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) | | ^~~~~~~~~~~~~~~~~~~~~~ | ./arch/arm64/kvm/hyp/include/hyp/switch.h:662:13: warning: 'kvm_hyp_handle_sysreg' defined but not used [-Wunused-function] | 662 | static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) | | ^~~~~~~~~~~~~~~~~~~~~ | ./arch/arm64/kvm/hyp/include/hyp/switch.h:458:13: warning: 'kvm_hyp_handle_fpsimd' defined but not used [-Wunused-function] | 458 | static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) | | ^~~~~~~~~~~~~~~~~~~~~ | ./arch/arm64/kvm/hyp/include/hyp/switch.h:329:13: warning: 'kvm_hyp_handle_mops' defined but not used [-Wunused-function] | 329 | static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) | | ^~~~~~~~~~~~~~~~~~~ Mark these functions as 'inline' to suppress this warning. This shouldn't result in any functional change. At the same time, avoid the use of __alias() in the header and alias kvm_hyp_handle_iabt_low() and kvm_hyp_handle_watchpt_low() to kvm_hyp_handle_memory_fault() using CPP, matching the style in the rest of the kernel. For consistency, kvm_hyp_handle_memory_fault() is also marked as 'inline'. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Tested-by: Mark Brown Acked-by: Will Deacon Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Oliver Upton Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-8-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 46df5c2eeaf5..163867f7f7c5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -326,7 +326,7 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault); } -static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) { *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); arm64_mops_reset_regs(vcpu_gp_regs(vcpu), vcpu->arch.fault.esr_el2); @@ -404,7 +404,7 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) * If FP/SIMD is not implemented, handle the trap and inject an undefined * instruction exception to the guest. Similarly for trapped SVE accesses. */ -static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) { bool sve_guest; u8 esr_ec; @@ -608,7 +608,7 @@ static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu) return true; } -static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) { if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && handle_tx2_tvm(vcpu)) @@ -628,7 +628,7 @@ static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) return false; } -static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) { if (static_branch_unlikely(&vgic_v3_cpuif_trap) && __vgic_v3_perform_cpuif_access(vcpu) == 1) @@ -637,19 +637,18 @@ static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) return false; } -static bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, + u64 *exit_code) { if (!__populate_fault_info(vcpu)) return true; return false; } -static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) - __alias(kvm_hyp_handle_memory_fault); -static bool kvm_hyp_handle_watchpt_low(struct kvm_vcpu *vcpu, u64 *exit_code) - __alias(kvm_hyp_handle_memory_fault); +#define kvm_hyp_handle_iabt_low kvm_hyp_handle_memory_fault +#define kvm_hyp_handle_watchpt_low kvm_hyp_handle_memory_fault -static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) { if (kvm_hyp_handle_memory_fault(vcpu, exit_code)) return true; From 59419f10045bc955d2229819c7cf7a8b0b9c5b59 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:26 +0000 Subject: [PATCH 191/310] KVM: arm64: Eagerly switch ZCR_EL{1,2} In non-protected KVM modes, while the guest FPSIMD/SVE/SME state is live on the CPU, the host's active SVE VL may differ from the guest's maximum SVE VL: * For VHE hosts, when a VM uses NV, ZCR_EL2 contains a value constrained by the guest hypervisor, which may be less than or equal to that guest's maximum VL. Note: in this case the value of ZCR_EL1 is immaterial due to E2H. * For nVHE/hVHE hosts, ZCR_EL1 contains a value written by the guest, which may be less than or greater than the guest's maximum VL. Note: in this case hyp code traps host SVE usage and lazily restores ZCR_EL2 to the host's maximum VL, which may be greater than the guest's maximum VL. This can be the case between exiting a guest and kvm_arch_vcpu_put_fp(). If a softirq is taken during this period and the softirq handler tries to use kernel-mode NEON, then the kernel will fail to save the guest's FPSIMD/SVE state, and will pend a SIGKILL for the current thread. This happens because kvm_arch_vcpu_ctxsync_fp() binds the guest's live FPSIMD/SVE state with the guest's maximum SVE VL, and fpsimd_save_user_state() verifies that the live SVE VL is as expected before attempting to save the register state: | if (WARN_ON(sve_get_vl() != vl)) { | force_signal_inject(SIGKILL, SI_KERNEL, 0, 0); | return; | } Fix this and make this a bit easier to reason about by always eagerly switching ZCR_EL{1,2} at hyp during guest<->host transitions. With this happening, there's no need to trap host SVE usage, and the nVHE/nVHE __deactivate_cptr_traps() logic can be simplified to enable host access to all present FPSIMD/SVE/SME features. In protected nVHE/hVHE modes, the host's state is always saved/restored by hyp, and the guest's state is saved prior to exit to the host, so from the host's PoV the guest never has live FPSIMD/SVE/SME state, and the host's ZCR_EL1 is never clobbered by hyp. Fixes: 8c8010d69c132273 ("KVM: arm64: Save/restore SVE state for nVHE") Fixes: 2e3cf82063a00ea0 ("KVM: arm64: nv: Ensure correct VL is loaded before saving SVE state") Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Tested-by: Mark Brown Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Oliver Upton Cc: Will Deacon Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-9-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/fpsimd.c | 30 ------------- arch/arm64/kvm/hyp/entry.S | 5 +++ arch/arm64/kvm/hyp/include/hyp/switch.h | 59 +++++++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/hyp-main.c | 13 +++--- arch/arm64/kvm/hyp/nvhe/switch.c | 6 +-- arch/arm64/kvm/hyp/vhe/switch.c | 4 ++ 6 files changed, 76 insertions(+), 41 deletions(-) diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index f64724197958..3cbb999419af 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -136,36 +136,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) local_irq_save(flags); if (guest_owns_fp_regs()) { - if (vcpu_has_sve(vcpu)) { - u64 zcr = read_sysreg_el1(SYS_ZCR); - - /* - * If the vCPU is in the hyp context then ZCR_EL1 is - * loaded with its vEL2 counterpart. - */ - __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr; - - /* - * Restore the VL that was saved when bound to the CPU, - * which is the maximum VL for the guest. Because the - * layout of the data when saving the sve state depends - * on the VL, we need to use a consistent (i.e., the - * maximum) VL. - * Note that this means that at guest exit ZCR_EL1 is - * not necessarily the same as on guest entry. - * - * ZCR_EL2 holds the guest hypervisor's VL when running - * a nested guest, which could be smaller than the - * max for the vCPU. Similar to above, we first need to - * switch to a VL consistent with the layout of the - * vCPU's SVE state. KVM support for NV implies VHE, so - * using the ZCR_EL1 alias is safe. - */ - if (!has_vhe() || (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) - sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, - SYS_ZCR_EL1); - } - /* * Flush (save and invalidate) the fpsimd/sve state so that if * the host tries to use fpsimd/sve, it's not using stale data diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 4433a234aa9b..9f4e8d68ab50 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -44,6 +44,11 @@ alternative_if ARM64_HAS_RAS_EXTN alternative_else_nop_endif mrs x1, isr_el1 cbz x1, 1f + + // Ensure that __guest_enter() always provides a context + // synchronization event so that callers don't need ISBs for anything + // that would usually be synchonized by the ERET. + isb mov x0, #ARM_EXCEPTION_IRQ ret diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 163867f7f7c5..f5e882a358e2 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -375,6 +375,65 @@ static inline void __hyp_sve_save_host(void) true); } +static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu) +{ + u64 zcr_el1, zcr_el2; + + if (!guest_owns_fp_regs()) + return; + + if (vcpu_has_sve(vcpu)) { + /* A guest hypervisor may restrict the effective max VL. */ + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) + zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2); + else + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + + write_sysreg_el2(zcr_el2, SYS_ZCR); + + zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)); + write_sysreg_el1(zcr_el1, SYS_ZCR); + } +} + +static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) +{ + u64 zcr_el1, zcr_el2; + + if (!guest_owns_fp_regs()) + return; + + /* + * When the guest owns the FP regs, we know that guest+hyp traps for + * any FPSIMD/SVE/SME features exposed to the guest have been disabled + * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd() + * prior to __guest_entry(). As __guest_entry() guarantees a context + * synchronization event, we don't need an ISB here to avoid taking + * traps for anything that was exposed to the guest. + */ + if (vcpu_has_sve(vcpu)) { + zcr_el1 = read_sysreg_el1(SYS_ZCR); + __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1; + + /* + * The guest's state is always saved using the guest's max VL. + * Ensure that the host has the guest's max VL active such that + * the host can save the guest's state lazily, but don't + * artificially restrict the host to the guest's max VL. + */ + if (has_vhe()) { + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + write_sysreg_el2(zcr_el2, SYS_ZCR); + } else { + zcr_el2 = sve_vq_from_vl(kvm_host_sve_max_vl) - 1; + write_sysreg_el2(zcr_el2, SYS_ZCR); + + zcr_el1 = vcpu_sve_max_vq(vcpu) - 1; + write_sysreg_el1(zcr_el1, SYS_ZCR); + } + } +} + static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) { /* diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 1a334a38d8fd..2c37680d954c 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -5,6 +5,7 @@ */ #include +#include #include #include @@ -224,8 +225,12 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) sync_hyp_vcpu(hyp_vcpu); } else { + struct kvm_vcpu *vcpu = kern_hyp_va(host_vcpu); + /* The host is fully trusted, run its vCPU directly. */ - ret = __kvm_vcpu_run(kern_hyp_va(host_vcpu)); + fpsimd_lazy_switch_to_guest(vcpu); + ret = __kvm_vcpu_run(vcpu); + fpsimd_lazy_switch_to_host(vcpu); } out: cpu_reg(host_ctxt, 1) = ret; @@ -675,12 +680,6 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) case ESR_ELx_EC_SMC64: handle_host_smc(host_ctxt); break; - case ESR_ELx_EC_SVE: - cpacr_clear_set(0, CPACR_EL1_ZEN); - isb(); - sve_cond_update_zcr_vq(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, - SYS_ZCR_EL2); - break; case ESR_ELx_EC_IABT_LOW: case ESR_ELx_EC_DABT_LOW: handle_host_mem_abort(host_ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 69d7d3b4294a..7d2ba6ef0261 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -73,12 +73,10 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) { - struct kvm *kvm = kern_hyp_va(vcpu->kvm); - if (has_hvhe()) { u64 val = CPACR_EL1_FPEN; - if (!kvm_has_sve(kvm) || !guest_owns_fp_regs()) + if (cpus_have_final_cap(ARM64_SVE)) val |= CPACR_EL1_ZEN; if (cpus_have_final_cap(ARM64_SME)) val |= CPACR_EL1_SMEN; @@ -87,7 +85,7 @@ static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) } else { u64 val = CPTR_NVHE_EL2_RES1; - if (kvm_has_sve(kvm) && guest_owns_fp_regs()) + if (!cpus_have_final_cap(ARM64_SVE)) val |= CPTR_EL2_TZ; if (!cpus_have_final_cap(ARM64_SME)) val |= CPTR_EL2_TSM; diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index c854d8445889..647737d6e8d0 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -579,6 +579,8 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) sysreg_save_host_state_vhe(host_ctxt); + fpsimd_lazy_switch_to_guest(vcpu); + /* * Note that ARM erratum 1165522 requires us to configure both stage 1 * and stage 2 translation for the guest context before we clear @@ -603,6 +605,8 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __deactivate_traps(vcpu); + fpsimd_lazy_switch_to_host(vcpu); + sysreg_restore_host_state_vhe(host_ctxt); if (guest_owns_fp_regs()) From 332b7e6d62b7a3a988017f5184e547aa20e3a19a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 13 Feb 2025 09:15:31 +0000 Subject: [PATCH 192/310] KVM: arm64: Simplify warning in kvm_arch_vcpu_load_fp() At the end of kvm_arch_vcpu_load_fp() we check that no bits are set in SVCR. We only check this for protected mode despite this mattering equally for non-protected mode, and the comment above this is confusing. Remove the comment and simplify the check, moving from WARN_ON() to WARN_ON_ONCE() to avoid spamming the log. Signed-off-by: Mark Rutland Signed-off-by: Marc Zyngier --- arch/arm64/kvm/fpsimd.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 3cbb999419af..7f6e43d25691 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -65,12 +65,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) fpsimd_save_and_flush_cpu_state(); *host_data_ptr(fp_owner) = FP_STATE_FREE; - /* - * If normal guests gain SME support, maintain this behavior for pKVM - * guests, which don't support SME. - */ - WARN_ON(is_protected_kvm_enabled() && system_supports_sme() && - read_sysreg_s(SYS_SVCR)); + WARN_ON_ONCE(system_supports_sme() && read_sysreg_s(SYS_SVCR)); } /* From 65729da9ce37f5a2c62e2542ef03bc9ac6775a7d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 12 Feb 2025 17:34:54 +0000 Subject: [PATCH 193/310] KVM: arm64: Convert timer offset VA when accessed in HYP code Now that EL2 has gained some early timer emulation, it accesses the offsets pointed to by the timer structure, both of which live in the KVM structure. Of course, these are *kernel* pointers, so the dereferencing of these pointers in non-kernel code must be itself be offset. Given switch.h its own version of timer_get_offset() and use that instead. Fixes: b86fc215dc26d ("KVM: arm64: Handle counter access early in non-HYP context") Reported-by: Linux Kernel Functional Testing Reviewed-by: Oliver Upton Tested-by: Anders Roxell Link: https://lore.kernel.org/r/20250212173454.2864462-1-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index f5e882a358e2..23bbe28eaaf9 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -581,9 +581,22 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) return true; } +/* Open-coded version of timer_get_offset() to allow for kern_hyp_va() */ +static inline u64 hyp_timer_get_offset(struct arch_timer_context *ctxt) +{ + u64 offset = 0; + + if (ctxt->offset.vm_offset) + offset += *kern_hyp_va(ctxt->offset.vm_offset); + if (ctxt->offset.vcpu_offset) + offset += *kern_hyp_va(ctxt->offset.vcpu_offset); + + return offset; +} + static inline u64 compute_counter_value(struct arch_timer_context *ctxt) { - return arch_timer_read_cntpct_el0() - timer_get_offset(ctxt); + return arch_timer_read_cntpct_el0() - hyp_timer_get_offset(ctxt); } static bool kvm_handle_cntxct(struct kvm_vcpu *vcpu) From b938731ed2d4eea8e268a27bfc600581fedae2a9 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 13 Feb 2025 15:36:14 +0000 Subject: [PATCH 194/310] KVM: arm64: Fix alignment of kvm_hyp_memcache allocations When allocating guest stage-2 page-table pages at EL2, pKVM can consume pages from the host-provided kvm_hyp_memcache. As pgtable.c expects zeroed pages, guest_s2_zalloc_page() actively implements this zeroing with a PAGE_SIZE memset. Unfortunately, we don't check the page alignment of the host-provided address before doing so, which could lead to the memset overrunning the page if the host was malicious. Fix this by simply force-aligning all kvm_hyp_memcache allocations to page boundaries. Fixes: 60dfe093ec13 ("KVM: arm64: Instantiate guest stage-2 page-tables at EL2") Reported-by: Ben Simner Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20250213153615.3642515-1-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c77acc990457..3a7ec98ef123 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -100,7 +100,7 @@ static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc, static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc, void *(*to_va)(phys_addr_t phys)) { - phys_addr_t *p = to_va(mc->head); + phys_addr_t *p = to_va(mc->head & PAGE_MASK); if (!mc->nr_pages) return NULL; From e6e3e0022ef8f1d584ee4d5b89dca02472c5eb1f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 12 Feb 2025 18:25:57 +0000 Subject: [PATCH 195/310] KVM: arm64: timer: Drop warning on failed interrupt signalling We currently spit out a warning if making a timer interrupt pending fails. But not only this is loud and easy to trigger from userspace, we also fail to do anything useful with that information. Dropping the warning is the easiest thing to do for now. We can always add error reporting if we really want in the future. Reported-by: Alexander Potapenko Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250212182558.2865232-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 231c0cd9c7b4..70802e4c91cf 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -447,21 +447,19 @@ static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level) static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx) { - int ret; - kvm_timer_update_status(timer_ctx, new_level); timer_ctx->irq.level = new_level; trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx), timer_ctx->irq.level); - if (!userspace_irqchip(vcpu->kvm)) { - ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu, - timer_irq(timer_ctx), - timer_ctx->irq.level, - timer_ctx); - WARN_ON(ret); - } + if (userspace_irqchip(vcpu->kvm)) + return; + + kvm_vgic_inject_irq(vcpu->kvm, vcpu, + timer_irq(timer_ctx), + timer_ctx->irq.level, + timer_ctx); } /* Only called for a fully emulated timer */ From b3aa9283c0c505b5cfd25f7d6cfd720de2adc807 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 12 Feb 2025 18:25:58 +0000 Subject: [PATCH 196/310] KVM: arm64: vgic: Hoist SGI/PPI alloc from vgic_init() to kvm_create_vgic() If userspace creates vcpus, then a vgic, we end-up in a situation where irqchip_in_kernel() will return true, but no private interrupt has been allocated for these vcpus. This situation will continue until userspace initialises the vgic, at which point we fix the early vcpus. Should a vcpu run or be initialised in the interval, bad things may happen. An obvious solution is to move this fix-up phase to the point where the vgic is created. This ensures that from that point onwards, all vcpus have their private interrupts, as new vcpus will directly allocate them. With that, we have the invariant that when irqchip_in_kernel() is true, all vcpus have their private interrupts. Reported-by: Alexander Potapenko Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250212182558.2865232-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-init.c | 74 ++++++++++++++++----------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index bc7e22ab5d81..775461cf2d2d 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -34,9 +34,9 @@ * * CPU Interface: * - * - kvm_vgic_vcpu_init(): initialization of static data that - * doesn't depend on any sizing information or emulation type. No - * allocation is allowed there. + * - kvm_vgic_vcpu_init(): initialization of static data that doesn't depend + * on any sizing information. Private interrupts are allocated if not + * already allocated at vgic-creation time. */ /* EARLY INIT */ @@ -58,6 +58,8 @@ void kvm_vgic_early_init(struct kvm *kvm) /* CREATION */ +static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type); + /** * kvm_vgic_create: triggered by the instantiation of the VGIC device by * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only) @@ -112,6 +114,22 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) goto out_unlock; } + kvm_for_each_vcpu(i, vcpu, kvm) { + ret = vgic_allocate_private_irqs_locked(vcpu, type); + if (ret) + break; + } + + if (ret) { + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + kfree(vgic_cpu->private_irqs); + vgic_cpu->private_irqs = NULL; + } + + goto out_unlock; + } + kvm->arch.vgic.in_kernel = true; kvm->arch.vgic.vgic_model = type; @@ -180,7 +198,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) return 0; } -static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu) +static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; int i; @@ -218,17 +236,28 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu) /* PPIs */ irq->config = VGIC_CONFIG_LEVEL; } + + switch (type) { + case KVM_DEV_TYPE_ARM_VGIC_V3: + irq->group = 1; + irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + break; + case KVM_DEV_TYPE_ARM_VGIC_V2: + irq->group = 0; + irq->targets = BIT(vcpu->vcpu_id); + break; + } } return 0; } -static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu) +static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu, u32 type) { int ret; mutex_lock(&vcpu->kvm->arch.config_lock); - ret = vgic_allocate_private_irqs_locked(vcpu); + ret = vgic_allocate_private_irqs_locked(vcpu, type); mutex_unlock(&vcpu->kvm->arch.config_lock); return ret; @@ -258,7 +287,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) if (!irqchip_in_kernel(vcpu->kvm)) return 0; - ret = vgic_allocate_private_irqs(vcpu); + ret = vgic_allocate_private_irqs(vcpu, dist->vgic_model); if (ret) return ret; @@ -295,7 +324,7 @@ int vgic_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; - int ret = 0, i; + int ret = 0; unsigned long idx; lockdep_assert_held(&kvm->arch.config_lock); @@ -315,35 +344,6 @@ int vgic_init(struct kvm *kvm) if (ret) goto out; - /* Initialize groups on CPUs created before the VGIC type was known */ - kvm_for_each_vcpu(idx, vcpu, kvm) { - ret = vgic_allocate_private_irqs_locked(vcpu); - if (ret) - goto out; - - for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i); - - switch (dist->vgic_model) { - case KVM_DEV_TYPE_ARM_VGIC_V3: - irq->group = 1; - irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); - break; - case KVM_DEV_TYPE_ARM_VGIC_V2: - irq->group = 0; - irq->targets = 1U << idx; - break; - default: - ret = -EINVAL; - } - - vgic_put_irq(kvm, irq); - - if (ret) - goto out; - } - } - /* * If we have GICv4.1 enabled, unconditionally request enable the * v4 support so that we get HW-accelerated vSGIs. Otherwise, only From 320702a76186222426e5dc8efb9d68ba9d4ed0ab Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 13 Feb 2025 17:29:51 +0100 Subject: [PATCH 197/310] MAINTAINERS: delete entry for AXXIA I2C The maintainer's email address bounced and he wasn't active for 4 years. Delete this entry and fall back to the generic I2C host drivers entry. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20250213162950.45596-2-wsa+renesas@sang-engineering.com Signed-off-by: Andi Shyti --- MAINTAINERS | 7 ------- 1 file changed, 7 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 25c86f47353d..fc332fbf3958 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3859,13 +3859,6 @@ W: https://ez.analog.com/linux-software-drivers F: Documentation/devicetree/bindings/pwm/adi,axi-pwmgen.yaml F: drivers/pwm/pwm-axi-pwmgen.c -AXXIA I2C CONTROLLER -M: Krzysztof Adamski -L: linux-i2c@vger.kernel.org -S: Maintained -F: Documentation/devicetree/bindings/i2c/i2c-axxia.txt -F: drivers/i2c/busses/i2c-axxia.c - AZ6007 DVB DRIVER M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org From 7422c319fd805b956aab5ba93e0274517a8e3650 Mon Sep 17 00:00:00 2001 From: Mukesh Kumar Savaliya Date: Thu, 23 Jan 2025 14:11:47 +0530 Subject: [PATCH 198/310] MAINTAINERS: Add maintainer for Qualcomm's I2C GENI driver Add a new entry for the I2C QCOM GENI driver to the MAINTAINERS file. This entry includes the maintainer's name and contact information, ensuring proper maintainership and communication for the i2c-qcom-geni driver file. Signed-off-by: Mukesh Kumar Savaliya Link: https://lore.kernel.org/r/20250123084147.3632023-1-quic_msavaliy@quicinc.com Signed-off-by: Andi Shyti --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index fc332fbf3958..db7e533c466c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19503,6 +19503,15 @@ L: dmaengine@vger.kernel.org S: Supported F: drivers/dma/qcom/hidma* +QUALCOMM I2C QCOM GENI DRIVER +M: Mukesh Kumar Savaliya +M: Viken Dadhaniya +L: linux-i2c@vger.kernel.org +L: linux-arm-msm@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/i2c/qcom,i2c-geni-qcom.yaml +F: drivers/i2c/busses/i2c-qcom-geni.c + QUALCOMM I2C CCI DRIVER M: Loic Poulain M: Robert Foss From 325735e83d7d0016e7b61069df2570e910898466 Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Fri, 14 Feb 2025 09:30:21 +0800 Subject: [PATCH 199/310] ALSA: hda/tas2781: Fix index issue in tas2781 hda SPI driver Correct wrong mask for device index. Signed-off-by: Baojun Xu Fixes: bb5f86ea50ff ("ALSA: hda/tas2781: Add tas2781 hda SPI driver") Link: https://patch.msgid.link/20250214013021.6072-1-baojun.xu@ti.com Signed-off-by: Takashi Iwai --- sound/pci/hda/tas2781_spi_fwlib.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sound/pci/hda/tas2781_spi_fwlib.c b/sound/pci/hda/tas2781_spi_fwlib.c index 0e2acbc3c900..131d9a77d140 100644 --- a/sound/pci/hda/tas2781_spi_fwlib.c +++ b/sound/pci/hda/tas2781_spi_fwlib.c @@ -2,7 +2,7 @@ // // TAS2781 HDA SPI driver // -// Copyright 2024 Texas Instruments, Inc. +// Copyright 2024-2025 Texas Instruments, Inc. // // Author: Baojun Xu @@ -771,19 +771,19 @@ static int tasdevice_process_block(void *context, unsigned char *data, switch (subblk_typ) { case TASDEVICE_CMD_SING_W: subblk_offset = tasdevice_single_byte_wr(tas_priv, - dev_idx & 0x4f, data, sublocksize); + dev_idx & 0x3f, data, sublocksize); break; case TASDEVICE_CMD_BURST: subblk_offset = tasdevice_burst_wr(tas_priv, - dev_idx & 0x4f, data, sublocksize); + dev_idx & 0x3f, data, sublocksize); break; case TASDEVICE_CMD_DELAY: subblk_offset = tasdevice_delay(tas_priv, - dev_idx & 0x4f, data, sublocksize); + dev_idx & 0x3f, data, sublocksize); break; case TASDEVICE_CMD_FIELD_W: subblk_offset = tasdevice_field_wr(tas_priv, - dev_idx & 0x4f, data, sublocksize); + dev_idx & 0x3f, data, sublocksize); break; default: subblk_offset = 2; From 822b7ec657e99b44b874e052d8540d8b54fe8569 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Thu, 13 Feb 2025 15:45:43 +0800 Subject: [PATCH 200/310] ALSA: hda: Add error check for snd_ctl_rename_id() in snd_hda_create_dig_out_ctls() Check the return value of snd_ctl_rename_id() in snd_hda_create_dig_out_ctls(). Ensure that failures are properly handled. [ Note: the error cannot happen practically because the only error condition in snd_ctl_rename_id() is the missing ID, but this is a rename, hence it must be present. But for the code consistency, it's safer to have always the proper return check -- tiwai ] Fixes: 5c219a340850 ("ALSA: hda: Fix kctl->id initialization") Cc: stable@vger.kernel.org # 6.4+ Signed-off-by: Wentao Liang Link: https://patch.msgid.link/20250213074543.1620-1-vulab@iscas.ac.cn Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_codec.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index 14763c0f31ad..46a220404999 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -2470,7 +2470,9 @@ int snd_hda_create_dig_out_ctls(struct hda_codec *codec, break; id = kctl->id; id.index = spdif_index; - snd_ctl_rename_id(codec->card, &kctl->id, &id); + err = snd_ctl_rename_id(codec->card, &kctl->id, &id); + if (err < 0) + return err; } bus->primary_dig_out_type = HDA_PCM_TYPE_HDMI; } From ef75966abf950c0539534effa4960caa29fb7167 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 27 Jan 2025 09:44:11 +0000 Subject: [PATCH 201/310] iommu/amd: Expicitly enable CNTRL.EPHEn bit in resume path With recent kernel, AMDGPU failed to resume after suspend on certain laptop. Sample log: ----------- Nov 14 11:52:19 Thinkbook kernel: iommu ivhd0: AMD-Vi: Event logged [ILLEGAL_DEV_TABLE_ENTRY device=0000:06:00.0 pasid=0x00000 address=0x135300000 flags=0x0080] Nov 14 11:52:19 Thinkbook kernel: AMD-Vi: DTE[0]: 7d90000000000003 Nov 14 11:52:19 Thinkbook kernel: AMD-Vi: DTE[1]: 0000100103fc0009 Nov 14 11:52:19 Thinkbook kernel: AMD-Vi: DTE[2]: 2000000117840013 Nov 14 11:52:19 Thinkbook kernel: AMD-Vi: DTE[3]: 0000000000000000 This is because in resume path, CNTRL[EPHEn] is not set. Fix this by setting CNTRL[EPHEn] to 1 in resume path if EFR[EPHSUP] is set. Note May be better approach is to save the control register in suspend path and restore it in resume path instead of trying to set indivisual bits. We will have separate patch for that. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219499 Fixes: c4cb23111103 ("iommu/amd: Add support for enable/disable IOPF") Tested-by: Hamish McIntyre-Bhatty Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20250127094411.5931-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 1 + drivers/iommu/amd/init.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 0bbda60d3cdc..23caea22f8dc 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -175,6 +175,7 @@ #define CONTROL_GAM_EN 25 #define CONTROL_GALOG_EN 28 #define CONTROL_GAINT_EN 29 +#define CONTROL_EPH_EN 45 #define CONTROL_XT_EN 50 #define CONTROL_INTCAPXT_EN 51 #define CONTROL_IRTCACHEDIS 59 diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index c5cd92edada0..438848b0682f 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -2653,6 +2653,10 @@ static void iommu_init_flags(struct amd_iommu *iommu) /* Set IOTLB invalidation timeout to 1s */ iommu_set_inv_tlb_timeout(iommu, CTRL_INV_TO_1S); + + /* Enable Enhanced Peripheral Page Request Handling */ + if (check_feature(FEATURE_EPHSUP)) + iommu_feature_enable(iommu, CONTROL_EPH_EN); } static void iommu_apply_resume_quirks(struct amd_iommu *iommu) From 78be7f04537fa35f6cc694879e9a475ca1984936 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Tue, 28 Jan 2025 19:05:21 +0000 Subject: [PATCH 202/310] iommu: Fix a spelling error Fix spelling error IDENITY -> IDENTITY in drivers/iommu/iommu.c. Signed-off-by: Easwar Hariharan Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250128190522.70800-1-eahariha@linux.microsoft.com [ joro: Add commit message ] Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 870c3cdbd0f6..60aed01e54f2 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1756,7 +1756,7 @@ static int iommu_get_def_domain_type(struct iommu_group *group, group->id); /* - * Try to recover, drivers are allowed to force IDENITY or DMA, IDENTITY + * Try to recover, drivers are allowed to force IDENTITY or DMA, IDENTITY * takes precedence. */ if (type == IOMMU_DOMAIN_IDENTITY) From 4a8991fe9cd0b6a509bab3d056700d3520601d86 Mon Sep 17 00:00:00 2001 From: Andrew Kreimer Date: Mon, 10 Feb 2025 13:20:04 +0200 Subject: [PATCH 203/310] iommu/exynos: Fix typos There are some typos in comments/messages: - modyfying -> modifying - Unabled -> Unable Fix them via codespell. Signed-off-by: Andrew Kreimer Link: https://lore.kernel.org/r/20250210112027.29791-1-algonell@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/exynos-iommu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index c666ecab955d..69e23e017d9e 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -249,7 +249,7 @@ struct exynos_iommu_domain { struct list_head clients; /* list of sysmmu_drvdata.domain_node */ sysmmu_pte_t *pgtable; /* lv1 page table, 16KB */ short *lv2entcnt; /* free lv2 entry counter for each section */ - spinlock_t lock; /* lock for modyfying list of clients */ + spinlock_t lock; /* lock for modifying list of clients */ spinlock_t pgtablelock; /* lock for modifying page table @ pgtable */ struct iommu_domain domain; /* generic domain data structure */ }; @@ -292,7 +292,7 @@ struct sysmmu_drvdata { struct clk *aclk; /* SYSMMU's aclk clock */ struct clk *pclk; /* SYSMMU's pclk clock */ struct clk *clk_master; /* master's device clock */ - spinlock_t lock; /* lock for modyfying state */ + spinlock_t lock; /* lock for modifying state */ bool active; /* current status */ struct exynos_iommu_domain *domain; /* domain we belong to */ struct list_head domain_node; /* node for domain clients list */ @@ -746,7 +746,7 @@ static int exynos_sysmmu_probe(struct platform_device *pdev) ret = devm_request_irq(dev, irq, exynos_sysmmu_irq, 0, dev_name(dev), data); if (ret) { - dev_err(dev, "Unabled to register handler of irq %d\n", irq); + dev_err(dev, "Unable to register handler of irq %d\n", irq); return ret; } From add43c4fbc92f8b48c1acd64e953af3b1be4cd9c Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 11 Feb 2025 08:55:12 +0800 Subject: [PATCH 204/310] iommu/vt-d: Make intel_iommu_drain_pasid_prq() cover faults for RID This driver supports page faults on PCI RID since commit <9f831c16c69e> ("iommu/vt-d: Remove the pasid present check in prq_event_thread") by allowing the reporting of page faults with the pasid_present field cleared to the upper layer for further handling. The fundamental assumption here is that the detach or replace operations act as a fence for page faults. This implies that all pending page faults associated with a specific RID or PASID are flushed when a domain is detached or replaced from a device RID or PASID. However, the intel_iommu_drain_pasid_prq() helper does not correctly handle faults for RID. This leads to faults potentially remaining pending in the iommu hardware queue even after the domain is detached, thereby violating the aforementioned assumption. Fix this issue by extending intel_iommu_drain_pasid_prq() to cover faults for RID. Fixes: 9f831c16c69e ("iommu/vt-d: Remove the pasid present check in prq_event_thread") Cc: stable@vger.kernel.org Suggested-by: Kevin Tian Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20250121023150.815972-1-baolu.lu@linux.intel.com Reviewed-by: Yi Liu Link: https://lore.kernel.org/r/20250211005512.985563-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/prq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c index c2d792db52c3..064194399b38 100644 --- a/drivers/iommu/intel/prq.c +++ b/drivers/iommu/intel/prq.c @@ -87,7 +87,9 @@ void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid) struct page_req_dsc *req; req = &iommu->prq[head / sizeof(*req)]; - if (!req->pasid_present || req->pasid != pasid) { + if (req->rid != sid || + (req->pasid_present && pasid != req->pasid) || + (!req->pasid_present && pasid != IOMMU_NO_PASID)) { head = (head + sizeof(*req)) & PRQ_RING_MASK; continue; } From e71f7f42e3c874ac3314b8f250e8416a706165af Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 2 Feb 2025 20:49:35 +0800 Subject: [PATCH 205/310] USB: pci-quirks: Fix HCCPARAMS register error for LS7A EHCI LS7A EHCI controller doesn't have extended capabilities, so the EECP (EHCI Extended Capabilities Pointer) field of HCCPARAMS register should be 0x0, but it reads as 0xa0 now. This is a hardware flaw and will be fixed in future, now just clear the EECP field to avoid error messages on boot: ...... [ 0.581675] pci 0000:00:04.1: EHCI: unrecognized capability ff [ 0.581699] pci 0000:00:04.1: EHCI: unrecognized capability ff [ 0.581716] pci 0000:00:04.1: EHCI: unrecognized capability ff [ 0.581851] pci 0000:00:04.1: EHCI: unrecognized capability ff ...... [ 0.581916] pci 0000:00:05.1: EHCI: unrecognized capability ff [ 0.581951] pci 0000:00:05.1: EHCI: unrecognized capability ff [ 0.582704] pci 0000:00:05.1: EHCI: unrecognized capability ff [ 0.582799] pci 0000:00:05.1: EHCI: unrecognized capability ff ...... Cc: stable Signed-off-by: Baoqi Zhang Signed-off-by: Huacai Chen Link: https://lore.kernel.org/r/20250202124935.480500-1-chenhuacai@loongson.cn Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/pci-quirks.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c index 1f9c1b1435d8..0404489c2f6a 100644 --- a/drivers/usb/host/pci-quirks.c +++ b/drivers/usb/host/pci-quirks.c @@ -958,6 +958,15 @@ static void quirk_usb_disable_ehci(struct pci_dev *pdev) * booting from USB disk or using a usb keyboard */ hcc_params = readl(base + EHCI_HCC_PARAMS); + + /* LS7A EHCI controller doesn't have extended capabilities, the + * EECP (EHCI Extended Capabilities Pointer) field of HCCPARAMS + * register should be 0x0 but it reads as 0xa0. So clear it to + * avoid error messages on boot. + */ + if (pdev->vendor == PCI_VENDOR_ID_LOONGSON && pdev->device == 0x7a14) + hcc_params &= ~(0xffL << 8); + offset = (hcc_params >> 8) & 0xff; while (offset && --count) { pci_read_config_dword(pdev, offset, &cap); From c81d9fcd5b9402166048f377d4e5e0ee6f9ef26d Mon Sep 17 00:00:00 2001 From: Michal Pecio Date: Tue, 28 Jan 2025 10:45:29 +0100 Subject: [PATCH 206/310] usb: xhci: Restore xhci_pci support for Renesas HCs Some Renesas HCs require firmware upload to work, this is handled by the xhci_pci_renesas driver. Other variants of those chips load firmware from a SPI flash and are ready to work with xhci_pci alone. A refactor merged in v6.12 broke the latter configuration so that users are finding their hardware ignored by the normal driver and are forced to enable the firmware loader which isn't really necessary on their systems. Let xhci_pci work with those chips as before when the firmware loader is disabled by kernel configuration. Fixes: 25f51b76f90f ("xhci-pci: Make xhci-pci-renesas a proper modular driver") Cc: stable Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219616 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219726 Signed-off-by: Michal Pecio Tested-by: Nicolai Buchwitz Link: https://lore.kernel.org/r/20250128104529.58a79bfc@foxbook Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-pci.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 2d1e205c14c6..ad0ff356f6fa 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -653,8 +653,8 @@ int xhci_pci_common_probe(struct pci_dev *dev, const struct pci_device_id *id) } EXPORT_SYMBOL_NS_GPL(xhci_pci_common_probe, "xhci"); -static const struct pci_device_id pci_ids_reject[] = { - /* handled by xhci-pci-renesas */ +/* handled by xhci-pci-renesas if enabled */ +static const struct pci_device_id pci_ids_renesas[] = { { PCI_DEVICE(PCI_VENDOR_ID_RENESAS, 0x0014) }, { PCI_DEVICE(PCI_VENDOR_ID_RENESAS, 0x0015) }, { /* end: all zeroes */ } @@ -662,7 +662,8 @@ static const struct pci_device_id pci_ids_reject[] = { static int xhci_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) { - if (pci_match_id(pci_ids_reject, dev)) + if (IS_ENABLED(CONFIG_USB_XHCI_PCI_RENESAS) && + pci_match_id(pci_ids_renesas, dev)) return -ENODEV; return xhci_pci_common_probe(dev, id); From e563b01208f4d1f609bcab13333b6c0e24ce6a01 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 12 Feb 2025 19:15:15 +0100 Subject: [PATCH 207/310] usb: cdc-acm: Check control transfer buffer size before access If the first fragment is shorter than struct usb_cdc_notification, we can't calculate an expected_size. Log an error and discard the notification instead of reading lengths from memory outside the received data, which can lead to memory corruption when the expected_size decreases between fragments, causing `expected_size - acm->nb_index` to wrap. This issue has been present since the beginning of git history; however, it only leads to memory corruption since commit ea2583529cd1 ("cdc-acm: reassemble fragmented notifications"). A mitigating factor is that acm_ctrl_irq() can only execute after userspace has opened /dev/ttyACM*; but if ModemManager is running, ModemManager will do that automatically depending on the USB device's vendor/product IDs and its other interfaces. Cc: stable Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Jann Horn Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 6b37d1c47fce..39c7db7bcd21 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -371,7 +371,7 @@ static void acm_process_notification(struct acm *acm, unsigned char *buf) static void acm_ctrl_irq(struct urb *urb) { struct acm *acm = urb->context; - struct usb_cdc_notification *dr = urb->transfer_buffer; + struct usb_cdc_notification *dr; unsigned int current_size = urb->actual_length; unsigned int expected_size, copy_size, alloc_size; int retval; @@ -398,9 +398,20 @@ static void acm_ctrl_irq(struct urb *urb) usb_mark_last_busy(acm->dev); - if (acm->nb_index) + if (acm->nb_index == 0) { + /* + * The first chunk of a message must contain at least the + * notification header with the length field, otherwise we + * can't get an expected_size. + */ + if (current_size < sizeof(struct usb_cdc_notification)) { + dev_dbg(&acm->control->dev, "urb too short\n"); + goto exit; + } + dr = urb->transfer_buffer; + } else { dr = (struct usb_cdc_notification *)acm->notification_buffer; - + } /* size = notification-header + (optional) data */ expected_size = sizeof(struct usb_cdc_notification) + le16_to_cpu(dr->wLength); From 12e712964f41d05ae034989892de445781c46730 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 12 Feb 2025 19:15:16 +0100 Subject: [PATCH 208/310] usb: cdc-acm: Fix handling of oversized fragments If we receive an initial fragment of size 8 bytes which specifies a wLength of 1 byte (so the reassembled message is supposed to be 9 bytes long), and we then receive a second fragment of size 9 bytes (which is not supposed to happen), we currently wrongly bypass the fragment reassembly code but still pass the pointer to the acm->notification_buffer to acm_process_notification(). Make this less wrong by always going through fragment reassembly when we expect more fragments. Before this patch, receiving an overlong fragment could lead to `newctrl` in acm_process_notification() being uninitialized data (instead of data coming from the device). Cc: stable Fixes: ea2583529cd1 ("cdc-acm: reassemble fragmented notifications") Signed-off-by: Jann Horn Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 39c7db7bcd21..c70f34993623 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -416,7 +416,7 @@ static void acm_ctrl_irq(struct urb *urb) expected_size = sizeof(struct usb_cdc_notification) + le16_to_cpu(dr->wLength); - if (current_size < expected_size) { + if (acm->nb_index != 0 || current_size < expected_size) { /* notification is transmitted fragmented, reassemble */ if (acm->nb_size < expected_size) { u8 *new_buffer; From 7284922f3e4fa285dff1b8bb593aa9a0b8458f30 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 9 Feb 2025 15:56:11 +0100 Subject: [PATCH 209/310] USB: cdc-acm: Fill in Renesas R-Car D3 USB Download mode quirk Add Renesas R-Car D3 USB Download mode quirk and update comments on all the other Renesas R-Car USB Download mode quirks to discern them from each other. This follows R-Car Series, 3rd Generation reference manual Rev.2.00 chapter 19.2.8 USB download mode . Fixes: 6d853c9e4104 ("usb: cdc-acm: Add DISABLE_ECHO for Renesas USB Download mode") Cc: stable Signed-off-by: Marek Vasut Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20250209145708.106914-1-marek.vasut+renesas@mailbox.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index c70f34993623..c2ecfa3c8349 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1738,13 +1738,16 @@ static const struct usb_device_id acm_ids[] = { { USB_DEVICE(0x0870, 0x0001), /* Metricom GS Modem */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, - { USB_DEVICE(0x045b, 0x023c), /* Renesas USB Download mode */ + { USB_DEVICE(0x045b, 0x023c), /* Renesas R-Car H3 USB Download mode */ .driver_info = DISABLE_ECHO, /* Don't echo banner */ }, - { USB_DEVICE(0x045b, 0x0248), /* Renesas USB Download mode */ + { USB_DEVICE(0x045b, 0x0247), /* Renesas R-Car D3 USB Download mode */ .driver_info = DISABLE_ECHO, /* Don't echo banner */ }, - { USB_DEVICE(0x045b, 0x024D), /* Renesas USB Download mode */ + { USB_DEVICE(0x045b, 0x0248), /* Renesas R-Car M3-N USB Download mode */ + .driver_info = DISABLE_ECHO, /* Don't echo banner */ + }, + { USB_DEVICE(0x045b, 0x024D), /* Renesas R-Car E3 USB Download mode */ .driver_info = DISABLE_ECHO, /* Don't echo banner */ }, { USB_DEVICE(0x0e8d, 0x0003), /* FIREFLY, MediaTek Inc; andrey.arapov@gmail.com */ From 159daf1258227f44b26b5d38f4aa8f37b8cca663 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 6 Feb 2025 17:18:36 +0200 Subject: [PATCH 210/310] USB: Add USB_QUIRK_NO_LPM quirk for sony xperia xz1 smartphone The fastboot tool for communicating with Android bootloaders does not work reliably with this device if USB 2 Link Power Management (LPM) is enabled. Various fastboot commands are affected, including the following, which usually reproduces the problem within two tries: fastboot getvar kernel getvar:kernel FAILED (remote: 'GetVar Variable Not found') This issue was hidden on many systems up until commit 63a1f8454962 ("xhci: stored cached port capability values in one place") as the xhci driver failed to detect USB 2 LPM support if USB 3 ports were listed before USB 2 ports in the "supported protocol capabilities". Adding the quirk resolves the issue. No drawbacks are expected since the device uses different USB product IDs outside of fastboot mode, and since fastboot commands worked before, until LPM was enabled on the tested system by the aforementioned commit. Based on a patch from Forest from which most of the code and commit message is taken. Cc: stable Reported-by: Forest Closes: https://lore.kernel.org/hk8umj9lv4l4qguftdq1luqtdrpa1gks5l@sonic.net Tested-by: Forest Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20250206151836.51742-1-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 67732c791c93..59ed9768dae1 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -435,6 +435,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x0c45, 0x7056), .driver_info = USB_QUIRK_IGNORE_REMOTE_WAKEUP }, + /* Sony Xperia XZ1 Compact (lilac) smartphone in fastboot mode */ + { USB_DEVICE(0x0fce, 0x0dde), .driver_info = USB_QUIRK_NO_LPM }, + /* Action Semiconductor flash disk */ { USB_DEVICE(0x10d6, 0x2200), .driver_info = USB_QUIRK_STRING_FETCH_255 }, From d3a8c28426fc1fb3252753a9f1db0d691ffc21b0 Mon Sep 17 00:00:00 2001 From: Selvarasu Ganesan Date: Sat, 1 Feb 2025 22:09:02 +0530 Subject: [PATCH 211/310] usb: dwc3: Fix timeout issue during controller enter/exit from halt state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a frequent timeout during controller enter/exit from halt state after toggling the run_stop bit by SW. This timeout occurs when performing frequent role switches between host and device, causing device enumeration issues due to the timeout. This issue was not present when USB2 suspend PHY was disabled by passing the SNPS quirks (snps,dis_u2_susphy_quirk and snps,dis_enblslpm_quirk) from the DTS. However, there is a requirement to enable USB2 suspend PHY by setting of GUSB2PHYCFG.ENBLSLPM and GUSB2PHYCFG.SUSPHY bits when controller starts in gadget or host mode results in the timeout issue. This commit addresses this timeout issue by ensuring that the bits GUSB2PHYCFG.ENBLSLPM and GUSB2PHYCFG.SUSPHY are cleared before starting the dwc3_gadget_run_stop sequence and restoring them after the dwc3_gadget_run_stop sequence is completed. Fixes: 72246da40f37 ("usb: Introduce DesignWare USB3 DRD Driver") Cc: stable Signed-off-by: Selvarasu Ganesan Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20250201163903.459-1-selvarasu.g@samsung.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index d27af65eb08a..ddd6b2ce5710 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2629,10 +2629,38 @@ static int dwc3_gadget_run_stop(struct dwc3 *dwc, int is_on) { u32 reg; u32 timeout = 2000; + u32 saved_config = 0; if (pm_runtime_suspended(dwc->dev)) return 0; + /* + * When operating in USB 2.0 speeds (HS/FS), ensure that + * GUSB2PHYCFG.ENBLSLPM and GUSB2PHYCFG.SUSPHY are cleared before starting + * or stopping the controller. This resolves timeout issues that occur + * during frequent role switches between host and device modes. + * + * Save and clear these settings, then restore them after completing the + * controller start or stop sequence. + * + * This solution was discovered through experimentation as it is not + * mentioned in the dwc3 programming guide. It has been tested on an + * Exynos platforms. + */ + reg = dwc3_readl(dwc->regs, DWC3_GUSB2PHYCFG(0)); + if (reg & DWC3_GUSB2PHYCFG_SUSPHY) { + saved_config |= DWC3_GUSB2PHYCFG_SUSPHY; + reg &= ~DWC3_GUSB2PHYCFG_SUSPHY; + } + + if (reg & DWC3_GUSB2PHYCFG_ENBLSLPM) { + saved_config |= DWC3_GUSB2PHYCFG_ENBLSLPM; + reg &= ~DWC3_GUSB2PHYCFG_ENBLSLPM; + } + + if (saved_config) + dwc3_writel(dwc->regs, DWC3_GUSB2PHYCFG(0), reg); + reg = dwc3_readl(dwc->regs, DWC3_DCTL); if (is_on) { if (DWC3_VER_IS_WITHIN(DWC3, ANY, 187A)) { @@ -2660,6 +2688,12 @@ static int dwc3_gadget_run_stop(struct dwc3 *dwc, int is_on) reg &= DWC3_DSTS_DEVCTRLHLT; } while (--timeout && !(!is_on ^ !reg)); + if (saved_config) { + reg = dwc3_readl(dwc->regs, DWC3_GUSB2PHYCFG(0)); + reg |= saved_config; + dwc3_writel(dwc->regs, DWC3_GUSB2PHYCFG(0), reg); + } + if (!timeout) return -ETIMEDOUT; From 4aac0db5a0ebc599d4ad9bf5ebab78afa1f33e10 Mon Sep 17 00:00:00 2001 From: Stefan Eichenberger Date: Mon, 3 Feb 2025 11:58:24 +0100 Subject: [PATCH 212/310] usb: core: fix pipe creation for get_bMaxPacketSize0 When usb_control_msg is used in the get_bMaxPacketSize0 function, the USB pipe does not include the endpoint device number. This can cause failures when a usb hub port is reinitialized after encountering a bad cable connection. As a result, the system logs the following error messages: usb usb2-port1: cannot reset (err = -32) usb usb2-port1: Cannot enable. Maybe the USB cable is bad? usb usb2-port1: attempt power cycle usb 2-1: new high-speed USB device number 5 using ci_hdrc usb 2-1: device descriptor read/8, error -71 The problem began after commit 85d07c556216 ("USB: core: Unite old scheme and new scheme descriptor reads"). There usb_get_device_descriptor was replaced with get_bMaxPacketSize0. Unlike usb_get_device_descriptor, the get_bMaxPacketSize0 function uses the macro usb_rcvaddr0pipe, which does not include the endpoint device number. usb_get_device_descriptor, on the other hand, used the macro usb_rcvctrlpipe, which includes the endpoint device number. By modifying the get_bMaxPacketSize0 function to use usb_rcvctrlpipe instead of usb_rcvaddr0pipe, the issue can be resolved. This change will ensure that the endpoint device number is included in the USB pipe, preventing reinitialization failures. If the endpoint has not set the device number yet, it will still work because the device number is 0 in udev. Cc: stable Fixes: 85d07c556216 ("USB: core: Unite old scheme and new scheme descriptor reads") Signed-off-by: Stefan Eichenberger Reviewed-by: Alan Stern Link: https://lore.kernel.org/r/20250203105840.17539-1-eichest@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 0cd44f1fd56d..a76bb50b6202 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -4709,7 +4709,6 @@ void usb_ep0_reinit(struct usb_device *udev) EXPORT_SYMBOL_GPL(usb_ep0_reinit); #define usb_sndaddr0pipe() (PIPE_CONTROL << 30) -#define usb_rcvaddr0pipe() ((PIPE_CONTROL << 30) | USB_DIR_IN) static int hub_set_address(struct usb_device *udev, int devnum) { @@ -4815,7 +4814,7 @@ static int get_bMaxPacketSize0(struct usb_device *udev, for (i = 0; i < GET_MAXPACKET0_TRIES; ++i) { /* Start with invalid values in case the transfer fails */ buf->bDescriptorType = buf->bMaxPacketSize0 = 0; - rc = usb_control_msg(udev, usb_rcvaddr0pipe(), + rc = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), USB_REQ_GET_DESCRIPTOR, USB_DIR_IN, USB_DT_DEVICE << 8, 0, buf, size, From 4ab37fcb42832cdd3e9d5e50653285ca84d6686f Mon Sep 17 00:00:00 2001 From: Jill Donahue Date: Tue, 11 Feb 2025 10:48:05 -0700 Subject: [PATCH 213/310] USB: gadget: f_midi: f_midi_complete to call queue_work When using USB MIDI, a lock is attempted to be acquired twice through a re-entrant call to f_midi_transmit, causing a deadlock. Fix it by using queue_work() to schedule the inner f_midi_transmit() via a high priority work queue from the completion handler. Link: https://lore.kernel.org/all/CAArt=LjxU0fUZOj06X+5tkeGT+6RbXzpWg1h4t4Fwa_KGVAX6g@mail.gmail.com/ Fixes: d5daf49b58661 ("USB: gadget: midi: add midi function driver") Cc: stable Signed-off-by: Jill Donahue Link: https://lore.kernel.org/r/20250211174805.1369265-1-jdonahue@fender.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_midi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c index 47260d65066a..da82598fcef8 100644 --- a/drivers/usb/gadget/function/f_midi.c +++ b/drivers/usb/gadget/function/f_midi.c @@ -283,7 +283,7 @@ f_midi_complete(struct usb_ep *ep, struct usb_request *req) /* Our transmit completed. See if there's more to go. * f_midi_transmit eats req, don't queue it again. */ req->length = 0; - f_midi_transmit(midi); + queue_work(system_highpri_wq, &midi->work); return; } break; From 399a45e5237ca14037120b1b895bd38a3b4492ea Mon Sep 17 00:00:00 2001 From: Roy Luo Date: Tue, 4 Feb 2025 23:36:42 +0000 Subject: [PATCH 214/310] usb: gadget: core: flush gadget workqueue after device removal device_del() can lead to new work being scheduled in gadget->work workqueue. This is observed, for example, with the dwc3 driver with the following call stack: device_del() gadget_unbind_driver() usb_gadget_disconnect_locked() dwc3_gadget_pullup() dwc3_gadget_soft_disconnect() usb_gadget_set_state() schedule_work(&gadget->work) Move flush_work() after device_del() to ensure the workqueue is cleaned up. Fixes: 5702f75375aa9 ("usb: gadget: udc-core: move sysfs_notify() to a workqueue") Cc: stable Signed-off-by: Roy Luo Reviewed-by: Alan Stern Reviewed-by: Thinh Nguyen Link: https://lore.kernel.org/r/20250204233642.666991-1-royluo@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index a6f46364be65..4b3d5075621a 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -1543,8 +1543,8 @@ void usb_del_gadget(struct usb_gadget *gadget) kobject_uevent(&udc->dev.kobj, KOBJ_REMOVE); sysfs_remove_link(&udc->dev.kobj, "gadget"); - flush_work(&gadget->work); device_del(&gadget->dev); + flush_work(&gadget->work); ida_free(&gadget_id_numbers, gadget->id_number); cancel_work_sync(&udc->vbus_work); device_unregister(&udc->dev); From e169d96eecd447ff7fd7542ca5fa0911f5622054 Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Wed, 12 Feb 2025 17:38:29 +0800 Subject: [PATCH 215/310] USB: quirks: add USB_QUIRK_NO_LPM quirk for Teclast dist Teclast disk used on Huawei hisi platforms doesn't work well, losing connectivity intermittently if LPM is enabled. Add quirk disable LPM to resolve the issue. Signed-off-by: Lei Huang Cc: stable Link: https://lore.kernel.org/r/20250212093829.7379-1-huanglei814@163.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 59ed9768dae1..dfcfc142bd5e 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -528,6 +528,9 @@ static const struct usb_device_id usb_quirk_list[] = { /* Blackmagic Design UltraStudio SDI */ { USB_DEVICE(0x1edb, 0xbd4f), .driver_info = USB_QUIRK_NO_LPM }, + /* Teclast disk */ + { USB_DEVICE(0x1f75, 0x0917), .driver_info = USB_QUIRK_NO_LPM }, + /* Hauppauge HVR-950q */ { USB_DEVICE(0x2040, 0x7200), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, From e5644be4079750a0a0a5a7068fd90b97bf6fac55 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 12 Feb 2025 14:55:14 +0100 Subject: [PATCH 216/310] usb: gadget: uvc: Fix unstarted kthread worker The behaviour of kthread_create_worker() was recently changed to align with the one of kthread_create(). The kthread worker is created but not awaken by default. This is to allow the use of kthread_affine_preferred() and kthread_bind[_mask]() with kthread workers. In order to keep the old behaviour and wake the kthread up, kthread_run_worker() must be used. All the pre-existing users have been converted, except for UVC that was introduced in the same merge window as the API change. This results in hangs: INFO: task UVCG:82 blocked for more than 491 seconds. Tainted: G T 6.13.0-rc2-00014-gb04e317b5226 #1 task:UVCG state:D stack:0 pid:82 Call Trace: __schedule schedule schedule_preempt_disabled kthread ? kthread_flush_work ret_from_fork ret_from_fork_asm entry_INT80_32 Fix this with converting UVCG kworker to the new API. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202502121025.55bfa801-lkp@intel.com Fixes: f0bbfbd16b3b ("usb: gadget: uvc: rework to enqueue in pump worker from encoded queue") Cc: stable Cc: Michael Grzeschik Signed-off-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20250212135514.30539-1-frederic@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/uvc_video.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/uvc_video.c b/drivers/usb/gadget/function/uvc_video.c index 79e223713d8b..fb77b0b21790 100644 --- a/drivers/usb/gadget/function/uvc_video.c +++ b/drivers/usb/gadget/function/uvc_video.c @@ -818,7 +818,7 @@ int uvcg_video_init(struct uvc_video *video, struct uvc_device *uvc) return -EINVAL; /* Allocate a kthread for asynchronous hw submit handler. */ - video->kworker = kthread_create_worker(0, "UVCG"); + video->kworker = kthread_run_worker(0, "UVCG"); if (IS_ERR(video->kworker)) { uvcg_err(&video->uvc->func, "failed to create UVCG kworker\n"); return PTR_ERR(video->kworker); From 634775a752a86784511018a108f3b530cc3399a7 Mon Sep 17 00:00:00 2001 From: Elson Roy Serrao Date: Thu, 6 Feb 2025 11:39:50 -0800 Subject: [PATCH 217/310] usb: roles: set switch registered flag early on The role switch registration and set_role() can happen in parallel as they are invoked independent of each other. There is a possibility that a driver might spend significant amount of time in usb_role_switch_register() API due to the presence of time intensive operations like component_add() which operate under common mutex. This leads to a time window after allocating the switch and before setting the registered flag where the set role notifications are dropped. Below timeline summarizes this behavior Thread1 | Thread2 usb_role_switch_register() | | | ---> allocate switch | | | ---> component_add() | usb_role_switch_set_role() | | | | | --> Drop role notifications | | since sw->registered | | flag is not set. | | --->Set registered flag.| To avoid this, set the registered flag early on in the switch register API. Fixes: b787a3e78175 ("usb: roles: don't get/set_role() when usb_role_switch is unregistered") Cc: stable Signed-off-by: Elson Roy Serrao Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20250206193950.22421-1-quic_eserrao@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/roles/class.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/usb/roles/class.c b/drivers/usb/roles/class.c index c58a12c147f4..30482d4cf826 100644 --- a/drivers/usb/roles/class.c +++ b/drivers/usb/roles/class.c @@ -387,8 +387,11 @@ usb_role_switch_register(struct device *parent, dev_set_name(&sw->dev, "%s-role-switch", desc->name ? desc->name : dev_name(parent)); + sw->registered = true; + ret = device_register(&sw->dev); if (ret) { + sw->registered = false; put_device(&sw->dev); return ERR_PTR(ret); } @@ -399,8 +402,6 @@ usb_role_switch_register(struct device *parent, dev_warn(&sw->dev, "failed to add component\n"); } - sw->registered = true; - /* TODO: Symlinks for the host port and the device controller. */ return sw; From 659f5d55feb75782bd46cf130da3c1f240afe9ba Mon Sep 17 00:00:00 2001 From: Jos Wang Date: Thu, 13 Feb 2025 21:49:21 +0800 Subject: [PATCH 218/310] usb: typec: tcpm: PSSourceOffTimer timeout in PR_Swap enters ERROR_RECOVERY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As PD2.0 spec ("6.5.6.2 PSSourceOffTimer"),the PSSourceOffTimer is used by the Policy Engine in Dual-Role Power device that is currently acting as a Sink to timeout on a PS_RDY Message during a Power Role Swap sequence. This condition leads to a Hard Reset for USB Type-A and Type-B Plugs and Error Recovery for Type-C plugs and return to USB Default Operation. Therefore, after PSSourceOffTimer timeout, the tcpm state machine should switch from PR_SWAP_SNK_SRC_SINK_OFF to ERROR_RECOVERY. This can also solve the test items in the USB power delivery compliance test: TEST.PD.PROT.SNK.12 PR_Swap – PSSourceOffTimer Timeout [1] https://usb.org/document-library/usb-power-delivery-compliance-test-specification-0/USB_PD3_CTS_Q4_2025_OR.zip Fixes: f0690a25a140 ("staging: typec: USB Type-C Port Manager (tcpm)") Cc: stable Signed-off-by: Jos Wang Reviewed-by: Heikki Krogerus Tested-by: Amit Sunil Dhamne Link: https://lore.kernel.org/r/20250213134921.3798-1-joswang1221@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 47be450d2be3..6bf1a22c785a 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -5591,8 +5591,7 @@ static void run_state_machine(struct tcpm_port *port) tcpm_set_auto_vbus_discharge_threshold(port, TYPEC_PWR_MODE_USB, port->pps_data.active, 0); tcpm_set_charge(port, false); - tcpm_set_state(port, hard_reset_state(port), - port->timings.ps_src_off_time); + tcpm_set_state(port, ERROR_RECOVERY, port->timings.ps_src_off_time); break; case PR_SWAP_SNK_SRC_SOURCE_ON: tcpm_enable_auto_vbus_discharge(port, true); From 66314e9a57a050f95cb0ebac904f5ab047a8926e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 2 Feb 2025 16:50:14 -0800 Subject: [PATCH 219/310] xfs: fix online repair probing when CONFIG_XFS_ONLINE_REPAIR=n I received a report from the release engineering side of the house that xfs_scrub without the -n flag (aka fix it mode) would try to fix a broken filesystem even on a kernel that doesn't have online repair built into it: # xfs_scrub -dTvn /mnt/test EXPERIMENTAL xfs_scrub program in use! Use at your own risk! Phase 1: Find filesystem geometry. /mnt/test: using 1 threads to scrub. Phase 1: Memory used: 132k/0k (108k/25k), time: 0.00/ 0.00/ 0.00s Phase 4: Repair filesystem. Info: /mnt/test/some/victimdir directory entries: Attempting repair. (repair.c line 351) Corruption: /mnt/test/some/victimdir directory entries: Repair unsuccessful; offline repair required. (repair.c line 204) Source: https://blogs.oracle.com/linux/post/xfs-online-filesystem-repair It is strange that xfs_scrub doesn't refuse to run, because the kernel is supposed to return EOPNOTSUPP if we actually needed to run a repair, and xfs_io's repair subcommand will perror that. And yet: # xfs_io -x -c 'repair probe' /mnt/test # The first problem is commit dcb660f9222fd9 (4.15) which should have had xchk_probe set the CORRUPT OFLAG so that any of the repair machinery will get called at all. It turns out that some refactoring that happened in the 6.6-6.8 era broke the operation of this corner case. What we *really* want to happen is that all the predicates that would steer xfs_scrub_metadata() towards calling xrep_attempt() should function the same way that they do when repair is compiled in; and then xrep_attempt gets to return the fatal EOPNOTSUPP error code that causes the probe to fail. Instead, commit 8336a64eb75cba (6.6) started the failwhale swimming by hoisting OFLAG checking logic into a helper whose non-repair stub always returns false, causing scrub to return "repair not needed" when in fact the repair is not supported. Prior to that commit, the oflag checking that was open-coded in scrub.c worked correctly. Similarly, in commit 4bdfd7d15747b1 (6.8) we hoisted the IFLAG_REPAIR and ALREADY_FIXED logic into a helper whose non-repair stub always returns false, so we never enter the if test body that would have called xrep_attempt, let alone fail to decode the OFLAGs correctly. The final insult (yes, we're doing The Naked Gun now) is commit 48a72f60861f79 (6.8) in which we hoisted the "are we going to try a repair?" predicate into yet another function with a non-repair stub always returns false. Fix xchk_probe to trigger xrep_probe if repair is enabled, or return EOPNOTSUPP directly if it is not. For all the other scrub types, we need to fix the header predicates so that the ->repair functions (which are all xrep_notsupported) get called to return EOPNOTSUPP. Commit 48a72 is tagged here because the scrub code prior to LTS 6.12 are incomplete and not worth patching. Reported-by: David Flynn Cc: # v6.8 Fixes: 8336a64eb75c ("xfs: don't complain about unfixed metadata when repairs were injected") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/common.h | 5 ----- fs/xfs/scrub/repair.h | 11 ++++++++++- fs/xfs/scrub/scrub.c | 12 ++++++++++++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index bdcd40f0ec74..19877d99f255 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -224,7 +224,6 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) bool xchk_dir_looks_zapped(struct xfs_inode *dp); bool xchk_pptr_looks_zapped(struct xfs_inode *ip); -#ifdef CONFIG_XFS_ONLINE_REPAIR /* Decide if a repair is required. */ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) { @@ -244,10 +243,6 @@ static inline bool xchk_could_repair(const struct xfs_scrub *sc) return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !(sc->flags & XREP_ALREADY_FIXED); } -#else -# define xchk_needs_repair(sc) (false) -# define xchk_could_repair(sc) (false) -#endif /* CONFIG_XFS_ONLINE_REPAIR */ int xchk_metadata_inode_forks(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 823c00d1a502..af0a3a9e5ed9 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -191,7 +191,16 @@ int xrep_reset_metafile_resv(struct xfs_scrub *sc); #else #define xrep_ino_dqattach(sc) (0) -#define xrep_will_attempt(sc) (false) + +/* + * When online repair is not built into the kernel, we still want to attempt + * the repair so that the stub xrep_attempt below will return EOPNOTSUPP. + */ +static inline bool xrep_will_attempt(const struct xfs_scrub *sc) +{ + return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + xchk_needs_repair(sc->sm); +} static inline int xrep_attempt( diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 7567dd5cad14..6fa9e3e5bab7 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -149,6 +149,18 @@ xchk_probe( if (xchk_should_terminate(sc, &error)) return error; + /* + * If the caller is probing to see if repair works but repair isn't + * built into the kernel, return EOPNOTSUPP because that's the signal + * that userspace expects. If online repair is built in, set the + * CORRUPT flag (without any of the usual tracing/logging) to force us + * into xrep_probe. + */ + if (xchk_could_repair(sc)) { + if (!IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)) + return -EOPNOTSUPP; + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + } return 0; } From 6e33017c3276e3af7f79f61f3b3648e4a4c03d34 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 2 Feb 2025 16:50:14 -0800 Subject: [PATCH 220/310] xfs: fix data fork format filtering during inode repair Coverity noticed that xrep_dinode_bad_metabt_fork never runs because XFS_DINODE_FMT_META_BTREE is always filtered out in the mode selection switch of xrep_dinode_check_dfork. Metadata btrees are allowed only in the data forks of regular files, so add this case explicitly. I guess this got fubard during a refactoring prior to 6.13 and I didn't notice until now. :/ Coverity-id: 1617714 Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/inode_repair.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 2f641b6d663e..13ff1c933cb8 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -1055,9 +1055,17 @@ xrep_dinode_check_dfork( return true; break; case S_IFREG: - if (fmt == XFS_DINODE_FMT_LOCAL) + switch (fmt) { + case XFS_DINODE_FMT_LOCAL: return true; - fallthrough; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + case XFS_DINODE_FMT_META_BTREE: + break; + default: + return true; + } + break; case S_IFLNK: case S_IFDIR: switch (fmt) { From 9e00163c31676c6b43d2334fdf5b406232f42dee Mon Sep 17 00:00:00 2001 From: Lukas Herbolt Date: Mon, 3 Feb 2025 09:55:13 +0100 Subject: [PATCH 221/310] xfs: do not check NEEDSREPAIR if ro,norecovery mount. If there is corrutpion on the filesystem andxfs_repair fails to repair it. The last resort of getting the data is to use norecovery,ro mount. But if the NEEDSREPAIR is set the filesystem cannot be mounted. The flag must be cleared out manually using xfs_db, to get access to what left over of the corrupted fs. Signed-off-by: Lukas Herbolt Reviewed-by: Dave Chinner Reviewed-by: Eric Sandeen Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_super.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d92d7a07ea89..0055066fb1d9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1661,8 +1661,12 @@ xfs_fs_fill_super( #endif } - /* Filesystem claims it needs repair, so refuse the mount. */ - if (xfs_has_needsrepair(mp)) { + /* + * Filesystem claims it needs repair, so refuse the mount unless + * norecovery is also specified, in which case the filesystem can + * be mounted with no risk of further damage. + */ + if (xfs_has_needsrepair(mp) && !xfs_has_norecovery(mp)) { xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair."); error = -EFSCORRUPTED; goto out_free_sb; From 9f0902091c332b2665951cfb970f60ae7cbdc0f3 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Mon, 3 Feb 2025 14:04:57 +0100 Subject: [PATCH 222/310] xfs: Do not allow norecovery mount with quotacheck Mounting a filesystem that requires quota state changing will generate a transaction. We already check for a read-only device; we should do that for norecovery too. A quotacheck on a norecovery mount, and with the right log size, will cause the mount process to hang on: [<0>] xlog_grant_head_wait+0x5d/0x2a0 [xfs] [<0>] xlog_grant_head_check+0x112/0x180 [xfs] [<0>] xfs_log_reserve+0xe3/0x260 [xfs] [<0>] xfs_trans_reserve+0x179/0x250 [xfs] [<0>] xfs_trans_alloc+0x101/0x260 [xfs] [<0>] xfs_sync_sb+0x3f/0x80 [xfs] [<0>] xfs_qm_mount_quotas+0xe3/0x2f0 [xfs] [<0>] xfs_mountfs+0x7ad/0xc20 [xfs] [<0>] xfs_fs_fill_super+0x762/0xa50 [xfs] [<0>] get_tree_bdev_flags+0x131/0x1d0 [<0>] vfs_get_tree+0x26/0xd0 [<0>] vfs_cmd_create+0x59/0xe0 [<0>] __do_sys_fsconfig+0x4e3/0x6b0 [<0>] do_syscall_64+0x82/0x160 [<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e This is caused by a transaction running with bogus initialized head/tail I initially hit this while running generic/050, with random log sizes, but I managed to reproduce it reliably here with the steps below: mkfs.xfs -f -lsize=1025M -f -b size=4096 -m crc=1,reflink=1,rmapbt=1, -i sparse=1 /dev/vdb2 > /dev/null mount -o usrquota,grpquota,prjquota /dev/vdb2 /mnt xfs_io -x -c 'shutdown -f' /mnt umount /mnt mount -o ro,norecovery,usrquota,grpquota,prjquota /dev/vdb2 /mnt Last mount hangs up As we add yet another validation if quota state is changing, this also add a new helper named xfs_qm_validate_state_change(), factoring the quota state changes out of xfs_qm_newmount() to reduce cluttering within it. Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_qm_bhv.c | 55 ++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 37f1230e7584..245d754f382a 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -78,6 +78,28 @@ xfs_qm_statvfs( } } +STATIC int +xfs_qm_validate_state_change( + struct xfs_mount *mp, + uint uqd, + uint gqd, + uint pqd) +{ + int state; + + /* Is quota state changing? */ + state = ((uqd && !XFS_IS_UQUOTA_ON(mp)) || + (!uqd && XFS_IS_UQUOTA_ON(mp)) || + (gqd && !XFS_IS_GQUOTA_ON(mp)) || + (!gqd && XFS_IS_GQUOTA_ON(mp)) || + (pqd && !XFS_IS_PQUOTA_ON(mp)) || + (!pqd && XFS_IS_PQUOTA_ON(mp))); + + return state && + (xfs_dev_is_read_only(mp, "changing quota state") || + xfs_has_norecovery(mp)); +} + int xfs_qm_newmount( xfs_mount_t *mp, @@ -97,24 +119,25 @@ xfs_qm_newmount( } /* - * If the device itself is read-only, we can't allow - * the user to change the state of quota on the mount - - * this would generate a transaction on the ro device, - * which would lead to an I/O error and shutdown + * If the device itself is read-only and/or in norecovery + * mode, we can't allow the user to change the state of + * quota on the mount - this would generate a transaction + * on the ro device, which would lead to an I/O error and + * shutdown. */ - if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || - (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || - (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || - (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) || - (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) || - (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) && - xfs_dev_is_read_only(mp, "changing quota state")) { - xfs_warn(mp, "please mount with%s%s%s%s.", - (!quotaondisk ? "out quota" : ""), - (uquotaondisk ? " usrquota" : ""), - (gquotaondisk ? " grpquota" : ""), - (pquotaondisk ? " prjquota" : "")); + if (xfs_qm_validate_state_change(mp, uquotaondisk, + gquotaondisk, pquotaondisk)) { + + if (xfs_has_metadir(mp)) + xfs_warn(mp, + "metadir enabled, please mount without any quota mount options"); + else + xfs_warn(mp, "please mount with%s%s%s%s.", + (!quotaondisk ? "out quota" : ""), + (uquotaondisk ? " usrquota" : ""), + (gquotaondisk ? " grpquota" : ""), + (pquotaondisk ? " prjquota" : "")); return -EPERM; } From 3cd6a8056f5a2e794c42fc2114ee2611e358b357 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:15:01 +0100 Subject: [PATCH 223/310] xfs: rename xfs_iomap_swapfile_activate to xfs_vm_swap_activate Match the method name and the naming convention or address_space operations. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_aops.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 67877c36ed11..a80608e82c9b 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -528,7 +528,7 @@ xfs_vm_readahead( } static int -xfs_iomap_swapfile_activate( +xfs_vm_swap_activate( struct swap_info_struct *sis, struct file *swap_file, sector_t *span) @@ -549,11 +549,11 @@ const struct address_space_operations xfs_address_space_operations = { .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = xfs_iomap_swapfile_activate, + .swap_activate = xfs_vm_swap_activate, }; const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, .dirty_folio = noop_dirty_folio, - .swap_activate = xfs_iomap_swapfile_activate, + .swap_activate = xfs_vm_swap_activate, }; From 2d873efd174bae9005776937d5ac6a96050266db Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:15:00 +0100 Subject: [PATCH 224/310] xfs: flush inodegc before swapon Fix the brand new xfstest that tries to swapon on a recently unshared file and use the chance to document the other bit of magic in this function. The big comment is taken from a mailinglist post by Dave Chinner. Fixes: 5e672cd69f0a53 ("xfs: introduce xfs_inodegc_push()") Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_aops.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a80608e82c9b..6d9965b546cb 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -19,6 +19,7 @@ #include "xfs_reflink.h" #include "xfs_errortag.h" #include "xfs_error.h" +#include "xfs_icache.h" struct xfs_writepage_ctx { struct iomap_writepage_ctx ctx; @@ -533,7 +534,39 @@ xfs_vm_swap_activate( struct file *swap_file, sector_t *span) { - sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev; + struct xfs_inode *ip = XFS_I(file_inode(swap_file)); + + /* + * Swap file activation can race against concurrent shared extent + * removal in files that have been cloned. If this happens, + * iomap_swapfile_iter() can fail because it encountered a shared + * extent even though an operation is in progress to remove those + * shared extents. + * + * This race becomes problematic when we defer extent removal + * operations beyond the end of a syscall (i.e. use async background + * processing algorithms). Users think the extents are no longer + * shared, but iomap_swapfile_iter() still sees them as shared + * because the refcountbt entries for the extents being removed have + * not yet been updated. Hence the swapon call fails unexpectedly. + * + * The race condition is currently most obvious from the unlink() + * operation as extent removal is deferred until after the last + * reference to the inode goes away. We then process the extent + * removal asynchronously, hence triggers the "syscall completed but + * work not done" condition mentioned above. To close this race + * window, we need to flush any pending inodegc operations to ensure + * they have updated the refcountbt records before we try to map the + * swapfile. + */ + xfs_inodegc_flush(ip->i_mount); + + /* + * Direct the swap code to the correct block device when this file + * sits on the RT device. + */ + sis->bdev = xfs_inode_buftarg(ip)->bt_bdev; + return iomap_swapfile_activate(sis, swap_file, span, &xfs_read_iomap_ops); } From 9e512eaaf8f4008c44ede3dfc0fbc9d9c5118583 Mon Sep 17 00:00:00 2001 From: John Keeping Date: Sat, 8 Feb 2025 12:41:44 +0000 Subject: [PATCH 225/310] serial: 8250: Fix fifo underflow on flush When flushing the serial port's buffer, uart_flush_buffer() calls kfifo_reset() but if there is an outstanding DMA transfer then the completion function will consume data from the kfifo via uart_xmit_advance(), underflowing and leading to ongoing DMA as the driver tries to transmit another 2^32 bytes. This is readily reproduced with serial-generic and amidi sending even short messages as closing the device on exit will wait for the fifo to drain and in the underflow case amidi hangs for 30 seconds on exit in tty_wait_until_sent(). A trace of that gives: kworker/1:1-84 [001] 51.769423: bprint: serial8250_tx_dma: tx_size=3 fifo_len=3 amidi-763 [001] 51.769460: bprint: uart_flush_buffer: resetting fifo irq/21-fe530000-76 [000] 51.769474: bprint: __dma_tx_complete: tx_size=3 irq/21-fe530000-76 [000] 51.769479: bprint: serial8250_tx_dma: tx_size=4096 fifo_len=4294967293 irq/21-fe530000-76 [000] 51.781295: bprint: __dma_tx_complete: tx_size=4096 irq/21-fe530000-76 [000] 51.781301: bprint: serial8250_tx_dma: tx_size=4096 fifo_len=4294963197 irq/21-fe530000-76 [000] 51.793131: bprint: __dma_tx_complete: tx_size=4096 irq/21-fe530000-76 [000] 51.793135: bprint: serial8250_tx_dma: tx_size=4096 fifo_len=4294959101 irq/21-fe530000-76 [000] 51.804949: bprint: __dma_tx_complete: tx_size=4096 Since the port lock is held in when the kfifo is reset in uart_flush_buffer() and in __dma_tx_complete(), adding a flush_buffer hook to adjust the outstanding DMA byte count is sufficient to avoid the kfifo underflow. Fixes: 9ee4b83e51f74 ("serial: 8250: Add support for dmaengine") Cc: stable Signed-off-by: John Keeping Link: https://lore.kernel.org/r/20250208124148.1189191-1-jkeeping@inmusicbrands.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250.h | 2 ++ drivers/tty/serial/8250/8250_dma.c | 16 ++++++++++++++++ drivers/tty/serial/8250/8250_port.c | 9 +++++++++ 3 files changed, 27 insertions(+) diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h index 11e05aa014e5..b861585ca02a 100644 --- a/drivers/tty/serial/8250/8250.h +++ b/drivers/tty/serial/8250/8250.h @@ -374,6 +374,7 @@ static inline int is_omap1510_8250(struct uart_8250_port *pt) #ifdef CONFIG_SERIAL_8250_DMA extern int serial8250_tx_dma(struct uart_8250_port *); +extern void serial8250_tx_dma_flush(struct uart_8250_port *); extern int serial8250_rx_dma(struct uart_8250_port *); extern void serial8250_rx_dma_flush(struct uart_8250_port *); extern int serial8250_request_dma(struct uart_8250_port *); @@ -406,6 +407,7 @@ static inline int serial8250_tx_dma(struct uart_8250_port *p) { return -1; } +static inline void serial8250_tx_dma_flush(struct uart_8250_port *p) { } static inline int serial8250_rx_dma(struct uart_8250_port *p) { return -1; diff --git a/drivers/tty/serial/8250/8250_dma.c b/drivers/tty/serial/8250/8250_dma.c index d215c494ee24..f245a84f4a50 100644 --- a/drivers/tty/serial/8250/8250_dma.c +++ b/drivers/tty/serial/8250/8250_dma.c @@ -149,6 +149,22 @@ int serial8250_tx_dma(struct uart_8250_port *p) return ret; } +void serial8250_tx_dma_flush(struct uart_8250_port *p) +{ + struct uart_8250_dma *dma = p->dma; + + if (!dma->tx_running) + return; + + /* + * kfifo_reset() has been called by the serial core, avoid + * advancing and underflowing in __dma_tx_complete(). + */ + dma->tx_size = 0; + + dmaengine_terminate_async(dma->rxchan); +} + int serial8250_rx_dma(struct uart_8250_port *p) { struct uart_8250_dma *dma = p->dma; diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index d7976a21cca9..442967a6cd52 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -2555,6 +2555,14 @@ static void serial8250_shutdown(struct uart_port *port) serial8250_do_shutdown(port); } +static void serial8250_flush_buffer(struct uart_port *port) +{ + struct uart_8250_port *up = up_to_u8250p(port); + + if (up->dma) + serial8250_tx_dma_flush(up); +} + static unsigned int serial8250_do_get_divisor(struct uart_port *port, unsigned int baud, unsigned int *frac) @@ -3244,6 +3252,7 @@ static const struct uart_ops serial8250_pops = { .break_ctl = serial8250_break_ctl, .startup = serial8250_startup, .shutdown = serial8250_shutdown, + .flush_buffer = serial8250_flush_buffer, .set_termios = serial8250_set_termios, .set_ldisc = serial8250_set_ldisc, .pm = serial8250_pm, From 362ff1e7c6c20f8d6ebe20682870d471373c608b Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 13 Feb 2025 17:18:25 +0100 Subject: [PATCH 226/310] virtio_snd.h: clarify that `controls` depends on VIRTIO_SND_F_CTLS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As defined in the specification, the `controls` field in the configuration space is only valid/present if VIRTIO_SND_F_CTLS is negotiated. From https://docs.oasis-open.org/virtio/virtio/v1.3/virtio-v1.3.html: 5.14.4 Device Configuration Layout ... controls (driver-read-only) indicates a total number of all available control elements if VIRTIO_SND_F_CTLS has been negotiated. Let's use the same style used in virtio_blk.h to clarify this and to avoid confusion as happened in QEMU (see link). Link: https://gitlab.com/qemu-project/qemu/-/issues/2805 Signed-off-by: Stefano Garzarella Acked-by: Eugenio Pérez Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250213161825.139952-1-sgarzare@redhat.com --- include/uapi/linux/virtio_snd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/virtio_snd.h b/include/uapi/linux/virtio_snd.h index 5f4100c2cf04..a4cfb9f6561a 100644 --- a/include/uapi/linux/virtio_snd.h +++ b/include/uapi/linux/virtio_snd.h @@ -25,7 +25,7 @@ struct virtio_snd_config { __le32 streams; /* # of available channel maps */ __le32 chmaps; - /* # of available control elements */ + /* # of available control elements (if VIRTIO_SND_F_CTLS) */ __le32 controls; }; From 80e648042e512d5a767da251d44132553fe04ae0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 14 Feb 2025 02:39:50 +0100 Subject: [PATCH 227/310] partitions: mac: fix handling of bogus partition table Fix several issues in partition probing: - The bailout for a bad partoffset must use put_dev_sector(), since the preceding read_part_sector() succeeded. - If the partition table claims a silly sector size like 0xfff bytes (which results in partition table entries straddling sector boundaries), bail out instead of accessing out-of-bounds memory. - We must not assume that the partition table contains proper NUL termination - use strnlen() and strncmp() instead of strlen() and strcmp(). Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20250214-partition-mac-v1-1-c1c626dffbd5@google.com Signed-off-by: Jens Axboe --- block/partitions/mac.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/block/partitions/mac.c b/block/partitions/mac.c index c80183156d68..b02530d98629 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -53,13 +53,25 @@ int mac_partition(struct parsed_partitions *state) } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); + + /* + * If the "block size" is not a power of 2, things get weird - we might + * end up with a partition straddling a sector boundary, so we wouldn't + * be able to read a partition entry with read_part_sector(). + * Real block sizes are probably (?) powers of two, so just require + * that. + */ + if (!is_power_of_2(secsize)) + return -1; datasize = round_down(secsize, 512); data = read_part_sector(state, datasize / 512, §); if (!data) return -1; partoffset = secsize % 512; - if (partoffset + sizeof(*part) > datasize) + if (partoffset + sizeof(*part) > datasize) { + put_dev_sector(sect); return -1; + } part = (struct mac_partition *) (data + partoffset); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { put_dev_sector(sect); @@ -112,8 +124,8 @@ int mac_partition(struct parsed_partitions *state) int i, l; goodness++; - l = strlen(part->name); - if (strcmp(part->name, "/") == 0) + l = strnlen(part->name, sizeof(part->name)); + if (strncmp(part->name, "/", sizeof(part->name)) == 0) goodness++; for (i = 0; i <= l - 4; ++i) { if (strncasecmp(part->name + i, "root", From 9ba0e1755a40f9920ad0f4168031291b3eb58d7b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Feb 2025 13:19:57 -0500 Subject: [PATCH 228/310] ring-buffer: Unlock resize on mmap error Memory mapping the tracing ring buffer will disable resizing the buffer. But if there's an error in the memory mapping like an invalid parameter, the function exits out without re-enabling the resizing of the ring buffer, preventing the ring buffer from being resized after that. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250213131957.530ec3c5@gandalf.local.home Fixes: 117c39200d9d7 ("ring-buffer: Introducing ring-buffer mapping functions") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b8e0ae15ca5b..07b421115692 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7126,6 +7126,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, kfree(cpu_buffer->subbuf_ids); cpu_buffer->subbuf_ids = NULL; rb_free_meta_page(cpu_buffer); + atomic_dec(&cpu_buffer->resize_disabled); } unlock: From 60b8f711143de7cd9c0f55be0fe7eb94b19eb5c7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Feb 2025 13:41:32 -0500 Subject: [PATCH 229/310] tracing: Have the error of __tracing_resize_ring_buffer() passed to user Currently if __tracing_resize_ring_buffer() returns an error, the tracing_resize_ringbuffer() returns -ENOMEM. But it may not be a memory issue that caused the function to fail. If the ring buffer is memory mapped, then the resizing of the ring buffer will be disabled. But if the user tries to resize the buffer, it will get an -ENOMEM returned, which is confusing because there is plenty of memory. The actual error returned was -EBUSY, which would make much more sense to the user. Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250213134132.7e4505d7@gandalf.local.home Fixes: 117c39200d9d7 ("ring-buffer: Introducing ring-buffer mapping functions") Signed-off-by: Steven Rostedt (Google) Reviewed-by: Masami Hiramatsu (Google) --- kernel/trace/trace.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1496a5ac33ae..25ff37aab00f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5977,8 +5977,6 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id) { - int ret; - guard(mutex)(&trace_types_lock); if (cpu_id != RING_BUFFER_ALL_CPUS) { @@ -5987,11 +5985,7 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr, return -EINVAL; } - ret = __tracing_resize_ring_buffer(tr, size, cpu_id); - if (ret < 0) - ret = -ENOMEM; - - return ret; + return __tracing_resize_ring_buffer(tr, size, cpu_id); } static void update_last_data(struct trace_array *tr) From f5b95f1fa2ef3a03f49eeec658ba97e721412b32 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Feb 2025 10:28:20 -0500 Subject: [PATCH 230/310] ring-buffer: Validate the persistent meta data subbuf array The meta data for a mapped ring buffer contains an array of indexes of all the subbuffers. The first entry is the reader page, and the rest of the entries lay out the order of the subbuffers in how the ring buffer link list is to be created. The validator currently makes sure that all the entries are within the range of 0 and nr_subbufs. But it does not check if there are any duplicates. While working on the ring buffer, I corrupted this array, where I added duplicates. The validator did not catch it and created the ring buffer link list on top of it. Luckily, the corruption was only that the reader page was also in the writer path and only presented corrupted data but did not crash the kernel. But if there were duplicates in the writer side, then it could corrupt the ring buffer link list and cause a crash. Create a bitmask array with the size of the number of subbuffers. Then clear it. When walking through the subbuf array checking to see if the entries are within the range, test if its bit is already set in the subbuf_mask. If it is, then there is duplicates and fail the validation. If not, set the corresponding bit and continue. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250214102820.7509ddea@gandalf.local.home Fixes: c76883f18e59b ("ring-buffer: Add test if range of boot buffer is valid") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 07b421115692..0419d41a2060 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1672,7 +1672,8 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) * must be the same. */ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, - struct trace_buffer *buffer, int nr_pages) + struct trace_buffer *buffer, int nr_pages, + unsigned long *subbuf_mask) { int subbuf_size = PAGE_SIZE; struct buffer_data_page *subbuf; @@ -1680,6 +1681,9 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, unsigned long buffers_end; int i; + if (!subbuf_mask) + return false; + /* Check the meta magic and meta struct size */ if (meta->magic != RING_BUFFER_META_MAGIC || meta->struct_size != sizeof(*meta)) { @@ -1712,6 +1716,8 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, subbuf = rb_subbufs_from_meta(meta); + bitmap_clear(subbuf_mask, 0, meta->nr_subbufs); + /* Is the meta buffers and the subbufs themselves have correct data? */ for (i = 0; i < meta->nr_subbufs; i++) { if (meta->buffers[i] < 0 || @@ -1725,6 +1731,12 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, return false; } + if (test_bit(meta->buffers[i], subbuf_mask)) { + pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu); + return false; + } + + set_bit(meta->buffers[i], subbuf_mask); subbuf = (void *)subbuf + subbuf_size; } @@ -1889,17 +1901,22 @@ static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) { struct ring_buffer_meta *meta; + unsigned long *subbuf_mask; unsigned long delta; void *subbuf; int cpu; int i; + /* Create a mask to test the subbuf array */ + subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL); + /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */ + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; meta = rb_range_meta(buffer, nr_pages, cpu); - if (rb_meta_valid(meta, cpu, buffer, nr_pages)) { + if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { /* Make the mappings match the current address */ subbuf = rb_subbufs_from_meta(meta); delta = (unsigned long)subbuf - meta->first_buffer; @@ -1943,6 +1960,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) subbuf += meta->subbuf_size; } } + bitmap_free(subbuf_mask); } static void *rbm_start(struct seq_file *m, loff_t *pos) From 77b823fa619f97d16409ca37ad4f7936e28c5f83 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Tue, 4 Feb 2025 23:35:22 +0100 Subject: [PATCH 231/310] alpha: replace hardcoded stack offsets with autogenerated ones This allows the assembly in entry.S to automatically keep in sync with changes in the stack layout (struct pt_regs and struct switch_stack). Cc: stable@vger.kernel.org Tested-by: Maciej W. Rozycki Tested-by: Matt Turner Reviewed-by: Maciej W. Rozycki Signed-off-by: Ivan Kokshaysky Signed-off-by: Matt Turner --- arch/alpha/kernel/asm-offsets.c | 4 ++++ arch/alpha/kernel/entry.S | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c index 4cfeae42c79a..e9dad60b147f 100644 --- a/arch/alpha/kernel/asm-offsets.c +++ b/arch/alpha/kernel/asm-offsets.c @@ -19,9 +19,13 @@ static void __used foo(void) DEFINE(TI_STATUS, offsetof(struct thread_info, status)); BLANK(); + DEFINE(SP_OFF, offsetof(struct pt_regs, ps)); DEFINE(SIZEOF_PT_REGS, sizeof(struct pt_regs)); BLANK(); + DEFINE(SWITCH_STACK_SIZE, sizeof(struct switch_stack)); + BLANK(); + DEFINE(HAE_CACHE, offsetof(struct alpha_machine_vector, hae_cache)); DEFINE(HAE_REG, offsetof(struct alpha_machine_vector, hae_register)); } diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S index dd26062d75b3..6fb38365539d 100644 --- a/arch/alpha/kernel/entry.S +++ b/arch/alpha/kernel/entry.S @@ -15,10 +15,6 @@ .set noat .cfi_sections .debug_frame -/* Stack offsets. */ -#define SP_OFF 184 -#define SWITCH_STACK_SIZE 64 - .macro CFI_START_OSF_FRAME func .align 4 .globl \func From 0a0f7362b0367634a2d5cb7c96226afc116f19c9 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Tue, 4 Feb 2025 23:35:23 +0100 Subject: [PATCH 232/310] alpha: make stack 16-byte aligned (most cases) The problem is that GCC expects 16-byte alignment of the incoming stack since early 2004, as Maciej found out [1]: Having actually dug speculatively I can see that the psABI was changed in GCC 3.5 with commit e5e10fb4a350 ("re PR target/14539 (128-bit long double improperly aligned)") back in Mar 2004, when the stack pointer alignment was increased from 8 bytes to 16 bytes, and arch/alpha/kernel/entry.S has various suspicious stack pointer adjustments, starting with SP_OFF which is not a whole multiple of 16. Also, as Magnus noted, "ALPHA Calling Standard" [2] required the same: D.3.1 Stack Alignment This standard requires that stacks be octaword aligned at the time a new procedure is invoked. However: - the "normal" kernel stack is always misaligned by 8 bytes, thanks to the odd number of 64-bit words in 'struct pt_regs', which is the very first thing pushed onto the kernel thread stack; - syscall, fault, interrupt etc. handlers may, or may not, receive aligned stack depending on numerous factors. Somehow we got away with it until recently, when we ended up with a stack corruption in kernel/smp.c:smp_call_function_single() due to its use of 32-byte aligned local data and the compiler doing clever things allocating it on the stack. This adds padding between the PAL-saved and kernel-saved registers so that 'struct pt_regs' have an even number of 64-bit words. This makes the stack properly aligned for most of the kernel code, except two handlers which need special threatment. Note: struct pt_regs doesn't belong in uapi/asm; this should be fixed, but let's put this off until later. Link: https://lore.kernel.org/rcu/alpine.DEB.2.21.2501130248010.18889@angie.orcam.me.uk/ [1] Link: https://bitsavers.org/pdf/dec/alpha/Alpha_Calling_Standard_Rev_2.0_19900427.pdf [2] Cc: stable@vger.kernel.org Tested-by: Maciej W. Rozycki Tested-by: Magnus Lindholm Tested-by: Matt Turner Reviewed-by: Maciej W. Rozycki Signed-off-by: Ivan Kokshaysky Signed-off-by: Matt Turner --- arch/alpha/include/uapi/asm/ptrace.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/alpha/include/uapi/asm/ptrace.h b/arch/alpha/include/uapi/asm/ptrace.h index 5ca45934fcbb..72ed913a910f 100644 --- a/arch/alpha/include/uapi/asm/ptrace.h +++ b/arch/alpha/include/uapi/asm/ptrace.h @@ -42,6 +42,8 @@ struct pt_regs { unsigned long trap_a0; unsigned long trap_a1; unsigned long trap_a2; +/* This makes the stack 16-byte aligned as GCC expects */ + unsigned long __pad0; /* These are saved by PAL-code: */ unsigned long ps; unsigned long pc; From 3b35a171060f846b08b48646b38c30b5d57d17ff Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Tue, 4 Feb 2025 23:35:24 +0100 Subject: [PATCH 233/310] alpha: align stack for page fault and user unaligned trap handlers do_page_fault() and do_entUna() are special because they use non-standard stack frame layout. Fix them manually. Cc: stable@vger.kernel.org Tested-by: Maciej W. Rozycki Tested-by: Magnus Lindholm Tested-by: Matt Turner Reviewed-by: Maciej W. Rozycki Suggested-by: Maciej W. Rozycki Signed-off-by: Ivan Kokshaysky Signed-off-by: Matt Turner --- arch/alpha/kernel/entry.S | 20 ++++++++++---------- arch/alpha/kernel/traps.c | 2 +- arch/alpha/mm/fault.c | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S index 6fb38365539d..f4d41b4538c2 100644 --- a/arch/alpha/kernel/entry.S +++ b/arch/alpha/kernel/entry.S @@ -194,8 +194,8 @@ CFI_END_OSF_FRAME entArith CFI_START_OSF_FRAME entMM SAVE_ALL /* save $9 - $15 so the inline exception code can manipulate them. */ - subq $sp, 56, $sp - .cfi_adjust_cfa_offset 56 + subq $sp, 64, $sp + .cfi_adjust_cfa_offset 64 stq $9, 0($sp) stq $10, 8($sp) stq $11, 16($sp) @@ -210,7 +210,7 @@ CFI_START_OSF_FRAME entMM .cfi_rel_offset $13, 32 .cfi_rel_offset $14, 40 .cfi_rel_offset $15, 48 - addq $sp, 56, $19 + addq $sp, 64, $19 /* handle the fault */ lda $8, 0x3fff bic $sp, $8, $8 @@ -223,7 +223,7 @@ CFI_START_OSF_FRAME entMM ldq $13, 32($sp) ldq $14, 40($sp) ldq $15, 48($sp) - addq $sp, 56, $sp + addq $sp, 64, $sp .cfi_restore $9 .cfi_restore $10 .cfi_restore $11 @@ -231,7 +231,7 @@ CFI_START_OSF_FRAME entMM .cfi_restore $13 .cfi_restore $14 .cfi_restore $15 - .cfi_adjust_cfa_offset -56 + .cfi_adjust_cfa_offset -64 /* finish up the syscall as normal. */ br ret_from_sys_call CFI_END_OSF_FRAME entMM @@ -378,8 +378,8 @@ entUnaUser: .cfi_restore $0 .cfi_adjust_cfa_offset -256 SAVE_ALL /* setup normal kernel stack */ - lda $sp, -56($sp) - .cfi_adjust_cfa_offset 56 + lda $sp, -64($sp) + .cfi_adjust_cfa_offset 64 stq $9, 0($sp) stq $10, 8($sp) stq $11, 16($sp) @@ -395,7 +395,7 @@ entUnaUser: .cfi_rel_offset $14, 40 .cfi_rel_offset $15, 48 lda $8, 0x3fff - addq $sp, 56, $19 + addq $sp, 64, $19 bic $sp, $8, $8 jsr $26, do_entUnaUser ldq $9, 0($sp) @@ -405,7 +405,7 @@ entUnaUser: ldq $13, 32($sp) ldq $14, 40($sp) ldq $15, 48($sp) - lda $sp, 56($sp) + lda $sp, 64($sp) .cfi_restore $9 .cfi_restore $10 .cfi_restore $11 @@ -413,7 +413,7 @@ entUnaUser: .cfi_restore $13 .cfi_restore $14 .cfi_restore $15 - .cfi_adjust_cfa_offset -56 + .cfi_adjust_cfa_offset -64 br ret_from_sys_call CFI_END_OSF_FRAME entUna diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index a9a38c80c4a7..7004397937cf 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -649,7 +649,7 @@ s_reg_to_mem (unsigned long s_reg) static int unauser_reg_offsets[32] = { R(r0), R(r1), R(r2), R(r3), R(r4), R(r5), R(r6), R(r7), R(r8), /* r9 ... r15 are stored in front of regs. */ - -56, -48, -40, -32, -24, -16, -8, + -64, -56, -48, -40, -32, -24, -16, /* padding at -8 */ R(r16), R(r17), R(r18), R(r19), R(r20), R(r21), R(r22), R(r23), R(r24), R(r25), R(r26), R(r27), R(r28), R(gp), diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 8c9850437e67..a9816bbc9f34 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -78,8 +78,8 @@ __load_new_mm_context(struct mm_struct *next_mm) /* Macro for exception fixup code to access integer registers. */ #define dpf_reg(r) \ - (((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-16 : \ - (r) <= 18 ? (r)+10 : (r)-10]) + (((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-17 : \ + (r) <= 18 ? (r)+11 : (r)-10]) asmlinkage void do_page_fault(unsigned long address, unsigned long mmcsr, From 757f051a506198186d796dff4ba696adb7bda54c Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 7 Jan 2025 11:43:42 +0100 Subject: [PATCH 234/310] alpha: Replace one-element array with flexible array member Replace the deprecated one-element array with a modern flexible array member in the struct crb_struct. Reviewed-by: Kees Cook Signed-off-by: Thorsten Blum Signed-off-by: Matt Turner --- arch/alpha/include/asm/hwrpb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/alpha/include/asm/hwrpb.h b/arch/alpha/include/asm/hwrpb.h index fc76f36265ad..db831cf8de10 100644 --- a/arch/alpha/include/asm/hwrpb.h +++ b/arch/alpha/include/asm/hwrpb.h @@ -135,7 +135,7 @@ struct crb_struct { /* virtual->physical map */ unsigned long map_entries; unsigned long map_pages; - struct vf_map_struct map[1]; + struct vf_map_struct map[]; }; struct memclust_struct { From 1523226edda566057bdd3264ceb56631ddf5f6f7 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 12 Feb 2025 12:14:47 +0100 Subject: [PATCH 235/310] alpha: Use str_yes_no() helper in pci_dac_dma_supported() Remove hard-coded strings by using the str_yes_no() helper function. Reviewed-by: Geert Uytterhoeven Signed-off-by: Thorsten Blum Signed-off-by: Matt Turner --- arch/alpha/kernel/pci_iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index 681f56089d9c..dc91de50f906 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -212,7 +213,7 @@ static int pci_dac_dma_supported(struct pci_dev *dev, u64 mask) /* If both conditions above are met, we are fine. */ DBGA("pci_dac_dma_supported %s from %ps\n", - ok ? "yes" : "no", __builtin_return_address(0)); + str_yes_no(ok), __builtin_return_address(0)); return ok; } From 054e61bb1de4fa02d148344152007facbcb28583 Mon Sep 17 00:00:00 2001 From: Jeroen de Borst Date: Thu, 13 Feb 2025 10:45:23 -0800 Subject: [PATCH 236/310] gve: Update MAINTAINERS Updating MAINTAINERS to include active contributers. Signed-off-by: Jeroen de Borst Link: https://patch.msgid.link/20250213184523.2002582-1-jeroendb@google.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 10893c91b1c1..988b0ff94fda 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9834,8 +9834,8 @@ F: drivers/input/touchscreen/goodix* GOOGLE ETHERNET DRIVERS M: Jeroen de Borst -M: Praveen Kaligineedi -R: Shailend Chand +M: Joshua Washington +M: Harshitha Ramamurthy L: netdev@vger.kernel.org S: Maintained F: Documentation/networking/device_drivers/ethernet/google/gve.rst From 0d1fac6d26aff5df21bb4ec980d9b7a11c410b96 Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Wed, 12 Feb 2025 12:15:35 +0100 Subject: [PATCH 237/310] net: wwan: mhi_wwan_mbim: Silence sequence number glitch errors When using the Qualcomm X55 modem on the ThinkPad X13s, the kernel log is constantly being filled with errors related to a "sequence number glitch", e.g.: [ 1903.284538] sequence number glitch prev=16 curr=0 [ 1913.812205] sequence number glitch prev=50 curr=0 [ 1923.698219] sequence number glitch prev=142 curr=0 [ 2029.248276] sequence number glitch prev=1555 curr=0 [ 2046.333059] sequence number glitch prev=70 curr=0 [ 2076.520067] sequence number glitch prev=272 curr=0 [ 2158.704202] sequence number glitch prev=2655 curr=0 [ 2218.530776] sequence number glitch prev=2349 curr=0 [ 2225.579092] sequence number glitch prev=6 curr=0 Internet connectivity is working fine, so this error seems harmless. It looks like modem does not preserve the sequence number when entering low power state; the amount of errors depends on how actively the modem is being used. A similar issue has also been seen on USB-based MBIM modems [1]. However, in cdc_ncm.c the "sequence number glitch" message is a debug message instead of an error. Apply the same to the mhi_wwan_mbim.c driver to silence these errors when using the modem. [1]: https://lists.freedesktop.org/archives/libmbim-devel/2016-November/000781.html Signed-off-by: Stephan Gerhold Reviewed-by: Loic Poulain Acked-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20250212-mhi-wwan-mbim-sequence-glitch-v1-1-503735977cbd@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/wwan/mhi_wwan_mbim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wwan/mhi_wwan_mbim.c b/drivers/net/wwan/mhi_wwan_mbim.c index d5a9360323d2..8755c5e6a65b 100644 --- a/drivers/net/wwan/mhi_wwan_mbim.c +++ b/drivers/net/wwan/mhi_wwan_mbim.c @@ -220,7 +220,7 @@ static int mbim_rx_verify_nth16(struct mhi_mbim_context *mbim, struct sk_buff *s if (mbim->rx_seq + 1 != le16_to_cpu(nth16->wSequence) && (mbim->rx_seq || le16_to_cpu(nth16->wSequence)) && !(mbim->rx_seq == 0xffff && !le16_to_cpu(nth16->wSequence))) { - net_err_ratelimited("sequence number glitch prev=%d curr=%d\n", + net_dbg_ratelimited("sequence number glitch prev=%d curr=%d\n", mbim->rx_seq, le16_to_cpu(nth16->wSequence)); } mbim->rx_seq = le16_to_cpu(nth16->wSequence); From 435b344a7042e91fb4719d589f18310e8919e39f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 10 Feb 2025 22:53:47 +0000 Subject: [PATCH 238/310] crypto: ccp: Add external API interface for PSP module initialization KVM is dependent on the PSP SEV driver and PSP SEV driver needs to be loaded before KVM module. In case of module loading any dependent modules are automatically loaded but in case of built-in modules there is no inherent mechanism available to specify dependencies between modules and ensure that any dependent modules are loaded implicitly. Add a new external API interface for PSP module initialization which allows PSP SEV driver to be loaded explicitly if KVM is built-in. Signed-off-by: Sean Christopherson Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra Reviewed-by: Tom Lendacky Message-ID: <15279ca0cad56a07cf12834ec544310f85ff5edc.1739226950.git.ashish.kalra@amd.com> Signed-off-by: Paolo Bonzini --- drivers/crypto/ccp/sp-dev.c | 14 ++++++++++++++ include/linux/psp-sev.h | 9 +++++++++ 2 files changed, 23 insertions(+) diff --git a/drivers/crypto/ccp/sp-dev.c b/drivers/crypto/ccp/sp-dev.c index 7eb3e4668286..3467f6db4f50 100644 --- a/drivers/crypto/ccp/sp-dev.c +++ b/drivers/crypto/ccp/sp-dev.c @@ -19,6 +19,7 @@ #include #include +#include "sev-dev.h" #include "ccp-dev.h" #include "sp-dev.h" @@ -253,8 +254,12 @@ struct sp_device *sp_get_psp_master_device(void) static int __init sp_mod_init(void) { #ifdef CONFIG_X86 + static bool initialized; int ret; + if (initialized) + return 0; + ret = sp_pci_init(); if (ret) return ret; @@ -263,6 +268,8 @@ static int __init sp_mod_init(void) psp_pci_init(); #endif + initialized = true; + return 0; #endif @@ -279,6 +286,13 @@ static int __init sp_mod_init(void) return -ENODEV; } +#if IS_BUILTIN(CONFIG_KVM_AMD) && IS_ENABLED(CONFIG_KVM_AMD_SEV) +int __init sev_module_init(void) +{ + return sp_mod_init(); +} +#endif + static void __exit sp_mod_exit(void) { #ifdef CONFIG_X86 diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 903ddfea8585..f3cad182d4ef 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -814,6 +814,15 @@ struct sev_data_snp_commit { #ifdef CONFIG_CRYPTO_DEV_SP_PSP +/** + * sev_module_init - perform PSP SEV module initialization + * + * Returns: + * 0 if the PSP module is successfully initialized + * negative value if the PSP module initialization fails + */ +int sev_module_init(void); + /** * sev_platform_init - perform SEV INIT command * From 44e70718df4fc2fadf1665eb9374df71aeda1f03 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 10 Feb 2025 22:54:02 +0000 Subject: [PATCH 239/310] KVM: SVM: Ensure PSP module is initialized if KVM module is built-in The kernel's initcall infrastructure lacks the ability to express dependencies between initcalls, whereas the modules infrastructure automatically handles dependencies via symbol loading. Ensure the PSP SEV driver is initialized before proceeding in sev_hardware_setup() if KVM is built-in as the dependency isn't handled by the initcall infrastructure. Signed-off-by: Sean Christopherson Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Message-ID: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a2a794c32050..0dbb25442ec1 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2972,6 +2972,16 @@ void __init sev_hardware_setup(void) WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) goto out; + /* + * The kernel's initcall infrastructure lacks the ability to express + * dependencies between initcalls, whereas the modules infrastructure + * automatically handles dependencies via symbol loading. Ensure the + * PSP SEV driver is initialized before proceeding if KVM is built-in, + * as the dependency isn't handled by the initcall infrastructure. + */ + if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) + goto out; + /* Retrieve SEV CPUID information */ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); From 409f45387c937145adeeeebc6d6032c2ec232b35 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 10 Feb 2025 22:54:18 +0000 Subject: [PATCH 240/310] x86/sev: Fix broken SNP support with KVM module built-in Fix issues with enabling SNP host support and effectively SNP support which is broken with respect to the KVM module being built-in. SNP host support is enabled in snp_rmptable_init() which is invoked as device_initcall(). SNP check on IOMMU is done during IOMMU PCI init (IOMMU_PCI_INIT stage). And for that reason snp_rmptable_init() is currently invoked via device_initcall() and cannot be invoked via subsys_initcall() as core IOMMU subsystem gets initialized via subsys_initcall(). Now, if kvm_amd module is built-in, it gets initialized before SNP host support is enabled in snp_rmptable_init() : [ 10.131811] kvm_amd: TSC scaling supported [ 10.136384] kvm_amd: Nested Virtualization enabled [ 10.141734] kvm_amd: Nested Paging enabled [ 10.146304] kvm_amd: LBR virtualization supported [ 10.151557] kvm_amd: SEV enabled (ASIDs 100 - 509) [ 10.156905] kvm_amd: SEV-ES enabled (ASIDs 1 - 99) [ 10.162256] kvm_amd: SEV-SNP enabled (ASIDs 1 - 99) [ 10.171508] kvm_amd: Virtual VMLOAD VMSAVE supported [ 10.177052] kvm_amd: Virtual GIF supported ... ... [ 10.201648] kvm_amd: in svm_enable_virtualization_cpu And then svm_x86_ops->enable_virtualization_cpu() (svm_enable_virtualization_cpu) programs MSR_VM_HSAVE_PA as following: wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); So VM_HSAVE_PA is non-zero before SNP support is enabled on all CPUs. snp_rmptable_init() gets invoked after svm_enable_virtualization_cpu() as following : ... [ 11.256138] kvm_amd: in svm_enable_virtualization_cpu ... [ 11.264918] SEV-SNP: in snp_rmptable_init This triggers a #GP exception in snp_rmptable_init() when snp_enable() is invoked to set SNP_EN in SYSCFG MSR: [ 11.294289] unchecked MSR access error: WRMSR to 0xc0010010 (tried to write 0x0000000003fc0000) at rIP: 0xffffffffaf5d5c28 (native_write_msr+0x8/0x30) ... [ 11.294404] Call Trace: [ 11.294482] [ 11.294513] ? show_stack_regs+0x26/0x30 [ 11.294522] ? ex_handler_msr+0x10f/0x180 [ 11.294529] ? search_extable+0x2b/0x40 [ 11.294538] ? fixup_exception+0x2dd/0x340 [ 11.294542] ? exc_general_protection+0x14f/0x440 [ 11.294550] ? asm_exc_general_protection+0x2b/0x30 [ 11.294557] ? __pfx_snp_enable+0x10/0x10 [ 11.294567] ? native_write_msr+0x8/0x30 [ 11.294570] ? __snp_enable+0x5d/0x70 [ 11.294575] snp_enable+0x19/0x20 [ 11.294578] __flush_smp_call_function_queue+0x9c/0x3a0 [ 11.294586] generic_smp_call_function_single_interrupt+0x17/0x20 [ 11.294589] __sysvec_call_function+0x20/0x90 [ 11.294596] sysvec_call_function+0x80/0xb0 [ 11.294601] [ 11.294603] [ 11.294605] asm_sysvec_call_function+0x1f/0x30 ... [ 11.294631] arch_cpu_idle+0xd/0x20 [ 11.294633] default_idle_call+0x34/0xd0 [ 11.294636] do_idle+0x1f1/0x230 [ 11.294643] ? complete+0x71/0x80 [ 11.294649] cpu_startup_entry+0x30/0x40 [ 11.294652] start_secondary+0x12d/0x160 [ 11.294655] common_startup_64+0x13e/0x141 [ 11.294662] This #GP exception is getting triggered due to the following errata for AMD family 19h Models 10h-1Fh Processors: Processor may generate spurious #GP(0) Exception on WRMSR instruction: Description: The Processor will generate a spurious #GP(0) Exception on a WRMSR instruction if the following conditions are all met: - the target of the WRMSR is a SYSCFG register. - the write changes the value of SYSCFG.SNPEn from 0 to 1. - One of the threads that share the physical core has a non-zero value in the VM_HSAVE_PA MSR. The document being referred to above: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/revision-guides/57095-PUB_1_01.pdf To summarize, with kvm_amd module being built-in, KVM/SVM initialization happens before host SNP is enabled and this SVM initialization sets VM_HSAVE_PA to non-zero, which then triggers a #GP when SYSCFG.SNPEn is being set and this will subsequently cause SNP_INIT(_EX) to fail with INVALID_CONFIG error as SYSCFG[SnpEn] is not set on all CPUs. Essentially SNP host enabling code should be invoked before KVM initialization, which is currently not the case when KVM is built-in. Add fix to call snp_rmptable_init() early from iommu_snp_enable() directly and not invoked via device_initcall() which enables SNP host support before KVM initialization with kvm_amd module built-in. Add additional handling for `iommu=off` or `amd_iommu=off` options. Note that IOMMUs need to be enabled for SNP initialization, therefore, if host SNP support is enabled but late IOMMU initialization fails then that will cause PSP driver's SNP_INIT to fail as IOMMU SNP sanity checks in SNP firmware will fail with invalid configuration error as below: [ 9.723114] ccp 0000:23:00.1: sev enabled [ 9.727602] ccp 0000:23:00.1: psp enabled [ 9.732527] ccp 0000:a2:00.1: enabling device (0000 -> 0002) [ 9.739098] ccp 0000:a2:00.1: no command queues available [ 9.745167] ccp 0000:a2:00.1: psp enabled [ 9.805337] ccp 0000:23:00.1: SEV-SNP: failed to INIT rc -5, error 0x3 [ 9.866426] ccp 0000:23:00.1: SEV API:1.53 build:5 Fixes: c3b86e61b756 ("x86/cpufeatures: Enable/unmask SEV-SNP CPU feature") Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Co-developed-by: Vasant Hegde Signed-off-by: Vasant Hegde Cc: Signed-off-by: Ashish Kalra Acked-by: Joerg Roedel Message-ID: <138b520fb83964782303b43ade4369cd181fdd9c.1739226950.git.ashish.kalra@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/sev.h | 2 ++ arch/x86/virt/svm/sev.c | 23 +++++++---------------- drivers/iommu/amd/init.c | 34 ++++++++++++++++++++++++++++++---- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 5d9685f92e5c..1581246491b5 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -531,6 +531,7 @@ static inline void __init snp_secure_tsc_init(void) { } #ifdef CONFIG_KVM_AMD_SEV bool snp_probe_rmptable_info(void); +int snp_rmptable_init(void); int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level); void snp_dump_hva_rmpentry(unsigned long address); int psmash(u64 pfn); @@ -541,6 +542,7 @@ void kdump_sev_callback(void); void snp_fixup_e820_tables(void); #else static inline bool snp_probe_rmptable_info(void) { return false; } +static inline int snp_rmptable_init(void) { return -ENOSYS; } static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; } static inline void snp_dump_hva_rmpentry(unsigned long address) {} static inline int psmash(u64 pfn) { return -ENODEV; } diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 1dcc027ec77e..42e74a5a7d78 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -505,19 +505,19 @@ static bool __init setup_rmptable(void) * described in the SNP_INIT_EX firmware command description in the SNP * firmware ABI spec. */ -static int __init snp_rmptable_init(void) +int __init snp_rmptable_init(void) { unsigned int i; u64 val; - if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) - return 0; + if (WARN_ON_ONCE(!cc_platform_has(CC_ATTR_HOST_SEV_SNP))) + return -ENOSYS; - if (!amd_iommu_snp_en) - goto nosnp; + if (WARN_ON_ONCE(!amd_iommu_snp_en)) + return -ENOSYS; if (!setup_rmptable()) - goto nosnp; + return -ENOSYS; /* * Check if SEV-SNP is already enabled, this can happen in case of @@ -530,7 +530,7 @@ static int __init snp_rmptable_init(void) /* Zero out the RMP bookkeeping area */ if (!clear_rmptable_bookkeeping()) { free_rmp_segment_table(); - goto nosnp; + return -ENOSYS; } /* Zero out the RMP entries */ @@ -562,17 +562,8 @@ static int __init snp_rmptable_init(void) crash_kexec_post_notifiers = true; return 0; - -nosnp: - cc_platform_clear(CC_ATTR_HOST_SEV_SNP); - return -ENOSYS; } -/* - * This must be called after the IOMMU has been initialized. - */ -device_initcall(snp_rmptable_init); - static void set_rmp_segment_info(unsigned int segment_shift) { rmp_segment_shift = segment_shift; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index c5cd92edada0..2fecfed75e54 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3194,7 +3194,7 @@ static bool __init detect_ivrs(void) return true; } -static void iommu_snp_enable(void) +static __init void iommu_snp_enable(void) { #ifdef CONFIG_KVM_AMD_SEV if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) @@ -3219,6 +3219,14 @@ static void iommu_snp_enable(void) goto disable_snp; } + /* + * Enable host SNP support once SNP support is checked on IOMMU. + */ + if (snp_rmptable_init()) { + pr_warn("SNP: RMP initialization failed, SNP cannot be supported.\n"); + goto disable_snp; + } + pr_info("IOMMU SNP support enabled.\n"); return; @@ -3318,6 +3326,19 @@ static int __init iommu_go_to_state(enum iommu_init_state state) ret = state_next(); } + /* + * SNP platform initilazation requires IOMMUs to be fully configured. + * If the SNP support on IOMMUs has NOT been checked, simply mark SNP + * as unsupported. If the SNP support on IOMMUs has been checked and + * host SNP support enabled but RMP enforcement has not been enabled + * in IOMMUs, then the system is in a half-baked state, but can limp + * along as all memory should be Hypervisor-Owned in the RMP. WARN, + * but leave SNP as "supported" to avoid confusing the kernel. + */ + if (ret && cc_platform_has(CC_ATTR_HOST_SEV_SNP) && + !WARN_ON_ONCE(amd_iommu_snp_en)) + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); + return ret; } @@ -3426,18 +3447,23 @@ void __init amd_iommu_detect(void) int ret; if (no_iommu || (iommu_detected && !gart_iommu_aperture)) - return; + goto disable_snp; if (!amd_iommu_sme_check()) - return; + goto disable_snp; ret = iommu_go_to_state(IOMMU_IVRS_DETECTED); if (ret) - return; + goto disable_snp; amd_iommu_detected = true; iommu_detected = 1; x86_init.iommu.iommu_init = amd_iommu_init; + return; + +disable_snp: + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); } /**************************************************************************** From 55eff109e76a14e5ed10c8c3c3978d20a35e2a4d Mon Sep 17 00:00:00 2001 From: Junnan Wu Date: Fri, 14 Feb 2025 09:22:00 +0800 Subject: [PATCH 241/310] vsock/virtio: fix variables initialization during resuming When executing suspend to ram twice in a row, the `rx_buf_nr` and `rx_buf_max_nr` increase to three times vq->num_free. Then after virtqueue_get_buf and `rx_buf_nr` decreased in function virtio_transport_rx_work, the condition to fill rx buffer (rx_buf_nr < rx_buf_max_nr / 2) will never be met. It is because that `rx_buf_nr` and `rx_buf_max_nr` are initialized only in virtio_vsock_probe(), but they should be reset whenever virtqueues are recreated, like after a suspend/resume. Move the `rx_buf_nr` and `rx_buf_max_nr` initialization in virtio_vsock_vqs_init(), so we are sure that they are properly initialized, every time we initialize the virtqueues, either when we load the driver or after a suspend/resume. To prevent erroneous atomic load operations on the `queued_replies` in the virtio_transport_send_pkt_work() function which may disrupt the scheduling of vsock->rx_work when transmitting reply-required socket packets, this atomic variable must undergo synchronized initialization alongside the preceding two variables after a suspend/resume. Fixes: bd50c5dc182b ("vsock/virtio: add support for device suspend/resume") Link: https://lore.kernel.org/virtualization/20250207052033.2222629-1-junnan01.wu@samsung.com/ Co-developed-by: Ying Gao Signed-off-by: Ying Gao Signed-off-by: Junnan Wu Reviewed-by: Luigi Leonardi Acked-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/20250214012200.1883896-1-junnan01.wu@samsung.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index b58c3818f284..f0e48e6911fc 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -670,6 +670,13 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock) }; int ret; + mutex_lock(&vsock->rx_lock); + vsock->rx_buf_nr = 0; + vsock->rx_buf_max_nr = 0; + mutex_unlock(&vsock->rx_lock); + + atomic_set(&vsock->queued_replies, 0); + ret = virtio_find_vqs(vdev, VSOCK_VQ_MAX, vsock->vqs, vqs_info, NULL); if (ret < 0) return ret; @@ -779,9 +786,6 @@ static int virtio_vsock_probe(struct virtio_device *vdev) vsock->vdev = vdev; - vsock->rx_buf_nr = 0; - vsock->rx_buf_max_nr = 0; - atomic_set(&vsock->queued_replies, 0); mutex_init(&vsock->tx_lock); mutex_init(&vsock->rx_lock); From 9593172d93b9f91c362baec4643003dc29802929 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 13 Feb 2025 13:33:54 +0900 Subject: [PATCH 242/310] geneve: Fix use-after-free in geneve_find_dev(). syzkaller reported a use-after-free in geneve_find_dev() [0] without repro. geneve_configure() links struct geneve_dev.next to net_generic(net, geneve_net_id)->geneve_list. The net here could differ from dev_net(dev) if IFLA_NET_NS_PID, IFLA_NET_NS_FD, or IFLA_TARGET_NETNSID is set. When dev_net(dev) is dismantled, geneve_exit_batch_rtnl() finally calls unregister_netdevice_queue() for each dev in the netns, and later the dev is freed. However, its geneve_dev.next is still linked to the backend UDP socket netns. Then, use-after-free will occur when another geneve dev is created in the netns. Let's call geneve_dellink() instead in geneve_destroy_tunnels(). [0]: BUG: KASAN: slab-use-after-free in geneve_find_dev drivers/net/geneve.c:1295 [inline] BUG: KASAN: slab-use-after-free in geneve_configure+0x234/0x858 drivers/net/geneve.c:1343 Read of size 2 at addr ffff000054d6ee24 by task syz.1.4029/13441 CPU: 1 UID: 0 PID: 13441 Comm: syz.1.4029 Not tainted 6.13.0-g0ad9617c78ac #24 dc35ca22c79fb82e8e7bc5c9c9adafea898b1e3d Hardware name: linux,dummy-virt (DT) Call trace: show_stack+0x38/0x50 arch/arm64/kernel/stacktrace.c:466 (C) __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0xbc/0x108 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x16c/0x6f0 mm/kasan/report.c:489 kasan_report+0xc0/0x120 mm/kasan/report.c:602 __asan_report_load2_noabort+0x20/0x30 mm/kasan/report_generic.c:379 geneve_find_dev drivers/net/geneve.c:1295 [inline] geneve_configure+0x234/0x858 drivers/net/geneve.c:1343 geneve_newlink+0xb8/0x128 drivers/net/geneve.c:1634 rtnl_newlink_create+0x23c/0x868 net/core/rtnetlink.c:3795 __rtnl_newlink net/core/rtnetlink.c:3906 [inline] rtnl_newlink+0x1054/0x1630 net/core/rtnetlink.c:4021 rtnetlink_rcv_msg+0x61c/0x918 net/core/rtnetlink.c:6911 netlink_rcv_skb+0x1dc/0x398 net/netlink/af_netlink.c:2543 rtnetlink_rcv+0x34/0x50 net/core/rtnetlink.c:6938 netlink_unicast_kernel net/netlink/af_netlink.c:1322 [inline] netlink_unicast+0x618/0x838 net/netlink/af_netlink.c:1348 netlink_sendmsg+0x5fc/0x8b0 net/netlink/af_netlink.c:1892 sock_sendmsg_nosec net/socket.c:713 [inline] __sock_sendmsg net/socket.c:728 [inline] ____sys_sendmsg+0x410/0x6f8 net/socket.c:2568 ___sys_sendmsg+0x178/0x1d8 net/socket.c:2622 __sys_sendmsg net/socket.c:2654 [inline] __do_sys_sendmsg net/socket.c:2659 [inline] __se_sys_sendmsg net/socket.c:2657 [inline] __arm64_sys_sendmsg+0x12c/0x1c8 net/socket.c:2657 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x90/0x278 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x13c/0x250 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x54/0x70 arch/arm64/kernel/syscall.c:151 el0_svc+0x4c/0xa8 arch/arm64/kernel/entry-common.c:744 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:762 el0t_64_sync+0x198/0x1a0 arch/arm64/kernel/entry.S:600 Allocated by task 13247: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x68 mm/kasan/common.c:68 kasan_save_alloc_info+0x44/0x58 mm/kasan/generic.c:568 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x84/0xa0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __do_kmalloc_node mm/slub.c:4298 [inline] __kmalloc_node_noprof+0x2a0/0x560 mm/slub.c:4304 __kvmalloc_node_noprof+0x9c/0x230 mm/util.c:645 alloc_netdev_mqs+0xb8/0x11a0 net/core/dev.c:11470 rtnl_create_link+0x2b8/0xb50 net/core/rtnetlink.c:3604 rtnl_newlink_create+0x19c/0x868 net/core/rtnetlink.c:3780 __rtnl_newlink net/core/rtnetlink.c:3906 [inline] rtnl_newlink+0x1054/0x1630 net/core/rtnetlink.c:4021 rtnetlink_rcv_msg+0x61c/0x918 net/core/rtnetlink.c:6911 netlink_rcv_skb+0x1dc/0x398 net/netlink/af_netlink.c:2543 rtnetlink_rcv+0x34/0x50 net/core/rtnetlink.c:6938 netlink_unicast_kernel net/netlink/af_netlink.c:1322 [inline] netlink_unicast+0x618/0x838 net/netlink/af_netlink.c:1348 netlink_sendmsg+0x5fc/0x8b0 net/netlink/af_netlink.c:1892 sock_sendmsg_nosec net/socket.c:713 [inline] __sock_sendmsg net/socket.c:728 [inline] ____sys_sendmsg+0x410/0x6f8 net/socket.c:2568 ___sys_sendmsg+0x178/0x1d8 net/socket.c:2622 __sys_sendmsg net/socket.c:2654 [inline] __do_sys_sendmsg net/socket.c:2659 [inline] __se_sys_sendmsg net/socket.c:2657 [inline] __arm64_sys_sendmsg+0x12c/0x1c8 net/socket.c:2657 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x90/0x278 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x13c/0x250 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x54/0x70 arch/arm64/kernel/syscall.c:151 el0_svc+0x4c/0xa8 arch/arm64/kernel/entry-common.c:744 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:762 el0t_64_sync+0x198/0x1a0 arch/arm64/kernel/entry.S:600 Freed by task 45: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x68 mm/kasan/common.c:68 kasan_save_free_info+0x58/0x70 mm/kasan/generic.c:582 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x48/0x68 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2353 [inline] slab_free mm/slub.c:4613 [inline] kfree+0x140/0x420 mm/slub.c:4761 kvfree+0x4c/0x68 mm/util.c:688 netdev_release+0x94/0xc8 net/core/net-sysfs.c:2065 device_release+0x98/0x1c0 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x2b0/0x438 lib/kobject.c:737 netdev_run_todo+0xe5c/0xfc8 net/core/dev.c:11185 rtnl_unlock+0x20/0x38 net/core/rtnetlink.c:151 cleanup_net+0x4fc/0x8c0 net/core/net_namespace.c:648 process_one_work+0x700/0x1398 kernel/workqueue.c:3236 process_scheduled_works kernel/workqueue.c:3317 [inline] worker_thread+0x8c4/0xe10 kernel/workqueue.c:3398 kthread+0x4bc/0x608 kernel/kthread.c:464 ret_from_fork+0x10/0x20 arch/arm64/kernel/entry.S:862 The buggy address belongs to the object at ffff000054d6e000 which belongs to the cache kmalloc-cg-4k of size 4096 The buggy address is located 3620 bytes inside of freed 4096-byte region [ffff000054d6e000, ffff000054d6f000) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x94d68 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 memcg:ffff000016276181 flags: 0x3fffe0000000040(head|node=0|zone=0|lastcpupid=0x1ffff) page_type: f5(slab) raw: 03fffe0000000040 ffff0000c000f500 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000040004 00000001f5000000 ffff000016276181 head: 03fffe0000000040 ffff0000c000f500 dead000000000122 0000000000000000 head: 0000000000000000 0000000000040004 00000001f5000000 ffff000016276181 head: 03fffe0000000003 fffffdffc1535a01 ffffffffffffffff 0000000000000000 head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff000054d6ed00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff000054d6ed80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff000054d6ee00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff000054d6ee80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff000054d6ef00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 2d07dc79fe04 ("geneve: add initial netdev driver for GENEVE tunnels") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250213043354.91368-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/geneve.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 642155cb8315..a1f674539965 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1907,16 +1907,11 @@ static void geneve_destroy_tunnels(struct net *net, struct list_head *head) /* gather any geneve devices that were moved into this ns */ for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &geneve_link_ops) - unregister_netdevice_queue(dev, head); + geneve_dellink(dev, head); /* now gather any other geneve devices that were created in this ns */ - list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) { - /* If geneve->dev is in the same netns, it was already added - * to the list by the previous loop. - */ - if (!net_eq(dev_net(geneve->dev), net)) - unregister_netdevice_queue(geneve->dev, head); - } + list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) + geneve_dellink(geneve->dev, head); } static void __net_exit geneve_exit_batch_rtnl(struct list_head *net_list, From 08b613b9e2ba431db3bd15cb68ca72472a50ef5c Mon Sep 17 00:00:00 2001 From: Vitaly Rodionov Date: Fri, 14 Feb 2025 21:07:28 +0000 Subject: [PATCH 243/310] ALSA: hda/cirrus: Correct the full scale volume set logic This patch corrects the full-scale volume setting logic. On certain platforms, the full-scale volume bit is required. The current logic mistakenly sets this bit and incorrectly clears reserved bit 0, causing the headphone output to be muted. Fixes: 342b6b610ae2 ("ALSA: hda/cs8409: Fix Full Scale Volume setting for all variants") Signed-off-by: Vitaly Rodionov Link: https://patch.msgid.link/20250214210736.30814-1-vitalyr@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_cs8409-tables.c | 6 +++--- sound/pci/hda/patch_cs8409.c | 20 +++++++++++--------- sound/pci/hda/patch_cs8409.h | 5 +++-- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/sound/pci/hda/patch_cs8409-tables.c b/sound/pci/hda/patch_cs8409-tables.c index 759f48038273..621f947e3817 100644 --- a/sound/pci/hda/patch_cs8409-tables.c +++ b/sound/pci/hda/patch_cs8409-tables.c @@ -121,7 +121,7 @@ static const struct cs8409_i2c_param cs42l42_init_reg_seq[] = { { CS42L42_MIXER_CHA_VOL, 0x3F }, { CS42L42_MIXER_CHB_VOL, 0x3F }, { CS42L42_MIXER_ADC_VOL, 0x3f }, - { CS42L42_HP_CTL, 0x03 }, + { CS42L42_HP_CTL, 0x0D }, { CS42L42_MIC_DET_CTL1, 0xB6 }, { CS42L42_TIPSENSE_CTL, 0xC2 }, { CS42L42_HS_CLAMP_DISABLE, 0x01 }, @@ -315,7 +315,7 @@ static const struct cs8409_i2c_param dolphin_c0_init_reg_seq[] = { { CS42L42_ASP_TX_SZ_EN, 0x01 }, { CS42L42_PWR_CTL1, 0x0A }, { CS42L42_PWR_CTL2, 0x84 }, - { CS42L42_HP_CTL, 0x03 }, + { CS42L42_HP_CTL, 0x0D }, { CS42L42_MIXER_CHA_VOL, 0x3F }, { CS42L42_MIXER_CHB_VOL, 0x3F }, { CS42L42_MIXER_ADC_VOL, 0x3f }, @@ -371,7 +371,7 @@ static const struct cs8409_i2c_param dolphin_c1_init_reg_seq[] = { { CS42L42_ASP_TX_SZ_EN, 0x00 }, { CS42L42_PWR_CTL1, 0x0E }, { CS42L42_PWR_CTL2, 0x84 }, - { CS42L42_HP_CTL, 0x01 }, + { CS42L42_HP_CTL, 0x0D }, { CS42L42_MIXER_CHA_VOL, 0x3F }, { CS42L42_MIXER_CHB_VOL, 0x3F }, { CS42L42_MIXER_ADC_VOL, 0x3f }, diff --git a/sound/pci/hda/patch_cs8409.c b/sound/pci/hda/patch_cs8409.c index 614327218634..b760332a4e35 100644 --- a/sound/pci/hda/patch_cs8409.c +++ b/sound/pci/hda/patch_cs8409.c @@ -876,7 +876,7 @@ static void cs42l42_resume(struct sub_codec *cs42l42) { CS42L42_DET_INT_STATUS2, 0x00 }, { CS42L42_TSRS_PLUG_STATUS, 0x00 }, }; - int fsv_old, fsv_new; + unsigned int fsv; /* Bring CS42L42 out of Reset */ spec->gpio_data = snd_hda_codec_read(codec, CS8409_PIN_AFG, 0, AC_VERB_GET_GPIO_DATA, 0); @@ -893,13 +893,15 @@ static void cs42l42_resume(struct sub_codec *cs42l42) /* Clear interrupts, by reading interrupt status registers */ cs8409_i2c_bulk_read(cs42l42, irq_regs, ARRAY_SIZE(irq_regs)); - fsv_old = cs8409_i2c_read(cs42l42, CS42L42_HP_CTL); - if (cs42l42->full_scale_vol == CS42L42_FULL_SCALE_VOL_0DB) - fsv_new = fsv_old & ~CS42L42_FULL_SCALE_VOL_MASK; - else - fsv_new = fsv_old & CS42L42_FULL_SCALE_VOL_MASK; - if (fsv_new != fsv_old) - cs8409_i2c_write(cs42l42, CS42L42_HP_CTL, fsv_new); + fsv = cs8409_i2c_read(cs42l42, CS42L42_HP_CTL); + if (cs42l42->full_scale_vol) { + // Set the full scale volume bit + fsv |= CS42L42_FULL_SCALE_VOL_MASK; + cs8409_i2c_write(cs42l42, CS42L42_HP_CTL, fsv); + } + // Unmute analog channels A and B + fsv = (fsv & ~CS42L42_ANA_MUTE_AB); + cs8409_i2c_write(cs42l42, CS42L42_HP_CTL, fsv); /* we have to explicitly allow unsol event handling even during the * resume phase so that the jack event is processed properly @@ -920,7 +922,7 @@ static void cs42l42_suspend(struct sub_codec *cs42l42) { CS42L42_MIXER_CHA_VOL, 0x3F }, { CS42L42_MIXER_ADC_VOL, 0x3F }, { CS42L42_MIXER_CHB_VOL, 0x3F }, - { CS42L42_HP_CTL, 0x0F }, + { CS42L42_HP_CTL, 0x0D }, { CS42L42_ASP_RX_DAI0_EN, 0x00 }, { CS42L42_ASP_CLK_CFG, 0x00 }, { CS42L42_PWR_CTL1, 0xFE }, diff --git a/sound/pci/hda/patch_cs8409.h b/sound/pci/hda/patch_cs8409.h index 5e48115caf09..14645d25e70f 100644 --- a/sound/pci/hda/patch_cs8409.h +++ b/sound/pci/hda/patch_cs8409.h @@ -230,9 +230,10 @@ enum cs8409_coefficient_index_registers { #define CS42L42_PDN_TIMEOUT_US (250000) #define CS42L42_PDN_SLEEP_US (2000) #define CS42L42_INIT_TIMEOUT_MS (45) +#define CS42L42_ANA_MUTE_AB (0x0C) #define CS42L42_FULL_SCALE_VOL_MASK (2) -#define CS42L42_FULL_SCALE_VOL_0DB (1) -#define CS42L42_FULL_SCALE_VOL_MINUS6DB (0) +#define CS42L42_FULL_SCALE_VOL_0DB (0) +#define CS42L42_FULL_SCALE_VOL_MINUS6DB (1) /* Dell BULLSEYE / WARLOCK / CYBORG Specific Definitions */ From 6a7ed7ee16a963f0ca028861eca8f8b365861dd1 Mon Sep 17 00:00:00 2001 From: Vitaly Rodionov Date: Fri, 14 Feb 2025 16:23:26 +0000 Subject: [PATCH 244/310] ALSA: hda/cirrus: Reduce codec resume time This patch reduces the resume time by half and introduces an option to include a delay after a single write operation before continuing. Signed-off-by: Vitaly Rodionov Link: https://patch.msgid.link/20250214162354.2675652-2-vitalyr@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_cs8409-tables.c | 6 +++--- sound/pci/hda/patch_cs8409.c | 6 +++++- sound/pci/hda/patch_cs8409.h | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/sound/pci/hda/patch_cs8409-tables.c b/sound/pci/hda/patch_cs8409-tables.c index 621f947e3817..09240138e087 100644 --- a/sound/pci/hda/patch_cs8409-tables.c +++ b/sound/pci/hda/patch_cs8409-tables.c @@ -131,7 +131,7 @@ static const struct cs8409_i2c_param cs42l42_init_reg_seq[] = { { CS42L42_RSENSE_CTL3, 0x00 }, { CS42L42_TSENSE_CTL, 0x80 }, { CS42L42_HS_BIAS_CTL, 0xC0 }, - { CS42L42_PWR_CTL1, 0x02 }, + { CS42L42_PWR_CTL1, 0x02, 10000 }, { CS42L42_ADC_OVFL_INT_MASK, 0xff }, { CS42L42_MIXER_INT_MASK, 0xff }, { CS42L42_SRC_INT_MASK, 0xff }, @@ -328,7 +328,7 @@ static const struct cs8409_i2c_param dolphin_c0_init_reg_seq[] = { { CS42L42_RSENSE_CTL3, 0x00 }, { CS42L42_TSENSE_CTL, 0x80 }, { CS42L42_HS_BIAS_CTL, 0xC0 }, - { CS42L42_PWR_CTL1, 0x02 }, + { CS42L42_PWR_CTL1, 0x02, 10000 }, { CS42L42_ADC_OVFL_INT_MASK, 0xff }, { CS42L42_MIXER_INT_MASK, 0xff }, { CS42L42_SRC_INT_MASK, 0xff }, @@ -384,7 +384,7 @@ static const struct cs8409_i2c_param dolphin_c1_init_reg_seq[] = { { CS42L42_RSENSE_CTL3, 0x00 }, { CS42L42_TSENSE_CTL, 0x80 }, { CS42L42_HS_BIAS_CTL, 0xC0 }, - { CS42L42_PWR_CTL1, 0x06 }, + { CS42L42_PWR_CTL1, 0x06, 10000 }, { CS42L42_ADC_OVFL_INT_MASK, 0xff }, { CS42L42_MIXER_INT_MASK, 0xff }, { CS42L42_SRC_INT_MASK, 0xff }, diff --git a/sound/pci/hda/patch_cs8409.c b/sound/pci/hda/patch_cs8409.c index b760332a4e35..e50006757a2c 100644 --- a/sound/pci/hda/patch_cs8409.c +++ b/sound/pci/hda/patch_cs8409.c @@ -346,6 +346,11 @@ static int cs8409_i2c_bulk_write(struct sub_codec *scodec, const struct cs8409_i if (cs8409_i2c_wait_complete(codec) < 0) goto error; + /* Certain use cases may require a delay + * after a write operation before proceeding. + */ + if (seq[i].delay) + fsleep(seq[i].delay); } mutex_unlock(&spec->i2c_mux); @@ -888,7 +893,6 @@ static void cs42l42_resume(struct sub_codec *cs42l42) /* Initialize CS42L42 companion codec */ cs8409_i2c_bulk_write(cs42l42, cs42l42->init_seq, cs42l42->init_seq_num); - msleep(CS42L42_INIT_TIMEOUT_MS); /* Clear interrupts, by reading interrupt status registers */ cs8409_i2c_bulk_read(cs42l42, irq_regs, ARRAY_SIZE(irq_regs)); diff --git a/sound/pci/hda/patch_cs8409.h b/sound/pci/hda/patch_cs8409.h index 14645d25e70f..e4bd2e12110b 100644 --- a/sound/pci/hda/patch_cs8409.h +++ b/sound/pci/hda/patch_cs8409.h @@ -229,7 +229,6 @@ enum cs8409_coefficient_index_registers { #define CS42L42_I2C_SLEEP_US (2000) #define CS42L42_PDN_TIMEOUT_US (250000) #define CS42L42_PDN_SLEEP_US (2000) -#define CS42L42_INIT_TIMEOUT_MS (45) #define CS42L42_ANA_MUTE_AB (0x0C) #define CS42L42_FULL_SCALE_VOL_MASK (2) #define CS42L42_FULL_SCALE_VOL_0DB (0) @@ -291,6 +290,7 @@ enum { struct cs8409_i2c_param { unsigned int addr; unsigned int value; + unsigned int delay; }; struct cs8409_cir_param { From d1d0963121769d8d16150b913fe886e48efefa51 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 11 Feb 2025 09:29:06 +0900 Subject: [PATCH 245/310] tools: fix annoying "mkdir -p ..." logs when building tools in parallel When CONFIG_OBJTOOL=y or CONFIG_DEBUG_INFO_BTF=y, parallel builds show awkward "mkdir -p ..." logs. $ make -j16 [ snip ] mkdir -p /home/masahiro/ref/linux/tools/objtool && make O=/home/masahiro/ref/linux subdir=tools/objtool --no-print-directory -C objtool mkdir -p /home/masahiro/ref/linux/tools/bpf/resolve_btfids && make O=/home/masahiro/ref/linux subdir=tools/bpf/resolve_btfids --no-print-directory -C bpf/resolve_btfids Defining MAKEFLAGS= on the command line wipes out command line switches from the resultant MAKEFLAGS definition, even though the command line switches are active. [1] MAKEFLAGS puts all single-letter options into the first word, and that word will be empty if no single-letter options were given. [2] However, this breaks if MAKEFLAGS= is given on the command line. The tools/ and tools/% targets set MAKEFLAGS= on the command line, which breaks the following code in tools/scripts/Makefile.include: short-opts := $(firstword -$(MAKEFLAGS)) If MAKEFLAGS really needs modification, it should be done through the environment variable, as follows: MAKEFLAGS= $(MAKE) ... That said, I question whether modifying MAKEFLAGS is necessary here. The only flag we might want to exclude is --no-print-directory, as the tools build system changes the working directory. However, people might find the "Entering/Leaving directory" logs annoying. I simply removed the offending MAKEFLAGS=. [1]: https://savannah.gnu.org/bugs/?62469 [2]: https://www.gnu.org/software/make/manual/make.html#Testing-Flags Fixes: ea01fa9f63ae ("tools: Connect to the kernel build system") Fixes: a50e43332756 ("perf tools: Honor parallel jobs") Signed-off-by: Masahiro Yamada Tested-by: Daniel Xu --- Makefile | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 89628e354ca7..52207bcb1a9d 100644 --- a/Makefile +++ b/Makefile @@ -1421,18 +1421,13 @@ ifneq ($(wildcard $(resolve_btfids_O)),) $(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean endif -# Clear a bunch of variables before executing the submake -ifeq ($(quiet),silent_) -tools_silent=s -endif - tools/: FORCE $(Q)mkdir -p $(objtree)/tools - $(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(tools_silent) $(filter --j% -j,$(MAKEFLAGS))" O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ + $(Q)$(MAKE) LDFLAGS= O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ tools/%: FORCE $(Q)mkdir -p $(objtree)/tools - $(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(tools_silent) $(filter --j% -j,$(MAKEFLAGS))" O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ $* + $(Q)$(MAKE) LDFLAGS= O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ $* # --------------------------------------------------------------------------- # Kernel selftest From 140332b6ed727a4ec2e5722a1ccda28b52d45771 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 13 Feb 2025 15:26:44 +0900 Subject: [PATCH 246/310] kbuild: fix linux-headers package build when $(CC) cannot link userspace Since commit 5f73e7d0386d ("kbuild: refactor cross-compiling linux-headers package"), the linux-headers Debian package fails to build when $(CC) cannot build userspace applications, for example, when using toolchains installed by the 0day bot. The host programs in the linux-headers package should be rebuilt using the disto's cross-compiler, ${DEB_HOST_GNU_TYPE}-gcc instead of $(CC). Hence, the variable 'CC' must be expanded in this shell script instead of in the top-level Makefile. Commit f354fc88a72a ("kbuild: install-extmod-build: add missing quotation marks for CC variable") was not a correct fix because CC="ccache gcc" should be unrelated when rebuilding userspace tools. Fixes: 5f73e7d0386d ("kbuild: refactor cross-compiling linux-headers package") Reported-by: Jeff Johnson Closes: https://lore.kernel.org/linux-kbuild/CAK7LNARb3xO3ptBWOMpwKcyf3=zkfhMey5H2KnB1dOmUwM79dA@mail.gmail.com/T/#t Signed-off-by: Masahiro Yamada Tested-by: Jeff Johnson --- scripts/package/install-extmod-build | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/package/install-extmod-build b/scripts/package/install-extmod-build index b724626ea0ca..2966473b4660 100755 --- a/scripts/package/install-extmod-build +++ b/scripts/package/install-extmod-build @@ -62,8 +62,8 @@ if [ "${CC}" != "${HOSTCC}" ]; then # # Clear VPATH and srcroot because the source files reside in the output # directory. - # shellcheck disable=SC2016 # $(MAKE), $(CC), and $(build) will be expanded by Make - "${MAKE}" run-command KBUILD_RUN_COMMAND='+$(MAKE) HOSTCC="$(CC)" VPATH= srcroot=. $(build)='"${destdir}"/scripts + # shellcheck disable=SC2016 # $(MAKE) and $(build) will be expanded by Make + "${MAKE}" run-command KBUILD_RUN_COMMAND='+$(MAKE) HOSTCC='"${CC}"' VPATH= srcroot=. $(build)='"${destdir}"/scripts rm -f "${destdir}/scripts/Kbuild" fi From 071ed42cff4fcdd89025d966d48eabef59913bf2 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Thu, 13 Feb 2025 23:36:10 +0100 Subject: [PATCH 247/310] net/sched: cls_api: fix error handling causing NULL dereference tcf_exts_miss_cookie_base_alloc() calls xa_alloc_cyclic() which can return 1 if the allocation succeeded after wrapping. This was treated as an error, with value 1 returned to caller tcf_exts_init_ex() which sets exts->actions to NULL and returns 1 to caller fl_change(). fl_change() treats err == 1 as success, calling tcf_exts_validate_ex() which calls tcf_action_init() with exts->actions as argument, where it is dereferenced. Example trace: BUG: kernel NULL pointer dereference, address: 0000000000000000 CPU: 114 PID: 16151 Comm: handler114 Kdump: loaded Not tainted 5.14.0-503.16.1.el9_5.x86_64 #1 RIP: 0010:tcf_action_init+0x1f8/0x2c0 Call Trace: tcf_action_init+0x1f8/0x2c0 tcf_exts_validate_ex+0x175/0x190 fl_change+0x537/0x1120 [cls_flower] Fixes: 80cd22c35c90 ("net/sched: cls_api: Support hardware miss to tc action") Signed-off-by: Pierre Riteau Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250213223610.320278-1-pierre@stackhpc.com Signed-off-by: Jakub Kicinski --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8e47e5355be6..4f648af8cfaa 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -97,7 +97,7 @@ tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, err = xa_alloc_cyclic(&tcf_exts_miss_cookies_xa, &n->miss_cookie_base, n, xa_limit_32b, &next, GFP_KERNEL); - if (err) + if (err < 0) goto err_xa_alloc; exts->miss_cookie_node = n; From d440148418f4816b4973ec6723bf63821793a0a7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 15 Feb 2025 09:28:55 -0800 Subject: [PATCH 248/310] tegra210-adma: fix 32-bit x86 build The Tegra210 Audio DMA controller driver did a plain divide: page_no = (res_page->start - res_base->start) / cdata->ch_base_offset; which causes problems on 32-bit x86 configurations that have 64-bit resource sizes: x86_64-linux-ld: drivers/dma/tegra210-adma.o: in function `tegra_adma_probe': tegra210-adma.c:(.text+0x1322): undefined reference to `__udivdi3' because gcc doesn't generate the trivial code for a 64-by-32 divide, turning it into a function call to do a full 64-by-64 divide. And the kernel intentionally doesn't provide that helper function, because 99% of the time all you want is the narrower version. Of course, tegra210 is a 64-bit architecture and the 32-bit x86 build is purely for build testing, so this really is just about build coverage failure. But build coverage is good. Side note: div_u64() would be suboptimal if you actually have a 32-bit resource_t, so our "helper" for divides are admittedly making it harder than it should be to generate good code for all the possible cases. At some point, I'll consider 32-bit x86 so entirely legacy that I can't find it in myself to care any more, and we'll just add the __udivdi3 library function. But for now, the right thing to do is to use "div_u64()" to show that you know that you are doing the simpler divide with a 32-bit number. And the build error enforces that. While fixing the build issue, also check for division-by-zero, and for overflow. Which hopefully cannot happen on real production hardware, but the value of 'ch_base_offset' can definitely be zero in other places. Reported-by: Guenter Roeck Signed-off-by: Linus Torvalds --- drivers/dma/tegra210-adma.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/dma/tegra210-adma.c b/drivers/dma/tegra210-adma.c index 6896da8ac7ef..5c6a5b358987 100644 --- a/drivers/dma/tegra210-adma.c +++ b/drivers/dma/tegra210-adma.c @@ -887,7 +887,7 @@ static int tegra_adma_probe(struct platform_device *pdev) const struct tegra_adma_chip_data *cdata; struct tegra_adma *tdma; struct resource *res_page, *res_base; - int ret, i, page_no; + int ret, i; cdata = of_device_get_match_data(&pdev->dev); if (!cdata) { @@ -914,9 +914,20 @@ static int tegra_adma_probe(struct platform_device *pdev) res_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "global"); if (res_base) { - page_no = (res_page->start - res_base->start) / cdata->ch_base_offset; - if (page_no <= 0) + resource_size_t page_offset, page_no; + unsigned int ch_base_offset; + + if (res_page->start < res_base->start) + return -EINVAL; + page_offset = res_page->start - res_base->start; + ch_base_offset = cdata->ch_base_offset; + if (!ch_base_offset) return -EINVAL; + + page_no = div_u64(page_offset, ch_base_offset); + if (!page_no || page_no > INT_MAX) + return -EINVAL; + tdma->ch_page_no = page_no - 1; tdma->base_addr = devm_ioremap_resource(&pdev->dev, res_base); if (IS_ERR(tdma->base_addr)) From 1b71c2fb04e7a713abc6edde4a412416ff3158f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 13 Feb 2025 15:55:17 +0100 Subject: [PATCH 249/310] kbuild: userprogs: fix bitsize and target detection on clang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scripts/Makefile.clang was changed in the linked commit to move --target from KBUILD_CFLAGS to KBUILD_CPPFLAGS, as that generally has a broader scope. However that variable is not inspected by the userprogs logic, breaking cross compilation on clang. Use both variables to detect bitsize and target arguments for userprogs. Fixes: feb843a469fb ("kbuild: add $(CLANG_FLAGS) to KBUILD_CPPFLAGS") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Reviewed-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 52207bcb1a9d..272db408be5c 100644 --- a/Makefile +++ b/Makefile @@ -1120,8 +1120,8 @@ LDFLAGS_vmlinux += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) endif # Align the bit size of userspace programs with the kernel -KBUILD_USERCFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CFLAGS)) -KBUILD_USERLDFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CFLAGS)) +KBUILD_USERCFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS)) +KBUILD_USERLDFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS)) # make the checker run with the right architecture CHECKFLAGS += --arch=$(ARCH) From b28fb1f2ef45eeef1cd2c23149b50d184d545a3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 13 Feb 2025 17:04:29 +0100 Subject: [PATCH 250/310] modpost: Fix a few typos in a comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Namely: s/becasue/because/ and s/wiht/with/ plus an added article. Signed-off-by: Uwe Kleine-König Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 36b28987a2f0..c35d22607978 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -190,8 +190,8 @@ static struct module *new_module(const char *name, size_t namelen) /* * Set mod->is_gpl_compatible to true by default. If MODULE_LICENSE() - * is missing, do not check the use for EXPORT_SYMBOL_GPL() becasue - * modpost will exit wiht error anyway. + * is missing, do not check the use for EXPORT_SYMBOL_GPL() because + * modpost will exit with an error anyway. */ mod->is_gpl_compatible = true; From 129fe718819cc5e24ea2f489db9ccd4371f0c6f6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Feb 2025 11:55:47 -0500 Subject: [PATCH 251/310] tracing: Do not allow mmap() of persistent ring buffer When trying to mmap a trace instance buffer that is attached to reserve_mem, it would crash: BUG: unable to handle page fault for address: ffffe97bd00025c8 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 2862f3067 P4D 2862f3067 PUD 0 Oops: Oops: 0000 [#1] PREEMPT_RT SMP PTI CPU: 4 UID: 0 PID: 981 Comm: mmap-rb Not tainted 6.14.0-rc2-test-00003-g7f1a5e3fbf9e-dirty #233 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:validate_page_before_insert+0x5/0xb0 Code: e2 01 89 d0 c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 <48> 8b 46 08 a8 01 75 67 66 90 48 89 f0 8b 50 34 85 d2 74 76 48 89 RSP: 0018:ffffb148c2f3f968 EFLAGS: 00010246 RAX: ffff9fa5d3322000 RBX: ffff9fa5ccff9c08 RCX: 00000000b879ed29 RDX: ffffe97bd00025c0 RSI: ffffe97bd00025c0 RDI: ffff9fa5ccff9c08 RBP: ffffb148c2f3f9f0 R08: 0000000000000004 R09: 0000000000000004 R10: 0000000000000000 R11: 0000000000000200 R12: 0000000000000000 R13: 00007f16a18d5000 R14: ffff9fa5c48db6a8 R15: 0000000000000000 FS: 00007f16a1b54740(0000) GS:ffff9fa73df00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffe97bd00025c8 CR3: 00000001048c6006 CR4: 0000000000172ef0 Call Trace: ? __die_body.cold+0x19/0x1f ? __die+0x2e/0x40 ? page_fault_oops+0x157/0x2b0 ? search_module_extables+0x53/0x80 ? validate_page_before_insert+0x5/0xb0 ? kernelmode_fixup_or_oops.isra.0+0x5f/0x70 ? __bad_area_nosemaphore+0x16e/0x1b0 ? bad_area_nosemaphore+0x16/0x20 ? do_kern_addr_fault+0x77/0x90 ? exc_page_fault+0x22b/0x230 ? asm_exc_page_fault+0x2b/0x30 ? validate_page_before_insert+0x5/0xb0 ? vm_insert_pages+0x151/0x400 __rb_map_vma+0x21f/0x3f0 ring_buffer_map+0x21b/0x2f0 tracing_buffers_mmap+0x70/0xd0 __mmap_region+0x6f0/0xbd0 mmap_region+0x7f/0x130 do_mmap+0x475/0x610 vm_mmap_pgoff+0xf2/0x1d0 ksys_mmap_pgoff+0x166/0x200 __x64_sys_mmap+0x37/0x50 x64_sys_call+0x1670/0x1d70 do_syscall_64+0xbb/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f The reason was that the code that maps the ring buffer pages to user space has: page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); And uses that in: vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); But virt_to_page() does not work with vmap()'d memory which is what the persistent ring buffer has. It is rather trivial to allow this, but for now just disable mmap() of instances that have their ring buffer from the reserve_mem option. If an mmap() is performed on a persistent buffer it will return -ENODEV just like it would if the .mmap field wasn't defined in the file_operations structure. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250214115547.0d7287d3@gandalf.local.home Fixes: 9b7bdf6f6ece6 ("tracing: Have trace_printk not use binary prints if boot buffer") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 25ff37aab00f..0e6d517e74e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8279,6 +8279,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) struct trace_iterator *iter = &info->iter; int ret = 0; + /* Currently the boot mapped buffer is not supported for mmap */ + if (iter->tr->flags & TRACE_ARRAY_FL_BOOT) + return -ENODEV; + ret = get_snapshot_map(iter->tr); if (ret) return ret; From 97937834ae876f29565415ab15f1284666dc6be3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Feb 2025 12:35:12 -0500 Subject: [PATCH 252/310] ring-buffer: Update pages_touched to reflect persistent buffer content The pages_touched field represents the number of subbuffers in the ring buffer that have content that can be read. This is used in accounting of "dirty_pages" and "buffer_percent" to allow the user to wait for the buffer to be filled to a certain amount before it reads the buffer in blocking mode. The persistent buffer never updated this value so it was set to zero, and this accounting would take it as it had no content. This would cause user space to wait for content even though there's enough content in the ring buffer that satisfies the buffer_percent. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250214123512.0631436e@gandalf.local.home Fixes: 5f3b6e839f3ce ("ring-buffer: Validate boot range memory events") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0419d41a2060..bb6089c2951e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1850,6 +1850,11 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->cpu); goto invalid; } + + /* If the buffer has content, update pages_touched */ + if (ret) + local_inc(&cpu_buffer->pages_touched); + entries += ret; entry_bytes += local_read(&head_page->page->commit); local_set(&cpu_buffer->head_page->entries, ret); From 0ad2507d5d93f39619fc42372c347d6006b64319 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 Feb 2025 14:02:44 -0800 Subject: [PATCH 253/310] Linux 6.14-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 272db408be5c..96407c1d6be1 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Baby Opossum Posse # *DOCUMENTATION* From 654292a0b264e9b8c51b98394146218a21612aa1 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sun, 16 Feb 2025 18:02:47 -0300 Subject: [PATCH 254/310] smb: client: fix chmod(2) regression with ATTR_READONLY When the user sets a file or directory as read-only (e.g. ~S_IWUGO), the client will set the ATTR_READONLY attribute by sending an SMB2_SET_INFO request to the server in cifs_setattr_{,nounix}(), but cifsInodeInfo::cifsAttrs will be left unchanged as the client will only update the new file attributes in the next call to {smb311_posix,cifs}_get_inode_info() with the new metadata filled in @data parameter. Commit a18280e7fdea ("smb: cilent: set reparse mount points as automounts") mistakenly removed the @data NULL check when calling is_inode_cache_good(), which broke the above case as the new ATTR_READONLY attribute would end up not being updated on files with a read lease. Fix this by updating the inode whenever we have cached metadata in @data parameter. Reported-by: Horst Reiterer Closes: https://lore.kernel.org/r/85a16504e09147a195ac0aac1c801280@fabasoft.com Fixes: a18280e7fdea ("smb: cilent: set reparse mount points as automounts") Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 9cc31cf6ebd0..3261190e6f90 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1408,7 +1408,7 @@ int cifs_get_inode_info(struct inode **inode, struct cifs_fattr fattr = {}; int rc; - if (is_inode_cache_good(*inode)) { + if (!data && is_inode_cache_good(*inode)) { cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); return 0; } @@ -1507,7 +1507,7 @@ int smb311_posix_get_inode_info(struct inode **inode, struct cifs_fattr fattr = {}; int rc; - if (is_inode_cache_good(*inode)) { + if (!data && is_inode_cache_good(*inode)) { cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); return 0; } From 6d1f86610f23b0bc334d6506a186f21a98f51392 Mon Sep 17 00:00:00 2001 From: John Veness Date: Mon, 17 Feb 2025 12:15:50 +0000 Subject: [PATCH 255/310] ALSA: hda/conexant: Add quirk for HP ProBook 450 G4 mute LED Allows the LED on the dedicated mute button on the HP ProBook 450 G4 laptop to change colour correctly. Signed-off-by: John Veness Cc: Link: https://patch.msgid.link/2fb55d48-6991-4a42-b591-4c78f2fad8d7@pelago.org.uk Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_conexant.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 4985e72b9094..34874039ad45 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -1090,6 +1090,7 @@ static const struct hda_quirk cxt5066_fixups[] = { SND_PCI_QUIRK(0x103c, 0x814f, "HP ZBook 15u G3", CXT_FIXUP_MUTE_LED_GPIO), SND_PCI_QUIRK(0x103c, 0x8174, "HP Spectre x360", CXT_FIXUP_HP_SPECTRE), SND_PCI_QUIRK(0x103c, 0x822e, "HP ProBook 440 G4", CXT_FIXUP_MUTE_LED_GPIO), + SND_PCI_QUIRK(0x103c, 0x8231, "HP ProBook 450 G4", CXT_FIXUP_MUTE_LED_GPIO), SND_PCI_QUIRK(0x103c, 0x828c, "HP EliteBook 840 G4", CXT_FIXUP_HP_DOCK), SND_PCI_QUIRK(0x103c, 0x8299, "HP 800 G3 SFF", CXT_FIXUP_HP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x829a, "HP 800 G3 DM", CXT_FIXUP_HP_MIC_NO_PRESENCE), From e77aa4b2eaa7fb31b2a7a50214ecb946b2a8b0f6 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 17 Feb 2025 18:00:30 +0100 Subject: [PATCH 256/310] ALSA: seq: Drop UMP events when no UMP-conversion is set When a destination client is a user client in the legacy MIDI mode and it sets the no-UMP-conversion flag, currently the all UMP events are still passed as-is. But this may confuse the user-space, because the event packet size is different from the legacy mode. Since we cannot handle UMP events in user clients unless it's running in the UMP client mode, we should filter out those events instead of accepting blindly. This patch addresses it by slightly adjusting the conditions for UMP event handling at the event delivery time. Fixes: 329ffe11a014 ("ALSA: seq: Allow suppressing UMP conversions") Link: https://lore.kernel.org/b77a2cd6-7b59-4eb0-a8db-22d507d3af5f@gmail.com Link: https://patch.msgid.link/20250217170034.21930-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_clientmgr.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 073b56dc2225..cb66ec42a3f8 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -678,12 +678,18 @@ static int snd_seq_deliver_single_event(struct snd_seq_client *client, dest_port->time_real); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) - if (!(dest->filter & SNDRV_SEQ_FILTER_NO_CONVERT)) { - if (snd_seq_ev_is_ump(event)) { + if (snd_seq_ev_is_ump(event)) { + if (!(dest->filter & SNDRV_SEQ_FILTER_NO_CONVERT)) { result = snd_seq_deliver_from_ump(client, dest, dest_port, event, atomic, hop); goto __skip; - } else if (snd_seq_client_is_ump(dest)) { + } else if (dest->type == USER_CLIENT && + !snd_seq_client_is_ump(dest)) { + result = 0; // drop the event + goto __skip; + } + } else if (snd_seq_client_is_ump(dest)) { + if (!(dest->filter & SNDRV_SEQ_FILTER_NO_CONVERT)) { result = snd_seq_deliver_to_ump(client, dest, dest_port, event, atomic, hop); goto __skip; From 07b598c0e6f06a0f254c88dafb4ad50f8a8c6eea Mon Sep 17 00:00:00 2001 From: Gavrilov Ilia Date: Thu, 13 Feb 2025 15:20:55 +0000 Subject: [PATCH 257/310] drop_monitor: fix incorrect initialization order Syzkaller reports the following bug: BUG: spinlock bad magic on CPU#1, syz-executor.0/7995 lock: 0xffff88805303f3e0, .magic: 00000000, .owner: /-1, .owner_cpu: 0 CPU: 1 PID: 7995 Comm: syz-executor.0 Tainted: G E 5.10.209+ #1 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x119/0x179 lib/dump_stack.c:118 debug_spin_lock_before kernel/locking/spinlock_debug.c:83 [inline] do_raw_spin_lock+0x1f6/0x270 kernel/locking/spinlock_debug.c:112 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:117 [inline] _raw_spin_lock_irqsave+0x50/0x70 kernel/locking/spinlock.c:159 reset_per_cpu_data+0xe6/0x240 [drop_monitor] net_dm_cmd_trace+0x43d/0x17a0 [drop_monitor] genl_family_rcv_msg_doit+0x22f/0x330 net/netlink/genetlink.c:739 genl_family_rcv_msg net/netlink/genetlink.c:783 [inline] genl_rcv_msg+0x341/0x5a0 net/netlink/genetlink.c:800 netlink_rcv_skb+0x14d/0x440 net/netlink/af_netlink.c:2497 genl_rcv+0x29/0x40 net/netlink/genetlink.c:811 netlink_unicast_kernel net/netlink/af_netlink.c:1322 [inline] netlink_unicast+0x54b/0x800 net/netlink/af_netlink.c:1348 netlink_sendmsg+0x914/0xe00 net/netlink/af_netlink.c:1916 sock_sendmsg_nosec net/socket.c:651 [inline] __sock_sendmsg+0x157/0x190 net/socket.c:663 ____sys_sendmsg+0x712/0x870 net/socket.c:2378 ___sys_sendmsg+0xf8/0x170 net/socket.c:2432 __sys_sendmsg+0xea/0x1b0 net/socket.c:2461 do_syscall_64+0x30/0x40 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x62/0xc7 RIP: 0033:0x7f3f9815aee9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f3f972bf0c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f3f9826d050 RCX: 00007f3f9815aee9 RDX: 0000000020000000 RSI: 0000000020001300 RDI: 0000000000000007 RBP: 00007f3f981b63bd R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000006e R14: 00007f3f9826d050 R15: 00007ffe01ee6768 If drop_monitor is built as a kernel module, syzkaller may have time to send a netlink NET_DM_CMD_START message during the module loading. This will call the net_dm_monitor_start() function that uses a spinlock that has not yet been initialized. To fix this, let's place resource initialization above the registration of a generic netlink family. Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 9a8afc8d3962 ("Network Drop Monitor: Adding drop monitor implementation & Netlink protocol") Cc: stable@vger.kernel.org Signed-off-by: Ilia Gavrilov Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250213152054.2785669-1-Ilia.Gavrilov@infotecs.ru Signed-off-by: Jakub Kicinski --- net/core/drop_monitor.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 6efd4cccc9dd..212f0a048cab 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1734,30 +1734,30 @@ static int __init init_net_drop_monitor(void) return -ENOSPC; } - rc = genl_register_family(&net_drop_monitor_family); - if (rc) { - pr_err("Could not create drop monitor netlink family\n"); - return rc; + for_each_possible_cpu(cpu) { + net_dm_cpu_data_init(cpu); + net_dm_hw_cpu_data_init(cpu); } - WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = register_netdevice_notifier(&dropmon_net_notifier); if (rc < 0) { pr_crit("Failed to register netdevice notifier\n"); + return rc; + } + + rc = genl_register_family(&net_drop_monitor_family); + if (rc) { + pr_err("Could not create drop monitor netlink family\n"); goto out_unreg; } + WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = 0; - for_each_possible_cpu(cpu) { - net_dm_cpu_data_init(cpu); - net_dm_hw_cpu_data_init(cpu); - } - goto out; out_unreg: - genl_unregister_family(&net_drop_monitor_family); + WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); out: return rc; } @@ -1766,19 +1766,18 @@ static void exit_net_drop_monitor(void) { int cpu; - BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); - /* * Because of the module_get/put we do in the trace state change path * we are guaranteed not to have any current users when we get here */ + BUG_ON(genl_unregister_family(&net_drop_monitor_family)); + + BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); for_each_possible_cpu(cpu) { net_dm_hw_cpu_data_fini(cpu); net_dm_cpu_data_fini(cpu); } - - BUG_ON(genl_unregister_family(&net_drop_monitor_family)); } module_init(init_net_drop_monitor); From 915e34d5ad35a6a9e56113f852ade4a730fb88f0 Mon Sep 17 00:00:00 2001 From: Julian Ruess Date: Fri, 14 Feb 2025 13:01:37 +0100 Subject: [PATCH 258/310] s390/ism: add release function for struct device According to device_release() in /drivers/base/core.c, a device without a release function is a broken device and must be fixed. The current code directly frees the device after calling device_add() without waiting for other kernel parts to release their references. Thus, a reference could still be held to a struct device, e.g., by sysfs, leading to potential use-after-free issues if a proper release function is not set. Fixes: 8c81ba20349d ("net/smc: De-tangle ism and smc device initialization") Reviewed-by: Alexandra Winter Reviewed-by: Wenjia Zhang Signed-off-by: Julian Ruess Signed-off-by: Alexandra Winter Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250214120137.563409-1-wintera@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/s390/net/ism_drv.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index e36e3ea165d3..2f34761e6413 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -588,6 +588,15 @@ static int ism_dev_init(struct ism_dev *ism) return ret; } +static void ism_dev_release(struct device *dev) +{ + struct ism_dev *ism; + + ism = container_of(dev, struct ism_dev, dev); + + kfree(ism); +} + static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct ism_dev *ism; @@ -601,6 +610,7 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_set_drvdata(&pdev->dev, ism); ism->pdev = pdev; ism->dev.parent = &pdev->dev; + ism->dev.release = ism_dev_release; device_initialize(&ism->dev); dev_set_name(&ism->dev, dev_name(&pdev->dev)); ret = device_add(&ism->dev); @@ -637,7 +647,7 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) device_del(&ism->dev); err_dev: dev_set_drvdata(&pdev->dev, NULL); - kfree(ism); + put_device(&ism->dev); return ret; } @@ -682,7 +692,7 @@ static void ism_remove(struct pci_dev *pdev) pci_disable_device(pdev); device_del(&ism->dev); dev_set_drvdata(&pdev->dev, NULL); - kfree(ism); + put_device(&ism->dev); } static struct pci_driver ism_driver = { From bdf5d13aa05ec314d4385b31ac974d6c7e0997c9 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Fri, 14 Feb 2025 09:52:33 -0600 Subject: [PATCH 259/310] ibmvnic: Don't reference skb after sending to VIOS Previously, after successfully flushing the xmit buffer to VIOS, the tx_bytes stat was incremented by the length of the skb. It is invalid to access the skb memory after sending the buffer to the VIOS because, at any point after sending, the VIOS can trigger an interrupt to free this memory. A race between reading skb->len and freeing the skb is possible (especially during LPM) and will result in use-after-free: ================================================================== BUG: KASAN: slab-use-after-free in ibmvnic_xmit+0x75c/0x1808 [ibmvnic] Read of size 4 at addr c00000024eb48a70 by task hxecom/14495 <...> Call Trace: [c000000118f66cf0] [c0000000018cba6c] dump_stack_lvl+0x84/0xe8 (unreliable) [c000000118f66d20] [c0000000006f0080] print_report+0x1a8/0x7f0 [c000000118f66df0] [c0000000006f08f0] kasan_report+0x128/0x1f8 [c000000118f66f00] [c0000000006f2868] __asan_load4+0xac/0xe0 [c000000118f66f20] [c0080000046eac84] ibmvnic_xmit+0x75c/0x1808 [ibmvnic] [c000000118f67340] [c0000000014be168] dev_hard_start_xmit+0x150/0x358 <...> Freed by task 0: kasan_save_stack+0x34/0x68 kasan_save_track+0x2c/0x50 kasan_save_free_info+0x64/0x108 __kasan_mempool_poison_object+0x148/0x2d4 napi_skb_cache_put+0x5c/0x194 net_tx_action+0x154/0x5b8 handle_softirqs+0x20c/0x60c do_softirq_own_stack+0x6c/0x88 <...> The buggy address belongs to the object at c00000024eb48a00 which belongs to the cache skbuff_head_cache of size 224 ================================================================== Fixes: 032c5e82847a ("Driver for IBM System i/p VNIC protocol") Signed-off-by: Nick Child Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250214155233.235559-1-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index e95ae0d39948..0676fc547b6f 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2408,6 +2408,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) dma_addr_t data_dma_addr; struct netdev_queue *txq; unsigned long lpar_rc; + unsigned int skblen; union sub_crq tx_crq; unsigned int offset; bool use_scrq_send_direct = false; @@ -2522,6 +2523,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) tx_buff->skb = skb; tx_buff->index = bufidx; tx_buff->pool_index = queue_num; + skblen = skb->len; memset(&tx_crq, 0, sizeof(tx_crq)); tx_crq.v1.first = IBMVNIC_CRQ_CMD; @@ -2614,7 +2616,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) netif_stop_subqueue(netdev, queue_num); } - tx_bytes += skb->len; + tx_bytes += skblen; txq_trans_cond_update(txq); ret = NETDEV_TX_OK; goto out; From 0a4f598c84fc0eeb143ba03cdd3fc3d857061c3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 15 Feb 2025 14:52:00 -0800 Subject: [PATCH 260/310] MAINTAINERS: create entry for ethtool MAC merge Vladimir implemented the MAC merge support and reviews all the new driver implementations. Acked-by: Vladimir Oltean Link: https://patch.msgid.link/20250215225200.2652212-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 988b0ff94fda..1405ebe703a8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16477,6 +16477,12 @@ F: net/ethtool/cabletest.c F: tools/testing/selftests/drivers/net/*/ethtool* K: cable_test +NETWORKING [ETHTOOL MAC MERGE] +M: Vladimir Oltean +F: net/ethtool/mm.c +F: tools/testing/selftests/drivers/net/hw/ethtool_mm.sh +K: ethtool_mm + NETWORKING [GENERAL] M: "David S. Miller" M: Eric Dumazet From c8a3e63ff9d75b9f3f031c90d218876051dea0ba Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 29 Jan 2025 14:20:03 -0800 Subject: [PATCH 261/310] procfs: fix a locking bug in a vmcore_add_device_dump() error path Unlock vmcore_mutex when returning -EBUSY. Link: https://lkml.kernel.org/r/20250129222003.1495713-1-bvanassche@acm.org Fixes: 0f3b1c40c652 ("fs/proc/vmcore: disallow vmcore modifications while the vmcore is open") Signed-off-by: Bart Van Assche Acked-by: Michael S. Tsirkin Acked-by: David Hildenbrand Cc: Baoquan he Signed-off-by: Andrew Morton --- fs/proc/vmcore.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a00120a3c099..10d01eb09c43 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1524,7 +1524,7 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) pr_warn_once("Unexpected adding of device dump\n"); if (vmcore_open) { ret = -EBUSY; - goto out_err; + goto unlock; } list_add_tail(&dump->list, &vmcoredd_list); @@ -1532,6 +1532,9 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) mutex_unlock(&vmcore_mutex); return 0; +unlock: + mutex_unlock(&vmcore_mutex); + out_err: vfree(buf); vfree(dump); From f4b78260fc678ccd7169f32dc9f3bfa3b93931c7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 31 Jan 2025 14:13:15 +0000 Subject: [PATCH 262/310] lib/iov_iter: fix import_iovec_ubuf iovec management import_iovec() says that it should always be fine to kfree the iovec returned in @iovp regardless of the error code. __import_iovec_ubuf() never reallocates it and thus should clear the pointer even in cases when copy_iovec_*() fail. Link: https://lkml.kernel.org/r/378ae26923ffc20fd5e41b4360d673bf47b1775b.1738332461.git.asml.silence@gmail.com Fixes: 3b2deb0e46da ("iov_iter: import single vector iovecs as ITER_UBUF") Signed-off-by: Pavel Begunkov Reviewed-by: Jens Axboe Cc: Al Viro Cc: Christian Brauner Cc: Signed-off-by: Andrew Morton --- lib/iov_iter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 9ec806f989f2..65f550cb5081 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1428,6 +1428,8 @@ static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, struct iovec *iov = *iovp; ssize_t ret; + *iovp = NULL; + if (compat) ret = copy_compat_iovec_from_user(iov, uvec, 1); else @@ -1438,7 +1440,6 @@ static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); if (unlikely(ret)) return ret; - *iovp = NULL; return i->count; } From 63895d20d63b446f5049a963983489319c2ea3e2 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 29 Jan 2025 19:08:44 +0900 Subject: [PATCH 263/310] mm/zswap: fix inconsistency when zswap_store_page() fails Commit b7c0ccdfbafd ("mm: zswap: support large folios in zswap_store()") skips charging any zswap entries when it failed to zswap the entire folio. However, when some base pages are zswapped but it failed to zswap the entire folio, the zswap operation is rolled back. When freeing zswap entries for those pages, zswap_entry_free() uncharges the zswap entries that were not previously charged, causing zswap charging to become inconsistent. This inconsistency triggers two warnings with following steps: # On a machine with 64GiB of RAM and 36GiB of zswap $ stress-ng --bigheap 2 # wait until the OOM-killer kills stress-ng $ sudo reboot The two warnings are: in mm/memcontrol.c:163, function obj_cgroup_release(): WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); in mm/page_counter.c:60, function page_counter_cancel(): if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", new, nr_pages)) zswap_stored_pages also becomes inconsistent in the same way. As suggested by Kanchana, increment zswap_stored_pages and charge zswap entries within zswap_store_page() when it succeeds. This way, zswap_entry_free() will decrement the counter and uncharge the entries when it failed to zswap the entire folio. While this could potentially be optimized by batching objcg charging and incrementing the counter, let's focus on fixing the bug this time and leave the optimization for later after some evaluation. After resolving the inconsistency, the warnings disappear. [42.hyeyoo@gmail.com: refactor zswap_store_page()] Link: https://lkml.kernel.org/r/20250131082037.2426-1-42.hyeyoo@gmail.com Link: https://lkml.kernel.org/r/20250129100844.2935-1-42.hyeyoo@gmail.com Fixes: b7c0ccdfbafd ("mm: zswap: support large folios in zswap_store()") Co-developed-by: Kanchana P Sridhar Signed-off-by: Kanchana P Sridhar Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Yosry Ahmed Acked-by: Nhat Pham Cc: Chengming Zhou Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton --- mm/zswap.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 6504174fbc6a..ac9d299e7d0c 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1445,9 +1445,9 @@ static void shrink_worker(struct work_struct *w) * main API **********************************/ -static ssize_t zswap_store_page(struct page *page, - struct obj_cgroup *objcg, - struct zswap_pool *pool) +static bool zswap_store_page(struct page *page, + struct obj_cgroup *objcg, + struct zswap_pool *pool) { swp_entry_t page_swpentry = page_swap_entry(page); struct zswap_entry *entry, *old; @@ -1456,7 +1456,7 @@ static ssize_t zswap_store_page(struct page *page, entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; - return -EINVAL; + return false; } if (!zswap_compress(page, entry, pool)) @@ -1483,13 +1483,17 @@ static ssize_t zswap_store_page(struct page *page, /* * The entry is successfully compressed and stored in the tree, there is - * no further possibility of failure. Grab refs to the pool and objcg. - * These refs will be dropped by zswap_entry_free() when the entry is - * removed from the tree. + * no further possibility of failure. Grab refs to the pool and objcg, + * charge zswap memory, and increment zswap_stored_pages. + * The opposite actions will be performed by zswap_entry_free() + * when the entry is removed from the tree. */ zswap_pool_get(pool); - if (objcg) + if (objcg) { obj_cgroup_get(objcg); + obj_cgroup_charge_zswap(objcg, entry->length); + } + atomic_long_inc(&zswap_stored_pages); /* * We finish initializing the entry while it's already in xarray. @@ -1510,13 +1514,13 @@ static ssize_t zswap_store_page(struct page *page, zswap_lru_add(&zswap_list_lru, entry); } - return entry->length; + return true; store_failed: zpool_free(pool->zpool, entry->handle); compress_failed: zswap_entry_cache_free(entry); - return -EINVAL; + return false; } bool zswap_store(struct folio *folio) @@ -1526,7 +1530,6 @@ bool zswap_store(struct folio *folio) struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; - size_t compressed_bytes = 0; bool ret = false; long index; @@ -1564,20 +1567,14 @@ bool zswap_store(struct folio *folio) for (index = 0; index < nr_pages; ++index) { struct page *page = folio_page(folio, index); - ssize_t bytes; - bytes = zswap_store_page(page, objcg, pool); - if (bytes < 0) + if (!zswap_store_page(page, objcg, pool)) goto put_pool; - compressed_bytes += bytes; } - if (objcg) { - obj_cgroup_charge_zswap(objcg, compressed_bytes); + if (objcg) count_objcg_events(objcg, ZSWPOUT, nr_pages); - } - atomic_long_add(nr_pages, &zswap_stored_pages); count_vm_events(ZSWPOUT, nr_pages); ret = true; From 2ede647a6fde3e54a6bfda7cf01c716649655900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Ca=C3=B1uelo=20Navarro?= Date: Mon, 3 Feb 2025 08:52:06 +0100 Subject: [PATCH 264/310] mm,madvise,hugetlb: check for 0-length range after end address adjustment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a sanity check to madvise_dontneed_free() to address a corner case in madvise where a race condition causes the current vma being processed to be backed by a different page size. During a madvise(MADV_DONTNEED) call on a memory region registered with a userfaultfd, there's a period of time where the process mm lock is temporarily released in order to send a UFFD_EVENT_REMOVE and let userspace handle the event. During this time, the vma covering the current address range may change due to an explicit mmap done concurrently by another thread. If, after that change, the memory region, which was originally backed by 4KB pages, is now backed by hugepages, the end address is rounded down to a hugepage boundary to avoid data loss (see "Fixes" below). This rounding may cause the end address to be truncated to the same address as the start. Make this corner case follow the same semantics as in other similar cases where the requested region has zero length (ie. return 0). This will make madvise_walk_vmas() continue to the next vma in the range (this time holding the process mm lock) which, due to the prev pointer becoming stale because of the vma change, will be the same hugepage-backed vma that was just checked before. The next time madvise_dontneed_free() runs for this vma, if the start address isn't aligned to a hugepage boundary, it'll return -EINVAL, which is also in line with the madvise api. From userspace perspective, madvise() will return EINVAL because the start address isn't aligned according to the new vma alignment requirements (hugepage), even though it was correctly page-aligned when the call was issued. Link: https://lkml.kernel.org/r/20250203075206.1452208-1-rcn@igalia.com Fixes: 8ebe0a5eaaeb ("mm,madvise,hugetlb: fix unexpected data loss with MADV_DONTNEED on hugetlbfs") Signed-off-by: Ricardo Cañuelo Navarro Reviewed-by: Oscar Salvador Cc: Florent Revest Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton --- mm/madvise.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 49f3a75046f6..08b207f8e61e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -933,7 +933,16 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ end = vma->vm_end; } - VM_WARN_ON(start >= end); + /* + * If the memory region between start and end was + * originally backed by 4kB pages and then remapped to + * be backed by hugepages while mmap_lock was dropped, + * the adjustment for hugetlb vma above may have rounded + * end down to the start address. + */ + if (start == end) + return 0; + VM_WARN_ON(start > end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) From 639375b0aa4323fe59b5fe2a6ebc68b022c36f50 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Wed, 5 Feb 2025 12:01:00 -0800 Subject: [PATCH 265/310] .mailmap: add entries for Jeff Johnson Map past iterations of my e-mail addresses to the current one. Link: https://lkml.kernel.org/r/20250205-jjohnson-mailmap-v1-1-269cb7b1710d@oss.qualcomm.com Signed-off-by: Jeff Johnson Signed-off-by: Andrew Morton --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index fedebf86640a..c96dbe259c22 100644 --- a/.mailmap +++ b/.mailmap @@ -317,6 +317,8 @@ Jayachandran C Jean Tourrilhes Jeevan Shriram Jeff Garzik +Jeff Johnson +Jeff Johnson Jeff Layton Jeff Layton Jeff Layton From 3219585e894c12cbffd4ac93d3e6783d236f146e Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 5 Feb 2025 14:04:57 +0800 Subject: [PATCH 266/310] mailmap: add entry for Feng Tang Map my old business email to personal email. Link: https://lkml.kernel.org/r/20250205060457.53667-1-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index c96dbe259c22..f34af946180f 100644 --- a/.mailmap +++ b/.mailmap @@ -226,6 +226,7 @@ Fangrui Song Felipe W Damasio Felix Kuhling Felix Moeller +Feng Tang Fenglin Wu Filipe Lautert Finn Thain From 035d3c778709680288b3954ee896043132bc3f8d Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 10 Feb 2025 12:05:18 -0800 Subject: [PATCH 267/310] tools/mm: fix build warnings with musl-libc musl-libc warns about the following: /home/florian/dev/buildroot/output/arm64/rpi4-b/host/aarch64-buildroot-linux-musl/sysroot/usr/include/sys/errno.h:1:2: attention: #warning redirecting incorrect #include to [-Wcpp] 1 | #warning redirecting incorrect #include to | ^~~~~~~ /home/florian/dev/buildroot/output/arm64/rpi4-b/host/aarch64-buildroot-linux-musl/sysroot/usr/include/sys/fcntl.h:1:2: attention: #warning redirecting incorrect #include to [-Wcpp] 1 | #warning redirecting incorrect #include to | ^~~~~~~ include errno.h and fcntl.h directly. Link: https://lkml.kernel.org/r/20250210200518.1137295-1-florian.fainelli@broadcom.com Signed-off-by: Florian Fainelli Signed-off-by: Andrew Morton --- tools/mm/page-types.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c index bcac7ebfb51f..d7e5e8902af8 100644 --- a/tools/mm/page-types.c +++ b/tools/mm/page-types.c @@ -24,8 +24,8 @@ #include #include #include -#include -#include +#include +#include #include #include #include From 41cddf83d8b00f29fd105e7a0777366edc69a5cf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 10 Feb 2025 17:13:17 +0100 Subject: [PATCH 268/310] mm/migrate_device: don't add folio to be freed to LRU in migrate_device_finalize() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If migration succeeded, we called folio_migrate_flags()->mem_cgroup_migrate() to migrate the memcg from the old to the new folio. This will set memcg_data of the old folio to 0. Similarly, if migration failed, memcg_data of the dst folio is left unset. If we call folio_putback_lru() on such folios (memcg_data == 0), we will add the folio to be freed to the LRU, making memcg code unhappy. Running the hmm selftests: # ./hmm-tests ... # RUN hmm.hmm_device_private.migrate ... [ 102.078007][T14893] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x7ff27d200 pfn:0x13cc00 [ 102.079974][T14893] anon flags: 0x17ff00000020018(uptodate|dirty|swapbacked|node=0|zone=2|lastcpupid=0x7ff) [ 102.082037][T14893] raw: 017ff00000020018 dead000000000100 dead000000000122 ffff8881353896c9 [ 102.083687][T14893] raw: 00000007ff27d200 0000000000000000 00000001ffffffff 0000000000000000 [ 102.085331][T14893] page dumped because: VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled()) [ 102.087230][T14893] ------------[ cut here ]------------ [ 102.088279][T14893] WARNING: CPU: 0 PID: 14893 at ./include/linux/memcontrol.h:726 folio_lruvec_lock_irqsave+0x10e/0x170 [ 102.090478][T14893] Modules linked in: [ 102.091244][T14893] CPU: 0 UID: 0 PID: 14893 Comm: hmm-tests Not tainted 6.13.0-09623-g6c216bc522fd #151 [ 102.093089][T14893] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014 [ 102.094848][T14893] RIP: 0010:folio_lruvec_lock_irqsave+0x10e/0x170 [ 102.096104][T14893] Code: ... [ 102.099908][T14893] RSP: 0018:ffffc900236c37b0 EFLAGS: 00010293 [ 102.101152][T14893] RAX: 0000000000000000 RBX: ffffea0004f30000 RCX: ffffffff8183f426 [ 102.102684][T14893] RDX: ffff8881063cb880 RSI: ffffffff81b8117f RDI: ffff8881063cb880 [ 102.104227][T14893] RBP: 0000000000000000 R08: 0000000000000005 R09: 0000000000000000 [ 102.105757][T14893] R10: 0000000000000001 R11: 0000000000000002 R12: ffffc900236c37d8 [ 102.107296][T14893] R13: ffff888277a2bcb0 R14: 000000000000001f R15: 0000000000000000 [ 102.108830][T14893] FS: 00007ff27dbdd740(0000) GS:ffff888277a00000(0000) knlGS:0000000000000000 [ 102.110643][T14893] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 102.111924][T14893] CR2: 00007ff27d400000 CR3: 000000010866e000 CR4: 0000000000750ef0 [ 102.113478][T14893] PKRU: 55555554 [ 102.114172][T14893] Call Trace: [ 102.114805][T14893] [ 102.115397][T14893] ? folio_lruvec_lock_irqsave+0x10e/0x170 [ 102.116547][T14893] ? __warn.cold+0x110/0x210 [ 102.117461][T14893] ? folio_lruvec_lock_irqsave+0x10e/0x170 [ 102.118667][T14893] ? report_bug+0x1b9/0x320 [ 102.119571][T14893] ? handle_bug+0x54/0x90 [ 102.120494][T14893] ? exc_invalid_op+0x17/0x50 [ 102.121433][T14893] ? asm_exc_invalid_op+0x1a/0x20 [ 102.122435][T14893] ? __wake_up_klogd.part.0+0x76/0xd0 [ 102.123506][T14893] ? dump_page+0x4f/0x60 [ 102.124352][T14893] ? folio_lruvec_lock_irqsave+0x10e/0x170 [ 102.125500][T14893] folio_batch_move_lru+0xd4/0x200 [ 102.126577][T14893] ? __pfx_lru_add+0x10/0x10 [ 102.127505][T14893] __folio_batch_add_and_move+0x391/0x720 [ 102.128633][T14893] ? __pfx_lru_add+0x10/0x10 [ 102.129550][T14893] folio_putback_lru+0x16/0x80 [ 102.130564][T14893] migrate_device_finalize+0x9b/0x530 [ 102.131640][T14893] dmirror_migrate_to_device.constprop.0+0x7c5/0xad0 [ 102.133047][T14893] dmirror_fops_unlocked_ioctl+0x89b/0xc80 Likely, nothing else goes wrong: putting the last folio reference will remove the folio from the LRU again. So besides memcg complaining, adding the folio to be freed to the LRU is just an unnecessary step. The new flow resembles what we have in migrate_folio_move(): add the dst to the lru, remove migration ptes, unlock and unref dst. Link: https://lkml.kernel.org/r/20250210161317.717936-1-david@redhat.com Fixes: 8763cb45ab96 ("mm/migrate: new memory migration helper for use with device memory") Signed-off-by: David Hildenbrand Cc: Jérôme Glisse Cc: John Hubbard Cc: Alistair Popple Cc: Signed-off-by: Andrew Morton --- mm/migrate_device.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 9cf26592ac93..5bd888223cc8 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -840,20 +840,15 @@ void migrate_device_finalize(unsigned long *src_pfns, dst = src; } + if (!folio_is_zone_device(dst)) + folio_add_lru(dst); remove_migration_ptes(src, dst, 0); folio_unlock(src); - - if (folio_is_zone_device(src)) - folio_put(src); - else - folio_putback_lru(src); + folio_put(src); if (dst != src) { folio_unlock(dst); - if (folio_is_zone_device(dst)) - folio_put(dst); - else - folio_putback_lru(dst); + folio_put(dst); } } } From 2272dbc471037b78f308b44351ab1b9f88d32628 Mon Sep 17 00:00:00 2001 From: Wang Yaxin Date: Sat, 8 Feb 2025 14:44:00 +0800 Subject: [PATCH 269/310] getdelays: fix error format characters getdelays had a compilation issue because the format string was not updated when the "delay min" was added. For example, after adding the "delay min" in printf, there were 7 strings but only 6 "%s" format specifiers. Similarly, after adding the 't->cpu_delay_total', there were 7 variables but only 6 format characters specifiers, causing compilation issues as follows. This commit fixes these issues to ensure that getdelays compiles correctly. root@xx:~/linux-next/tools/accounting$ make getdelays.c:199:9: warning: format `%llu' expects argument of type `long long unsigned int', but argument 8 has type `char *' [-Wformat=] 199 | printf("\n\nCPU %15s%15s%15s%15s%15s%15s\n" | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ..... 216 | "delay total", "delay average", "delay max", "delay min", | ~~~~~~~~~~~ | | | char * getdelays.c:200:21: note: format string is defined here 200 | " %15llu%15llu%15llu%15llu%15.3fms%13.6fms\n" | ~~~~~^ | | | long long unsigned int | %15s getdelays.c:199:9: warning: format `%f' expects argument of type `double', but argument 12 has type `long long unsigned int' [-Wformat=] 199 | printf("\n\nCPU %15s%15s%15s%15s%15s%15s\n" | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ..... 220 | (unsigned long long)t->cpu_delay_total, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | | | long long unsigned int ..... Link: https://lkml.kernel.org/r/20250208144400544RduNRhwIpT3m2JyRBqskZ@zte.com.cn Fixes: f65c64f311ee ("delayacct: add delay min to record delay peak") Reviewed-by: xu xin Signed-off-by: Wang Yaxin Signed-off-by: Kun Jiang Cc: Balbir Singh Cc: David Hildenbrand Cc: Fan Yu Cc: Peilin He Cc: Qiang Tu Cc: wangyong Cc: ye xingchen Cc: Yunkai Zhang Signed-off-by: Andrew Morton --- tools/accounting/getdelays.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 100ad3dc091a..3feac0482fe9 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -196,22 +196,22 @@ static int get_family_id(int sd) static void print_delayacct(struct taskstats *t) { - printf("\n\nCPU %15s%15s%15s%15s%15s%15s\n" - " %15llu%15llu%15llu%15llu%15.3fms%13.6fms\n" - "IO %15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms\n" - "SWAP %15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms\n" - "RECLAIM %12s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms\n" - "THRASHING%12s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms\n" - "COMPACT %12s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms\n" - "WPCOPY %12s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms\n" - "IRQ %15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms\n", + printf("\n\nCPU %15s%15s%15s%15s%15s%15s%15s\n" + " %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms\n" + "IO %15s%15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" + "SWAP %15s%15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" + "RECLAIM %12s%15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" + "THRASHING%12s%15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" + "COMPACT %12s%15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" + "WPCOPY %12s%15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" + "IRQ %15s%15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms%13.6fms\n", "count", "real total", "virtual total", "delay total", "delay average", "delay max", "delay min", (unsigned long long)t->cpu_count, From b016d0873777462e55af4c615104cc684fce086d Mon Sep 17 00:00:00 2001 From: Wang Yaxin Date: Sat, 8 Feb 2025 14:49:01 +0800 Subject: [PATCH 270/310] taskstats: modify taskstats version After adding "delay max" and "delay min" to the taskstats structure, the taskstats version needs to be updated. Link: https://lkml.kernel.org/r/20250208144901218Q5ptVpqsQkb2MOEmW4Ujn@zte.com.cn Fixes: f65c64f311ee ("delayacct: add delay min to record delay peak") Signed-off-by: Wang Yaxin Signed-off-by: Kun Jiang Reviewed-by: xu xin Signed-off-by: Andrew Morton --- include/uapi/linux/taskstats.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index 934e20ef7f79..95762232e018 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 14 +#define TASKSTATS_VERSION 15 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ From f39edcf6349abb2ca2df96acc8645f4d2631d0a7 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Tue, 11 Feb 2025 15:26:25 +0800 Subject: [PATCH 271/310] mm: pgtable: fix incorrect reclaim of non-empty PTE pages In zap_pte_range(), if the pte lock was released midway, the pte entries may be refilled with physical pages by another thread, which may cause a non-empty PTE page to be reclaimed and eventually cause the system to crash. To fix it, fall back to the slow path in this case to recheck if all pte entries are still none. Link: https://lkml.kernel.org/r/20250211072625.89188-1-zhengqi.arch@bytedance.com Fixes: 6375e95f381e ("mm: pgtable: reclaim empty PTE page in madvise(MADV_DONTNEED)") Signed-off-by: Qi Zheng Reported-by: Christian Brauner Closes: https://lore.kernel.org/all/20250207-anbot-bankfilialen-acce9d79a2c7@brauner/ Reported-by: Qu Wenruo Closes: https://lore.kernel.org/all/152296f3-5c81-4a94-97f3-004108fba7be@gmx.com/ Tested-by: Zi Yan Cc: Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: David Hildenbrand Cc: Jann Horn Cc: Matthew Wilcox Cc: Muchun Song Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 539c0f7c6d54..b4d3d4893267 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1719,7 +1719,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pmd_t pmdval; unsigned long start = addr; bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); - bool direct_reclaim = false; + bool direct_reclaim = true; int nr; retry: @@ -1734,8 +1734,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, do { bool any_skipped = false; - if (need_resched()) + if (need_resched()) { + direct_reclaim = false; break; + } nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss, &force_flush, &force_break, &any_skipped); @@ -1743,11 +1745,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, can_reclaim_pt = false; if (unlikely(force_break)) { addr += nr * PAGE_SIZE; + direct_reclaim = false; break; } } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); - if (can_reclaim_pt && addr == end) + /* + * Fast path: try to hold the pmd lock and unmap the PTE page. + * + * If the pte lock was released midway (retry case), or if the attempt + * to hold the pmd lock failed, then we need to recheck all pte entries + * to ensure they are still none, thereby preventing the pte entries + * from being repopulated by another thread. + */ + if (can_reclaim_pt && direct_reclaim && addr == end) direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); add_mm_rss_vec(mm, rss); From 8648ee2622aefa5b567ebea71609822373995f37 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 11 Feb 2025 13:21:17 -0800 Subject: [PATCH 272/310] mailmap: update Nick's entry Link: https://lkml.kernel.org/r/20250211212117.3195265-1-ndesaulniers@google.com Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index f34af946180f..a897c16d3bae 100644 --- a/.mailmap +++ b/.mailmap @@ -534,6 +534,7 @@ Nicholas Piggin Nicholas Piggin Nicholas Piggin Nicholas Piggin +Nick Desaulniers Nicolas Ferre Nicolas Pitre Nicolas Pitre From 99333229dee41b992f3b0493f6aa2e3528138384 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 11 Feb 2025 08:18:19 +0000 Subject: [PATCH 273/310] memcg: avoid dead loop when setting memory.max A softlockup issue was found with stress test: watchdog: BUG: soft lockup - CPU#27 stuck for 26s! [migration/27:181] CPU: 27 UID: 0 PID: 181 Comm: migration/27 6.14.0-rc2-next-20250210 #1 Stopper: multi_cpu_stop <- stop_machine_from_inactive_cpu RIP: 0010:stop_machine_yield+0x2/0x10 RSP: 0000:ff4a0dcecd19be48 EFLAGS: 00000246 RAX: ffffffff89c0108f RBX: ff4a0dcec03afe44 RCX: 0000000000000000 RDX: ff1cdaaf6eba5808 RSI: 0000000000000282 RDI: ff1cda80c1775a40 RBP: 0000000000000001 R08: 00000011620096c6 R09: 7fffffffffffffff R10: 0000000000000001 R11: 0000000000000100 R12: ff1cda80c1775a40 R13: 0000000000000000 R14: 0000000000000001 R15: ff4a0dcec03afe20 FS: 0000000000000000(0000) GS:ff1cdaaf6eb80000(0000) CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000025e2c2a001 CR4: 0000000000773ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: multi_cpu_stop+0x8f/0x100 cpu_stopper_thread+0x90/0x140 smpboot_thread_fn+0xad/0x150 kthread+0xc2/0x100 ret_from_fork+0x2d/0x50 The stress test involves CPU hotplug operations and memory control group (memcg) operations. The scenario can be described as follows: echo xx > memory.max cache_ap_online oom_reaper (CPU23) (CPU50) xx < usage stop_machine_from_inactive_cpu for(;;) // all active cpus trigger OOM queue_stop_cpus_work // waiting oom_reaper multi_cpu_stop(migration/xx) // sync all active cpus ack // waiting cpu23 ack // CPU50 loops in multi_cpu_stop waiting cpu50 Detailed explanation: 1. When the usage is larger than xx, an OOM may be triggered. If the process does not handle with ths kill signal immediately, it will loop in the memory_max_write. 2. When cache_ap_online is triggered, the multi_cpu_stop is queued to the active cpus. Within the multi_cpu_stop function, it attempts to synchronize the CPU states. However, the CPU23 didn't acknowledge because it is stuck in a loop within the for(;;). 3. The oom_reaper process is blocked because CPU50 is in a loop, waiting for CPU23 to acknowledge the synchronization request. 4. Finally, it formed cyclic dependency and lead to softlockup and dead loop. To fix this issue, add cond_resched() in the memory_max_write, so that it will not block migration task. Link: https://lkml.kernel.org/r/20250211081819.33307-1-chenridong@huaweicloud.com Fixes: b6e6edcfa405 ("mm: memcontrol: reclaim and OOM kill when shrinking memory.max below usage") Signed-off-by: Chen Ridong Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Cc: Wang Weiyang Signed-off-by: Andrew Morton --- mm/memcontrol.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 46f8b372d212..4de6acb9b8ec 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4166,6 +4166,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, memcg_memory_event(memcg, MEMCG_OOM); if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) break; + cond_resched(); } memcg_wb_domain_size_changed(memcg); From 6d7bc938adca9024a6b51cf55d9b0542b653b69c Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 10 Feb 2025 22:48:56 -0500 Subject: [PATCH 274/310] mm: hugetlb: avoid fallback for specific node allocation of 1G pages When using the HugeTLB kernel command-line to allocate 1G pages from a specific node, such as: default_hugepagesz=1G hugepages=1:1 If node 1 happens to not have enough memory for the requested number of 1G pages, the allocation falls back to other nodes. A quick way to reproduce this is by creating a KVM guest with a memory-less node and trying to allocate 1 1G page from it. Instead of failing, the allocation will fallback to other nodes. This defeats the purpose of node specific allocation. Also, specific node allocation for 2M pages don't have this behavior: the allocation will just fail for the pages it can't satisfy. This issue happens because HugeTLB calls memblock_alloc_try_nid_raw() for 1G boot-time allocation as this function falls back to other nodes if the allocation can't be satisfied. Use memblock_alloc_exact_nid_raw() instead, which ensures that the allocation will only be satisfied from the specified node. Link: https://lkml.kernel.org/r/20250211034856.629371-1-luizcap@redhat.com Fixes: b5389086ad7b ("hugetlbfs: extend the definition of hugepages parameter to support node allocation") Signed-off-by: Luiz Capitulino Acked-by: Oscar Salvador Acked-by: David Hildenbrand Cc: "Mike Rapoport (IBM)" Cc: Muchun Song Cc: Zhenguo Yao Cc: Frank van der Linden Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65068671e460..163190e89ea1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3145,7 +3145,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) /* do node specific alloc */ if (nid != NUMA_NO_NODE) { - m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), + m = memblock_alloc_exact_nid_raw(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!m) return 0; From 5dcf52e2ce0fe3c4516b1e494c1af6d3a69e30e7 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 12 Feb 2025 17:44:25 +0000 Subject: [PATCH 275/310] selftests/mm: fix check for running THP tests When testing if we should try to compact memory or drop caches before we run the THP or HugeTLB tests we use | as an or operator. This doesn't work since run_vmtests.sh is written in shell where this is used to pipe the output of the first argument into the second. Instead use the shell's -o operator. Link: https://lkml.kernel.org/r/20250212-kselftest-mm-no-hugepages-v1-1-44702f538522@kernel.org Fixes: b433ffa8dbac ("selftests: mm: perform some system cleanup before using hugepages") Signed-off-by: Mark Brown Reviewed-by: Nico Pache Cc: Mariano Pache Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 333c468c2699..da7e26668103 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -220,7 +220,7 @@ run_test() { if test_selected ${CATEGORY}; then # On memory constrainted systems some tests can fail to allocate hugepages. # perform some cleanup before the test for a higher success rate. - if [ ${CATEGORY} == "thp" ] | [ ${CATEGORY} == "hugetlb" ]; then + if [ ${CATEGORY} == "thp" -o ${CATEGORY} == "hugetlb" ]; then echo 3 > /proc/sys/vm/drop_caches sleep 2 echo 1 > /proc/sys/vm/compact_memory From 4998a6fa2a31176d0882bdfa27d5d03b665ba19b Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 12 Feb 2025 09:35:20 -0800 Subject: [PATCH 276/310] MAINTAINERS: update Nick's contact info Updated .mailmap, but forgot these other places. Link: https://lkml.kernel.org/r/20250212173523.3979840-1-ndesaulniers@google.com Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton --- Documentation/process/embargoed-hardware-issues.rst | 2 +- .../translations/sp_SP/process/embargoed-hardware-issues.rst | 2 +- MAINTAINERS | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index daebce49cfdf..0e19d2f0d6bb 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -308,7 +308,7 @@ an involved disclosed party. The current ambassadors list: Google Kees Cook - LLVM Nick Desaulniers + LLVM Nick Desaulniers ============= ======================================================== If you want your organization to be added to the ambassadors list, please diff --git a/Documentation/translations/sp_SP/process/embargoed-hardware-issues.rst b/Documentation/translations/sp_SP/process/embargoed-hardware-issues.rst index 7d4d694967c7..9d444b9c46d3 100644 --- a/Documentation/translations/sp_SP/process/embargoed-hardware-issues.rst +++ b/Documentation/translations/sp_SP/process/embargoed-hardware-issues.rst @@ -287,7 +287,7 @@ revelada involucrada. La lista de embajadores actuales: Google Kees Cook - LLVM Nick Desaulniers + LLVM Nick Desaulniers ============= ======================================================== Si quiere que su organización se añada a la lista de embajadores, por diff --git a/MAINTAINERS b/MAINTAINERS index efee40ea589f..4e17764cb6ed 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5655,7 +5655,7 @@ F: .clang-format CLANG/LLVM BUILD SUPPORT M: Nathan Chancellor -R: Nick Desaulniers +R: Nick Desaulniers R: Bill Wendling R: Justin Stitt L: llvm@lists.linux.dev From ac7af1f57acd1e1d112b36e036584ca4bc4c284a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 17 Feb 2025 15:44:02 -0500 Subject: [PATCH 277/310] kasan: don't call find_vm_area() in a PREEMPT_RT kernel The following bug report was found when running a PREEMPT_RT debug kernel. BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 140605, name: kunit_try_catch preempt_count: 1, expected: 0 Call trace: rt_spin_lock+0x70/0x140 find_vmap_area+0x84/0x168 find_vm_area+0x1c/0x50 print_address_description.constprop.0+0x2a0/0x320 print_report+0x108/0x1f8 kasan_report+0x90/0xc8 Since commit e30a0361b851 ("kasan: make report_lock a raw spinlock"), report_lock was changed to raw_spinlock_t to fix another similar PREEMPT_RT problem. That alone isn't enough to cover other corner cases. print_address_description() is always invoked under the report_lock. The context under this lock is always atomic even on PREEMPT_RT. find_vm_area() acquires vmap_node::busy.lock which is a spinlock_t, becoming a sleeping lock on PREEMPT_RT and must not be acquired in atomic context. Don't invoke find_vm_area() on PREEMPT_RT and just print the address. Non-PREEMPT_RT builds remain unchanged. Add a DEFINE_WAIT_OVERRIDE_MAP() macro to tell lockdep that this lock nesting is allowed because the PREEMPT_RT part (which is invalid) has been taken care of. This macro was first introduced in commit 0cce06ba859a ("debugobjects,locking: Annotate debug_object_fill_pool() wait type violation"). Link: https://lkml.kernel.org/r/20250217204402.60533-1-longman@redhat.com Fixes: e30a0361b851 ("kasan: make report_lock a raw spinlock") Signed-off-by: Waiman Long Suggested-by: Andrey Konovalov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitriy Vyukov Cc: Steven Rostedt Cc: Mariano Pache Cc: Sebastian Andrzej Siewior Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/report.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 3fe77a360f1c..8357e1a33699 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -370,6 +370,36 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } +/* + * This function is invoked with report_lock (a raw_spinlock) held. A + * PREEMPT_RT kernel cannot call find_vm_area() as it will acquire a sleeping + * rt_spinlock. + * + * For !RT kernel, the PROVE_RAW_LOCK_NESTING config option will print a + * lockdep warning for this raw_spinlock -> spinlock dependency. This config + * option is enabled by default to ensure better test coverage to expose this + * kind of RT kernel problem. This lockdep splat, however, can be suppressed + * by using DEFINE_WAIT_OVERRIDE_MAP() if it serves a useful purpose and the + * invalid PREEMPT_RT case has been taken care of. + */ +static inline struct vm_struct *kasan_find_vm_area(void *addr) +{ + static DEFINE_WAIT_OVERRIDE_MAP(vmalloc_map, LD_WAIT_SLEEP); + struct vm_struct *va; + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return NULL; + + /* + * Suppress lockdep warning and fetch vmalloc area of the + * offending address. + */ + lock_map_acquire_try(&vmalloc_map); + va = find_vm_area(addr); + lock_map_release(&vmalloc_map); + return va; +} + static void print_address_description(void *addr, u8 tag, struct kasan_report_info *info) { @@ -399,7 +429,7 @@ static void print_address_description(void *addr, u8 tag, } if (is_vmalloc_addr(addr)) { - struct vm_struct *va = find_vm_area(addr); + struct vm_struct *va = kasan_find_vm_area(addr); if (va) { pr_err("The buggy address belongs to the virtual mapping at\n" @@ -409,6 +439,8 @@ static void print_address_description(void *addr, u8 tag, pr_err("\n"); page = vmalloc_to_page(addr); + } else { + pr_err("The buggy address %px belongs to a vmalloc virtual mapping\n", addr); } } From 8344017aaf32a7532cff293eb3df7fd2265ebafd Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Feb 2025 00:36:59 +0800 Subject: [PATCH 278/310] test_xarray: fix failure in check_pause when CONFIG_XARRAY_MULTI is not defined In case CONFIG_XARRAY_MULTI is not defined, xa_store_order can store a multi-index entry but xas_for_each can't tell sbiling entry from valid entry. So the check_pause failed when we store a multi-index entry and wish xas_for_each can handle it normally. Avoid to store multi-index entry when CONFIG_XARRAY_MULTI is disabled to fix the failure. Link: https://lkml.kernel.org/r/20250213163659.414309-1-shikemeng@huaweicloud.com Fixes: c9ba5249ef8b ("Xarray: move forward index correctly in xas_pause()") Signed-off-by: Kemeng Shi Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/r/CAMuHMdU_bfadUO=0OZ=AoQ9EAmQPA4wsLCBqohXR+QCeCKRn4A@mail.gmail.com Tested-by: Geert Uytterhoeven Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/test_xarray.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 6932a26f4927..0e865bab4a10 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1418,7 +1418,7 @@ static noinline void check_pause(struct xarray *xa) { XA_STATE(xas, xa, 0); void *entry; - unsigned int order; + int order; unsigned long index = 1; unsigned int count = 0; @@ -1450,7 +1450,7 @@ static noinline void check_pause(struct xarray *xa) xa_destroy(xa); index = 0; - for (order = XA_CHUNK_SHIFT; order > 0; order--) { + for (order = order_limit - 1; order >= 0; order--) { XA_BUG_ON(xa, xa_store_order(xa, index, order, xa_mk_index(index), GFP_KERNEL)); index += 1UL << order; @@ -1462,24 +1462,25 @@ static noinline void check_pause(struct xarray *xa) rcu_read_lock(); xas_for_each(&xas, entry, ULONG_MAX) { XA_BUG_ON(xa, entry != xa_mk_index(index)); - index += 1UL << (XA_CHUNK_SHIFT - count); + index += 1UL << (order_limit - count - 1); count++; } rcu_read_unlock(); - XA_BUG_ON(xa, count != XA_CHUNK_SHIFT); + XA_BUG_ON(xa, count != order_limit); index = 0; count = 0; - xas_set(&xas, XA_CHUNK_SIZE / 2 + 1); + /* test unaligned index */ + xas_set(&xas, 1 % (1UL << (order_limit - 1))); rcu_read_lock(); xas_for_each(&xas, entry, ULONG_MAX) { XA_BUG_ON(xa, entry != xa_mk_index(index)); - index += 1UL << (XA_CHUNK_SHIFT - count); + index += 1UL << (order_limit - count - 1); count++; xas_pause(&xas); } rcu_read_unlock(); - XA_BUG_ON(xa, count != XA_CHUNK_SHIFT); + XA_BUG_ON(xa, count != order_limit); xa_destroy(xa); From 8fb5bb169d17cdd12c2dcc2e96830ed487d77a0f Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Feb 2025 12:58:49 +0100 Subject: [PATCH 279/310] sockmap, vsock: For connectible sockets allow only connected sockmap expects all vsocks to have a transport assigned, which is expressed in vsock_proto::psock_update_sk_prot(). However, there is an edge case where an unconnected (connectible) socket may lose its previously assigned transport. This is handled with a NULL check in the vsock/BPF recv path. Another design detail is that listening vsocks are not supposed to have any transport assigned at all. Which implies they are not supported by the sockmap. But this is complicated by the fact that a socket, before switching to TCP_LISTEN, may have had some transport assigned during a failed connect() attempt. Hence, we may end up with a listening vsock in a sockmap, which blows up quickly: KASAN: null-ptr-deref in range [0x0000000000000120-0x0000000000000127] CPU: 7 UID: 0 PID: 56 Comm: kworker/7:0 Not tainted 6.14.0-rc1+ Workqueue: vsock-loopback vsock_loopback_work RIP: 0010:vsock_read_skb+0x4b/0x90 Call Trace: sk_psock_verdict_data_ready+0xa4/0x2e0 virtio_transport_recv_pkt+0x1ca8/0x2acc vsock_loopback_work+0x27d/0x3f0 process_one_work+0x846/0x1420 worker_thread+0x5b3/0xf80 kthread+0x35a/0x700 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 For connectible sockets, instead of relying solely on the state of vsk->transport, tell sockmap to only allow those representing established connections. This aligns with the behaviour for AF_INET and AF_UNIX. Fixes: 634f1a7110b4 ("vsock: support sockmap") Signed-off-by: Michal Luczaj Acked-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- net/core/sock_map.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index f1b9b3958792..2f1be9baad05 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -541,6 +541,9 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; + if (sk_is_vsock(sk) && + (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) + return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } From 857ae05549ee2542317e7084ecaa5f8536634dd9 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Feb 2025 12:58:50 +0100 Subject: [PATCH 280/310] vsock/bpf: Warn on socket without transport In the spirit of commit 91751e248256 ("vsock: prevent null-ptr-deref in vsock_*[has_data|has_space]"), armorize the "impossible" cases with a warning. Fixes: 634f1a7110b4 ("vsock: support sockmap") Signed-off-by: Michal Luczaj Reviewed-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- net/vmw_vsock/af_vsock.c | 3 +++ net/vmw_vsock/vsock_bpf.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 53a081d49d28..7e3db87ae433 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1189,6 +1189,9 @@ static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor) { struct vsock_sock *vsk = vsock_sk(sk); + if (WARN_ON_ONCE(!vsk->transport)) + return -ENODEV; + return vsk->transport->read_skb(vsk, read_actor); } diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c index f201d9eca1df..07b96d56f3a5 100644 --- a/net/vmw_vsock/vsock_bpf.c +++ b/net/vmw_vsock/vsock_bpf.c @@ -87,7 +87,7 @@ static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, lock_sock(sk); vsk = vsock_sk(sk); - if (!vsk->transport) { + if (WARN_ON_ONCE(!vsk->transport)) { copied = -ENODEV; goto out; } From 8350695bfb169b1924626a68f76b369ad01f18f2 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Feb 2025 12:58:51 +0100 Subject: [PATCH 281/310] selftest/bpf: Adapt vsock_delete_on_close to sockmap rejecting unconnected Commit 515745445e92 ("selftest/bpf: Add test for vsock removal from sockmap on close()") added test that checked if proto::close() callback was invoked on AF_VSOCK socket release. I.e. it verified that a close()d vsock does indeed get removed from the sockmap. It was done simply by creating a socket pair and attempting to replace a close()d one with its peer. Since, due to a recent change, sockmap does not allow updating index with a non-established connectible vsock, redo it with a freshly established one. Signed-off-by: Michal Luczaj Acked-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- .../selftests/bpf/prog_tests/sockmap_basic.c | 40 ++++++++++--------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 884ad87783d5..21793d8c79e1 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -111,31 +111,35 @@ static void test_sockmap_create_update_free(enum bpf_map_type map_type) static void test_sockmap_vsock_delete_on_close(void) { - int err, c, p, map; - const int zero = 0; - - err = create_pair(AF_VSOCK, SOCK_STREAM, &c, &p); - if (!ASSERT_OK(err, "create_pair(AF_VSOCK)")) - return; + int map, c, p, err, zero = 0; map = bpf_map_create(BPF_MAP_TYPE_SOCKMAP, NULL, sizeof(int), sizeof(int), 1, NULL); - if (!ASSERT_GE(map, 0, "bpf_map_create")) { - close(c); - goto out; - } + if (!ASSERT_OK_FD(map, "bpf_map_create")) + return; - err = bpf_map_update_elem(map, &zero, &c, BPF_NOEXIST); - close(c); - if (!ASSERT_OK(err, "bpf_map_update")) - goto out; + err = create_pair(AF_VSOCK, SOCK_STREAM, &c, &p); + if (!ASSERT_OK(err, "create_pair")) + goto close_map; - err = bpf_map_update_elem(map, &zero, &p, BPF_NOEXIST); + if (xbpf_map_update_elem(map, &zero, &c, BPF_NOEXIST)) + goto close_socks; + + xclose(c); + xclose(p); + + err = create_pair(AF_VSOCK, SOCK_STREAM, &c, &p); + if (!ASSERT_OK(err, "create_pair")) + goto close_map; + + err = bpf_map_update_elem(map, &zero, &c, BPF_NOEXIST); ASSERT_OK(err, "after close(), bpf_map_update"); -out: - close(p); - close(map); +close_socks: + xclose(c); + xclose(p); +close_map: + xclose(map); } static void test_skmsg_helpers(enum bpf_map_type map_type) From 85928e9c436398abcac32a9afa2f591895dd497d Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Feb 2025 12:58:52 +0100 Subject: [PATCH 282/310] selftest/bpf: Add vsock test for sockmap rejecting unconnected Verify that for a connectible AF_VSOCK socket, merely having a transport assigned is insufficient; socket must be connected for the sockmap to accept. This does not test datagram vsocks. Even though it hardly matters. VMCI is the only transport that features VSOCK_TRANSPORT_F_DGRAM, but it has an unimplemented vsock_transport::readskb() callback, making it unsupported by BPF/sockmap. Signed-off-by: Michal Luczaj Acked-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- .../selftests/bpf/prog_tests/sockmap_basic.c | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 21793d8c79e1..05eb37935c3e 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -1065,6 +1065,34 @@ static void test_sockmap_skb_verdict_vsock_poll(void) test_sockmap_pass_prog__destroy(skel); } +static void test_sockmap_vsock_unconnected(void) +{ + struct sockaddr_storage addr; + int map, s, zero = 0; + socklen_t alen; + + map = bpf_map_create(BPF_MAP_TYPE_SOCKMAP, NULL, sizeof(int), + sizeof(int), 1, NULL); + if (!ASSERT_OK_FD(map, "bpf_map_create")) + return; + + s = xsocket(AF_VSOCK, SOCK_STREAM, 0); + if (s < 0) + goto close_map; + + /* Fail connect(), but trigger transport assignment. */ + init_addr_loopback(AF_VSOCK, &addr, &alen); + if (!ASSERT_ERR(connect(s, sockaddr(&addr), alen), "connect")) + goto close_sock; + + ASSERT_ERR(bpf_map_update_elem(map, &zero, &s, BPF_ANY), "map_update"); + +close_sock: + xclose(s); +close_map: + xclose(map); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -1131,4 +1159,6 @@ void test_sockmap_basic(void) test_skmsg_helpers_with_link(BPF_MAP_TYPE_SOCKHASH); if (test__start_subtest("sockmap skb_verdict vsock poll")) test_sockmap_skb_verdict_vsock_poll(); + if (test__start_subtest("sockmap vsock unconnected")) + test_sockmap_vsock_unconnected(); } From f5da7c45188eea71394bf445655cae2df88a7788 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 17 Feb 2025 15:29:05 -0800 Subject: [PATCH 283/310] tcp: adjust rcvq_space after updating scaling ratio Since commit under Fixes we set the window clamp in accordance to newly measured rcvbuf scaling_ratio. If the scaling_ratio decreased significantly we may put ourselves in a situation where windows become smaller than rcvq_space, preventing tcp_rcv_space_adjust() from increasing rcvbuf. The significant decrease of scaling_ratio is far more likely since commit 697a6c8cec03 ("tcp: increase the default TCP scaling ratio"), which increased the "default" scaling ratio from ~30% to 50%. Hitting the bad condition depends a lot on TCP tuning, and drivers at play. One of Meta's workloads hits it reliably under following conditions: - default rcvbuf of 125k - sender MTU 1500, receiver MTU 5000 - driver settles on scaling_ratio of 78 for the config above. Initial rcvq_space gets calculated as TCP_INIT_CWND * tp->advmss (10 * 5k = 50k). Once we find out the true scaling ratio and MSS we clamp the windows to 38k. Triggering the condition also depends on the message sequence of this workload. I can't repro the problem with simple iperf or TCP_RR-style tests. Fixes: a2cbb1603943 ("tcp: Update window clamping condition") Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250217232905.3162187-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eb82e01da911..98b8cc740392 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -243,9 +243,15 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) do_div(val, skb->truesize); tcp_sk(sk)->scaling_ratio = val ? val : 1; - if (old_ratio != tcp_sk(sk)->scaling_ratio) - WRITE_ONCE(tcp_sk(sk)->window_clamp, - tcp_win_from_space(sk, sk->sk_rcvbuf)); + if (old_ratio != tcp_sk(sk)->scaling_ratio) { + struct tcp_sock *tp = tcp_sk(sk); + + val = tcp_win_from_space(sk, sk->sk_rcvbuf); + tcp_set_window_clamp(sk, val); + + if (tp->window_clamp < tp->rcvq_space.space) + tp->rcvq_space.space = tp->window_clamp; + } } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); From 415cadd505464d9a11ff5e0f6e0329c127849da5 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Fri, 14 Feb 2025 14:43:59 -0800 Subject: [PATCH 284/310] gve: set xdp redirect target only when it is available Before this patch the NETDEV_XDP_ACT_NDO_XMIT XDP feature flag is set by default as part of driver initialization, and is never cleared. However, this flag differs from others in that it is used as an indicator for whether the driver is ready to perform the ndo_xdp_xmit operation as part of an XDP_REDIRECT. Kernel helpers xdp_features_(set|clear)_redirect_target exist to convey this meaning. This patch ensures that the netdev is only reported as a redirect target when XDP queues exist to forward traffic. Fixes: 39a7f4aa3e4a ("gve: Add XDP REDIRECT support for GQI-QPL format") Cc: stable@vger.kernel.org Reviewed-by: Praveen Kaligineedi Reviewed-by: Jeroen de Borst Signed-off-by: Joshua Washington Link: https://patch.msgid.link/20250214224417.1237818-1-joshwash@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve.h | 10 ++++++++++ drivers/net/ethernet/google/gve/gve_main.c | 6 +++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 8167cc5fb0df..78d2a19593d1 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -1116,6 +1116,16 @@ static inline u32 gve_xdp_tx_start_queue_id(struct gve_priv *priv) return gve_xdp_tx_queue_id(priv, 0); } +static inline bool gve_supports_xdp_xmit(struct gve_priv *priv) +{ + switch (priv->queue_format) { + case GVE_GQI_QPL_FORMAT: + return true; + default: + return false; + } +} + /* gqi napi handler defined in gve_main.c */ int gve_napi_poll(struct napi_struct *napi, int budget); diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 533e659b15b3..92237fb0b60c 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1903,6 +1903,8 @@ static void gve_turndown(struct gve_priv *priv) /* Stop tx queues */ netif_tx_disable(priv->dev); + xdp_features_clear_redirect_target(priv->dev); + gve_clear_napi_enabled(priv); gve_clear_report_stats(priv); @@ -1972,6 +1974,9 @@ static void gve_turnup(struct gve_priv *priv) napi_schedule(&block->napi); } + if (priv->num_xdp_queues && gve_supports_xdp_xmit(priv)) + xdp_features_set_redirect_target(priv->dev, false); + gve_set_napi_enabled(priv); } @@ -2246,7 +2251,6 @@ static void gve_set_netdev_xdp_features(struct gve_priv *priv) if (priv->queue_format == GVE_GQI_QPL_FORMAT) { xdp_features = NETDEV_XDP_ACT_BASIC; xdp_features |= NETDEV_XDP_ACT_REDIRECT; - xdp_features |= NETDEV_XDP_ACT_NDO_XMIT; xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY; } else { xdp_features = 0; From 2f56be7f52ece7fc8c16a58ca9683f0a73e288e1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 15 Feb 2025 08:26:46 -0800 Subject: [PATCH 285/310] MAINTAINERS: trim the GVE entry We requested in the past that GVE patches coming out of Google should be submitted only by GVE maintainers. There were too many patches posted which didn't follow the subsystem guidance. Recently Joshua was added to maintainers, but even tho he was asked to follow the netdev "FAQ" in the past [1] he does not follow the local customs. It is not reasonable for a person who hasn't read the maintainer entry for the subsystem to be a driver maintainer. We can re-add once Joshua does some on-list reviews to prove the fluency with the upstream process. Link: https://lore.kernel.org/20240610172720.073d5912@kernel.org # [1] Link: https://patch.msgid.link/20250215162646.2446559-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1405ebe703a8..0bfcbe6a74ea 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9834,7 +9834,6 @@ F: drivers/input/touchscreen/goodix* GOOGLE ETHERNET DRIVERS M: Jeroen de Borst -M: Joshua Washington M: Harshitha Ramamurthy L: netdev@vger.kernel.org S: Maintained From f6093c5ec74d5cc495f89bd359253d9c738d04d9 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Mon, 17 Feb 2025 14:48:11 +0100 Subject: [PATCH 286/310] net: pse-pd: pd692x0: Fix power limit retrieval Fix incorrect data offset read in the pd692x0_pi_get_pw_limit callback. The issue was previously unnoticed as it was only used by the regulator API and not thoroughly tested, since the PSE is mainly controlled via ethtool. The function became actively used by ethtool after commit 3e9dbfec4998 ("net: pse-pd: Split ethtool_get_status into multiple callbacks"), which led to the discovery of this issue. Fix it by using the correct data offset. Fixes: a87e699c9d33 ("net: pse-pd: pd692x0: Enhance with new current limit and voltage read callbacks") Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20250217134812.1925345-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/pd692x0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/pse-pd/pd692x0.c b/drivers/net/pse-pd/pd692x0.c index fc9e23927b3b..7d60a714ca53 100644 --- a/drivers/net/pse-pd/pd692x0.c +++ b/drivers/net/pse-pd/pd692x0.c @@ -1047,7 +1047,7 @@ static int pd692x0_pi_get_pw_limit(struct pse_controller_dev *pcdev, if (ret < 0) return ret; - return pd692x0_pi_get_pw_from_table(buf.data[2], buf.data[3]); + return pd692x0_pi_get_pw_from_table(buf.data[0], buf.data[1]); } static int pd692x0_pi_set_pw_limit(struct pse_controller_dev *pcdev, From e57a6320215c3967f51ab0edeff87db2095440e4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Feb 2025 11:11:27 -0800 Subject: [PATCH 287/310] net: Add net_passive_inc() and net_passive_dec(). net_drop_ns() is NULL when CONFIG_NET_NS is disabled. The next patch introduces a function that increments and decrements net->passive. As a prep, let's rename and export net_free() to net_passive_dec() and add net_passive_inc(). Suggested-by: Eric Dumazet Link: https://lore.kernel.org/netdev/CANn89i+oUCt2VGvrbrweniTendZFEh+nwS=uonc004-aPkWy-Q@mail.gmail.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250217191129.19967-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 11 +++++++++++ net/core/net_namespace.c | 8 ++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 7ba1402ca779..f467a66abc6b 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -297,6 +297,7 @@ static inline int check_net(const struct net *net) } void net_drop_ns(void *); +void net_passive_dec(struct net *net); #else @@ -326,8 +327,18 @@ static inline int check_net(const struct net *net) } #define net_drop_ns NULL + +static inline void net_passive_dec(struct net *net) +{ + refcount_dec(&net->passive); +} #endif +static inline void net_passive_inc(struct net *net) +{ + refcount_inc(&net->passive); +} + /* Returns true if the netns initialization is completed successfully */ static inline bool net_initialized(const struct net *net) { diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cb39a12b2f82..4303f2a49262 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -464,7 +464,7 @@ static void net_complete_free(void) } -static void net_free(struct net *net) +void net_passive_dec(struct net *net) { if (refcount_dec_and_test(&net->passive)) { kfree(rcu_access_pointer(net->gen)); @@ -482,7 +482,7 @@ void net_drop_ns(void *p) struct net *net = (struct net *)p; if (net) - net_free(net); + net_passive_dec(net); } struct net *copy_net_ns(unsigned long flags, @@ -523,7 +523,7 @@ struct net *copy_net_ns(unsigned long flags, key_remove_domain(net->key_domain); #endif put_user_ns(user_ns); - net_free(net); + net_passive_dec(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); @@ -672,7 +672,7 @@ static void cleanup_net(struct work_struct *work) key_remove_domain(net->key_domain); #endif put_user_ns(net->user_ns); - net_free(net); + net_passive_dec(net); } cleanup_net_task = NULL; } From 65161fb544aada499c912b6010a8f7d8e04f6130 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Feb 2025 11:11:28 -0800 Subject: [PATCH 288/310] net: Fix dev_net(dev) race in unregister_netdevice_notifier_dev_net(). After the cited commit, dev_net(dev) is fetched before holding RTNL and passed to __unregister_netdevice_notifier_net(). However, dev_net(dev) might be different after holding RTNL. In the reported case [0], while removing a VF device, its netns was being dismantled and the VF was moved to init_net. So the following sequence is basically illegal when dev was fetched without lookup: net = dev_net(dev); rtnl_net_lock(net); Let's use a new helper rtnl_net_dev_lock() to fix the race. It fetches dev_net_rcu(dev), bumps its net->passive, and checks if dev_net_rcu(dev) is changed after rtnl_net_lock(). [0]: BUG: KASAN: slab-use-after-free in notifier_call_chain (kernel/notifier.c:75 (discriminator 2)) Read of size 8 at addr ffff88810cefb4c8 by task test-bridge-lag/21127 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:123) print_report (mm/kasan/report.c:379 mm/kasan/report.c:489) kasan_report (mm/kasan/report.c:604) notifier_call_chain (kernel/notifier.c:75 (discriminator 2)) call_netdevice_notifiers_info (net/core/dev.c:2011) unregister_netdevice_many_notify (net/core/dev.c:11551) unregister_netdevice_queue (net/core/dev.c:11487) unregister_netdev (net/core/dev.c:11635) mlx5e_remove (drivers/net/ethernet/mellanox/mlx5/core/en_main.c:6552 drivers/net/ethernet/mellanox/mlx5/core/en_main.c:6579) mlx5_core auxiliary_bus_remove (drivers/base/auxiliary.c:230) device_release_driver_internal (drivers/base/dd.c:1275 drivers/base/dd.c:1296) bus_remove_device (./include/linux/kobject.h:193 drivers/base/base.h:73 drivers/base/bus.c:583) device_del (drivers/base/power/power.h:142 drivers/base/core.c:3855) mlx5_rescan_drivers_locked (./include/linux/auxiliary_bus.h:241 drivers/net/ethernet/mellanox/mlx5/core/dev.c:333 drivers/net/ethernet/mellanox/mlx5/core/dev.c:535 drivers/net/ethernet/mellanox/mlx5/core/dev.c:549) mlx5_core mlx5_unregister_device (drivers/net/ethernet/mellanox/mlx5/core/dev.c:468) mlx5_core mlx5_uninit_one (./include/linux/instrumented.h:68 ./include/asm-generic/bitops/instrumented-non-atomic.h:141 drivers/net/ethernet/mellanox/mlx5/core/main.c:1563) mlx5_core remove_one (drivers/net/ethernet/mellanox/mlx5/core/main.c:965 drivers/net/ethernet/mellanox/mlx5/core/main.c:2019) mlx5_core pci_device_remove (./include/linux/pm_runtime.h:129 drivers/pci/pci-driver.c:475) device_release_driver_internal (drivers/base/dd.c:1275 drivers/base/dd.c:1296) unbind_store (drivers/base/bus.c:245) kernfs_fop_write_iter (fs/kernfs/file.c:338) vfs_write (fs/read_write.c:587 (discriminator 1) fs/read_write.c:679 (discriminator 1)) ksys_write (fs/read_write.c:732) do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1) arch/x86/entry/common.c:83 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) RIP: 0033:0x7f6a4d5018b7 Fixes: 7fb1073300a2 ("net: Hold rtnl_net_lock() in (un)?register_netdevice_notifier_dev_net().") Reported-by: Yael Chemla Closes: https://lore.kernel.org/netdev/146eabfe-123c-4970-901e-e961b4c09bc3@nvidia.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250217191129.19967-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index b91658e8aedb..19e268568282 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2070,6 +2070,42 @@ static void __move_netdevice_notifier_net(struct net *src_net, __register_netdevice_notifier_net(dst_net, nb, true); } +static void rtnl_net_dev_lock(struct net_device *dev) +{ + bool again; + + do { + struct net *net; + + again = false; + + /* netns might be being dismantled. */ + rcu_read_lock(); + net = dev_net_rcu(dev); + net_passive_inc(net); + rcu_read_unlock(); + + rtnl_net_lock(net); + +#ifdef CONFIG_NET_NS + /* dev might have been moved to another netns. */ + if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) { + rtnl_net_unlock(net); + net_passive_dec(net); + again = true; + } +#endif + } while (again); +} + +static void rtnl_net_dev_unlock(struct net_device *dev) +{ + struct net *net = dev_net(dev); + + rtnl_net_unlock(net); + net_passive_dec(net); +} + int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) @@ -2077,6 +2113,11 @@ int register_netdevice_notifier_dev_net(struct net_device *dev, struct net *net = dev_net(dev); int err; + /* rtnl_net_lock() assumes dev is not yet published by + * register_netdevice(). + */ + DEBUG_NET_WARN_ON_ONCE(!list_empty(&dev->dev_list)); + rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); if (!err) { @@ -2093,13 +2134,12 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { - struct net *net = dev_net(dev); int err; - rtnl_net_lock(net); + rtnl_net_dev_lock(dev); list_del(&nn->list); - err = __unregister_netdevice_notifier_net(net, nb); - rtnl_net_unlock(net); + err = __unregister_netdevice_notifier_net(dev_net(dev), nb); + rtnl_net_dev_unlock(dev); return err; } From d4c6bfc83936cb61fac99e9891c406fbdd40f964 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Feb 2025 11:11:29 -0800 Subject: [PATCH 289/310] dev: Use rtnl_net_dev_lock() in unregister_netdev(). The following sequence is basically illegal when dev was fetched without lookup because dev_net(dev) might be different after holding rtnl_net_lock(): net = dev_net(dev); rtnl_net_lock(net); Let's use rtnl_net_dev_lock() in unregister_netdev(). Note that there is no real bug in unregister_netdev() for now because RTNL protects the scope even if dev_net(dev) is changed before/after RTNL. Fixes: 00fb9823939e ("dev: Hold per-netns RTNL in (un)?register_netdev().") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250217191129.19967-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 19e268568282..fafd2f4b5d5d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11920,11 +11920,9 @@ EXPORT_SYMBOL(unregister_netdevice_many); */ void unregister_netdev(struct net_device *dev) { - struct net *net = dev_net(dev); - - rtnl_net_lock(net); + rtnl_net_dev_lock(dev); unregister_netdevice(dev); - rtnl_net_unlock(net); + rtnl_net_dev_unlock(dev); } EXPORT_SYMBOL(unregister_netdev); From 7330195e6018ece3e886177ffbc9349a0b6585e6 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 11 Feb 2025 20:51:25 +1030 Subject: [PATCH 290/310] smb: client, common: Avoid multiple -Wflex-array-member-not-at-end warnings -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. So, in order to avoid ending up with flexible-array members in the middle of other structs, we use the `__struct_group()` helper to separate the flexible arrays from the rest of the members in the flexible structures. We then use the newly created tagged `struct smb2_file_link_info_hdr` and `struct smb2_file_rename_info_hdr` to replace the type of the objects causing trouble: `rename_info` and `link_info` in `struct smb2_compound_vars`. We also want to ensure that when new members need to be added to the flexible structures, they are always included within the newly created tagged structs. For this, we use `static_assert()`. This ensures that the memory layout for both the flexible structure and the new tagged struct is the same after any changes. So, with these changes, fix 86 of the following warnings: fs/smb/client/cifsglob.h:2335:36: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] fs/smb/client/cifsglob.h:2334:38: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Acked-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 4 ++-- fs/smb/common/smb2pdu.h | 30 ++++++++++++++++++++---------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 4bdd6a43e521..bc06b8ae2ebd 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -2324,8 +2324,8 @@ struct smb2_compound_vars { struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; struct kvec close_iov; - struct smb2_file_rename_info rename_info; - struct smb2_file_link_info link_info; + struct smb2_file_rename_info_hdr rename_info; + struct smb2_file_link_info_hdr link_info; struct kvec ea_iov; }; diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index 3336df2ea5d4..c7a0efda4403 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -1707,23 +1707,33 @@ struct smb2_file_internal_info { } __packed; /* level 6 Query */ struct smb2_file_rename_info { /* encoding of request for level 10 */ - __u8 ReplaceIfExists; /* 1 = replace existing target with new */ - /* 0 = fail if target already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; + /* New members MUST be added within the struct_group() macro below. */ + __struct_group(smb2_file_rename_info_hdr, __hdr, __packed, + __u8 ReplaceIfExists; /* 1 = replace existing target with new */ + /* 0 = fail if target already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + ); char FileName[]; /* New name to be assigned */ /* padding - overall struct size must be >= 24 so filename + pad >= 6 */ } __packed; /* level 10 Set */ +static_assert(offsetof(struct smb2_file_rename_info, FileName) == sizeof(struct smb2_file_rename_info_hdr), + "struct member likely outside of __struct_group()"); struct smb2_file_link_info { /* encoding of request for level 11 */ - __u8 ReplaceIfExists; /* 1 = replace existing link with new */ - /* 0 = fail if link already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; + /* New members MUST be added within the struct_group() macro below. */ + __struct_group(smb2_file_link_info_hdr, __hdr, __packed, + __u8 ReplaceIfExists; /* 1 = replace existing link with new */ + /* 0 = fail if link already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + ); char FileName[]; /* Name to be assigned to new link */ } __packed; /* level 11 Set */ +static_assert(offsetof(struct smb2_file_link_info, FileName) == sizeof(struct smb2_file_link_info_hdr), + "struct member likely outside of __struct_group()"); /* * This level 18, although with struct with same name is different from cifs From 9df23801c83d3e12b4c09be39d37d2be385e52f9 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 16 Feb 2025 22:17:54 -0600 Subject: [PATCH 291/310] smb311: failure to open files of length 1040 when mounting with SMB3.1.1 POSIX extensions If a file size has bits 0x410 = ATTR_DIRECTORY | ATTR_REPARSE set then during queryinfo (stat) the file is regarded as a directory and subsequent opens can fail. A simple test example is trying to open any file 1040 bytes long when mounting with "posix" (SMB3.1.1 POSIX/Linux Extensions). The cause of this bug is that Attributes field in smb2_file_all_info struct occupies the same place that EndOfFile field in smb311_posix_qinfo, and sometimes the latter struct is incorrectly processed as if it was the first one. Reported-by: Oleh Nykyforchyn Tested-by: Oleh Nykyforchyn Acked-by: Paulo Alcantara (Red Hat) Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 1 + fs/smb/client/reparse.h | 28 ++++++++++++++++++++++------ fs/smb/client/smb2inode.c | 4 ++++ fs/smb/client/smb2ops.c | 3 ++- 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index bc06b8ae2ebd..cddeb2adbf4a 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -253,6 +253,7 @@ struct cifs_cred { struct cifs_open_info_data { bool adjust_tz; bool reparse_point; + bool contains_posix_file_info; struct { /* ioctl response buffer */ struct { diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h index 5a753fec7e2c..c0be5ab45a78 100644 --- a/fs/smb/client/reparse.h +++ b/fs/smb/client/reparse.h @@ -99,14 +99,30 @@ static inline bool reparse_inode_match(struct inode *inode, static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data) { - struct smb2_file_all_info *fi = &data->fi; - u32 attrs = le32_to_cpu(fi->Attributes); + u32 attrs; bool ret; - ret = data->reparse_point || (attrs & ATTR_REPARSE); - if (ret) - attrs |= ATTR_REPARSE; - fi->Attributes = cpu_to_le32(attrs); + if (data->contains_posix_file_info) { + struct smb311_posix_qinfo *fi = &data->posix_fi; + + attrs = le32_to_cpu(fi->DosAttributes); + if (data->reparse_point) { + attrs |= ATTR_REPARSE; + fi->DosAttributes = cpu_to_le32(attrs); + } + + } else { + struct smb2_file_all_info *fi = &data->fi; + + attrs = le32_to_cpu(fi->Attributes); + if (data->reparse_point) { + attrs |= ATTR_REPARSE; + fi->Attributes = cpu_to_le32(attrs); + } + } + + ret = attrs & ATTR_REPARSE; + return ret; } diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 5dfb30b0a852..826b57a5a2a8 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -650,6 +650,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, switch (cmds[i]) { case SMB2_OP_QUERY_INFO: idata = in_iov[i].iov_base; + idata->contains_posix_file_info = false; if (rc == 0 && cfile && cfile->symlink_target) { idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); if (!idata->symlink_target) @@ -673,6 +674,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, break; case SMB2_OP_POSIX_QUERY_INFO: idata = in_iov[i].iov_base; + idata->contains_posix_file_info = true; if (rc == 0 && cfile && cfile->symlink_target) { idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); if (!idata->symlink_target) @@ -770,6 +772,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, idata = in_iov[i].iov_base; idata->reparse.io.iov = *iov; idata->reparse.io.buftype = resp_buftype[i + 1]; + idata->contains_posix_file_info = false; /* BB VERIFY */ rbuf = reparse_buf_ptr(iov); if (IS_ERR(rbuf)) { rc = PTR_ERR(rbuf); @@ -791,6 +794,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, case SMB2_OP_QUERY_WSL_EA: if (!rc) { idata = in_iov[i].iov_base; + idata->contains_posix_file_info = false; qi_rsp = rsp_iov[i + 1].iov_base; data[0] = (u8 *)qi_rsp + le16_to_cpu(qi_rsp->OutputBufferOffset); size[0] = le32_to_cpu(qi_rsp->OutputBufferLength); diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index ec36bed54b0b..23e0c8be7fb5 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1001,6 +1001,7 @@ static int smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, if (!data->symlink_target) return -ENOMEM; } + data->contains_posix_file_info = false; return SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, &data->fi); } @@ -5146,7 +5147,7 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, FILE_CREATE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, ACL_NO_MODE); oparms.fid = &fid; - + idata.contains_posix_file_info = false; rc = server->ops->open(xid, &oparms, &oplock, &idata); if (rc) goto out; From cad3fc0a4c8cef07b07ceddc137f582267577250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Wed, 18 Sep 2024 00:16:05 +0200 Subject: [PATCH 292/310] cifs: Throw -EOPNOTSUPP error on unsupported reparse point type from parse_reparse_point() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This would help to track and detect by caller if the reparse point type was processed or not. Signed-off-by: Pali Rohár Signed-off-by: Steve French --- fs/smb/client/reparse.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 0a5a52a8a7dd..2b9e9885dc42 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -1088,13 +1088,12 @@ int parse_reparse_point(struct reparse_data_buffer *buf, le32_to_cpu(buf->ReparseTag)); return -EIO; } - break; + return 0; default: cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n", le32_to_cpu(buf->ReparseTag)); - break; + return -EOPNOTSUPP; } - return 0; } int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, From b587fd128660d48cd2122f870f720ff8e2b4abb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Wed, 18 Sep 2024 00:28:25 +0200 Subject: [PATCH 293/310] cifs: Treat unhandled directory name surrogate reparse points as mount directory nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the reparse point was not handled (indicated by the -EOPNOTSUPP from ops->parse_reparse_point() call) but reparse tag is of type name surrogate directory type, then treat is as a new mount point. Name surrogate reparse point represents another named entity in the system. From SMB client point of view, this another entity is resolved on the SMB server, and server serves its content automatically. Therefore from Linux client point of view, this name surrogate reparse point of directory type crosses mount point. Signed-off-by: Pali Rohár Signed-off-by: Steve French --- fs/smb/client/inode.c | 13 +++++++++++++ fs/smb/common/smbfsctl.h | 3 +++ 2 files changed, 16 insertions(+) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 3261190e6f90..616149c7f0a5 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1215,6 +1215,19 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, rc = server->ops->parse_reparse_point(cifs_sb, full_path, iov, data); + /* + * If the reparse point was not handled but it is the + * name surrogate which points to directory, then treat + * is as a new mount point. Name surrogate reparse point + * represents another named entity in the system. + */ + if (rc == -EOPNOTSUPP && + IS_REPARSE_TAG_NAME_SURROGATE(data->reparse.tag) && + (le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY)) { + rc = 0; + cifs_create_junction_fattr(fattr, sb); + goto out; + } } if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) { diff --git a/fs/smb/common/smbfsctl.h b/fs/smb/common/smbfsctl.h index 4b379e84c46b..3253a18ecb5c 100644 --- a/fs/smb/common/smbfsctl.h +++ b/fs/smb/common/smbfsctl.h @@ -159,6 +159,9 @@ #define IO_REPARSE_TAG_LX_CHR 0x80000025 #define IO_REPARSE_TAG_LX_BLK 0x80000026 +/* If Name Surrogate Bit is set, the file or directory represents another named entity in the system. */ +#define IS_REPARSE_TAG_NAME_SURROGATE(tag) (!!((tag) & 0x20000000)) + /* fsctl flags */ /* If Flags is set to this value, the request is an FSCTL not ioctl request */ #define SMB2_0_IOCTL_IS_FSCTL 0x00000001 From b9ddb3e1a8aa86c61c4a93e27cf66414f5fa7b6e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Feb 2025 12:43:42 -0500 Subject: [PATCH 294/310] bcachefs: Fix fsck directory i_size checking Error handling was wrong, causing unhandled transaction restart errors. check_directory_size() was also inefficient, since keys in multiple snapshots would be iterated over once for every snapshot. Convert it to the same scheme used for i_sectors and subdir count checking. Cc: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 78 +++++++++++++++----------------------- fs/bcachefs/sb-downgrade.c | 2 +- 2 files changed, 32 insertions(+), 48 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 53a421ff136d..9bf316e7b845 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -823,6 +823,7 @@ struct inode_walker_entry { struct bch_inode_unpacked inode; u32 snapshot; u64 count; + u64 i_size; }; struct inode_walker { @@ -910,8 +911,9 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_ if (k.k->p.snapshot != i->snapshot && !is_whiteout) { struct inode_walker_entry new = *i; - new.snapshot = k.k->p.snapshot; - new.count = 0; + new.snapshot = k.k->p.snapshot; + new.count = 0; + new.i_size = 0; struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); @@ -1116,37 +1118,6 @@ static int get_snapshot_root_inode(struct btree_trans *trans, return ret; } -static int check_directory_size(struct btree_trans *trans, - struct bch_inode_unpacked *inode_u, - struct bkey_s_c inode_k, bool *write_inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 new_size = 0; - int ret; - - for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, - SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot), - POS(inode_k.k->p.offset, U64_MAX), - 0, k, ret) { - if (k.k->type != KEY_TYPE_dirent) - continue; - - struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k); - struct qstr name = bch2_dirent_get_name(dirent); - - new_size += dirent_occupied_size(&name); - } - bch2_trans_iter_exit(trans, &iter); - - if (!ret && inode_u->bi_size != new_size) { - inode_u->bi_size = new_size; - *write_inode = true; - } - - return ret; -} - static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -1335,16 +1306,6 @@ static int check_inode(struct btree_trans *trans, u.bi_journal_seq = journal_cur_seq(&c->journal); do_update = true; } - - if (S_ISDIR(u.bi_mode)) { - ret = check_directory_size(trans, &u, k, &do_update); - - fsck_err_on(ret, - trans, directory_size_mismatch, - "directory inode %llu:%u with the mismatch directory size", - u.bi_inum, k.k->p.snapshot); - ret = 0; - } do_update: if (do_update) { ret = __bch2_fsck_write_inode(trans, &u); @@ -2017,10 +1978,31 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ return ret; } -static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) +static int check_dir_i_size_notnested(struct btree_trans *trans, struct inode_walker *w) +{ + struct bch_fs *c = trans->c; + int ret = 0; + + darray_for_each(w->inodes, i) + if (fsck_err_on(i->inode.bi_size != i->i_size, + trans, inode_dir_wrong_nlink, + "directory %llu:%u with wrong i_size: got %llu, should be %llu", + w->last_pos.inode, i->snapshot, i->inode.bi_size, i->i_size)) { + i->inode.bi_size = i->i_size; + ret = bch2_fsck_write_inode(trans, &i->inode); + if (ret) + break; + } +fsck_err: + bch_err_fn(c, ret); + return ret; +} + +static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) { u32 restart_count = trans->restart_count; return check_subdir_count_notnested(trans, w) ?: + check_dir_i_size_notnested(trans, w) ?: trans_was_restarted(trans, restart_count); } @@ -2367,7 +2349,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto out; if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { - ret = check_subdir_count(trans, dir); + ret = check_subdir_dirents_count(trans, dir); if (ret) goto err; } @@ -2457,9 +2439,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; - if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) { + if (d.v->d_type == DT_DIR) i->count++; + i->i_size += bkey_bytes(d.k); + } out: err: fsck_err: diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 14f6b6a5fb38..35e07bc8fbd3 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -92,7 +92,7 @@ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ BCH_FSCK_ERR_accounting_key_junk_at_end) \ x(directory_size, \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ BCH_FSCK_ERR_directory_size_mismatch) \ #define DOWNGRADE_TABLE() \ From 4fd509c10f9687f54752fbcaf83f520c93fc1f18 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Feb 2025 13:45:02 -0500 Subject: [PATCH 295/310] bcachefs: Fix bch2_indirect_extent_missing_error() We had some error handling confusion here; -BCH_ERR_missing_indirect_extent is thrown by trans_trigger_reflink_p_segment(); at this point we haven't decide whether we're generating an error. Signed-off-by: Kent Overstreet --- fs/bcachefs/reflink.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 376fd0a6e868..441e648f28b5 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -172,7 +172,7 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, bool should_commit) { if (REFLINK_P_ERROR(p.v)) - return -BCH_ERR_missing_indirect_extent; + return 0; struct bch_fs *c = trans->c; u64 live_start = REFLINK_P_IDX(p.v); @@ -259,8 +259,6 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, return k; if (unlikely(!bkey_extent_is_reflink_data(k.k))) { - bch2_trans_iter_exit(trans, iter); - unsigned size = min((u64) k.k->size, REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - reflink_offset); @@ -268,14 +266,16 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, k.k->p.offset, should_commit); - if (ret) + if (ret) { + bch2_trans_iter_exit(trans, iter); return bkey_s_c_err(ret); + } } else if (unlikely(REFLINK_P_ERROR(p.v))) { - bch2_trans_iter_exit(trans, iter); - int ret = bch2_indirect_extent_not_missing(trans, p, should_commit); - if (ret) + if (ret) { + bch2_trans_iter_exit(trans, iter); return bkey_s_c_err(ret); + } } *offset_into_extent = reflink_offset - bkey_start_offset(k.k); @@ -300,7 +300,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, if (ret) return ret; - if (bkey_deleted(k.k)) { + if (!bkey_refcount_c(k)) { if (!(flags & BTREE_TRIGGER_overwrite)) ret = -BCH_ERR_missing_indirect_extent; goto next; @@ -381,8 +381,6 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, not_found: if (flags & BTREE_TRIGGER_check_repair) { ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); - if (ret == -BCH_ERR_missing_indirect_extent) - ret = 0; if (ret) goto err; } From b04974f759ac7574d8556deb7c602a8d01a0dcc6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Feb 2025 15:40:03 -0500 Subject: [PATCH 296/310] bcachefs: Fix srcu lock warning in btree_update_nodes_written() We don't want to be holding the srcu lock while waiting on btree write completions - easily fixed. Reported-by: Janpieter Sollie Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index f4aeadbe53c1..e4e7c804625e 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -681,9 +681,11 @@ static void btree_update_nodes_written(struct btree_update *as) b = as->old_nodes[i]; + bch2_trans_begin(trans); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); seq = b->data ? b->data->keys.seq : 0; six_unlock_read(&b->c.lock); + bch2_trans_unlock_long(trans); if (seq == as->old_nodes_seq[i]) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, From 4ccacf86491d33d2486b62d4d44864d7101b299d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Feb 2025 12:37:04 -0800 Subject: [PATCH 297/310] gtp: Suppress list corruption splat in gtp_net_exit_batch_rtnl(). Brad Spengler reported the list_del() corruption splat in gtp_net_exit_batch_rtnl(). [0] Commit eb28fd76c0a0 ("gtp: Destroy device along with udp socket's netns dismantle.") added the for_each_netdev() loop in gtp_net_exit_batch_rtnl() to destroy devices in each netns as done in geneve and ip tunnels. However, this could trigger ->dellink() twice for the same device during ->exit_batch_rtnl(). Say we have two netns A & B and gtp device B that resides in netns B but whose UDP socket is in netns A. 1. cleanup_net() processes netns A and then B. 2. gtp_net_exit_batch_rtnl() finds the device B while iterating netns A's gn->gtp_dev_list and calls ->dellink(). [ device B is not yet unlinked from netns B as unregister_netdevice_many() has not been called. ] 3. gtp_net_exit_batch_rtnl() finds the device B while iterating netns B's for_each_netdev() and calls ->dellink(). gtp_dellink() cleans up the device's hash table, unlinks the dev from gn->gtp_dev_list, and calls unregister_netdevice_queue(). Basically, calling gtp_dellink() multiple times is fine unless CONFIG_DEBUG_LIST is enabled. Let's remove for_each_netdev() in gtp_net_exit_batch_rtnl() and delegate the destruction to default_device_exit_batch() as done in bareudp. [0]: list_del corruption, ffff8880aaa62c00->next (autoslab_size_M_dev_P_net_core_dev_11127_8_1328_8_S_4096_A_64_n_139+0xc00/0x1000 [slab object]) is LIST_POISON1 (ffffffffffffff02) (prev is 0xffffffffffffff04) kernel BUG at lib/list_debug.c:58! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 1 UID: 0 PID: 1804 Comm: kworker/u8:7 Tainted: G T 6.12.13-grsec-full-20250211091339 #1 Tainted: [T]=RANDSTRUCT Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Workqueue: netns cleanup_net RIP: 0010:[] __list_del_entry_valid_or_report+0x141/0x200 lib/list_debug.c:58 Code: c2 76 91 31 c0 e8 9f b1 f7 fc 0f 0b 4d 89 f0 48 c7 c1 02 ff ff ff 48 89 ea 48 89 ee 48 c7 c7 e0 c2 76 91 31 c0 e8 7f b1 f7 fc <0f> 0b 4d 89 e8 48 c7 c1 04 ff ff ff 48 89 ea 48 89 ee 48 c7 c7 60 RSP: 0018:fffffe8040b4fbd0 EFLAGS: 00010283 RAX: 00000000000000cc RBX: dffffc0000000000 RCX: ffffffff818c4054 RDX: ffffffff84947381 RSI: ffffffff818d1512 RDI: 0000000000000000 RBP: ffff8880aaa62c00 R08: 0000000000000001 R09: fffffbd008169f32 R10: fffffe8040b4f997 R11: 0000000000000001 R12: a1988d84f24943e4 R13: ffffffffffffff02 R14: ffffffffffffff04 R15: ffff8880aaa62c08 RBX: kasan shadow of 0x0 RCX: __wake_up_klogd.part.0+0x74/0xe0 kernel/printk/printk.c:4554 RDX: __list_del_entry_valid_or_report+0x141/0x200 lib/list_debug.c:58 RSI: vprintk+0x72/0x100 kernel/printk/printk_safe.c:71 RBP: autoslab_size_M_dev_P_net_core_dev_11127_8_1328_8_S_4096_A_64_n_139+0xc00/0x1000 [slab object] RSP: process kstack fffffe8040b4fbd0+0x7bd0/0x8000 [kworker/u8:7+netns 1804 ] R09: kasan shadow of process kstack fffffe8040b4f990+0x7990/0x8000 [kworker/u8:7+netns 1804 ] R10: process kstack fffffe8040b4f997+0x7997/0x8000 [kworker/u8:7+netns 1804 ] R15: autoslab_size_M_dev_P_net_core_dev_11127_8_1328_8_S_4096_A_64_n_139+0xc08/0x1000 [slab object] FS: 0000000000000000(0000) GS:ffff888116000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000748f5372c000 CR3: 0000000015408000 CR4: 00000000003406f0 shadow CR4: 00000000003406f0 Stack: 0000000000000000 ffffffff8a0c35e7 ffffffff8a0c3603 ffff8880aaa62c00 ffff8880aaa62c00 0000000000000004 ffff88811145311c 0000000000000005 0000000000000001 ffff8880aaa62000 fffffe8040b4fd40 ffffffff8a0c360d Call Trace: [] __list_del_entry_valid include/linux/list.h:131 [inline] fffffe8040b4fc28 [] __list_del_entry include/linux/list.h:248 [inline] fffffe8040b4fc28 [] list_del include/linux/list.h:262 [inline] fffffe8040b4fc28 [] gtp_dellink+0x16d/0x360 drivers/net/gtp.c:1557 fffffe8040b4fc28 [] gtp_net_exit_batch_rtnl+0x124/0x2c0 drivers/net/gtp.c:2495 fffffe8040b4fc88 [] cleanup_net+0x5a4/0xbe0 net/core/net_namespace.c:635 fffffe8040b4fcd0 [] process_one_work+0xbd7/0x2160 kernel/workqueue.c:3326 fffffe8040b4fd88 [] process_scheduled_works kernel/workqueue.c:3407 [inline] fffffe8040b4fec0 [] worker_thread+0x6b5/0xfa0 kernel/workqueue.c:3488 fffffe8040b4fec0 [] kthread+0x360/0x4c0 kernel/kthread.c:397 fffffe8040b4ff78 [] ret_from_fork+0x74/0xe0 arch/x86/kernel/process.c:172 fffffe8040b4ffb8 [] ret_from_fork_asm+0x29/0xc0 arch/x86/entry/entry_64.S:399 fffffe8040b4ffe8 Modules linked in: Fixes: eb28fd76c0a0 ("gtp: Destroy device along with udp socket's netns dismantle.") Reported-by: Brad Spengler Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250217203705.40342-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/gtp.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index d64740bf44ed..b7b46c5e6399 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -2481,11 +2481,6 @@ static void __net_exit gtp_net_exit_batch_rtnl(struct list_head *net_list, list_for_each_entry(net, net_list, exit_list) { struct gtp_net *gn = net_generic(net, gtp_net_id); struct gtp_dev *gtp, *gtp_next; - struct net_device *dev; - - for_each_netdev(net, dev) - if (dev->rtnl_link_ops == >p_link_ops) - gtp_dellink(dev, dev_to_kill); list_for_each_entry_safe(gtp, gtp_next, &gn->gtp_dev_list, list) gtp_dellink(gtp->dev, dev_to_kill); From 62fab6eef61f245dc8797e3a6a5b890ef40e8628 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Feb 2025 12:37:05 -0800 Subject: [PATCH 298/310] geneve: Suppress list corruption splat in geneve_destroy_tunnels(). As explained in the previous patch, iterating for_each_netdev() and gn->geneve_list during ->exit_batch_rtnl() could trigger ->dellink() twice for the same device. If CONFIG_DEBUG_LIST is enabled, we will see a list_del() corruption splat in the 2nd call of geneve_dellink(). Let's remove for_each_netdev() in geneve_destroy_tunnels() and delegate that part to default_device_exit_batch(). Fixes: 9593172d93b9 ("geneve: Fix use-after-free in geneve_find_dev().") Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250217203705.40342-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/geneve.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index a1f674539965..dbb3960126ee 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1902,14 +1902,7 @@ static void geneve_destroy_tunnels(struct net *net, struct list_head *head) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *geneve, *next; - struct net_device *dev, *aux; - /* gather any geneve devices that were moved into this ns */ - for_each_netdev_safe(net, dev, aux) - if (dev->rtnl_link_ops == &geneve_link_ops) - geneve_dellink(dev, head); - - /* now gather any other geneve devices that were created in this ns */ list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) geneve_dellink(geneve->dev, head); } From 3e5796862c692ea608d96f0a1437f9290f44953a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 17 Feb 2025 20:32:07 -0800 Subject: [PATCH 299/310] flow_dissector: Fix handling of mixed port and port-range keys This patch fixes a bug in TC flower filter where rules combining a specific destination port with a source port range weren't working correctly. The specific case was when users tried to configure rules like: tc filter add dev ens38 ingress protocol ip flower ip_proto udp \ dst_port 5000 src_port 2000-3000 action drop The root cause was in the flow dissector code. While both FLOW_DISSECTOR_KEY_PORTS and FLOW_DISSECTOR_KEY_PORTS_RANGE flags were being set correctly in the classifier, the __skb_flow_dissect_ports() function was only populating one of them: whichever came first in the enum check. This meant that when the code needed both a specific port and a port range, one of them would be left as 0, causing the filter to not match packets as expected. Fix it by removing the either/or logic and instead checking and populating both key types independently when they're in use. Fixes: 8ffb055beae5 ("cls_flower: Fix the behavior using port ranges with hw-offload") Reported-by: Qiang Zhang Closes: https://lore.kernel.org/netdev/CAPx+-5uvFxkhkz4=j_Xuwkezjn9U6kzKTD5jz4tZ9msSJ0fOJA@mail.gmail.com/ Cc: Yoshiki Komachi Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250218043210.732959-2-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/core/flow_dissector.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5db41bf2ed93..c33af3ef0b79 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -853,23 +853,30 @@ __skb_flow_dissect_ports(const struct sk_buff *skb, void *target_container, const void *data, int nhoff, u8 ip_proto, int hlen) { - enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; - struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_ports_range *key_ports_range = NULL; + struct flow_dissector_key_ports *key_ports = NULL; + __be32 ports; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS; - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE; + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); - if (dissector_ports == FLOW_DISSECTOR_KEY_MAX) + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + + if (!key_ports && !key_ports_range) return; - key_ports = skb_flow_dissector_target(flow_dissector, - dissector_ports, - target_container); - key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, - data, hlen); + ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); + + if (key_ports) + key_ports->ports = ports; + + if (key_ports_range) + key_ports_range->tp.ports = ports; } static void From dfc1580f960bf70bdaacda8f3d644e3e58160f9d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 17 Feb 2025 20:32:08 -0800 Subject: [PATCH 300/310] selftests/net/forwarding: Add a test case for tc-flower of mixed port and port-range After this patch: # ./tc_flower_port_range.sh TEST: Port range matching - IPv4 UDP [ OK ] TEST: Port range matching - IPv4 TCP [ OK ] TEST: Port range matching - IPv6 UDP [ OK ] TEST: Port range matching - IPv6 TCP [ OK ] TEST: Port range matching - IPv4 UDP Drop [ OK ] Cc: Qiang Zhang Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Link: https://patch.msgid.link/20250218043210.732959-3-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- .../net/forwarding/tc_flower_port_range.sh | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/tc_flower_port_range.sh b/tools/testing/selftests/net/forwarding/tc_flower_port_range.sh index 3885a2a91f7d..baed5e380dae 100755 --- a/tools/testing/selftests/net/forwarding/tc_flower_port_range.sh +++ b/tools/testing/selftests/net/forwarding/tc_flower_port_range.sh @@ -20,6 +20,7 @@ ALL_TESTS=" test_port_range_ipv4_tcp test_port_range_ipv6_udp test_port_range_ipv6_tcp + test_port_range_ipv4_udp_drop " NUM_NETIFS=4 @@ -194,6 +195,51 @@ test_port_range_ipv6_tcp() __test_port_range $proto $ip_proto $sip $dip $mode "$name" } +test_port_range_ipv4_udp_drop() +{ + local proto=ipv4 + local ip_proto=udp + local sip=192.0.2.1 + local dip=192.0.2.2 + local mode="-4" + local name="IPv4 UDP Drop" + local dmac=$(mac_get $h2) + local smac=$(mac_get $h1) + local sport_min=2000 + local sport_max=3000 + local sport_mid=$((sport_min + (sport_max - sport_min) / 2)) + local dport=5000 + + RET=0 + + tc filter add dev $swp1 ingress protocol $proto handle 101 pref 1 \ + flower src_ip $sip dst_ip $dip ip_proto $ip_proto \ + src_port $sport_min-$sport_max \ + dst_port $dport \ + action drop + + # Test ports outside range - should pass + $MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \ + -t $ip_proto "sp=$((sport_min - 1)),dp=$dport" + $MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \ + -t $ip_proto "sp=$((sport_max + 1)),dp=$dport" + + # Test ports inside range - should be dropped + $MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \ + -t $ip_proto "sp=$sport_min,dp=$dport" + $MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \ + -t $ip_proto "sp=$sport_mid,dp=$dport" + $MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \ + -t $ip_proto "sp=$sport_max,dp=$dport" + + tc_check_packets "dev $swp1 ingress" 101 3 + check_err $? "Filter did not drop the expected number of packets" + + tc filter del dev $swp1 ingress protocol $proto pref 1 handle 101 flower + + log_test "Port range matching - $name" +} + setup_prepare() { h1=${NETIFS[p1]} From 69ab34f705fbfabcace64b5d53bb7a4450fac875 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 17 Feb 2025 20:32:09 -0800 Subject: [PATCH 301/310] flow_dissector: Fix port range key handling in BPF conversion Fix how port range keys are handled in __skb_flow_bpf_to_target() by: - Separating PORTS and PORTS_RANGE key handling - Using correct key_ports_range structure for range keys - Properly initializing both key types independently This ensures port range information is correctly stored in its dedicated structure rather than incorrectly using the regular ports key structure. Fixes: 59fb9b62fb6c ("flow_dissector: Fix to use new variables for port ranges in bpf hook") Reported-by: Qiang Zhang Closes: https://lore.kernel.org/netdev/CAPx+-5uvFxkhkz4=j_Xuwkezjn9U6kzKTD5jz4tZ9msSJ0fOJA@mail.gmail.com/ Cc: Yoshiki Komachi Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Link: https://patch.msgid.link/20250218043210.732959-4-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/core/flow_dissector.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index c33af3ef0b79..9cd8de6bebb5 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -931,6 +931,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { + struct flow_dissector_key_ports_range *key_ports_range = NULL; struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -975,20 +976,21 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - key_ports = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE, - target_container); - - if (key_ports) { key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) { + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + key_ports_range->tp.src = flow_keys->sport; + key_ports_range->tp.dst = flow_keys->dport; + } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { From 15de6ba95dbe98af7eb71e644205a37c2f1a9aea Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 17 Feb 2025 20:32:10 -0800 Subject: [PATCH 302/310] selftests/bpf: Add a specific dst port matching After this patch: #102/1 flow_dissector_classification/ipv4:OK #102/2 flow_dissector_classification/ipv4_continue_dissect:OK #102/3 flow_dissector_classification/ipip:OK #102/4 flow_dissector_classification/gre:OK #102/5 flow_dissector_classification/port_range:OK #102/6 flow_dissector_classification/ipv6:OK #102 flow_dissector_classification:OK Summary: 1/6 PASSED, 0 SKIPPED, 0 FAILED Cc: Daniel Borkmann Cc: Andrii Nakryiko Signed-off-by: Cong Wang Link: https://patch.msgid.link/20250218043210.732959-5-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- .../bpf/prog_tests/flow_dissector_classification.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_classification.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_classification.c index 3729fbfd3084..80b153d3ddec 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_classification.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_classification.c @@ -542,8 +542,12 @@ static void detach_program(struct bpf_flow *skel, int prog_fd) static int set_port_drop(int pf, bool multi_port) { + char dst_port[16]; + + snprintf(dst_port, sizeof(dst_port), "%d", CFG_PORT_INNER); + SYS(fail, "tc qdisc add dev lo ingress"); - SYS(fail_delete_qdisc, "tc filter add %s %s %s %s %s %s %s %s %s %s", + SYS(fail_delete_qdisc, "tc filter add %s %s %s %s %s %s %s %s %s %s %s %s", "dev lo", "parent FFFF:", "protocol", pf == PF_INET6 ? "ipv6" : "ip", @@ -551,6 +555,7 @@ static int set_port_drop(int pf, bool multi_port) "flower", "ip_proto udp", "src_port", multi_port ? "8-10" : "9", + "dst_port", dst_port, "action drop"); return 0; From 606572eb22c1786a3957d24307f5760bb058ca19 Mon Sep 17 00:00:00 2001 From: Yu-Chun Lin Date: Tue, 18 Feb 2025 16:12:16 +0800 Subject: [PATCH 303/310] sctp: Fix undefined behavior in left shift operation According to the C11 standard (ISO/IEC 9899:2011, 6.5.7): "If E1 has a signed type and E1 x 2^E2 is not representable in the result type, the behavior is undefined." Shifting 1 << 31 causes signed integer overflow, which leads to undefined behavior. Fix this by explicitly using '1U << 31' to ensure the shift operates on an unsigned type, avoiding undefined behavior. Signed-off-by: Yu-Chun Lin Link: https://patch.msgid.link/20250218081217.3468369-1-eleanor15x@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/stream.c b/net/sctp/stream.c index c241cc552e8d..bfcff6d6a438 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -735,7 +735,7 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( * value SHOULD be the smallest TSN not acknowledged by the * receiver of the request plus 2^31. */ - init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1 << 31); + init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1U << 31); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, init_tsn, GFP_ATOMIC); From 4b5a28b38c4a0106c64416a1b2042405166b26ce Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 18 Feb 2025 05:49:30 -0800 Subject: [PATCH 304/310] net: Add non-RCU dev_getbyhwaddr() helper Add dedicated helper for finding devices by hardware address when holding rtnl_lock, similar to existing dev_getbyhwaddr_rcu(). This prevents PROVE_LOCKING warnings when rtnl_lock is held but RCU read lock is not. Extract common address comparison logic into dev_addr_cmp(). The context about this change could be found in the following discussion: Link: https://lore.kernel.org/all/20250206-scarlet-ermine-of-improvement-1fcac5@leitao/ Cc: kuniyu@amazon.com Cc: ushankar@purestorage.com Suggested-by: Eric Dumazet Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250218-arm_fix_selftest-v5-1-d3d6892db9e1@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c0a86afb85da..94b7d4eca003 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3275,6 +3275,8 @@ static inline struct net_device *first_net_device_rcu(struct net *net) } int netdev_boot_setup_check(struct net_device *dev); +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *hwaddr); struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *hwaddr); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type); diff --git a/net/core/dev.c b/net/core/dev.c index fafd2f4b5d5d..72459dd02f38 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1121,6 +1121,12 @@ int netdev_get_name(struct net *net, char *name, int ifindex) return ret; } +static bool dev_addr_cmp(struct net_device *dev, unsigned short type, + const char *ha) +{ + return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len); +} + /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace @@ -1129,7 +1135,7 @@ int netdev_get_name(struct net *net, char *name, int ifindex) * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. - * The caller must hold RCU or RTNL. + * The caller must hold RCU. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * @@ -1141,14 +1147,39 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, struct net_device *dev; for_each_netdev_rcu(net, dev) - if (dev->type == type && - !memcmp(dev->dev_addr, ha, dev->addr_len)) + if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); +/** + * dev_getbyhwaddr() - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold + * rtnl_lock. + * + * Context: rtnl_lock() must be held. + * Return: pointer to the net_device, or NULL if not found + */ +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *ha) +{ + struct net_device *dev; + + ASSERT_RTNL(); + for_each_netdev(net, dev) + if (dev_addr_cmp(dev, type, ha)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_getbyhwaddr); + struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; From 4eae0ee0f1e6256d0b0b9dd6e72f1d9cf8f72e08 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 18 Feb 2025 05:49:31 -0800 Subject: [PATCH 305/310] arp: switch to dev_getbyhwaddr() in arp_req_set_public() The arp_req_set_public() function is called with the rtnl lock held, which provides enough synchronization protection. This makes the RCU variant of dev_getbyhwaddr() unnecessary. Switch to using the simpler dev_getbyhwaddr() function since we already have the required rtnl locking. This change helps maintain consistency in the networking code by using the appropriate helper function for the existing locking context. Since we're not holding the RCU read lock in arp_req_set_public() existing code could trigger false positive locking warnings. Fixes: 941666c2e3e0 ("net: RCU conversion of dev_getbyhwaddr() and arp_ioctl()") Suggested-by: Kuniyuki Iwashima Reviewed-by: Kuniyuki Iwashima Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250218-arm_fix_selftest-v5-2-d3d6892db9e1@debian.org Signed-off-by: Jakub Kicinski --- net/ipv4/arp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f23a1ec6694c..814300eee39d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1077,7 +1077,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (!dev && (r->arp_flags & ATF_COM)) { - dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, + dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; From a370295367b55662a32a4be92565fe72a5aa79bb Mon Sep 17 00:00:00 2001 From: Nick Hu Date: Mon, 17 Feb 2025 13:58:42 +0800 Subject: [PATCH 306/310] net: axienet: Set mac_managed_pm The external PHY will undergo a soft reset twice during the resume process when it wake up from suspend. The first reset occurs when the axienet driver calls phylink_of_phy_connect(), and the second occurs when mdio_bus_phy_resume() invokes phy_init_hw(). The second soft reset of the external PHY does not reinitialize the internal PHY, which causes issues with the internal PHY, resulting in the PHY link being down. To prevent this, setting the mac_managed_pm flag skips the mdio_bus_phy_resume() function. Fixes: a129b41fe0a8 ("Revert "net: phy: dp83867: perform soft reset and retain established link"") Signed-off-by: Nick Hu Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250217055843.19799-1-nick.hu@sifive.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 9e7fa012e4fa..f33178f90c42 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -2897,6 +2897,7 @@ static int axienet_probe(struct platform_device *pdev) lp->phylink_config.dev = &ndev->dev; lp->phylink_config.type = PHYLINK_NETDEV; + lp->phylink_config.mac_managed_pm = true; lp->phylink_config.mac_capabilities = MAC_SYM_PAUSE | MAC_ASYM_PAUSE | MAC_10FD | MAC_100FD | MAC_1000FD; From 9b6412e6979f6f9e0632075f8f008937b5cd4efd Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 17 Feb 2025 11:23:35 +0100 Subject: [PATCH 307/310] tcp: drop secpath at the same time as we currently drop dst Xiumei reported hitting the WARN in xfrm6_tunnel_net_exit while running tests that boil down to: - create a pair of netns - run a basic TCP test over ipcomp6 - delete the pair of netns The xfrm_state found on spi_byaddr was not deleted at the time we delete the netns, because we still have a reference on it. This lingering reference comes from a secpath (which holds a ref on the xfrm_state), which is still attached to an skb. This skb is not leaked, it ends up on sk_receive_queue and then gets defer-free'd by skb_attempt_defer_free. The problem happens when we defer freeing an skb (push it on one CPU's defer_list), and don't flush that list before the netns is deleted. In that case, we still have a reference on the xfrm_state that we don't expect at this point. We already drop the skb's dst in the TCP receive path when it's no longer needed, so let's also drop the secpath. At this point, tcp_filter has already called into the LSM hooks that may require the secpath, so it should not be needed anymore. However, in some of those places, the MPTCP extension has just been attached to the skb, so we cannot simply drop all extensions. Fixes: 68822bdf76f1 ("net: generalize skb freeing deferral to per-cpu lists") Reported-by: Xiumei Mu Signed-off-by: Sabrina Dubroca Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/5055ba8f8f72bdcb602faa299faca73c280b7735.1739743613.git.sd@queasysnail.net Signed-off-by: Paolo Abeni --- include/net/tcp.h | 14 ++++++++++++++ net/ipv4/tcp_fastopen.c | 4 ++-- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_ipv4.c | 2 +- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 5b2b04835688..930cda5b5eb9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -683,6 +684,19 @@ void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); void tcp_sack_compress_send_ack(struct sock *sk); +static inline void tcp_cleanup_skb(struct sk_buff *skb) +{ + skb_dst_drop(skb); + secpath_reset(skb); +} + +static inline void tcp_add_receive_queue(struct sock *sk, struct sk_buff *skb) +{ + DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); + DEBUG_NET_WARN_ON_ONCE(secpath_exists(skb)); + __skb_queue_tail(&sk->sk_receive_queue, skb); +} + /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 0f523cbfe329..32b28fc21b63 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -178,7 +178,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) if (!skb) return; - skb_dst_drop(skb); + tcp_cleanup_skb(skb); /* segs_in has been initialized to 1 in tcp_create_openreq_child(). * Hence, reset segs_in to 0 before calling tcp_segs_in() * to avoid double counting. Also, tcp_segs_in() expects @@ -195,7 +195,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); tp->syn_data_acked = 1; /* u64_stats_update_begin(&tp->syncp) not needed here, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 98b8cc740392..0cbf81bf3d45 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4976,7 +4976,7 @@ static void tcp_ofo_queue(struct sock *sk) tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); else kfree_skb_partial(skb, fragstolen); @@ -5168,7 +5168,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); skb_set_owner_r(skb, sk); } return eaten; @@ -5251,7 +5251,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); return; } - skb_dst_drop(skb); + tcp_cleanup_skb(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); reason = SKB_DROP_REASON_NOT_SPECIFIED; @@ -6232,7 +6232,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - skb_dst_drop(skb); + tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cc2b5194a18d..2632844d2c35 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2027,7 +2027,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, */ skb_condense(skb); - skb_dst_drop(skb); + tcp_cleanup_skb(skb); if (unlikely(tcp_checksum_complete(skb))) { bh_unlock_sock(sk); From 878e7b11736e062514e58f3b445ff343e6705537 Mon Sep 17 00:00:00 2001 From: Haoxiang Li Date: Tue, 18 Feb 2025 11:04:09 +0800 Subject: [PATCH 308/310] nfp: bpf: Add check for nfp_app_ctrl_msg_alloc() Add check for the return value of nfp_app_ctrl_msg_alloc() in nfp_bpf_cmsg_alloc() to prevent null pointer dereference. Fixes: ff3d43f7568c ("nfp: bpf: implement helpers for FW map ops") Cc: stable@vger.kernel.org Signed-off-by: Haoxiang Li Link: https://patch.msgid.link/20250218030409.2425798-1-haoxiang_li2024@163.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/netronome/nfp/bpf/cmsg.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c index 2ec62c8d86e1..59486fe2ad18 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c @@ -20,6 +20,8 @@ nfp_bpf_cmsg_alloc(struct nfp_app_bpf *bpf, unsigned int size) struct sk_buff *skb; skb = nfp_app_ctrl_msg_alloc(bpf->app, size, GFP_KERNEL); + if (!skb) + return NULL; skb_put(skb, size); return skb; From 14ad6ed30a10afbe91b0749d6378285f4225d482 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Feb 2025 19:29:39 +0100 Subject: [PATCH 309/310] net: allow small head cache usage with large MAX_SKB_FRAGS values Sabrina reported the following splat: WARNING: CPU: 0 PID: 1 at net/core/dev.c:6935 netif_napi_add_weight_locked+0x8f2/0xba0 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.14.0-rc1-net-00092-g011b03359038 #996 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 RIP: 0010:netif_napi_add_weight_locked+0x8f2/0xba0 Code: e8 c3 e6 6a fe 48 83 c4 28 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc c7 44 24 10 ff ff ff ff e9 8f fb ff ff e8 9e e6 6a fe <0f> 0b e9 d3 fe ff ff e8 92 e6 6a fe 48 8b 04 24 be ff ff ff ff 48 RSP: 0000:ffffc9000001fc60 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff88806ce48128 RCX: 1ffff11001664b9e RDX: ffff888008f00040 RSI: ffffffff8317ca42 RDI: ffff88800b325cb6 RBP: ffff88800b325c40 R08: 0000000000000001 R09: ffffed100167502c R10: ffff88800b3a8163 R11: 0000000000000000 R12: ffff88800ac1c168 R13: ffff88800ac1c168 R14: ffff88800ac1c168 R15: 0000000000000007 FS: 0000000000000000(0000) GS:ffff88806ce00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff888008201000 CR3: 0000000004c94001 CR4: 0000000000370ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: gro_cells_init+0x1ba/0x270 xfrm_input_init+0x4b/0x2a0 xfrm_init+0x38/0x50 ip_rt_init+0x2d7/0x350 ip_init+0xf/0x20 inet_init+0x406/0x590 do_one_initcall+0x9d/0x2e0 do_initcalls+0x23b/0x280 kernel_init_freeable+0x445/0x490 kernel_init+0x20/0x1d0 ret_from_fork+0x46/0x80 ret_from_fork_asm+0x1a/0x30 irq event stamp: 584330 hardirqs last enabled at (584338): [] __up_console_sem+0x77/0xb0 hardirqs last disabled at (584345): [] __up_console_sem+0x5c/0xb0 softirqs last enabled at (583242): [] netlink_insert+0x14d/0x470 softirqs last disabled at (583754): [] netif_napi_add_weight_locked+0x77d/0xba0 on kernel built with MAX_SKB_FRAGS=45, where SKB_WITH_OVERHEAD(1024) is smaller than GRO_MAX_HEAD. Such built additionally contains the revert of the single page frag cache so that napi_get_frags() ends up using the page frag allocator, triggering the splat. Note that the underlying issue is independent from the mentioned revert; address it ensuring that the small head cache will fit either TCP and GRO allocation and updating napi_alloc_skb() and __netdev_alloc_skb() to select kmalloc() usage for any allocation fitting such cache. Reported-by: Sabrina Dubroca Suggested-by: Eric Dumazet Fixes: 3948b05950fd ("net: introduce a config option to tweak MAX_SKB_FRAGS") Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/gro.h | 3 +++ net/core/gro.c | 3 --- net/core/skbuff.c | 10 +++++++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/net/gro.h b/include/net/gro.h index b9b58c1f8d19..7b548f91754b 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -11,6 +11,9 @@ #include #include +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) + struct napi_gro_cb { union { struct { diff --git a/net/core/gro.c b/net/core/gro.c index d1f44084e978..78b320b63174 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -7,9 +7,6 @@ #define MAX_GRO_SKBS 8 -/* This should be increased if a protocol with a bigger head is added. */ -#define GRO_MAX_HEAD (MAX_HEADER + 128) - static DEFINE_SPINLOCK(offload_lock); /** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a441613a1e6c..f5a6d50570c4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -95,7 +96,9 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif -#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) +#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN) +#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \ + GRO_MAX_HEAD_PAD)) /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique @@ -736,7 +739,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. */ - if (len <= SKB_WITH_OVERHEAD(1024) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); @@ -816,7 +819,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) * When the small frag allocator is available, prefer it over kmalloc * for small fragments */ - if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || + if ((!NAPI_HAS_SMALL_PAGE_FRAG && + len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, From 6bc7e4eb0499562ccd291712fd7be0d1a5aad00a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Feb 2025 19:29:40 +0100 Subject: [PATCH 310/310] Revert "net: skb: introduce and use a single page frag cache" After the previous commit is finally safe to revert commit dbae2b062824 ("net: skb: introduce and use a single page frag cache"): do it here. The intended goal of such change was to counter a performance regression introduced by commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs"). Unfortunately, the blamed commit introduces another regression for the virtio_net driver. Such a driver calls napi_alloc_skb() with a tiny size, so that the whole head frag could fit a 512-byte block. The single page frag cache uses a 1K fragment for such allocation, and the additional overhead, under small UDP packets flood, makes the page allocator a bottleneck. Thanks to commit bf9f1baa279f ("net: add dedicated kmem_cache for typical/small skb->head"), this revert does not re-introduce the original regression. Actually, in the relevant test on top of this revert, I measure a small but noticeable positive delta, just above noise level. The revert itself required some additional mangling due to recent updates in the affected code. Suggested-by: Eric Dumazet Fixes: dbae2b062824 ("net: skb: introduce and use a single page frag cache") Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 1 - net/core/dev.c | 17 +++++++ net/core/skbuff.c | 104 ++------------------------------------ 3 files changed, 22 insertions(+), 100 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 94b7d4eca003..ab550a89b9bf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4117,7 +4117,6 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); -void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) diff --git a/net/core/dev.c b/net/core/dev.c index 72459dd02f38..1b252e9459fd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6991,6 +6991,23 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) list_add_rcu(&napi->dev_list, higher); /* adds after higher */ } +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +static void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f5a6d50570c4..7b03b64fdcb2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -223,67 +223,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) -#if PAGE_SIZE == SZ_4K - -#define NAPI_HAS_SMALL_PAGE_FRAG 1 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) - -/* specialized page frag allocator using a single order 0 page - * and slicing it into 1K sized fragment. Constrained to systems - * with a very limited amount of 1K fragments fitting a single - * page - to avoid excessive truesize underestimation - */ - -struct page_frag_1k { - void *va; - u16 offset; - bool pfmemalloc; -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) -{ - struct page *page; - int offset; - - offset = nc->offset - SZ_1K; - if (likely(offset >= 0)) - goto use_frag; - - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - if (!page) - return NULL; - - nc->va = page_address(page); - nc->pfmemalloc = page_is_pfmemalloc(page); - offset = PAGE_SIZE - SZ_1K; - page_ref_add(page, offset / SZ_1K); - -use_frag: - nc->offset = offset; - return nc->va + offset; -} -#else - -/* the small page is actually unused in this build; add dummy helpers - * to please the compiler and avoid later preprocessor's conditionals - */ -#define NAPI_HAS_SMALL_PAGE_FRAG 0 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false - -struct page_frag_1k { -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) -{ - return NULL; -} - -#endif - struct napi_alloc_cache { local_lock_t bh_lock; struct page_frag_cache page; - struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -293,23 +235,6 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} - void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -816,11 +741,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. - * When the small frag allocator is available, prefer it over kmalloc - * for small fragments */ - if ((!NAPI_HAS_SMALL_PAGE_FRAG && - len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -830,32 +752,16 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) goto skb_success; } + len = SKB_HEAD_ALIGN(len); + if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache); - if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { - /* we are artificially inflating the allocation size, but - * that is not as bad as it may look like, as: - * - 'len' less than GRO_MAX_HEAD makes little sense - * - On most systems, larger 'len' values lead to fragment - * size above 512 bytes - * - kmalloc would use the kmalloc-1k slab for such values - * - Builds with smaller GRO_MAX_HEAD will very likely do - * little networking, as that implies no WiFi and no - * tunnels support, and 32 bits arches. - */ - len = SZ_1K; - data = page_frag_alloc_1k(&nc->page_small, gfp_mask); - pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); - } else { - len = SKB_HEAD_ALIGN(len); - - data = page_frag_alloc(&nc->page, len, gfp_mask); - pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); - } + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data))