Skip to content

Commit 341adee

Browse files
Wen Gudavem330
Wen Gu
authored andcommitted
net/smc: Forward wakeup to smc socket waitqueue after fallback
When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: <IRQ> __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 </IRQ> <TASK> The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1 ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul <[email protected]> Signed-off-by: Wen Gu <[email protected]> Acked-by: Karsten Graul <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 6449520 commit 341adee

File tree

2 files changed

+137
-16
lines changed

2 files changed

+137
-16
lines changed

net/smc/af_smc.c

Lines changed: 118 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -566,17 +566,115 @@ static void smc_stat_fallback(struct smc_sock *smc)
566566
mutex_unlock(&net->smc.mutex_fback_rsn);
567567
}
568568

569+
/* must be called under rcu read lock */
570+
static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key)
571+
{
572+
struct socket_wq *wq;
573+
__poll_t flags;
574+
575+
wq = rcu_dereference(smc->sk.sk_wq);
576+
if (!skwq_has_sleeper(wq))
577+
return;
578+
579+
/* wake up smc sk->sk_wq */
580+
if (!key) {
581+
/* sk_state_change */
582+
wake_up_interruptible_all(&wq->wait);
583+
} else {
584+
flags = key_to_poll(key);
585+
if (flags & (EPOLLIN | EPOLLOUT))
586+
/* sk_data_ready or sk_write_space */
587+
wake_up_interruptible_sync_poll(&wq->wait, flags);
588+
else if (flags & EPOLLERR)
589+
/* sk_error_report */
590+
wake_up_interruptible_poll(&wq->wait, flags);
591+
}
592+
}
593+
594+
static int smc_fback_mark_woken(wait_queue_entry_t *wait,
595+
unsigned int mode, int sync, void *key)
596+
{
597+
struct smc_mark_woken *mark =
598+
container_of(wait, struct smc_mark_woken, wait_entry);
599+
600+
mark->woken = true;
601+
mark->key = key;
602+
return 0;
603+
}
604+
605+
static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk,
606+
void (*clcsock_callback)(struct sock *sk))
607+
{
608+
struct smc_mark_woken mark = { .woken = false };
609+
struct socket_wq *wq;
610+
611+
init_waitqueue_func_entry(&mark.wait_entry,
612+
smc_fback_mark_woken);
613+
rcu_read_lock();
614+
wq = rcu_dereference(clcsk->sk_wq);
615+
if (!wq)
616+
goto out;
617+
add_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
618+
clcsock_callback(clcsk);
619+
remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
620+
621+
if (mark.woken)
622+
smc_fback_wakeup_waitqueue(smc, mark.key);
623+
out:
624+
rcu_read_unlock();
625+
}
626+
627+
static void smc_fback_state_change(struct sock *clcsk)
628+
{
629+
struct smc_sock *smc =
630+
smc_clcsock_user_data(clcsk);
631+
632+
if (!smc)
633+
return;
634+
smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change);
635+
}
636+
637+
static void smc_fback_data_ready(struct sock *clcsk)
638+
{
639+
struct smc_sock *smc =
640+
smc_clcsock_user_data(clcsk);
641+
642+
if (!smc)
643+
return;
644+
smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready);
645+
}
646+
647+
static void smc_fback_write_space(struct sock *clcsk)
648+
{
649+
struct smc_sock *smc =
650+
smc_clcsock_user_data(clcsk);
651+
652+
if (!smc)
653+
return;
654+
smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space);
655+
}
656+
657+
static void smc_fback_error_report(struct sock *clcsk)
658+
{
659+
struct smc_sock *smc =
660+
smc_clcsock_user_data(clcsk);
661+
662+
if (!smc)
663+
return;
664+
smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report);
665+
}
666+
569667
static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
570668
{
571-
wait_queue_head_t *smc_wait = sk_sleep(&smc->sk);
572-
wait_queue_head_t *clc_wait;
573-
unsigned long flags;
669+
struct sock *clcsk;
574670

575671
mutex_lock(&smc->clcsock_release_lock);
576672
if (!smc->clcsock) {
577673
mutex_unlock(&smc->clcsock_release_lock);
578674
return -EBADF;
579675
}
676+
clcsk = smc->clcsock->sk;
677+
580678
smc->use_fallback = true;
581679
smc->fallback_rsn = reason_code;
582680
smc_stat_fallback(smc);
@@ -587,16 +685,22 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
587685
smc->clcsock->wq.fasync_list =
588686
smc->sk.sk_socket->wq.fasync_list;
589687

590-
/* There may be some entries remaining in
591-
* smc socket->wq, which should be removed
592-
* to clcsocket->wq during the fallback.
688+
/* There might be some wait entries remaining
689+
* in smc sk->sk_wq and they should be woken up
690+
* as clcsock's wait queue is woken up.
593691
*/
594-
clc_wait = sk_sleep(smc->clcsock->sk);
595-
spin_lock_irqsave(&smc_wait->lock, flags);
596-
spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING);
597-
list_splice_init(&smc_wait->head, &clc_wait->head);
598-
spin_unlock(&clc_wait->lock);
599-
spin_unlock_irqrestore(&smc_wait->lock, flags);
692+
smc->clcsk_state_change = clcsk->sk_state_change;
693+
smc->clcsk_data_ready = clcsk->sk_data_ready;
694+
smc->clcsk_write_space = clcsk->sk_write_space;
695+
smc->clcsk_error_report = clcsk->sk_error_report;
696+
697+
clcsk->sk_state_change = smc_fback_state_change;
698+
clcsk->sk_data_ready = smc_fback_data_ready;
699+
clcsk->sk_write_space = smc_fback_write_space;
700+
clcsk->sk_error_report = smc_fback_error_report;
701+
702+
smc->clcsock->sk->sk_user_data =
703+
(void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
600704
}
601705
mutex_unlock(&smc->clcsock_release_lock);
602706
return 0;
@@ -2115,10 +2219,9 @@ static void smc_tcp_listen_work(struct work_struct *work)
21152219

21162220
static void smc_clcsock_data_ready(struct sock *listen_clcsock)
21172221
{
2118-
struct smc_sock *lsmc;
2222+
struct smc_sock *lsmc =
2223+
smc_clcsock_user_data(listen_clcsock);
21192224

2120-
lsmc = (struct smc_sock *)
2121-
((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY);
21222225
if (!lsmc)
21232226
return;
21242227
lsmc->clcsk_data_ready(listen_clcsock);

net/smc/smc.h

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,12 @@ enum smc_urg_state {
139139
SMC_URG_READ = 3, /* data was already read */
140140
};
141141

142+
struct smc_mark_woken {
143+
bool woken;
144+
void *key;
145+
wait_queue_entry_t wait_entry;
146+
};
147+
142148
struct smc_connection {
143149
struct rb_node alert_node;
144150
struct smc_link_group *lgr; /* link group of connection */
@@ -228,8 +234,14 @@ struct smc_connection {
228234
struct smc_sock { /* smc sock container */
229235
struct sock sk;
230236
struct socket *clcsock; /* internal tcp socket */
237+
void (*clcsk_state_change)(struct sock *sk);
238+
/* original stat_change fct. */
231239
void (*clcsk_data_ready)(struct sock *sk);
232-
/* original data_ready fct. **/
240+
/* original data_ready fct. */
241+
void (*clcsk_write_space)(struct sock *sk);
242+
/* original write_space fct. */
243+
void (*clcsk_error_report)(struct sock *sk);
244+
/* original error_report fct. */
233245
struct smc_connection conn; /* smc connection */
234246
struct smc_sock *listen_smc; /* listen parent */
235247
struct work_struct connect_work; /* handle non-blocking connect*/
@@ -264,6 +276,12 @@ static inline struct smc_sock *smc_sk(const struct sock *sk)
264276
return (struct smc_sock *)sk;
265277
}
266278

279+
static inline struct smc_sock *smc_clcsock_user_data(struct sock *clcsk)
280+
{
281+
return (struct smc_sock *)
282+
((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY);
283+
}
284+
267285
extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
268286
extern struct workqueue_struct *smc_close_wq; /* wq for close work */
269287

0 commit comments

Comments
 (0)