You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
typedefstructqspinlock {
union {
atomic_tval;
/* * By using the whole 2nd least significant byte for the * pending bit, we can allow better optimization of the lock * acquisition for the pending bit holder. */#ifdef__LITTLE_ENDIANstruct {
u8locked;
u8pending;
};
struct {
u16locked_pending;
u16tail;
};
#elsestruct {
u16tail;
u16locked_pending;
};
struct {
u8reserved[2];
u8pending;
u8locked;
};
#endif
};
} arch_spinlock_t;
小端时的内存排布
大端时的内存排布
各位域的使用
/* * Bitfields in the atomic value: * * When NR_CPUS < 16K * 0- 7: locked byte * 8: pending * 9-15: not used * 16-17: tail index * 18-31: tail cpu (+1) * * When NR_CPUS >= 16K * 0- 7: locked byte * 8: pending * 9-10: tail index * 11-31: tail cpu (+1) */
/** * queued_spin_lock_slowpath - acquire the queued spinlock * @lock: Pointer to queued spinlock structure * @val: Current value of the queued spinlock 32-bit word * * (queue tail, pending bit, lock value) * * fast : slow : unlock * : : * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) * : | ^--------.------. / : * : v \ \ | : * pending : (0,1,1) +--> (0,1,0) \ | : * : | ^--' | | : * : v | | : * uncontended : (n,x,y) +--> (n,0,0) --' | : * queue : | ^--' | : * : v | : * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : * queue : ^--' : */voidqueued_spin_lock_slowpath(structqspinlock*lock, u32val)
{
structmcs_spinlock*prev, *next, *node;
u32old, tail;
intidx;
// CPU 数目多于 1<<14 = 16384 个,超出 tail cpu 所能表示的范围了,编译报错BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
// 需符合条件 !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)if (pv_enabled())
goto pv_queue;
// 开启 CONFIG_PARAVIRT 时会 hijack spinlock 的处理,先不管这个分支if (virt_spin_lock(lock))
return;
// [part 2 - 第三个 CPU 试图获取 spinlock]/* * Wait for in-progress pending->locked hand-overs with a bounded * number of spins so that we guarantee forward progress. * * 0,1,0 -> 0,0,1 */// 三元组为 (0,1,0),恰巧碰到了第一个 CPU 准备把锁交给第二个 CPU 的过渡状态,我们就自旋一小下,等待移交完成 (0,1,0) --> (0,0,1)if (val==_Q_PENDING_VAL) {
intcnt=_Q_PENDING_LOOPS;
val=atomic_cond_read_relaxed(&lock->val,
(VAL!=_Q_PENDING_VAL) || !cnt--); // [自旋点1]短暂自旋
}
// 自旋一小下的第一种结果,三元组不是(0,0,*),仍然在过渡或者有其他竞争者加入,排长队去/* * If we observe any contention; queue. */if (val& ~_Q_LOCKED_MASK)
goto queue;
// 自旋一小下的第二种结果,三元组是 (0,0,*),过渡完成,而我们有机会成为第 1 竞争者// [part 1 - 第二个 CPU 试图获取 spinlock]/* * trylock || pending * * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock */// 此时,三元组是 (0,0,*),我们通过原子操作设置 pending 位,(0,0,*) -> (0,1,*)val=queued_fetch_set_pending_acquire(lock); // [原子操作 1]// 这里有两种返回结果,见下面详解。注意,这里返回的是三元组的原值,赋给 val/* * If we observe contention, there is a concurrent locker. * * Undo and queue; our setting of PENDING might have made the * n,0,0 -> 0,0,0 transition fail and it will now be waiting * on @next to become !NULL. */// 第一种结果,三元组的原值不是 (0,0,*) 说明我们没能占住 pending 位成为第 1 竞争者,要排长队去if (unlikely(val& ~_Q_LOCKED_MASK)) {
// 下面这个条件为 true 时说明原来 pending 位为 0,是我们把它设成了 1,但这是不对的,必须撤销,见下面的详解/* Undo PENDING if we set it. */if (!(val&_Q_PENDING_MASK))
clear_pending(lock); // 撤销我们刚才设置的 qspinlock 的 pending 域,不这么做可能造成死锁// 排长队去
goto queue;
}
// 第二种结果,三元组旧值为 (0,0,*),说明是我们设置的 pending 位,但旧值有可能是 (0,0,0) 或 (0,0,1)/* * We're pending, wait for the owner to go away. * * 0,1,1 -> 0,1,0 * * this wait loop must be a load-acquire such that we match the * store-release that clears the locked bit and create lock * sequentiality; this is because not all * clear_pending_set_locked() implementations imply full * barriers. */if (val&_Q_LOCKED_MASK) // 如果旧值是 (0,0,1),说明现在的值是 (0,1,1),那我们就自旋直到 owner 释放锁,三元组变为 (0,1,0)atomic_cond_read_acquire(&lock->val, !(VAL&_Q_LOCKED_MASK)); // [自旋点 2]自旋在锁字上// 如果旧值是 (0,0,0),说明现在的值是 (0,1,0),不经过[自旋点 2]// 拿锁,虽然可能是两种情况过来的,但能走到这,都是要将三元组由 (0,1,0) 设为 (0,0,1),pending 位同时被清除/* * take ownership and clear the pending bit. * * 0,1,0 -> 0,0,1 */clear_pending_set_locked(lock); // [拿锁点 1]不需要原子操作,理论上只有我们能走到这里,因为只有一个 CPU 占据着 pending 位且自旋在锁字上lockevent_inc(lock_pending); // 增加 lock_pending 事件的统计计数return; // 获得锁,返回
[part3-进入MCSlock队列]
/* * End of pending bit optimistic spinning and beginning of MCS * queuing. */queue:
lockevent_inc(lock_slowpath);
pv_queue:
node=this_cpu_ptr(&qnodes[0].mcs);
idx=node->count++; // 我们只在 context 0(task)的 node->count 上记载 context 索引tail=encode_tail(smp_processor_id(), idx); // 将自己的 CPU 编号和 context 编号编进 tail 里// 基于没有 NMI 上下文发生自旋锁嵌套的假设,当真的发生这种情况时简单粗暴地原地自旋,直到拿到锁/* * 4 nodes are allocated based on the assumption that there will * not be nested NMIs taking spinlocks. That may not be true in * some architectures even though the chance of needing more than * 4 nodes will still be extremely unlikely. When that happens, * we fall back to spinning on the lock directly without using * any MCS node. This is not the most elegant solution, but is * simple enough. */if (unlikely(idx >= MAX_NODES)) {
lockevent_inc(lock_no_node);
while (!queued_spin_trylock(lock)) // [自旋点 3][拿锁点 2][原子操作 2]原子操作尝试拿锁 + 自旋cpu_relax();
goto release;
}
// 根据上下文索引 idx 拿到真正跟上下文关联的 MSC nodenode=grab_mcs_node(node, idx);
/* * Keep counts of non-zero index values: */lockevent_cond_inc(lock_use_node2+idx-1, idx);
/* * Ensure that we increment the head node->count before initialising * the actual node. If the compiler is kind enough to reorder these * stores, then an IRQ could overwrite our assignments. */barrier();
// 初始化新入队的 MCS node,注意 locked 初始化为 0,变为 1 的时候表示锁被前一个竞争者已获得锁,现在把第 1 竞争者的位置传给我node->locked=0;
node->next=NULL;
pv_init_node(node);
// 碰碰运气,看能不能拿到目前三元组是 (0,0,0) 的锁/* * We touched a (possibly) cold cacheline in the per-cpu queue node; * attempt the trylock once more in the hope someone let go while we * weren't watching. */if (queued_spin_trylock(lock)) // [拿锁点 3][原子操作 3]原子操作尝试拿锁
goto release; // 很幸运,在排队的路上前面的竞争者都释放锁了// 尝试失败,准备排队/* * Ensure that the initialisation of @node is complete before we * publish the updated tail via xchg_tail() and potentially link * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. */smp_wmb();
// 下的 xchg_tail 在 qspinlock 中原子地设置新的 tail 值,返回 qspinlock 中之前存储的 tail 值(即"old")*Publishtheupdatedtail.
*Wehavealreadytouchedthequeueingcacheline; don't bother with*pendingstuff.
**p,*,*->n,*,**/
// 原子操作令三元组 (prev,*,*) -> (new,*,*); 返回 (prev,*,*)。也许在此一瞬间有其他 CPU 在做同样的事情,没关系,原子操作保证 qspinlock 的 tail 总是指明队尾old=xchg_tail(lock, tail); // [原子操作 4]三元组 (prev,*,*) -> (new,*,*); 返回 (prev,*,*)next=NULL; // 本地变量,准备后面用// 对于第 N 个(N > 3)试图获取 spinlock 的 CPU,之前的 tail 值是指向上一个 MCS node 的,获得上一个 node 的目的是让它的"next"指向自己,这样才能加入 MCS node 的等待队列。/* * if there was a previous node; link it and wait until reaching the * head of the waitqueue. */if (old&_Q_TAIL_MASK) { // 旧三元组的 tail 不为 0,我们至少是第 3 个竞争(第 1 个是持有者,第 1 个竞争者占据 pending 位,第 2 个竞争者让 tail 不为 0)prev=decode_tail(old); // tail(old)转 MCS node// 让上一个 node 的 next 指向自己,从而加入 MCS node 的等待队列/* Link @node into the waitqueue. */WRITE_ONCE(prev->next, node); // prev->next = node// 在自己的 locked 值上进行自旋,等着在[part 4]中前一个竞争者获得锁,然后把第 1 竞争者的位置传给我,解除自旋pv_wait_node(node, prev);
arch_mcs_spin_lock_contended(&node->locked); // [自旋点 4]// 当我们在自旋等锁的时候,我们的 next 指针可能被其他的竞争者设置了。我们乐观地预取 next 指针,期望能减少即将进行的 MCS 释放锁操作的延迟/* * While waiting for the MCS lock, the next pointer may have * been set by another lock waiter. We optimistically load * the next pointer & prefetch the cacheline for writing * to reduce latency in the upcoming MCS unlock operation. */next=READ_ONCE(node->next);
if (next)
prefetchw(next);
}
// 此时,我们是等待队列的头一个,可能有以下两种情况:// 1. 上面那条 if 语句不成立,我们是第 2 个竞争者,等待 owner 和 pending 离开;// 2. 我是第 N 个竞争者,设置完第 N-1 个竞争者的 next 指针后自旋,直到第 N-1 个竞争者最终获得锁,然后把第 1 竞争者的位置传给我/* * we're at the head of the waitqueue, wait for the owner & pending to * go away. * * *,x,y -> *,0,0 * * this wait loop must use a load-acquire such that we match the * store-release that clears the locked bit and create lock * sequentiality; this is because the set_locked() function below * does not imply a full barrier. * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the * atomic_cond_read_acquire() call. As the next PV queue head hasn't * been designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. * * If PV isn't active, 0 will be returned instead. * */if ((val=pv_wait_head_or_lock(lock, node)))
goto locked;
// 以上两种情况我们都自旋在 (*,1,1) 上,owner 和 pending 都不为 0 时结束自旋,我们有机会拿锁了val=atomic_cond_read_acquire(&lock->val, !(VAL&_Q_LOCKED_PENDING_MASK)); [自旋点5]
[part4-第三个CPU成功获取spinlock]
locked:
/* * claim the lock: * * n,0,0 -> 0,0,1 : lock, uncontended 拿到锁了 * *,*,0 -> *,*,1 : lock, contended 不行啊,还是有人在用 * * If the queue head is the only one in the queue (lock value == tail) * and nobody is pending, clear the tail code and grab the lock. * Otherwise, we only need to grab the lock. */// 如果 qspinlock 的 tail 是指向我的,说明我是当前 MCS node 队列中唯一的 node,那么直接通过// atomic_try_cmpxchg_relaxed() 获取 spinlock 即可,三元组的值由 (n,0,0) 变为 (0,0,1)/* * In the PV case we might already have _Q_LOCKED_VAL set, because * of lock stealing; therefore we must also allow: * * n,0,1 -> 0,0,1 * * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the * above wait condition, therefore any concurrent setting of * PENDING will make the uncontended transition fail. */if ((val&_Q_TAIL_MASK) ==tail) {
// 注意:[插队点 1]有可能会有竞争者在此加入队列if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) // [原子操作 5][拿锁点 4]因为设置的是(0,0,1),故在拿锁的同时清除其他域
goto release; /* No contention */// 拿到了
}
// 如果不等,说明队列中还有其他的 node 在等待;或者有竞争者在[插队点 1]之间加入队列,导致原子操作争锁失败/* * Either somebody is queued behind us or _Q_PENDING_VAL got set * which will then detect the remaining tail and queue behind us * ensuring we'll see a @next. */set_locked(lock); // [拿锁点 5]作为排头兵,我们还是拿到了锁,设置锁字/* * contended path; wait for next if not observed yet, release. */if (!next) // 竞争者是在 [插队点 1] 加入的,就会进入这个条件next=smp_cond_load_relaxed(&node->next, (VAL)); // [自旋点 6]自旋直至我们的 next 域不为 0,[插队点 1]插入的竞争者设置好我们的 next 域// 按照 MCS node 队列的规则,设置下一个 node 的 locked 为 1,解除它在[自旋点 4]的自旋arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(lock, next);
release:
/* * release the node */__this_cpu_dec(qnodes[0].mcs.count); // 递减代表索引的 count
}
但这个写操作lock btsl是个原子操作,所以只能有一个 CPU 返回的三元组是(0, 0, 1),成为第 1 竞争者
include/asm-generic/barrier.h
/** * smp_acquire__after_ctrl_dep() - Provide ACQUIRE ordering after a control dependency * * A control dependency provides a LOAD->STORE order, the additional RMB * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order, * aka. (load)-ACQUIRE. * * Architectures that do not do load speculation can have this be barrier(). */#ifndefsmp_acquire__after_ctrl_dep#definesmp_acquire__after_ctrl_dep() smp_rmb()
#endif/** * smp_cond_load_relaxed() - (Spin) wait for cond with no ordering guarantees * @ptr: pointer to the variable to wait on * @cond: boolean expression to wait for * * Equivalent to using READ_ONCE() on the condition variable. * * Due to C lacking lambda expressions we load the value of *ptr into a * pre-named variable @VAL to be used in @cond. */#ifndefsmp_cond_load_relaxed#definesmp_cond_load_relaxed(ptr, cond_expr) ({ \
typeof(ptr) __PTR = (ptr); \
__unqual_scalar_typeof(*ptr) VAL; \
for (;;) { \
VAL = READ_ONCE(*__PTR); \
if (cond_expr) \
break; \
cpu_relax(); \
} \
(typeof(*ptr))VAL; \
})
#endif/** * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering * @ptr: pointer to the variable to wait on * @cond: boolean expression to wait for * * Equivalent to using smp_load_acquire() on the condition variable but employs * the control dependency of the wait to reduce the barrier on many platforms. */#ifndefsmp_cond_load_acquire#definesmp_cond_load_acquire(ptr, cond_expr) ({ \
__unqual_scalar_typeof(*ptr) _val; \
_val = smp_cond_load_relaxed(ptr, cond_expr); \
smp_acquire__after_ctrl_dep(); \
(typeof(*ptr))_val; \
})
#endif
// [part 2 - 第三个 CPU 试图获取 spinlock]/* * Wait for in-progress pending->locked hand-overs with a bounded * number of spins so that we guarantee forward progress. * * 0,1,0 -> 0,0,1 */// 三元组为 (0,1,0),恰巧碰到了第一个 CPU 准备把锁交给第二个 CPU 的过渡状态,我们就自旋一小下,等待移交完成 (0,1,0) --> (0,0,1)if (val==_Q_PENDING_VAL) {
intcnt=_Q_PENDING_LOOPS;
val=atomic_cond_read_relaxed(&lock->val,
(VAL!=_Q_PENDING_VAL) || !cnt--);
}
// 自旋一小下的第一种结果,三元组不是(0,0,*),仍然在过渡或者有其他竞争者加入,排长队去/* * If we observe any contention; queue. */if (val& ~_Q_LOCKED_MASK)
goto queue;
// 自旋一小下的第二种结果,三元组是 (0,0,*),过渡完成,而我们有机会成为第 1 竞争者
“*” 表示0或者1都有可能
进入 MCS node 队列
自旋锁关抢占,CPU 进入忙等,如果某个 CPU 因为竞争某把锁已经在自旋了,在同级的上下文就不会有其他 task 来抢锁了
#defineMAX_NODES4/* * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in * size and four of them will fit nicely in one 64-byte cacheline. For * pvqspinlock, however, we need more space for extra data. To accommodate * that, we insert two more long words to pad it up to 32 bytes. IOW, only * two of them can fit in a cacheline in this case. That is OK as it is rare * to have more than 2 levels of slowpath nesting in actual use. We don't * want to penalize pvqspinlocks to optimize for a rare case in native * qspinlocks.*/structqnode {
structmcs_spinlock mcs;
#ifdef CONFIG_PARAVIRT_SPINLOCKS
long reserved[2];
#endif
};
...
/* * Per-CPU queue node structures; we can never have more than 4 nested * contexts: task, softirq, hardirq, nmi. * * Exactly fits one 64-byte cacheline on a 64-bit architecture. * * PV doubles the storage and uses the second cacheline for PV state.*/staticDEFINE_PER_CPU_ALIGNED(structqnode, qnodes[MAX_NODES]);
MCS 队列辅助函数
tail CPU 是 CPU 的编号加1,因为对于编号为0,tail index也为0的 CPU,如果不加1,那么算出来的tail值就是0,而tail为0表示当前没有 MCS node 的队列,为了能够区分这两种情况,在encode_tail()的时候把cpu + 1编码进tail
/* * We must be able to distinguish between no-tail and the tail at 0:0, * therefore increment the cpu number by one. */staticinline__pureu32encode_tail(intcpu, intidx)
{
u32tail;
tail= (cpu+1) << _Q_TAIL_CPU_OFFSET;
tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */returntail;
}
/* * xchg_tail - Put in the new queue tail code word & retrieve previous one * @lock : Pointer to queued spinlock structure * @tail : The new queue tail code word * Return: The previous queue tail code word * * xchg(lock, tail), which heads an address dependency * * p,*,* -> n,*,* ; prev = xchg(lock, node) */static__always_inlineu32xchg_tail(structqspinlock*lock, u32tail)
{
/* * We can use relaxed semantics since the caller ensures that the * MCS node is properly initialized before updating the tail. */return (u32)xchg_relaxed(&lock->tail,
tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
}
/* * Using smp_cond_load_acquire() provides the acquire semantics * required so that subsequent operations happen after the * lock is acquired. Additionally, some architectures such as * ARM64 would like to do spin-waiting instead of purely * spinning, and smp_cond_load_acquire() provides that behavior. */#definearch_mcs_spin_lock_contended(l) \
do { \
smp_cond_load_acquire(l, VAL); \
} while (0)
#endif
第三个 CPU 成功获取锁
set_locked() 设置锁位,三元组(*, *, 0) -> (*, 0, 1)
/** * set_locked - Set the lock bit and own the lock * @lock: Pointer to queued spinlock structure * * *,*,0 -> *,0,1 */static__always_inlinevoidset_locked(structqspinlock*lock)
{
WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
}
/* * smp_store_release() provides a memory barrier to ensure all * operations in the critical section has been completed before * unlocking. */#definearch_mcs_spin_unlock_contended(l) \
smp_store_release((l), 1) // 就是带屏障地 WRITE_ONCE(*l, 1)
#endif