Skip to content

Commit edc70c2

Browse files
author
Alexei Starovoitov
committed
Merge branch 'bpf-next/master' into for-next
Signed-off-by: Alexei Starovoitov <[email protected]>
2 parents 017822b + ff34657 commit edc70c2

File tree

24 files changed

+286
-616
lines changed

24 files changed

+286
-616
lines changed

include/asm-generic/rqspinlock.h

Lines changed: 32 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,8 @@ static __always_inline void release_held_lock_entry(void)
129129
* <error> for lock B
130130
* release_held_lock_entry
131131
*
132-
* try_cmpxchg_acquire for lock A
133132
* grab_held_lock_entry
133+
* try_cmpxchg_acquire for lock A
134134
*
135135
* Lack of any ordering means reordering may occur such that dec, inc
136136
* are done before entry is overwritten. This permits a remote lock
@@ -139,13 +139,8 @@ static __always_inline void release_held_lock_entry(void)
139139
* CPU holds a lock it is attempting to acquire, leading to false ABBA
140140
* diagnosis).
141141
*
142-
* In case of unlock, we will always do a release on the lock word after
143-
* releasing the entry, ensuring that other CPUs cannot hold the lock
144-
* (and make conclusions about deadlocks) until the entry has been
145-
* cleared on the local CPU, preventing any anomalies. Reordering is
146-
* still possible there, but a remote CPU cannot observe a lock in our
147-
* table which it is already holding, since visibility entails our
148-
* release store for the said lock has not retired.
142+
* The case of unlock is treated differently due to NMI reentrancy, see
143+
* comments in res_spin_unlock.
149144
*
150145
* In theory we don't have a problem if the dec and WRITE_ONCE above get
151146
* reordered with each other, we either notice an empty NULL entry on
@@ -175,10 +170,22 @@ static __always_inline int res_spin_lock(rqspinlock_t *lock)
175170
{
176171
int val = 0;
177172

178-
if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
179-
grab_held_lock_entry(lock);
173+
/*
174+
* Grab the deadlock detection entry before doing the cmpxchg, so that
175+
* reentrancy due to NMIs between the succeeding cmpxchg and creation of
176+
* held lock entry can correctly detect an acquisition attempt in the
177+
* interrupted context.
178+
*
179+
* cmpxchg lock A
180+
* <NMI>
181+
* res_spin_lock(A) --> missed AA, leads to timeout
182+
* </NMI>
183+
* grab_held_lock_entry(A)
184+
*/
185+
grab_held_lock_entry(lock);
186+
187+
if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
180188
return 0;
181-
}
182189
return resilient_queued_spin_lock_slowpath(lock, val);
183190
}
184191

@@ -192,28 +199,25 @@ static __always_inline void res_spin_unlock(rqspinlock_t *lock)
192199
{
193200
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
194201

195-
if (unlikely(rqh->cnt > RES_NR_HELD))
196-
goto unlock;
197-
WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
198-
unlock:
199202
/*
200-
* Release barrier, ensures correct ordering. See release_held_lock_entry
201-
* for details. Perform release store instead of queued_spin_unlock,
202-
* since we use this function for test-and-set fallback as well. When we
203-
* have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword.
203+
* Release barrier, ensures correct ordering. Perform release store
204+
* instead of queued_spin_unlock, since we use this function for the TAS
205+
* fallback as well. When we have CONFIG_QUEUED_SPINLOCKS=n, we clear
206+
* the full 4-byte lockword.
204207
*
205-
* Like release_held_lock_entry, we can do the release before the dec.
206-
* We simply care about not seeing the 'lock' in our table from a remote
207-
* CPU once the lock has been released, which doesn't rely on the dec.
208+
* Perform the smp_store_release before clearing the lock entry so that
209+
* NMIs landing in the unlock path can correctly detect AA issues. The
210+
* opposite order shown below may lead to missed AA checks:
208211
*
209-
* Unlike smp_wmb(), release is not a two way fence, hence it is
210-
* possible for a inc to move up and reorder with our clearing of the
211-
* entry. This isn't a problem however, as for a misdiagnosis of ABBA,
212-
* the remote CPU needs to hold this lock, which won't be released until
213-
* the store below is done, which would ensure the entry is overwritten
214-
* to NULL, etc.
212+
* WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)
213+
* <NMI>
214+
* res_spin_lock(A) --> missed AA, leads to timeout
215+
* </NMI>
216+
* smp_store_release(A->locked, 0)
215217
*/
216218
smp_store_release(&lock->locked, 0);
219+
if (likely(rqh->cnt <= RES_NR_HELD))
220+
WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
217221
this_cpu_dec(rqspinlock_held_locks.cnt);
218222
}
219223

kernel/bpf/bpf_insn_array.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ static struct bpf_map *insn_array_alloc(union bpf_attr *attr)
5555

5656
bpf_map_init_from_attr(&insn_array->map, attr);
5757

58+
/* BPF programs aren't allowed to write to the map */
59+
insn_array->map.map_flags |= BPF_F_RDONLY_PROG;
60+
5861
return &insn_array->map;
5962
}
6063

kernel/bpf/bpf_lsm.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ BTF_ID(func, bpf_lsm_key_getsecurity)
5151
BTF_ID(func, bpf_lsm_audit_rule_match)
5252
#endif
5353
BTF_ID(func, bpf_lsm_ismaclabel)
54+
BTF_ID(func, bpf_lsm_file_alloc_security)
5455
BTF_SET_END(bpf_lsm_disabled_hooks)
5556

5657
/* List of LSM hooks that should operate on 'current' cgroup regardless

kernel/bpf/rqspinlock.c

Lines changed: 33 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -196,32 +196,21 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask)
196196
return 0;
197197
}
198198

199-
static noinline int check_deadlock(rqspinlock_t *lock, u32 mask)
200-
{
201-
int ret;
202-
203-
ret = check_deadlock_AA(lock);
204-
if (ret)
205-
return ret;
206-
ret = check_deadlock_ABBA(lock, mask);
207-
if (ret)
208-
return ret;
209-
210-
return 0;
211-
}
212-
213199
static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
214200
struct rqspinlock_timeout *ts)
215201
{
216-
u64 time = ktime_get_mono_fast_ns();
217202
u64 prev = ts->cur;
203+
u64 time;
218204

219205
if (!ts->timeout_end) {
220-
ts->cur = time;
221-
ts->timeout_end = time + ts->duration;
206+
if (check_deadlock_AA(lock))
207+
return -EDEADLK;
208+
ts->cur = ktime_get_mono_fast_ns();
209+
ts->timeout_end = ts->cur + ts->duration;
222210
return 0;
223211
}
224212

213+
time = ktime_get_mono_fast_ns();
225214
if (time > ts->timeout_end)
226215
return -ETIMEDOUT;
227216

@@ -231,7 +220,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
231220
*/
232221
if (prev + NSEC_PER_MSEC < time) {
233222
ts->cur = time;
234-
return check_deadlock(lock, mask);
223+
return check_deadlock_ABBA(lock, mask);
235224
}
236225

237226
return 0;
@@ -275,6 +264,10 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
275264
int val, ret = 0;
276265

277266
RES_INIT_TIMEOUT(ts);
267+
/*
268+
* The fast path is not invoked for the TAS fallback, so we must grab
269+
* the deadlock detection entry here.
270+
*/
278271
grab_held_lock_entry(lock);
279272

280273
/*
@@ -397,10 +390,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
397390
goto queue;
398391
}
399392

400-
/*
401-
* Grab an entry in the held locks array, to enable deadlock detection.
402-
*/
403-
grab_held_lock_entry(lock);
393+
/* Deadlock detection entry already held after failing fast path. */
404394

405395
/*
406396
* We're pending, wait for the owner to go away.
@@ -447,12 +437,21 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
447437
* queuing.
448438
*/
449439
queue:
450-
lockevent_inc(lock_slowpath);
451440
/*
452-
* Grab deadlock detection entry for the queue path.
441+
* Do not queue if we're a waiter and someone is attempting this lock on
442+
* the same CPU. In case of NMIs, this prevents long timeouts where we
443+
* interrupt the pending waiter, and the owner, that will eventually
444+
* signal the head of our queue, both of which are logically but not
445+
* physically part of the queue, hence outside the scope of the idx > 0
446+
* check above for the trylock fallback.
453447
*/
454-
grab_held_lock_entry(lock);
448+
if (check_deadlock_AA(lock)) {
449+
ret = -EDEADLK;
450+
goto err_release_entry;
451+
}
455452

453+
lockevent_inc(lock_slowpath);
454+
/* Deadlock detection entry already held after failing fast path. */
456455
node = this_cpu_ptr(&rqnodes[0].mcs);
457456
idx = node->count++;
458457
tail = encode_tail(smp_processor_id(), idx);
@@ -464,19 +463,17 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
464463
* not be nested NMIs taking spinlocks. That may not be true in
465464
* some architectures even though the chance of needing more than
466465
* 4 nodes will still be extremely unlikely. When that happens,
467-
* we fall back to spinning on the lock directly without using
468-
* any MCS node. This is not the most elegant solution, but is
469-
* simple enough.
466+
* we fall back to attempting a trylock operation without using
467+
* any MCS node. Unlike qspinlock which cannot fail, we have the
468+
* option of failing the slow path, and under contention, such a
469+
* trylock spinning will likely be treated unfairly due to lack of
470+
* queueing, hence do not spin.
470471
*/
471-
if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) {
472+
if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) {
472473
lockevent_inc(lock_no_node);
473-
RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
474-
while (!queued_spin_trylock(lock)) {
475-
if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) {
476-
lockevent_inc(rqspinlock_lock_timeout);
477-
goto err_release_node;
478-
}
479-
cpu_relax();
474+
if (!queued_spin_trylock(lock)) {
475+
ret = -EDEADLK;
476+
goto err_release_node;
480477
}
481478
goto release;
482479
}

kernel/bpf/syscall.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
158158
*/
159159
if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
160160
map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
161-
synchronize_rcu();
161+
synchronize_rcu_expedited();
162162
}
163163

164164
static void unpin_uptr_kaddr(void *kaddr)

kernel/bpf/verifier.c

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6482,6 +6482,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
64826482
break;
64836483
case PTR_TO_MAP_VALUE:
64846484
pointer_desc = "value ";
6485+
if (reg->map_ptr->map_type == BPF_MAP_TYPE_INSN_ARRAY)
6486+
strict = true;
64856487
break;
64866488
case PTR_TO_CTX:
64876489
pointer_desc = "context ";
@@ -7529,16 +7531,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
75297531
{
75307532
struct bpf_reg_state *regs = cur_regs(env);
75317533
struct bpf_reg_state *reg = regs + regno;
7532-
bool insn_array = reg->type == PTR_TO_MAP_VALUE &&
7533-
reg->map_ptr->map_type == BPF_MAP_TYPE_INSN_ARRAY;
75347534
int size, err = 0;
75357535

75367536
size = bpf_size_to_bytes(bpf_size);
75377537
if (size < 0)
75387538
return size;
75397539

75407540
/* alignment checks will add in reg->off themselves */
7541-
err = check_ptr_alignment(env, reg, off, size, strict_alignment_once || insn_array);
7541+
err = check_ptr_alignment(env, reg, off, size, strict_alignment_once);
75427542
if (err)
75437543
return err;
75447544

@@ -7565,11 +7565,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
75657565
verbose(env, "R%d leaks addr into map\n", value_regno);
75667566
return -EACCES;
75677567
}
7568-
if (t == BPF_WRITE && insn_array) {
7569-
verbose(env, "writes into insn_array not allowed\n");
7570-
return -EACCES;
7571-
}
7572-
75737568
err = check_map_access_type(env, regno, off, size, t);
75747569
if (err)
75757570
return err;
@@ -7584,10 +7579,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
75847579
} else if (t == BPF_READ && value_regno >= 0) {
75857580
struct bpf_map *map = reg->map_ptr;
75867581

7587-
/* if map is read-only, track its contents as scalars */
7582+
/*
7583+
* If map is read-only, track its contents as scalars,
7584+
* unless it is an insn array (see the special case below)
7585+
*/
75887586
if (tnum_is_const(reg->var_off) &&
75897587
bpf_map_is_rdonly(map) &&
7590-
map->ops->map_direct_value_addr) {
7588+
map->ops->map_direct_value_addr &&
7589+
map->map_type != BPF_MAP_TYPE_INSN_ARRAY) {
75917590
int map_off = off + reg->var_off.value;
75927591
u64 val = 0;
75937592

kernel/trace/bpf_trace.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2529,7 +2529,7 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
25292529
return run_ctx->entry_ip;
25302530
}
25312531

2532-
static int
2532+
static __always_inline int
25332533
kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
25342534
unsigned long entry_ip, struct ftrace_regs *fregs,
25352535
bool is_return, void *data)

tools/bpf/Makefile

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ FEATURE_TESTS = libbfd disassembler-four-args disassembler-init-styled
3232
FEATURE_DISPLAY = libbfd
3333

3434
check_feat := 1
35-
NON_CHECK_FEAT_TARGETS := clean bpftool_clean runqslower_clean resolve_btfids_clean
35+
NON_CHECK_FEAT_TARGETS := clean bpftool_clean resolve_btfids_clean
3636
ifdef MAKECMDGOALS
3737
ifeq ($(filter-out $(NON_CHECK_FEAT_TARGETS),$(MAKECMDGOALS)),)
3838
check_feat := 0
@@ -70,7 +70,7 @@ $(OUTPUT)%.lex.o: $(OUTPUT)%.lex.c
7070

7171
PROGS = $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg $(OUTPUT)bpf_asm
7272

73-
all: $(PROGS) bpftool runqslower
73+
all: $(PROGS) bpftool
7474

7575
$(OUTPUT)bpf_jit_disasm: CFLAGS += -DPACKAGE='bpf_jit_disasm'
7676
$(OUTPUT)bpf_jit_disasm: $(OUTPUT)bpf_jit_disasm.o
@@ -86,7 +86,7 @@ $(OUTPUT)bpf_exp.lex.c: $(OUTPUT)bpf_exp.yacc.c
8686
$(OUTPUT)bpf_exp.yacc.o: $(OUTPUT)bpf_exp.yacc.c
8787
$(OUTPUT)bpf_exp.lex.o: $(OUTPUT)bpf_exp.lex.c
8888

89-
clean: bpftool_clean runqslower_clean resolve_btfids_clean
89+
clean: bpftool_clean resolve_btfids_clean
9090
$(call QUIET_CLEAN, bpf-progs)
9191
$(Q)$(RM) -r -- $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \
9292
$(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.*
@@ -112,18 +112,11 @@ bpftool_install:
112112
bpftool_clean:
113113
$(call descend,bpftool,clean)
114114

115-
runqslower:
116-
$(call descend,runqslower)
117-
118-
runqslower_clean:
119-
$(call descend,runqslower,clean)
120-
121115
resolve_btfids:
122116
$(call descend,resolve_btfids)
123117

124118
resolve_btfids_clean:
125119
$(call descend,resolve_btfids,clean)
126120

127121
.PHONY: all install clean bpftool bpftool_install bpftool_clean \
128-
runqslower runqslower_clean \
129122
resolve_btfids resolve_btfids_clean

tools/bpf/runqslower/.gitignore

Lines changed: 0 additions & 2 deletions
This file was deleted.

0 commit comments

Comments
 (0)