Skip to content

Commit b5f2170

Browse files
committed
Merge tag 'bpf-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf
Pull bpf fixes from Daniel Borkmann:: - Fix several issues for BPF LPM trie map which were found by syzbot and during addition of new test cases (Hou Tao) - Fix a missing process_iter_arg register type check in the BPF verifier (Kumar Kartikeya Dwivedi, Tao Lyu) - Fix several correctness gaps in the BPF verifier when interacting with the BPF stack without CAP_PERFMON (Kumar Kartikeya Dwivedi, Eduard Zingerman, Tao Lyu) - Fix OOB BPF map writes when deleting elements for the case of xsk map as well as devmap (Maciej Fijalkowski) - Fix xsk sockets to always clear DMA mapping information when unmapping the pool (Larysa Zaremba) - Fix sk_mem_uncharge logic in tcp_bpf_sendmsg to only uncharge after sent bytes have been finalized (Zijian Zhang) - Fix BPF sockmap with vsocks which was missing a queue check in poll and sockmap cleanup on close (Michal Luczaj) - Fix tools infra to override makefile ARCH variable if defined but empty, which addresses cross-building tools. (Björn Töpel) - Fix two resolve_btfids build warnings on unresolved bpf_lsm symbols (Thomas Weißschuh) - Fix a NULL pointer dereference in bpftool (Amir Mohammadi) - Fix BPF selftests to check for CONFIG_PREEMPTION instead of CONFIG_PREEMPT (Sebastian Andrzej Siewior) * tag 'bpf-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf: (31 commits) selftests/bpf: Add more test cases for LPM trie selftests/bpf: Move test_lpm_map.c to map_tests bpf: Use raw_spinlock_t for LPM trie bpf: Switch to bpf mem allocator for LPM trie bpf: Fix exact match conditions in trie_get_next_key() bpf: Handle in-place update for full LPM trie correctly bpf: Handle BPF_EXIST and BPF_NOEXIST for LPM trie bpf: Remove unnecessary kfree(im_node) in lpm_trie_update_elem bpf: Remove unnecessary check when updating LPM trie selftests/bpf: Add test for narrow spill into 64-bit spilled scalar selftests/bpf: Add test for reading from STACK_INVALID slots selftests/bpf: Introduce __caps_unpriv annotation for tests bpf: Fix narrow scalar spill onto 64-bit spilled scalar slots bpf: Don't mark STACK_INVALID as STACK_MISC in mark_stack_slot_misc samples/bpf: Remove unnecessary -I flags from libbpf EXTRA_CFLAGS bpf: Zero index arg error string for dynptr and iter selftests/bpf: Add tests for iter arg check bpf: Ensure reg is PTR_TO_STACK in process_iter_arg tools: Override makefile ARCH variable if defined, but empty selftests/bpf: Add apply_bytes test to test_txmsg_redir_wait_sndmem in test_sockmap ...
2 parents f3ddc43 + 509df67 commit b5f2170

31 files changed

+813
-174
lines changed

kernel/bpf/bpf_lsm.c

-2
Original file line numberDiff line numberDiff line change
@@ -375,8 +375,6 @@ BTF_ID(func, bpf_lsm_socket_socketpair)
375375

376376
BTF_ID(func, bpf_lsm_syslog)
377377
BTF_ID(func, bpf_lsm_task_alloc)
378-
BTF_ID(func, bpf_lsm_current_getsecid_subj)
379-
BTF_ID(func, bpf_lsm_task_getsecid_obj)
380378
BTF_ID(func, bpf_lsm_task_prctl)
381379
BTF_ID(func, bpf_lsm_task_setscheduler)
382380
BTF_ID(func, bpf_lsm_task_to_inode)

kernel/bpf/devmap.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
184184
static void dev_map_free(struct bpf_map *map)
185185
{
186186
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
187-
int i;
187+
u32 i;
188188

189189
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
190190
* so the programs (can be more than one that used this map) were
@@ -821,7 +821,7 @@ static long dev_map_delete_elem(struct bpf_map *map, void *key)
821821
{
822822
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
823823
struct bpf_dtab_netdev *old_dev;
824-
int k = *(u32 *)key;
824+
u32 k = *(u32 *)key;
825825

826826
if (k >= map->max_entries)
827827
return -EINVAL;
@@ -838,7 +838,7 @@ static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
838838
{
839839
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
840840
struct bpf_dtab_netdev *old_dev;
841-
int k = *(u32 *)key;
841+
u32 k = *(u32 *)key;
842842
unsigned long flags;
843843
int ret = -ENOENT;
844844

kernel/bpf/lpm_trie.c

+85-48
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@
1515
#include <net/ipv6.h>
1616
#include <uapi/linux/btf.h>
1717
#include <linux/btf_ids.h>
18+
#include <linux/bpf_mem_alloc.h>
1819

1920
/* Intermediate node */
2021
#define LPM_TREE_NODE_FLAG_IM BIT(0)
2122

2223
struct lpm_trie_node;
2324

2425
struct lpm_trie_node {
25-
struct rcu_head rcu;
2626
struct lpm_trie_node __rcu *child[2];
2727
u32 prefixlen;
2828
u32 flags;
@@ -32,10 +32,11 @@ struct lpm_trie_node {
3232
struct lpm_trie {
3333
struct bpf_map map;
3434
struct lpm_trie_node __rcu *root;
35+
struct bpf_mem_alloc ma;
3536
size_t n_entries;
3637
size_t max_prefixlen;
3738
size_t data_size;
38-
spinlock_t lock;
39+
raw_spinlock_t lock;
3940
};
4041

4142
/* This trie implements a longest prefix match algorithm that can be used to
@@ -287,17 +288,18 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
287288
return found->data + trie->data_size;
288289
}
289290

290-
static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
291-
const void *value)
291+
static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
292+
const void *value,
293+
bool disable_migration)
292294
{
293295
struct lpm_trie_node *node;
294-
size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
295296

296-
if (value)
297-
size += trie->map.value_size;
297+
if (disable_migration)
298+
migrate_disable();
299+
node = bpf_mem_cache_alloc(&trie->ma);
300+
if (disable_migration)
301+
migrate_enable();
298302

299-
node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
300-
trie->map.numa_node);
301303
if (!node)
302304
return NULL;
303305

@@ -310,12 +312,22 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
310312
return node;
311313
}
312314

315+
static int trie_check_add_elem(struct lpm_trie *trie, u64 flags)
316+
{
317+
if (flags == BPF_EXIST)
318+
return -ENOENT;
319+
if (trie->n_entries == trie->map.max_entries)
320+
return -ENOSPC;
321+
trie->n_entries++;
322+
return 0;
323+
}
324+
313325
/* Called from syscall or from eBPF program */
314326
static long trie_update_elem(struct bpf_map *map,
315327
void *_key, void *value, u64 flags)
316328
{
317329
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
318-
struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
330+
struct lpm_trie_node *node, *im_node, *new_node;
319331
struct lpm_trie_node *free_node = NULL;
320332
struct lpm_trie_node __rcu **slot;
321333
struct bpf_lpm_trie_key_u8 *key = _key;
@@ -330,22 +342,14 @@ static long trie_update_elem(struct bpf_map *map,
330342
if (key->prefixlen > trie->max_prefixlen)
331343
return -EINVAL;
332344

333-
spin_lock_irqsave(&trie->lock, irq_flags);
334-
335-
/* Allocate and fill a new node */
336-
337-
if (trie->n_entries == trie->map.max_entries) {
338-
ret = -ENOSPC;
339-
goto out;
340-
}
341-
342-
new_node = lpm_trie_node_alloc(trie, value);
343-
if (!new_node) {
344-
ret = -ENOMEM;
345-
goto out;
346-
}
345+
/* Allocate and fill a new node. Need to disable migration before
346+
* invoking bpf_mem_cache_alloc().
347+
*/
348+
new_node = lpm_trie_node_alloc(trie, value, true);
349+
if (!new_node)
350+
return -ENOMEM;
347351

348-
trie->n_entries++;
352+
raw_spin_lock_irqsave(&trie->lock, irq_flags);
349353

350354
new_node->prefixlen = key->prefixlen;
351355
RCU_INIT_POINTER(new_node->child[0], NULL);
@@ -364,8 +368,7 @@ static long trie_update_elem(struct bpf_map *map,
364368
matchlen = longest_prefix_match(trie, node, key);
365369

366370
if (node->prefixlen != matchlen ||
367-
node->prefixlen == key->prefixlen ||
368-
node->prefixlen == trie->max_prefixlen)
371+
node->prefixlen == key->prefixlen)
369372
break;
370373

371374
next_bit = extract_bit(key->data, node->prefixlen);
@@ -376,6 +379,10 @@ static long trie_update_elem(struct bpf_map *map,
376379
* simply assign the @new_node to that slot and be done.
377380
*/
378381
if (!node) {
382+
ret = trie_check_add_elem(trie, flags);
383+
if (ret)
384+
goto out;
385+
379386
rcu_assign_pointer(*slot, new_node);
380387
goto out;
381388
}
@@ -384,18 +391,30 @@ static long trie_update_elem(struct bpf_map *map,
384391
* which already has the correct data array set.
385392
*/
386393
if (node->prefixlen == matchlen) {
394+
if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) {
395+
if (flags == BPF_NOEXIST) {
396+
ret = -EEXIST;
397+
goto out;
398+
}
399+
} else {
400+
ret = trie_check_add_elem(trie, flags);
401+
if (ret)
402+
goto out;
403+
}
404+
387405
new_node->child[0] = node->child[0];
388406
new_node->child[1] = node->child[1];
389407

390-
if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
391-
trie->n_entries--;
392-
393408
rcu_assign_pointer(*slot, new_node);
394409
free_node = node;
395410

396411
goto out;
397412
}
398413

414+
ret = trie_check_add_elem(trie, flags);
415+
if (ret)
416+
goto out;
417+
399418
/* If the new node matches the prefix completely, it must be inserted
400419
* as an ancestor. Simply insert it between @node and *@slot.
401420
*/
@@ -406,8 +425,10 @@ static long trie_update_elem(struct bpf_map *map,
406425
goto out;
407426
}
408427

409-
im_node = lpm_trie_node_alloc(trie, NULL);
428+
/* migration is disabled within the locked scope */
429+
im_node = lpm_trie_node_alloc(trie, NULL, false);
410430
if (!im_node) {
431+
trie->n_entries--;
411432
ret = -ENOMEM;
412433
goto out;
413434
}
@@ -429,16 +450,13 @@ static long trie_update_elem(struct bpf_map *map,
429450
rcu_assign_pointer(*slot, im_node);
430451

431452
out:
432-
if (ret) {
433-
if (new_node)
434-
trie->n_entries--;
453+
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
435454

436-
kfree(new_node);
437-
kfree(im_node);
438-
}
439-
440-
spin_unlock_irqrestore(&trie->lock, irq_flags);
441-
kfree_rcu(free_node, rcu);
455+
migrate_disable();
456+
if (ret)
457+
bpf_mem_cache_free(&trie->ma, new_node);
458+
bpf_mem_cache_free_rcu(&trie->ma, free_node);
459+
migrate_enable();
442460

443461
return ret;
444462
}
@@ -459,7 +477,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
459477
if (key->prefixlen > trie->max_prefixlen)
460478
return -EINVAL;
461479

462-
spin_lock_irqsave(&trie->lock, irq_flags);
480+
raw_spin_lock_irqsave(&trie->lock, irq_flags);
463481

464482
/* Walk the tree looking for an exact key/length match and keeping
465483
* track of the path we traverse. We will need to know the node
@@ -535,9 +553,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
535553
free_node = node;
536554

537555
out:
538-
spin_unlock_irqrestore(&trie->lock, irq_flags);
539-
kfree_rcu(free_parent, rcu);
540-
kfree_rcu(free_node, rcu);
556+
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
557+
558+
migrate_disable();
559+
bpf_mem_cache_free_rcu(&trie->ma, free_parent);
560+
bpf_mem_cache_free_rcu(&trie->ma, free_node);
561+
migrate_enable();
541562

542563
return ret;
543564
}
@@ -559,6 +580,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
559580
static struct bpf_map *trie_alloc(union bpf_attr *attr)
560581
{
561582
struct lpm_trie *trie;
583+
size_t leaf_size;
584+
int err;
562585

563586
/* check sanity of attributes */
564587
if (attr->max_entries == 0 ||
@@ -581,9 +604,19 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
581604
offsetof(struct bpf_lpm_trie_key_u8, data);
582605
trie->max_prefixlen = trie->data_size * 8;
583606

584-
spin_lock_init(&trie->lock);
607+
raw_spin_lock_init(&trie->lock);
585608

609+
/* Allocate intermediate and leaf nodes from the same allocator */
610+
leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
611+
trie->map.value_size;
612+
err = bpf_mem_alloc_init(&trie->ma, leaf_size, false);
613+
if (err)
614+
goto free_out;
586615
return &trie->map;
616+
617+
free_out:
618+
bpf_map_area_free(trie);
619+
return ERR_PTR(err);
587620
}
588621

589622
static void trie_free(struct bpf_map *map)
@@ -615,13 +648,17 @@ static void trie_free(struct bpf_map *map)
615648
continue;
616649
}
617650

618-
kfree(node);
651+
/* No bpf program may access the map, so freeing the
652+
* node without waiting for the extra RCU GP.
653+
*/
654+
bpf_mem_cache_raw_free(node);
619655
RCU_INIT_POINTER(*slot, NULL);
620656
break;
621657
}
622658
}
623659

624660
out:
661+
bpf_mem_alloc_destroy(&trie->ma);
625662
bpf_map_area_free(trie);
626663
}
627664

@@ -633,7 +670,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
633670
struct lpm_trie_node **node_stack = NULL;
634671
int err = 0, stack_ptr = -1;
635672
unsigned int next_bit;
636-
size_t matchlen;
673+
size_t matchlen = 0;
637674

638675
/* The get_next_key follows postorder. For the 4 node example in
639676
* the top of this file, the trie_get_next_key() returns the following
@@ -672,7 +709,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
672709
next_bit = extract_bit(key->data, node->prefixlen);
673710
node = rcu_dereference(node->child[next_bit]);
674711
}
675-
if (!node || node->prefixlen != key->prefixlen ||
712+
if (!node || node->prefixlen != matchlen ||
676713
(node->flags & LPM_TREE_NODE_FLAG_IM))
677714
goto find_leftmost;
678715

0 commit comments

Comments
 (0)