Skip to content

Commit cf559a4

Browse files
author
Alexei Starovoitov
committed
Merge branch 'bpf-fixes-for-per-cpu-kptr'
Hou Tao says: ==================== bpf: Fixes for per-cpu kptr From: Hou Tao <houtao1@huawei.com> Hi, The patchset aims to fix the problems found in the review of per-cpu kptr patch-set [0]. Patch #1 moves pcpu_lock after the invocation of pcpu_chunk_addr_search() and it is a micro-optimization for free_percpu(). The reason includes it in the patch is that the same logic is used in newly-added API pcpu_alloc_size(). Patch #2 introduces pcpu_alloc_size() for dynamic per-cpu area. Patch #2 and #3 use pcpu_alloc_size() to check whether or not unit_size matches with the size of underlying per-cpu area and to select a matching bpf_mem_cache. Patch #4 fixes the freeing of per-cpu kptr when these kptrs are freed by map destruction. The last patch adds test cases for these problems. Please see individual patches for details. And comments are always welcome. Change Log: v3: * rebased on bpf-next * patch 2: update API document to note that pcpu_alloc_size() doesn't support statically allocated per-cpu area. (Dennis) * patch 1 & 2: add Acked-by from Dennis v2: https://lore.kernel.org/bpf/20231018113343.2446300-1-houtao@huaweicloud.com/ * add a new patch "don't acquire pcpu_lock for pcpu_chunk_addr_search()" * patch 2: change type of bit_off and end to unsigned long (Andrew) * patch 2: rename the new API as pcpu_alloc_size and follow 80-column convention (Dennis) * patch 5: move the common declaration into bpf.h (Stanislav, Alxei) v1: https://lore.kernel.org/bpf/20231007135106.3031284-1-houtao@huaweicloud.com/ [0]: https://lore.kernel.org/bpf/20230827152729.1995219-1-yonghong.song@linux.dev ==================== Link: https://lore.kernel.org/r/20231020133202.4043247-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2 parents da1055b + d440ba9 commit cf559a4

File tree

9 files changed

+270
-36
lines changed

9 files changed

+270
-36
lines changed

include/linux/bpf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2058,6 +2058,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec);
20582058
bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
20592059
void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
20602060
void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
2061+
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);
20612062

20622063
struct bpf_map *bpf_map_get(u32 ufd);
20632064
struct bpf_map *bpf_map_get_with_uref(u32 ufd);

include/linux/bpf_mem_alloc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ struct bpf_mem_caches;
1111
struct bpf_mem_alloc {
1212
struct bpf_mem_caches __percpu *caches;
1313
struct bpf_mem_cache __percpu *cache;
14+
bool percpu;
1415
struct work_struct work;
1516
};
1617

include/linux/percpu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ extern void __init setup_per_cpu_areas(void);
132132
extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1);
133133
extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1);
134134
extern void free_percpu(void __percpu *__pdata);
135+
extern size_t pcpu_alloc_size(void __percpu *__pdata);
135136

136137
DEFINE_FREE(free_percpu, void __percpu *, free_percpu(_T))
137138

kernel/bpf/helpers.c

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1811,8 +1811,6 @@ bpf_base_func_proto(enum bpf_func_id func_id)
18111811
}
18121812
}
18131813

1814-
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
1815-
18161814
void bpf_list_head_free(const struct btf_field *field, void *list_head,
18171815
struct bpf_spin_lock *spin_lock)
18181816
{
@@ -1844,7 +1842,7 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head,
18441842
* bpf_list_head which needs to be freed.
18451843
*/
18461844
migrate_disable();
1847-
__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
1845+
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
18481846
migrate_enable();
18491847
}
18501848
}
@@ -1883,7 +1881,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
18831881

18841882

18851883
migrate_disable();
1886-
__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
1884+
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
18871885
migrate_enable();
18881886
}
18891887
}
@@ -1915,8 +1913,10 @@ __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
19151913
}
19161914

19171915
/* Must be called under migrate_disable(), as required by bpf_mem_free */
1918-
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
1916+
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
19191917
{
1918+
struct bpf_mem_alloc *ma;
1919+
19201920
if (rec && rec->refcount_off >= 0 &&
19211921
!refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
19221922
/* Object is refcounted and refcount_dec didn't result in 0
@@ -1928,18 +1928,22 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
19281928
if (rec)
19291929
bpf_obj_free_fields(rec, p);
19301930

1931+
if (percpu)
1932+
ma = &bpf_global_percpu_ma;
1933+
else
1934+
ma = &bpf_global_ma;
19311935
if (rec && rec->refcount_off >= 0)
1932-
bpf_mem_free_rcu(&bpf_global_ma, p);
1936+
bpf_mem_free_rcu(ma, p);
19331937
else
1934-
bpf_mem_free(&bpf_global_ma, p);
1938+
bpf_mem_free(ma, p);
19351939
}
19361940

19371941
__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
19381942
{
19391943
struct btf_struct_meta *meta = meta__ign;
19401944
void *p = p__alloc;
19411945

1942-
__bpf_obj_drop_impl(p, meta ? meta->record : NULL);
1946+
__bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
19431947
}
19441948

19451949
__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
@@ -1983,7 +1987,7 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
19831987
*/
19841988
if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
19851989
/* Only called from BPF prog, no need to migrate_disable */
1986-
__bpf_obj_drop_impl((void *)n - off, rec);
1990+
__bpf_obj_drop_impl((void *)n - off, rec, false);
19871991
return -EINVAL;
19881992
}
19891993

@@ -2082,7 +2086,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root,
20822086
*/
20832087
if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
20842088
/* Only called from BPF prog, no need to migrate_disable */
2085-
__bpf_obj_drop_impl((void *)n - off, rec);
2089+
__bpf_obj_drop_impl((void *)n - off, rec, false);
20862090
return -EINVAL;
20872091
}
20882092

kernel/bpf/memalloc.c

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -491,21 +491,17 @@ static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx)
491491
struct llist_node *first;
492492
unsigned int obj_size;
493493

494-
/* For per-cpu allocator, the size of free objects in free list doesn't
495-
* match with unit_size and now there is no way to get the size of
496-
* per-cpu pointer saved in free object, so just skip the checking.
497-
*/
498-
if (c->percpu_size)
499-
return 0;
500-
501494
first = c->free_llist.first;
502495
if (!first)
503496
return 0;
504497

505-
obj_size = ksize(first);
498+
if (c->percpu_size)
499+
obj_size = pcpu_alloc_size(((void **)first)[1]);
500+
else
501+
obj_size = ksize(first);
506502
if (obj_size != c->unit_size) {
507-
WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n",
508-
idx, obj_size, c->unit_size);
503+
WARN_ONCE(1, "bpf_mem_cache[%u]: percpu %d, unexpected object size %u, expect %u\n",
504+
idx, c->percpu_size, obj_size, c->unit_size);
509505
return -EINVAL;
510506
}
511507
return 0;
@@ -529,6 +525,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
529525
/* room for llist_node and per-cpu pointer */
530526
if (percpu)
531527
percpu_size = LLIST_NODE_SZ + sizeof(void *);
528+
ma->percpu = percpu;
532529

533530
if (size) {
534531
pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
@@ -878,14 +875,25 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
878875
return !ret ? NULL : ret + LLIST_NODE_SZ;
879876
}
880877

878+
static notrace int bpf_mem_free_idx(void *ptr, bool percpu)
879+
{
880+
size_t size;
881+
882+
if (percpu)
883+
size = pcpu_alloc_size(*((void **)ptr));
884+
else
885+
size = ksize(ptr - LLIST_NODE_SZ);
886+
return bpf_mem_cache_idx(size);
887+
}
888+
881889
void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
882890
{
883891
int idx;
884892

885893
if (!ptr)
886894
return;
887895

888-
idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
896+
idx = bpf_mem_free_idx(ptr, ma->percpu);
889897
if (idx < 0)
890898
return;
891899

@@ -899,7 +907,7 @@ void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
899907
if (!ptr)
900908
return;
901909

902-
idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
910+
idx = bpf_mem_free_idx(ptr, ma->percpu);
903911
if (idx < 0)
904912
return;
905913

@@ -973,6 +981,12 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
973981
return !ret ? NULL : ret + LLIST_NODE_SZ;
974982
}
975983

984+
/* The alignment of dynamic per-cpu area is 8, so c->unit_size and the
985+
* actual size of dynamic per-cpu area will always be matched and there is
986+
* no need to adjust size_index for per-cpu allocation. However for the
987+
* simplicity of the implementation, use an unified size_index for both
988+
* kmalloc and per-cpu allocation.
989+
*/
976990
static __init int bpf_mem_cache_adjust_size(void)
977991
{
978992
unsigned int size;

kernel/bpf/syscall.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -626,8 +626,6 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
626626
bpf_timer_cancel_and_free(obj + rec->timer_off);
627627
}
628628

629-
extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
630-
631629
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
632630
{
633631
const struct btf_field *fields;
@@ -662,8 +660,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
662660
field->kptr.btf_id);
663661
migrate_disable();
664662
__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
665-
pointee_struct_meta->record :
666-
NULL);
663+
pointee_struct_meta->record : NULL,
664+
fields[i].type == BPF_KPTR_PERCPU);
667665
migrate_enable();
668666
} else {
669667
field->kptr.dtor(xchgd_field);

mm/percpu.c

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2244,6 +2244,37 @@ static void pcpu_balance_workfn(struct work_struct *work)
22442244
mutex_unlock(&pcpu_alloc_mutex);
22452245
}
22462246

2247+
/**
2248+
* pcpu_alloc_size - the size of the dynamic percpu area
2249+
* @ptr: pointer to the dynamic percpu area
2250+
*
2251+
* Returns the size of the @ptr allocation. This is undefined for statically
2252+
* defined percpu variables as there is no corresponding chunk->bound_map.
2253+
*
2254+
* RETURNS:
2255+
* The size of the dynamic percpu area.
2256+
*
2257+
* CONTEXT:
2258+
* Can be called from atomic context.
2259+
*/
2260+
size_t pcpu_alloc_size(void __percpu *ptr)
2261+
{
2262+
struct pcpu_chunk *chunk;
2263+
unsigned long bit_off, end;
2264+
void *addr;
2265+
2266+
if (!ptr)
2267+
return 0;
2268+
2269+
addr = __pcpu_ptr_to_addr(ptr);
2270+
/* No pcpu_lock here: ptr has not been freed, so chunk is still alive */
2271+
chunk = pcpu_chunk_addr_search(addr);
2272+
bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE;
2273+
end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
2274+
bit_off + 1);
2275+
return (end - bit_off) * PCPU_MIN_ALLOC_SIZE;
2276+
}
2277+
22472278
/**
22482279
* free_percpu - free percpu area
22492280
* @ptr: pointer to area to free
@@ -2267,12 +2298,10 @@ void free_percpu(void __percpu *ptr)
22672298
kmemleak_free_percpu(ptr);
22682299

22692300
addr = __pcpu_ptr_to_addr(ptr);
2270-
2271-
spin_lock_irqsave(&pcpu_lock, flags);
2272-
22732301
chunk = pcpu_chunk_addr_search(addr);
22742302
off = addr - chunk->base_addr;
22752303

2304+
spin_lock_irqsave(&pcpu_lock, flags);
22762305
size = pcpu_free_area(chunk, off);
22772306

22782307
pcpu_memcg_free_hook(chunk, off, size);

tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@
99

1010
#include "test_bpf_ma.skel.h"
1111

12-
void test_test_bpf_ma(void)
12+
static void do_bpf_ma_test(const char *name)
1313
{
1414
struct test_bpf_ma *skel;
15+
struct bpf_program *prog;
1516
struct btf *btf;
1617
int i, err;
1718

@@ -34,6 +35,11 @@ void test_test_bpf_ma(void)
3435
skel->rodata->data_btf_ids[i] = id;
3536
}
3637

38+
prog = bpf_object__find_program_by_name(skel->obj, name);
39+
if (!ASSERT_OK_PTR(prog, "invalid prog name"))
40+
goto out;
41+
bpf_program__set_autoload(prog, true);
42+
3743
err = test_bpf_ma__load(skel);
3844
if (!ASSERT_OK(err, "load"))
3945
goto out;
@@ -48,3 +54,15 @@ void test_test_bpf_ma(void)
4854
out:
4955
test_bpf_ma__destroy(skel);
5056
}
57+
58+
void test_test_bpf_ma(void)
59+
{
60+
if (test__start_subtest("batch_alloc_free"))
61+
do_bpf_ma_test("test_batch_alloc_free");
62+
if (test__start_subtest("free_through_map_free"))
63+
do_bpf_ma_test("test_free_through_map_free");
64+
if (test__start_subtest("batch_percpu_alloc_free"))
65+
do_bpf_ma_test("test_batch_percpu_alloc_free");
66+
if (test__start_subtest("percpu_free_through_map_free"))
67+
do_bpf_ma_test("test_percpu_free_through_map_free");
68+
}

0 commit comments

Comments
 (0)