~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/kernel/bpf/arena.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /kernel/bpf/arena.c (Version linux-6.12-rc7) and /kernel/bpf/arena.c (Version linux-6.11.7)


  1 // SPDX-License-Identifier: GPL-2.0-only            1 // SPDX-License-Identifier: GPL-2.0-only
  2 /* Copyright (c) 2024 Meta Platforms, Inc. and      2 /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
  3 #include <linux/bpf.h>                              3 #include <linux/bpf.h>
  4 #include <linux/btf.h>                              4 #include <linux/btf.h>
  5 #include <linux/err.h>                              5 #include <linux/err.h>
  6 #include <linux/btf_ids.h>                          6 #include <linux/btf_ids.h>
  7 #include <linux/vmalloc.h>                          7 #include <linux/vmalloc.h>
  8 #include <linux/pagemap.h>                          8 #include <linux/pagemap.h>
  9                                                     9 
 10 /*                                                 10 /*
 11  * bpf_arena is a sparsely populated shared me     11  * bpf_arena is a sparsely populated shared memory region between bpf program and
 12  * user space process.                             12  * user space process.
 13  *                                                 13  *
 14  * For example on x86-64 the values could be:      14  * For example on x86-64 the values could be:
 15  * user_vm_start 7f7d26200000     // picked by     15  * user_vm_start 7f7d26200000     // picked by mmap()
 16  * kern_vm_start ffffc90001e69000 // picked by     16  * kern_vm_start ffffc90001e69000 // picked by get_vm_area()
 17  * For user space all pointers within the aren     17  * For user space all pointers within the arena are normal 8-byte addresses.
 18  * In this example 7f7d26200000 is the address     18  * In this example 7f7d26200000 is the address of the first page (pgoff=0).
 19  * The bpf program will access it as: kern_vm_     19  * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr
 20  * (u32)7f7d26200000 -> 26200000                   20  * (u32)7f7d26200000 -> 26200000
 21  * hence                                           21  * hence
 22  * ffffc90001e69000 + 26200000 == ffffc9002806     22  * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb
 23  * kernel memory region.                           23  * kernel memory region.
 24  *                                                 24  *
 25  * BPF JITs generate the following code to acc     25  * BPF JITs generate the following code to access arena:
 26  *   mov eax, eax  // eax has lower 32-bit of      26  *   mov eax, eax  // eax has lower 32-bit of user pointer
 27  *   mov word ptr [rax + r12 + off], bx            27  *   mov word ptr [rax + r12 + off], bx
 28  * where r12 == kern_vm_start and off is s16.      28  * where r12 == kern_vm_start and off is s16.
 29  * Hence allocate 4Gb + GUARD_SZ/2 on each sid     29  * Hence allocate 4Gb + GUARD_SZ/2 on each side.
 30  *                                                 30  *
 31  * Initially kernel vm_area and user vma are n     31  * Initially kernel vm_area and user vma are not populated.
 32  * User space can fault-in any address which w     32  * User space can fault-in any address which will insert the page
 33  * into kernel and user vma.                       33  * into kernel and user vma.
 34  * bpf program can allocate a page via bpf_are     34  * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc
 35  * which will insert it into kernel vm_area.       35  * which will insert it into kernel vm_area.
 36  * The later fault-in from user space will pop     36  * The later fault-in from user space will populate that page into user vma.
 37  */                                                37  */
 38                                                    38 
 39 /* number of bytes addressable by LDX/STX insn     39 /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
 40 #define GUARD_SZ (1ull << sizeof_field(struct      40 #define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8)
 41 #define KERN_VM_SZ (SZ_4G + GUARD_SZ)              41 #define KERN_VM_SZ (SZ_4G + GUARD_SZ)
 42                                                    42 
 43 struct bpf_arena {                                 43 struct bpf_arena {
 44         struct bpf_map map;                        44         struct bpf_map map;
 45         u64 user_vm_start;                         45         u64 user_vm_start;
 46         u64 user_vm_end;                           46         u64 user_vm_end;
 47         struct vm_struct *kern_vm;                 47         struct vm_struct *kern_vm;
 48         struct maple_tree mt;                      48         struct maple_tree mt;
 49         struct list_head vma_list;                 49         struct list_head vma_list;
 50         struct mutex lock;                         50         struct mutex lock;
 51 };                                                 51 };
 52                                                    52 
 53 u64 bpf_arena_get_kern_vm_start(struct bpf_are     53 u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
 54 {                                                  54 {
 55         return arena ? (u64) (long) arena->ker     55         return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0;
 56 }                                                  56 }
 57                                                    57 
 58 u64 bpf_arena_get_user_vm_start(struct bpf_are     58 u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
 59 {                                                  59 {
 60         return arena ? arena->user_vm_start :      60         return arena ? arena->user_vm_start : 0;
 61 }                                                  61 }
 62                                                    62 
 63 static long arena_map_peek_elem(struct bpf_map     63 static long arena_map_peek_elem(struct bpf_map *map, void *value)
 64 {                                                  64 {
 65         return -EOPNOTSUPP;                        65         return -EOPNOTSUPP;
 66 }                                                  66 }
 67                                                    67 
 68 static long arena_map_push_elem(struct bpf_map     68 static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags)
 69 {                                                  69 {
 70         return -EOPNOTSUPP;                        70         return -EOPNOTSUPP;
 71 }                                                  71 }
 72                                                    72 
 73 static long arena_map_pop_elem(struct bpf_map      73 static long arena_map_pop_elem(struct bpf_map *map, void *value)
 74 {                                                  74 {
 75         return -EOPNOTSUPP;                        75         return -EOPNOTSUPP;
 76 }                                                  76 }
 77                                                    77 
 78 static long arena_map_delete_elem(struct bpf_m     78 static long arena_map_delete_elem(struct bpf_map *map, void *value)
 79 {                                                  79 {
 80         return -EOPNOTSUPP;                        80         return -EOPNOTSUPP;
 81 }                                                  81 }
 82                                                    82 
 83 static int arena_map_get_next_key(struct bpf_m     83 static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 84 {                                                  84 {
 85         return -EOPNOTSUPP;                        85         return -EOPNOTSUPP;
 86 }                                                  86 }
 87                                                    87 
 88 static long compute_pgoff(struct bpf_arena *ar     88 static long compute_pgoff(struct bpf_arena *arena, long uaddr)
 89 {                                                  89 {
 90         return (u32)(uaddr - (u32)arena->user_     90         return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
 91 }                                                  91 }
 92                                                    92 
 93 static struct bpf_map *arena_map_alloc(union b     93 static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
 94 {                                                  94 {
 95         struct vm_struct *kern_vm;                 95         struct vm_struct *kern_vm;
 96         int numa_node = bpf_map_attr_numa_node     96         int numa_node = bpf_map_attr_numa_node(attr);
 97         struct bpf_arena *arena;                   97         struct bpf_arena *arena;
 98         u64 vm_range;                              98         u64 vm_range;
 99         int err = -ENOMEM;                         99         int err = -ENOMEM;
100                                                   100 
101         if (attr->key_size || attr->value_size    101         if (attr->key_size || attr->value_size || attr->max_entries == 0 ||
102             /* BPF_F_MMAPABLE must be set */      102             /* BPF_F_MMAPABLE must be set */
103             !(attr->map_flags & BPF_F_MMAPABLE    103             !(attr->map_flags & BPF_F_MMAPABLE) ||
104             /* No unsupported flags present */    104             /* No unsupported flags present */
105             (attr->map_flags & ~(BPF_F_SEGV_ON    105             (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV)))
106                 return ERR_PTR(-EINVAL);          106                 return ERR_PTR(-EINVAL);
107                                                   107 
108         if (attr->map_extra & ~PAGE_MASK)         108         if (attr->map_extra & ~PAGE_MASK)
109                 /* If non-zero the map_extra i    109                 /* If non-zero the map_extra is an expected user VMA start address */
110                 return ERR_PTR(-EINVAL);          110                 return ERR_PTR(-EINVAL);
111                                                   111 
112         vm_range = (u64)attr->max_entries * PA    112         vm_range = (u64)attr->max_entries * PAGE_SIZE;
113         if (vm_range > SZ_4G)                     113         if (vm_range > SZ_4G)
114                 return ERR_PTR(-E2BIG);           114                 return ERR_PTR(-E2BIG);
115                                                   115 
116         if ((attr->map_extra >> 32) != ((attr-    116         if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32))
117                 /* user vma must not cross 32-    117                 /* user vma must not cross 32-bit boundary */
118                 return ERR_PTR(-ERANGE);          118                 return ERR_PTR(-ERANGE);
119                                                   119 
120         kern_vm = get_vm_area(KERN_VM_SZ, VM_S    120         kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP);
121         if (!kern_vm)                             121         if (!kern_vm)
122                 return ERR_PTR(-ENOMEM);          122                 return ERR_PTR(-ENOMEM);
123                                                   123 
124         arena = bpf_map_area_alloc(sizeof(*are    124         arena = bpf_map_area_alloc(sizeof(*arena), numa_node);
125         if (!arena)                               125         if (!arena)
126                 goto err;                         126                 goto err;
127                                                   127 
128         arena->kern_vm = kern_vm;                 128         arena->kern_vm = kern_vm;
129         arena->user_vm_start = attr->map_extra    129         arena->user_vm_start = attr->map_extra;
130         if (arena->user_vm_start)                 130         if (arena->user_vm_start)
131                 arena->user_vm_end = arena->us    131                 arena->user_vm_end = arena->user_vm_start + vm_range;
132                                                   132 
133         INIT_LIST_HEAD(&arena->vma_list);         133         INIT_LIST_HEAD(&arena->vma_list);
134         bpf_map_init_from_attr(&arena->map, at    134         bpf_map_init_from_attr(&arena->map, attr);
135         mt_init_flags(&arena->mt, MT_FLAGS_ALL    135         mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE);
136         mutex_init(&arena->lock);                 136         mutex_init(&arena->lock);
137                                                   137 
138         return &arena->map;                       138         return &arena->map;
139 err:                                              139 err:
140         free_vm_area(kern_vm);                    140         free_vm_area(kern_vm);
141         return ERR_PTR(err);                      141         return ERR_PTR(err);
142 }                                                 142 }
143                                                   143 
144 static int existing_page_cb(pte_t *ptep, unsig    144 static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
145 {                                                 145 {
146         struct page *page;                        146         struct page *page;
147         pte_t pte;                                147         pte_t pte;
148                                                   148 
149         pte = ptep_get(ptep);                     149         pte = ptep_get(ptep);
150         if (!pte_present(pte)) /* sanity check    150         if (!pte_present(pte)) /* sanity check */
151                 return 0;                         151                 return 0;
152         page = pte_page(pte);                     152         page = pte_page(pte);
153         /*                                        153         /*
154          * We do not update pte here:             154          * We do not update pte here:
155          * 1. Nobody should be accessing bpf_a    155          * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
156          * 2. TLB flushing is batched or defer    156          * 2. TLB flushing is batched or deferred. Even if we clear pte,
157          * the TLB entries can stick around an    157          * the TLB entries can stick around and continue to permit access to
158          * the freed page. So it all relies on    158          * the freed page. So it all relies on 1.
159          */                                       159          */
160         __free_page(page);                        160         __free_page(page);
161         return 0;                                 161         return 0;
162 }                                                 162 }
163                                                   163 
164 static void arena_map_free(struct bpf_map *map    164 static void arena_map_free(struct bpf_map *map)
165 {                                                 165 {
166         struct bpf_arena *arena = container_of    166         struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
167                                                   167 
168         /*                                        168         /*
169          * Check that user vma-s are not aroun    169          * Check that user vma-s are not around when bpf map is freed.
170          * mmap() holds vm_file which holds bp    170          * mmap() holds vm_file which holds bpf_map refcnt.
171          * munmap() must have happened on vma     171          * munmap() must have happened on vma followed by arena_vm_close()
172          * which would clear arena->vma_list.     172          * which would clear arena->vma_list.
173          */                                       173          */
174         if (WARN_ON_ONCE(!list_empty(&arena->v    174         if (WARN_ON_ONCE(!list_empty(&arena->vma_list)))
175                 return;                           175                 return;
176                                                   176 
177         /*                                        177         /*
178          * free_vm_area() calls remove_vm_area    178          * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area().
179          * It unmaps everything from vmalloc a    179          * It unmaps everything from vmalloc area and clears pgtables.
180          * Call apply_to_existing_page_range()    180          * Call apply_to_existing_page_range() first to find populated ptes and
181          * free those pages.                      181          * free those pages.
182          */                                       182          */
183         apply_to_existing_page_range(&init_mm,    183         apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
184                                      KERN_VM_S    184                                      KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
185         free_vm_area(arena->kern_vm);             185         free_vm_area(arena->kern_vm);
186         mtree_destroy(&arena->mt);                186         mtree_destroy(&arena->mt);
187         bpf_map_area_free(arena);                 187         bpf_map_area_free(arena);
188 }                                                 188 }
189                                                   189 
190 static void *arena_map_lookup_elem(struct bpf_    190 static void *arena_map_lookup_elem(struct bpf_map *map, void *key)
191 {                                                 191 {
192         return ERR_PTR(-EINVAL);                  192         return ERR_PTR(-EINVAL);
193 }                                                 193 }
194                                                   194 
195 static long arena_map_update_elem(struct bpf_m    195 static long arena_map_update_elem(struct bpf_map *map, void *key,
196                                   void *value,    196                                   void *value, u64 flags)
197 {                                                 197 {
198         return -EOPNOTSUPP;                       198         return -EOPNOTSUPP;
199 }                                                 199 }
200                                                   200 
201 static int arena_map_check_btf(const struct bp    201 static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
202                                const struct bt    202                                const struct btf_type *key_type, const struct btf_type *value_type)
203 {                                                 203 {
204         return 0;                                 204         return 0;
205 }                                                 205 }
206                                                   206 
207 static u64 arena_map_mem_usage(const struct bp    207 static u64 arena_map_mem_usage(const struct bpf_map *map)
208 {                                                 208 {
209         return 0;                                 209         return 0;
210 }                                                 210 }
211                                                   211 
212 struct vma_list {                                 212 struct vma_list {
213         struct vm_area_struct *vma;               213         struct vm_area_struct *vma;
214         struct list_head head;                    214         struct list_head head;
215         atomic_t mmap_count;                      215         atomic_t mmap_count;
216 };                                                216 };
217                                                   217 
218 static int remember_vma(struct bpf_arena *aren    218 static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
219 {                                                 219 {
220         struct vma_list *vml;                     220         struct vma_list *vml;
221                                                   221 
222         vml = kmalloc(sizeof(*vml), GFP_KERNEL    222         vml = kmalloc(sizeof(*vml), GFP_KERNEL);
223         if (!vml)                                 223         if (!vml)
224                 return -ENOMEM;                   224                 return -ENOMEM;
225         atomic_set(&vml->mmap_count, 1);          225         atomic_set(&vml->mmap_count, 1);
226         vma->vm_private_data = vml;               226         vma->vm_private_data = vml;
227         vml->vma = vma;                           227         vml->vma = vma;
228         list_add(&vml->head, &arena->vma_list)    228         list_add(&vml->head, &arena->vma_list);
229         return 0;                                 229         return 0;
230 }                                                 230 }
231                                                   231 
232 static void arena_vm_open(struct vm_area_struc    232 static void arena_vm_open(struct vm_area_struct *vma)
233 {                                                 233 {
234         struct vma_list *vml = vma->vm_private    234         struct vma_list *vml = vma->vm_private_data;
235                                                   235 
236         atomic_inc(&vml->mmap_count);             236         atomic_inc(&vml->mmap_count);
237 }                                                 237 }
238                                                   238 
239 static void arena_vm_close(struct vm_area_stru    239 static void arena_vm_close(struct vm_area_struct *vma)
240 {                                                 240 {
241         struct bpf_map *map = vma->vm_file->pr    241         struct bpf_map *map = vma->vm_file->private_data;
242         struct bpf_arena *arena = container_of    242         struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
243         struct vma_list *vml = vma->vm_private    243         struct vma_list *vml = vma->vm_private_data;
244                                                   244 
245         if (!atomic_dec_and_test(&vml->mmap_co    245         if (!atomic_dec_and_test(&vml->mmap_count))
246                 return;                           246                 return;
247         guard(mutex)(&arena->lock);               247         guard(mutex)(&arena->lock);
248         /* update link list under lock */         248         /* update link list under lock */
249         list_del(&vml->head);                     249         list_del(&vml->head);
250         vma->vm_private_data = NULL;              250         vma->vm_private_data = NULL;
251         kfree(vml);                               251         kfree(vml);
252 }                                                 252 }
253                                                   253 
254 #define MT_ENTRY ((void *)&arena_map_ops) /* u    254 #define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */
255                                                   255 
256 static vm_fault_t arena_vm_fault(struct vm_fau    256 static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
257 {                                                 257 {
258         struct bpf_map *map = vmf->vma->vm_fil    258         struct bpf_map *map = vmf->vma->vm_file->private_data;
259         struct bpf_arena *arena = container_of    259         struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
260         struct page *page;                        260         struct page *page;
261         long kbase, kaddr;                        261         long kbase, kaddr;
262         int ret;                                  262         int ret;
263                                                   263 
264         kbase = bpf_arena_get_kern_vm_start(ar    264         kbase = bpf_arena_get_kern_vm_start(arena);
265         kaddr = kbase + (u32)(vmf->address);      265         kaddr = kbase + (u32)(vmf->address);
266                                                   266 
267         guard(mutex)(&arena->lock);               267         guard(mutex)(&arena->lock);
268         page = vmalloc_to_page((void *)kaddr);    268         page = vmalloc_to_page((void *)kaddr);
269         if (page)                                 269         if (page)
270                 /* already have a page vmap-ed    270                 /* already have a page vmap-ed */
271                 goto out;                         271                 goto out;
272                                                   272 
273         if (arena->map.map_flags & BPF_F_SEGV_    273         if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
274                 /* User space requested to seg    274                 /* User space requested to segfault when page is not allocated by bpf prog */
275                 return VM_FAULT_SIGSEGV;          275                 return VM_FAULT_SIGSEGV;
276                                                   276 
277         ret = mtree_insert(&arena->mt, vmf->pg    277         ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL);
278         if (ret)                                  278         if (ret)
279                 return VM_FAULT_SIGSEGV;          279                 return VM_FAULT_SIGSEGV;
280                                                   280 
281         /* Account into memcg of the process t    281         /* Account into memcg of the process that created bpf_arena */
282         ret = bpf_map_alloc_pages(map, GFP_KER    282         ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
283         if (ret) {                                283         if (ret) {
284                 mtree_erase(&arena->mt, vmf->p    284                 mtree_erase(&arena->mt, vmf->pgoff);
285                 return VM_FAULT_SIGSEGV;          285                 return VM_FAULT_SIGSEGV;
286         }                                         286         }
287                                                   287 
288         ret = vm_area_map_pages(arena->kern_vm    288         ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
289         if (ret) {                                289         if (ret) {
290                 mtree_erase(&arena->mt, vmf->p    290                 mtree_erase(&arena->mt, vmf->pgoff);
291                 __free_page(page);                291                 __free_page(page);
292                 return VM_FAULT_SIGSEGV;          292                 return VM_FAULT_SIGSEGV;
293         }                                         293         }
294 out:                                              294 out:
295         page_ref_add(page, 1);                    295         page_ref_add(page, 1);
296         vmf->page = page;                         296         vmf->page = page;
297         return 0;                                 297         return 0;
298 }                                                 298 }
299                                                   299 
300 static const struct vm_operations_struct arena    300 static const struct vm_operations_struct arena_vm_ops = {
301         .open           = arena_vm_open,          301         .open           = arena_vm_open,
302         .close          = arena_vm_close,         302         .close          = arena_vm_close,
303         .fault          = arena_vm_fault,         303         .fault          = arena_vm_fault,
304 };                                                304 };
305                                                   305 
306 static unsigned long arena_get_unmapped_area(s    306 static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr,
307                                              u    307                                              unsigned long len, unsigned long pgoff,
308                                              u    308                                              unsigned long flags)
309 {                                                 309 {
310         struct bpf_map *map = filp->private_da    310         struct bpf_map *map = filp->private_data;
311         struct bpf_arena *arena = container_of    311         struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
312         long ret;                                 312         long ret;
313                                                   313 
314         if (pgoff)                                314         if (pgoff)
315                 return -EINVAL;                   315                 return -EINVAL;
316         if (len > SZ_4G)                          316         if (len > SZ_4G)
317                 return -E2BIG;                    317                 return -E2BIG;
318                                                   318 
319         /* if user_vm_start was specified at a    319         /* if user_vm_start was specified at arena creation time */
320         if (arena->user_vm_start) {               320         if (arena->user_vm_start) {
321                 if (len > arena->user_vm_end -    321                 if (len > arena->user_vm_end - arena->user_vm_start)
322                         return -E2BIG;            322                         return -E2BIG;
323                 if (len != arena->user_vm_end     323                 if (len != arena->user_vm_end - arena->user_vm_start)
324                         return -EINVAL;           324                         return -EINVAL;
325                 if (addr != arena->user_vm_sta    325                 if (addr != arena->user_vm_start)
326                         return -EINVAL;           326                         return -EINVAL;
327         }                                         327         }
328                                                   328 
329         ret = mm_get_unmapped_area(current->mm    329         ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags);
330         if (IS_ERR_VALUE(ret))                    330         if (IS_ERR_VALUE(ret))
331                 return ret;                       331                 return ret;
332         if ((ret >> 32) == ((ret + len - 1) >>    332         if ((ret >> 32) == ((ret + len - 1) >> 32))
333                 return ret;                       333                 return ret;
334         if (WARN_ON_ONCE(arena->user_vm_start)    334         if (WARN_ON_ONCE(arena->user_vm_start))
335                 /* checks at map creation time    335                 /* checks at map creation time should prevent this */
336                 return -EFAULT;                   336                 return -EFAULT;
337         return round_up(ret, SZ_4G);              337         return round_up(ret, SZ_4G);
338 }                                                 338 }
339                                                   339 
340 static int arena_map_mmap(struct bpf_map *map,    340 static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
341 {                                                 341 {
342         struct bpf_arena *arena = container_of    342         struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
343                                                   343 
344         guard(mutex)(&arena->lock);               344         guard(mutex)(&arena->lock);
345         if (arena->user_vm_start && arena->use    345         if (arena->user_vm_start && arena->user_vm_start != vma->vm_start)
346                 /*                                346                 /*
347                  * If map_extra was not specif    347                  * If map_extra was not specified at arena creation time then
348                  * 1st user process can do mma    348                  * 1st user process can do mmap(NULL, ...) to pick user_vm_start
349                  * 2nd user process must pass     349                  * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..);
350                  *   or                           350                  *   or
351                  * specify addr in map_extra a    351                  * specify addr in map_extra and
352                  * use the same addr later wit    352                  * use the same addr later with mmap(addr, MAP_FIXED..);
353                  */                               353                  */
354                 return -EBUSY;                    354                 return -EBUSY;
355                                                   355 
356         if (arena->user_vm_end && arena->user_    356         if (arena->user_vm_end && arena->user_vm_end != vma->vm_end)
357                 /* all user processes must hav    357                 /* all user processes must have the same size of mmap-ed region */
358                 return -EBUSY;                    358                 return -EBUSY;
359                                                   359 
360         /* Earlier checks should prevent this     360         /* Earlier checks should prevent this */
361         if (WARN_ON_ONCE(vma->vm_end - vma->vm    361         if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > SZ_4G || vma->vm_pgoff))
362                 return -EFAULT;                   362                 return -EFAULT;
363                                                   363 
364         if (remember_vma(arena, vma))             364         if (remember_vma(arena, vma))
365                 return -ENOMEM;                   365                 return -ENOMEM;
366                                                   366 
367         arena->user_vm_start = vma->vm_start;     367         arena->user_vm_start = vma->vm_start;
368         arena->user_vm_end = vma->vm_end;         368         arena->user_vm_end = vma->vm_end;
369         /*                                        369         /*
370          * bpf_map_mmap() checks that it's bei    370          * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and
371          * clears VM_MAYEXEC. Set VM_DONTEXPAN    371          * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid
372          * potential change of user_vm_start.     372          * potential change of user_vm_start.
373          */                                       373          */
374         vm_flags_set(vma, VM_DONTEXPAND);         374         vm_flags_set(vma, VM_DONTEXPAND);
375         vma->vm_ops = &arena_vm_ops;              375         vma->vm_ops = &arena_vm_ops;
376         return 0;                                 376         return 0;
377 }                                                 377 }
378                                                   378 
379 static int arena_map_direct_value_addr(const s    379 static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
380 {                                                 380 {
381         struct bpf_arena *arena = container_of    381         struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
382                                                   382 
383         if ((u64)off > arena->user_vm_end - ar    383         if ((u64)off > arena->user_vm_end - arena->user_vm_start)
384                 return -ERANGE;                   384                 return -ERANGE;
385         *imm = (unsigned long)arena->user_vm_s    385         *imm = (unsigned long)arena->user_vm_start;
386         return 0;                                 386         return 0;
387 }                                                 387 }
388                                                   388 
389 BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, stru    389 BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena)
390 const struct bpf_map_ops arena_map_ops = {        390 const struct bpf_map_ops arena_map_ops = {
391         .map_meta_equal = bpf_map_meta_equal,     391         .map_meta_equal = bpf_map_meta_equal,
392         .map_alloc = arena_map_alloc,             392         .map_alloc = arena_map_alloc,
393         .map_free = arena_map_free,               393         .map_free = arena_map_free,
394         .map_direct_value_addr = arena_map_dir    394         .map_direct_value_addr = arena_map_direct_value_addr,
395         .map_mmap = arena_map_mmap,               395         .map_mmap = arena_map_mmap,
396         .map_get_unmapped_area = arena_get_unm    396         .map_get_unmapped_area = arena_get_unmapped_area,
397         .map_get_next_key = arena_map_get_next    397         .map_get_next_key = arena_map_get_next_key,
398         .map_push_elem = arena_map_push_elem,     398         .map_push_elem = arena_map_push_elem,
399         .map_peek_elem = arena_map_peek_elem,     399         .map_peek_elem = arena_map_peek_elem,
400         .map_pop_elem = arena_map_pop_elem,       400         .map_pop_elem = arena_map_pop_elem,
401         .map_lookup_elem = arena_map_lookup_el    401         .map_lookup_elem = arena_map_lookup_elem,
402         .map_update_elem = arena_map_update_el    402         .map_update_elem = arena_map_update_elem,
403         .map_delete_elem = arena_map_delete_el    403         .map_delete_elem = arena_map_delete_elem,
404         .map_check_btf = arena_map_check_btf,     404         .map_check_btf = arena_map_check_btf,
405         .map_mem_usage = arena_map_mem_usage,     405         .map_mem_usage = arena_map_mem_usage,
406         .map_btf_id = &bpf_arena_map_btf_ids[0    406         .map_btf_id = &bpf_arena_map_btf_ids[0],
407 };                                                407 };
408                                                   408 
409 static u64 clear_lo32(u64 val)                    409 static u64 clear_lo32(u64 val)
410 {                                                 410 {
411         return val & ~(u64)~0U;                   411         return val & ~(u64)~0U;
412 }                                                 412 }
413                                                   413 
414 /*                                                414 /*
415  * Allocate pages and vmap them into kernel vm    415  * Allocate pages and vmap them into kernel vmalloc area.
416  * Later the pages will be mmaped into user sp    416  * Later the pages will be mmaped into user space vma.
417  */                                               417  */
418 static long arena_alloc_pages(struct bpf_arena    418 static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id)
419 {                                                 419 {
420         /* user_vm_end/start are fixed before     420         /* user_vm_end/start are fixed before bpf prog runs */
421         long page_cnt_max = (arena->user_vm_en    421         long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
422         u64 kern_vm_start = bpf_arena_get_kern    422         u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
423         struct page **pages;                      423         struct page **pages;
424         long pgoff = 0;                           424         long pgoff = 0;
425         u32 uaddr32;                              425         u32 uaddr32;
426         int ret, i;                               426         int ret, i;
427                                                   427 
428         if (page_cnt > page_cnt_max)              428         if (page_cnt > page_cnt_max)
429                 return 0;                         429                 return 0;
430                                                   430 
431         if (uaddr) {                              431         if (uaddr) {
432                 if (uaddr & ~PAGE_MASK)           432                 if (uaddr & ~PAGE_MASK)
433                         return 0;                 433                         return 0;
434                 pgoff = compute_pgoff(arena, u    434                 pgoff = compute_pgoff(arena, uaddr);
435                 if (pgoff > page_cnt_max - pag    435                 if (pgoff > page_cnt_max - page_cnt)
436                         /* requested address w    436                         /* requested address will be outside of user VMA */
437                         return 0;                 437                         return 0;
438         }                                         438         }
439                                                   439 
440         /* zeroing is needed, since alloc_page    440         /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */
441         pages = kvcalloc(page_cnt, sizeof(stru    441         pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
442         if (!pages)                               442         if (!pages)
443                 return 0;                         443                 return 0;
444                                                   444 
445         guard(mutex)(&arena->lock);               445         guard(mutex)(&arena->lock);
446                                                   446 
447         if (uaddr)                                447         if (uaddr)
448                 ret = mtree_insert_range(&aren    448                 ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1,
449                                          MT_EN    449                                          MT_ENTRY, GFP_KERNEL);
450         else                                      450         else
451                 ret = mtree_alloc_range(&arena    451                 ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY,
452                                         page_c    452                                         page_cnt, 0, page_cnt_max - 1, GFP_KERNEL);
453         if (ret)                                  453         if (ret)
454                 goto out_free_pages;              454                 goto out_free_pages;
455                                                   455 
456         ret = bpf_map_alloc_pages(&arena->map,    456         ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
457                                   node_id, pag    457                                   node_id, page_cnt, pages);
458         if (ret)                                  458         if (ret)
459                 goto out;                         459                 goto out;
460                                                   460 
461         uaddr32 = (u32)(arena->user_vm_start +    461         uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
462         /* Earlier checks made sure that uaddr    462         /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
463          * will not overflow 32-bit. Lower 32-    463          * will not overflow 32-bit. Lower 32-bit need to represent
464          * contiguous user address range.         464          * contiguous user address range.
465          * Map these pages at kern_vm_start ba    465          * Map these pages at kern_vm_start base.
466          * kern_vm_start + uaddr32 + page_cnt     466          * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
467          * lower 32-bit and it's ok.              467          * lower 32-bit and it's ok.
468          */                                       468          */
469         ret = vm_area_map_pages(arena->kern_vm    469         ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
470                                 kern_vm_start     470                                 kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
471         if (ret) {                                471         if (ret) {
472                 for (i = 0; i < page_cnt; i++)    472                 for (i = 0; i < page_cnt; i++)
473                         __free_page(pages[i]);    473                         __free_page(pages[i]);
474                 goto out;                         474                 goto out;
475         }                                         475         }
476         kvfree(pages);                            476         kvfree(pages);
477         return clear_lo32(arena->user_vm_start    477         return clear_lo32(arena->user_vm_start) + uaddr32;
478 out:                                              478 out:
479         mtree_erase(&arena->mt, pgoff);           479         mtree_erase(&arena->mt, pgoff);
480 out_free_pages:                                   480 out_free_pages:
481         kvfree(pages);                            481         kvfree(pages);
482         return 0;                                 482         return 0;
483 }                                                 483 }
484                                                   484 
485 /*                                                485 /*
486  * If page is present in vmalloc area, unmap i    486  * If page is present in vmalloc area, unmap it from vmalloc area,
487  * unmap it from all user space vma-s,            487  * unmap it from all user space vma-s,
488  * and free it.                                   488  * and free it.
489  */                                               489  */
490 static void zap_pages(struct bpf_arena *arena,    490 static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
491 {                                                 491 {
492         struct vma_list *vml;                     492         struct vma_list *vml;
493                                                   493 
494         list_for_each_entry(vml, &arena->vma_l    494         list_for_each_entry(vml, &arena->vma_list, head)
495                 zap_page_range_single(vml->vma    495                 zap_page_range_single(vml->vma, uaddr,
496                                       PAGE_SIZ    496                                       PAGE_SIZE * page_cnt, NULL);
497 }                                                 497 }
498                                                   498 
499 static void arena_free_pages(struct bpf_arena     499 static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
500 {                                                 500 {
501         u64 full_uaddr, uaddr_end;                501         u64 full_uaddr, uaddr_end;
502         long kaddr, pgoff, i;                     502         long kaddr, pgoff, i;
503         struct page *page;                        503         struct page *page;
504                                                   504 
505         /* only aligned lower 32-bit are relev    505         /* only aligned lower 32-bit are relevant */
506         uaddr = (u32)uaddr;                       506         uaddr = (u32)uaddr;
507         uaddr &= PAGE_MASK;                       507         uaddr &= PAGE_MASK;
508         full_uaddr = clear_lo32(arena->user_vm    508         full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
509         uaddr_end = min(arena->user_vm_end, fu    509         uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
510         if (full_uaddr >= uaddr_end)              510         if (full_uaddr >= uaddr_end)
511                 return;                           511                 return;
512                                                   512 
513         page_cnt = (uaddr_end - full_uaddr) >>    513         page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
514                                                   514 
515         guard(mutex)(&arena->lock);               515         guard(mutex)(&arena->lock);
516                                                   516 
517         pgoff = compute_pgoff(arena, uaddr);      517         pgoff = compute_pgoff(arena, uaddr);
518         /* clear range */                         518         /* clear range */
519         mtree_store_range(&arena->mt, pgoff, p    519         mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL);
520                                                   520 
521         if (page_cnt > 1)                         521         if (page_cnt > 1)
522                 /* bulk zap if multiple pages     522                 /* bulk zap if multiple pages being freed */
523                 zap_pages(arena, full_uaddr, p    523                 zap_pages(arena, full_uaddr, page_cnt);
524                                                   524 
525         kaddr = bpf_arena_get_kern_vm_start(ar    525         kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
526         for (i = 0; i < page_cnt; i++, kaddr +    526         for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) {
527                 page = vmalloc_to_page((void *    527                 page = vmalloc_to_page((void *)kaddr);
528                 if (!page)                        528                 if (!page)
529                         continue;                 529                         continue;
530                 if (page_cnt == 1 && page_mapp    530                 if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
531                         /* Optimization for th    531                         /* Optimization for the common case of page_cnt==1:
532                          * If page wasn't mapp    532                          * If page wasn't mapped into some user vma there
533                          * is no need to call     533                          * is no need to call zap_pages which is slow. When
534                          * page_cnt is big it'    534                          * page_cnt is big it's faster to do the batched zap.
535                          */                       535                          */
536                         zap_pages(arena, full_    536                         zap_pages(arena, full_uaddr, 1);
537                 vm_area_unmap_pages(arena->ker    537                 vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
538                 __free_page(page);                538                 __free_page(page);
539         }                                         539         }
540 }                                                 540 }
541                                                   541 
542 __bpf_kfunc_start_defs();                         542 __bpf_kfunc_start_defs();
543                                                   543 
544 __bpf_kfunc void *bpf_arena_alloc_pages(void *    544 __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
545                                         int no    545                                         int node_id, u64 flags)
546 {                                                 546 {
547         struct bpf_map *map = p__map;             547         struct bpf_map *map = p__map;
548         struct bpf_arena *arena = container_of    548         struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
549                                                   549 
550         if (map->map_type != BPF_MAP_TYPE_AREN    550         if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
551                 return NULL;                      551                 return NULL;
552                                                   552 
553         return (void *)arena_alloc_pages(arena    553         return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id);
554 }                                                 554 }
555                                                   555 
556 __bpf_kfunc void bpf_arena_free_pages(void *p_    556 __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
557 {                                                 557 {
558         struct bpf_map *map = p__map;             558         struct bpf_map *map = p__map;
559         struct bpf_arena *arena = container_of    559         struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
560                                                   560 
561         if (map->map_type != BPF_MAP_TYPE_AREN    561         if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
562                 return;                           562                 return;
563         arena_free_pages(arena, (long)ptr__ign    563         arena_free_pages(arena, (long)ptr__ign, page_cnt);
564 }                                                 564 }
565 __bpf_kfunc_end_defs();                           565 __bpf_kfunc_end_defs();
566                                                   566 
567 BTF_KFUNCS_START(arena_kfuncs)                    567 BTF_KFUNCS_START(arena_kfuncs)
568 BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_T    568 BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
569 BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TR    569 BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
570 BTF_KFUNCS_END(arena_kfuncs)                      570 BTF_KFUNCS_END(arena_kfuncs)
571                                                   571 
572 static const struct btf_kfunc_id_set common_kf    572 static const struct btf_kfunc_id_set common_kfunc_set = {
573         .owner = THIS_MODULE,                     573         .owner = THIS_MODULE,
574         .set   = &arena_kfuncs,                   574         .set   = &arena_kfuncs,
575 };                                                575 };
576                                                   576 
577 static int __init kfunc_init(void)                577 static int __init kfunc_init(void)
578 {                                                 578 {
579         return register_btf_kfunc_id_set(BPF_P    579         return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
580 }                                                 580 }
581 late_initcall(kfunc_init);                        581 late_initcall(kfunc_init);
582                                                   582 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php