1 // SPDX-License-Identifier: GPL-2.0-only << 2 /* 1 /* 3 * mm/userfaultfd.c 2 * mm/userfaultfd.c 4 * 3 * 5 * Copyright (C) 2015 Red Hat, Inc. 4 * Copyright (C) 2015 Red Hat, Inc. >> 5 * >> 6 * This work is licensed under the terms of the GNU GPL, version 2. See >> 7 * the COPYING file in the top-level directory. 6 */ 8 */ 7 9 8 #include <linux/mm.h> 10 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 11 #include <linux/sched/signal.h> 10 #include <linux/pagemap.h> 12 #include <linux/pagemap.h> 11 #include <linux/rmap.h> 13 #include <linux/rmap.h> 12 #include <linux/swap.h> 14 #include <linux/swap.h> 13 #include <linux/swapops.h> 15 #include <linux/swapops.h> 14 #include <linux/userfaultfd_k.h> 16 #include <linux/userfaultfd_k.h> 15 #include <linux/mmu_notifier.h> 17 #include <linux/mmu_notifier.h> 16 #include <linux/hugetlb.h> 18 #include <linux/hugetlb.h> 17 #include <linux/shmem_fs.h> 19 #include <linux/shmem_fs.h> 18 #include <asm/tlbflush.h> 20 #include <asm/tlbflush.h> 19 #include <asm/tlb.h> << 20 #include "internal.h" 21 #include "internal.h" 21 22 22 static __always_inline !! 23 static int mcopy_atomic_pte(struct mm_struct *dst_mm, 23 bool validate_dst_vma(struct vm_area_struct *d !! 24 pmd_t *dst_pmd, 24 { !! 25 struct vm_area_struct *dst_vma, 25 /* Make sure that the dst range is ful !! 26 unsigned long dst_addr, 26 if (dst_end > dst_vma->vm_end) !! 27 unsigned long src_addr, 27 return false; !! 28 struct page **pagep) 28 << 29 /* << 30 * Check the vma is registered in uffd << 31 * enforce the VM_MAYWRITE check done << 32 * time. << 33 */ << 34 if (!dst_vma->vm_userfaultfd_ctx.ctx) << 35 return false; << 36 << 37 return true; << 38 } << 39 << 40 static __always_inline << 41 struct vm_area_struct *find_vma_and_prepare_an << 42 << 43 { << 44 struct vm_area_struct *vma; << 45 << 46 mmap_assert_locked(mm); << 47 vma = vma_lookup(mm, addr); << 48 if (!vma) << 49 vma = ERR_PTR(-ENOENT); << 50 else if (!(vma->vm_flags & VM_SHARED) << 51 unlikely(anon_vma_prepare(vma << 52 vma = ERR_PTR(-ENOMEM); << 53 << 54 return vma; << 55 } << 56 << 57 #ifdef CONFIG_PER_VMA_LOCK << 58 /* << 59 * uffd_lock_vma() - Lookup and lock vma corre << 60 * @mm: mm to search vma in. << 61 * @address: address that the vma should conta << 62 * << 63 * Should be called without holding mmap_lock. << 64 * << 65 * Return: A locked vma containing @address, - << 66 * -ENOMEM if anon_vma couldn't be allocated. << 67 */ << 68 static struct vm_area_struct *uffd_lock_vma(st << 69 unsigne << 70 { << 71 struct vm_area_struct *vma; << 72 << 73 vma = lock_vma_under_rcu(mm, address); << 74 if (vma) { << 75 /* << 76 * We know we're going to need << 77 * that early. << 78 */ << 79 if (!(vma->vm_flags & VM_SHARE << 80 vma_end_read(vma); << 81 else << 82 return vma; << 83 } << 84 << 85 mmap_read_lock(mm); << 86 vma = find_vma_and_prepare_anon(mm, ad << 87 if (!IS_ERR(vma)) { << 88 /* << 89 * We cannot use vma_start_rea << 90 * false locked (see comment i << 91 * can avoid that by directly << 92 * mmap_lock, which guarantees << 93 * vma for write (vma_start_wr << 94 */ << 95 down_read(&vma->vm_lock->lock) << 96 } << 97 << 98 mmap_read_unlock(mm); << 99 return vma; << 100 } << 101 << 102 static struct vm_area_struct *uffd_mfill_lock( << 103 << 104 << 105 { << 106 struct vm_area_struct *dst_vma; << 107 << 108 dst_vma = uffd_lock_vma(dst_mm, dst_st << 109 if (IS_ERR(dst_vma) || validate_dst_vm << 110 return dst_vma; << 111 << 112 vma_end_read(dst_vma); << 113 return ERR_PTR(-ENOENT); << 114 } << 115 << 116 static void uffd_mfill_unlock(struct vm_area_s << 117 { << 118 vma_end_read(vma); << 119 } << 120 << 121 #else << 122 << 123 static struct vm_area_struct *uffd_mfill_lock( << 124 << 125 << 126 { << 127 struct vm_area_struct *dst_vma; << 128 << 129 mmap_read_lock(dst_mm); << 130 dst_vma = find_vma_and_prepare_anon(ds << 131 if (IS_ERR(dst_vma)) << 132 goto out_unlock; << 133 << 134 if (validate_dst_vma(dst_vma, dst_star << 135 return dst_vma; << 136 << 137 dst_vma = ERR_PTR(-ENOENT); << 138 out_unlock: << 139 mmap_read_unlock(dst_mm); << 140 return dst_vma; << 141 } << 142 << 143 static void uffd_mfill_unlock(struct vm_area_s << 144 { << 145 mmap_read_unlock(vma->vm_mm); << 146 } << 147 #endif << 148 << 149 /* Check if dst_addr is outside of file's size << 150 static bool mfill_file_over_size(struct vm_are << 151 unsigned long << 152 { << 153 struct inode *inode; << 154 pgoff_t offset, max_off; << 155 << 156 if (!dst_vma->vm_file) << 157 return false; << 158 << 159 inode = dst_vma->vm_file->f_inode; << 160 offset = linear_page_index(dst_vma, ds << 161 max_off = DIV_ROUND_UP(i_size_read(ino << 162 return offset >= max_off; << 163 } << 164 << 165 /* << 166 * Install PTEs, to map dst_addr (within dst_v << 167 * << 168 * This function handles both MCOPY_ATOMIC_NOR << 169 * and anon, and for both shared and private V << 170 */ << 171 int mfill_atomic_install_pte(pmd_t *dst_pmd, << 172 struct vm_area_st << 173 unsigned long dst << 174 bool newly_alloca << 175 { 29 { 176 int ret; !! 30 struct mem_cgroup *memcg; 177 struct mm_struct *dst_mm = dst_vma->vm << 178 pte_t _dst_pte, *dst_pte; 31 pte_t _dst_pte, *dst_pte; 179 bool writable = dst_vma->vm_flags & VM << 180 bool vm_shared = dst_vma->vm_flags & V << 181 spinlock_t *ptl; 32 spinlock_t *ptl; 182 struct folio *folio = page_folio(page) !! 33 void *page_kaddr; 183 bool page_in_cache = folio_mapping(fol << 184 << 185 _dst_pte = mk_pte(page, dst_vma->vm_pa << 186 _dst_pte = pte_mkdirty(_dst_pte); << 187 if (page_in_cache && !vm_shared) << 188 writable = false; << 189 if (writable) << 190 _dst_pte = pte_mkwrite(_dst_pt << 191 if (flags & MFILL_ATOMIC_WP) << 192 _dst_pte = pte_mkuffd_wp(_dst_ << 193 << 194 ret = -EAGAIN; << 195 dst_pte = pte_offset_map_lock(dst_mm, << 196 if (!dst_pte) << 197 goto out; << 198 << 199 if (mfill_file_over_size(dst_vma, dst_ << 200 ret = -EFAULT; << 201 goto out_unlock; << 202 } << 203 << 204 ret = -EEXIST; << 205 /* << 206 * We allow to overwrite a pte marker: << 207 * registered, we firstly wr-protect a << 208 * page backing it, then access the pa << 209 */ << 210 if (!pte_none_mostly(ptep_get(dst_pte) << 211 goto out_unlock; << 212 << 213 if (page_in_cache) { << 214 /* Usually, cache pages are al << 215 if (newly_allocated) << 216 folio_add_lru(folio); << 217 folio_add_file_rmap_pte(folio, << 218 } else { << 219 folio_add_new_anon_rmap(folio, << 220 folio_add_lru_vma(folio, dst_v << 221 } << 222 << 223 /* << 224 * Must happen after rmap, as mm_count << 225 * PageAnon()), which is set by __page << 226 */ << 227 inc_mm_counter(dst_mm, mm_counter(foli << 228 << 229 set_pte_at(dst_mm, dst_addr, dst_pte, << 230 << 231 /* No need to invalidate - it was non- << 232 update_mmu_cache(dst_vma, dst_addr, ds << 233 ret = 0; << 234 out_unlock: << 235 pte_unmap_unlock(dst_pte, ptl); << 236 out: << 237 return ret; << 238 } << 239 << 240 static int mfill_atomic_pte_copy(pmd_t *dst_pm << 241 struct vm_are << 242 unsigned long << 243 unsigned long << 244 uffd_flags_t << 245 struct folio << 246 { << 247 void *kaddr; << 248 int ret; 34 int ret; 249 struct folio *folio; !! 35 struct page *page; >> 36 pgoff_t offset, max_off; >> 37 struct inode *inode; 250 38 251 if (!*foliop) { !! 39 if (!*pagep) { 252 ret = -ENOMEM; 40 ret = -ENOMEM; 253 folio = vma_alloc_folio(GFP_HI !! 41 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr); 254 dst_ad !! 42 if (!page) 255 if (!folio) << 256 goto out; 43 goto out; 257 44 258 kaddr = kmap_local_folio(folio !! 45 page_kaddr = kmap_atomic(page); 259 /* !! 46 ret = copy_from_user(page_kaddr, 260 * The read mmap_lock is held !! 47 (const void __user *) src_addr, 261 * mmap_lock being read recurs << 262 * possible if a writer has ta << 263 * << 264 * process A thread 1 takes re << 265 * process A thread 2 calls mm << 266 * process B thread 1 takes pa << 267 * process B thread 2 calls mm << 268 * process A thread 1 blocks t << 269 * process B thread 1 blocks t << 270 * << 271 * Disable page faults to prev << 272 * and retry the copy outside << 273 */ << 274 pagefault_disable(); << 275 ret = copy_from_user(kaddr, (c << 276 PAGE_SIZE 48 PAGE_SIZE); 277 pagefault_enable(); !! 49 kunmap_atomic(page_kaddr); 278 kunmap_local(kaddr); << 279 50 280 /* fallback to copy_from_user !! 51 /* fallback to copy_from_user outside mmap_sem */ 281 if (unlikely(ret)) { 52 if (unlikely(ret)) { 282 ret = -ENOENT; 53 ret = -ENOENT; 283 *foliop = folio; !! 54 *pagep = page; 284 /* don't free the page 55 /* don't free the page */ 285 goto out; 56 goto out; 286 } 57 } 287 58 288 flush_dcache_folio(folio); !! 59 flush_dcache_page(page); 289 } else { 60 } else { 290 folio = *foliop; !! 61 page = *pagep; 291 *foliop = NULL; !! 62 *pagep = NULL; 292 } 63 } 293 64 294 /* 65 /* 295 * The memory barrier inside __folio_m !! 66 * The memory barrier inside __SetPageUptodate makes sure that 296 * preceding stores to the page conten !! 67 * preceeding stores to the page contents become visible before 297 * the set_pte_at() write. 68 * the set_pte_at() write. 298 */ 69 */ 299 __folio_mark_uptodate(folio); !! 70 __SetPageUptodate(page); 300 71 301 ret = -ENOMEM; 72 ret = -ENOMEM; 302 if (mem_cgroup_charge(folio, dst_vma-> !! 73 if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) 303 goto out_release; 74 goto out_release; 304 75 305 ret = mfill_atomic_install_pte(dst_pmd !! 76 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 306 &folio- !! 77 if (dst_vma->vm_flags & VM_WRITE) 307 if (ret) !! 78 _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); 308 goto out_release; << 309 out: << 310 return ret; << 311 out_release: << 312 folio_put(folio); << 313 goto out; << 314 } << 315 << 316 static int mfill_atomic_pte_zeroed_folio(pmd_t << 317 struc << 318 unsig << 319 { << 320 struct folio *folio; << 321 int ret = -ENOMEM; << 322 << 323 folio = vma_alloc_zeroed_movable_folio << 324 if (!folio) << 325 return ret; << 326 << 327 if (mem_cgroup_charge(folio, dst_vma-> << 328 goto out_put; << 329 << 330 /* << 331 * The memory barrier inside __folio_m << 332 * zeroing out the folio become visibl << 333 * using set_pte_at(). See do_anonymou << 334 */ << 335 __folio_mark_uptodate(folio); << 336 << 337 ret = mfill_atomic_install_pte(dst_pmd << 338 &folio- << 339 if (ret) << 340 goto out_put; << 341 << 342 return 0; << 343 out_put: << 344 folio_put(folio); << 345 return ret; << 346 } << 347 << 348 static int mfill_atomic_pte_zeropage(pmd_t *ds << 349 struct vm << 350 unsigned << 351 { << 352 pte_t _dst_pte, *dst_pte; << 353 spinlock_t *ptl; << 354 int ret; << 355 << 356 if (mm_forbids_zeropage(dst_vma->vm_mm << 357 return mfill_atomic_pte_zeroed << 358 79 359 _dst_pte = pte_mkspecial(pfn_pte(my_ze !! 80 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 360 dst_v !! 81 if (dst_vma->vm_file) { 361 ret = -EAGAIN; !! 82 /* the shmem MAP_PRIVATE case requires checking the i_size */ 362 dst_pte = pte_offset_map_lock(dst_vma- !! 83 inode = dst_vma->vm_file->f_inode; 363 if (!dst_pte) !! 84 offset = linear_page_index(dst_vma, dst_addr); 364 goto out; !! 85 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 365 if (mfill_file_over_size(dst_vma, dst_ << 366 ret = -EFAULT; 86 ret = -EFAULT; 367 goto out_unlock; !! 87 if (unlikely(offset >= max_off)) >> 88 goto out_release_uncharge_unlock; 368 } 89 } 369 ret = -EEXIST; 90 ret = -EEXIST; 370 if (!pte_none(ptep_get(dst_pte))) !! 91 if (!pte_none(*dst_pte)) 371 goto out_unlock; !! 92 goto out_release_uncharge_unlock; 372 set_pte_at(dst_vma->vm_mm, dst_addr, d << 373 /* No need to invalidate - it was non- << 374 update_mmu_cache(dst_vma, dst_addr, ds << 375 ret = 0; << 376 out_unlock: << 377 pte_unmap_unlock(dst_pte, ptl); << 378 out: << 379 return ret; << 380 } << 381 93 382 /* Handles UFFDIO_CONTINUE for all shmem VMAs !! 94 inc_mm_counter(dst_mm, MM_ANONPAGES); 383 static int mfill_atomic_pte_continue(pmd_t *ds !! 95 page_add_new_anon_rmap(page, dst_vma, dst_addr, false); 384 struct vm !! 96 mem_cgroup_commit_charge(page, memcg, false, false); 385 unsigned !! 97 lru_cache_add_active_or_unevictable(page, dst_vma); 386 uffd_flag << 387 { << 388 struct inode *inode = file_inode(dst_v << 389 pgoff_t pgoff = linear_page_index(dst_ << 390 struct folio *folio; << 391 struct page *page; << 392 int ret; << 393 << 394 ret = shmem_get_folio(inode, pgoff, 0, << 395 /* Our caller expects us to return -EF << 396 if (ret == -ENOENT) << 397 ret = -EFAULT; << 398 if (ret) << 399 goto out; << 400 if (!folio) { << 401 ret = -EFAULT; << 402 goto out; << 403 } << 404 98 405 page = folio_file_page(folio, pgoff); !! 99 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 406 if (PageHWPoison(page)) { << 407 ret = -EIO; << 408 goto out_release; << 409 } << 410 100 411 ret = mfill_atomic_install_pte(dst_pmd !! 101 /* No need to invalidate - it was non-present before */ 412 page, f !! 102 update_mmu_cache(dst_vma, dst_addr, dst_pte); 413 if (ret) << 414 goto out_release; << 415 103 416 folio_unlock(folio); !! 104 pte_unmap_unlock(dst_pte, ptl); 417 ret = 0; 105 ret = 0; 418 out: 106 out: 419 return ret; 107 return ret; >> 108 out_release_uncharge_unlock: >> 109 pte_unmap_unlock(dst_pte, ptl); >> 110 mem_cgroup_cancel_charge(page, memcg, false); 420 out_release: 111 out_release: 421 folio_unlock(folio); !! 112 put_page(page); 422 folio_put(folio); << 423 goto out; 113 goto out; 424 } 114 } 425 115 426 /* Handles UFFDIO_POISON for all non-hugetlb V !! 116 static int mfill_zeropage_pte(struct mm_struct *dst_mm, 427 static int mfill_atomic_pte_poison(pmd_t *dst_ !! 117 pmd_t *dst_pmd, 428 struct vm_a !! 118 struct vm_area_struct *dst_vma, 429 unsigned lo !! 119 unsigned long dst_addr) 430 uffd_flags_ << 431 { 120 { 432 int ret; << 433 struct mm_struct *dst_mm = dst_vma->vm << 434 pte_t _dst_pte, *dst_pte; 121 pte_t _dst_pte, *dst_pte; 435 spinlock_t *ptl; 122 spinlock_t *ptl; >> 123 int ret; >> 124 pgoff_t offset, max_off; >> 125 struct inode *inode; 436 126 437 _dst_pte = make_pte_marker(PTE_MARKER_ !! 127 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 438 ret = -EAGAIN; !! 128 dst_vma->vm_page_prot)); 439 dst_pte = pte_offset_map_lock(dst_mm, 129 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 440 if (!dst_pte) !! 130 if (dst_vma->vm_file) { 441 goto out; !! 131 /* the shmem MAP_PRIVATE case requires checking the i_size */ 442 !! 132 inode = dst_vma->vm_file->f_inode; 443 if (mfill_file_over_size(dst_vma, dst_ !! 133 offset = linear_page_index(dst_vma, dst_addr); >> 134 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 444 ret = -EFAULT; 135 ret = -EFAULT; 445 goto out_unlock; !! 136 if (unlikely(offset >= max_off)) >> 137 goto out_unlock; 446 } 138 } 447 << 448 ret = -EEXIST; 139 ret = -EEXIST; 449 /* Refuse to overwrite any PTE, even a !! 140 if (!pte_none(*dst_pte)) 450 if (!pte_none(ptep_get(dst_pte))) << 451 goto out_unlock; 141 goto out_unlock; 452 << 453 set_pte_at(dst_mm, dst_addr, dst_pte, 142 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 454 << 455 /* No need to invalidate - it was non- 143 /* No need to invalidate - it was non-present before */ 456 update_mmu_cache(dst_vma, dst_addr, ds 144 update_mmu_cache(dst_vma, dst_addr, dst_pte); 457 ret = 0; 145 ret = 0; 458 out_unlock: 146 out_unlock: 459 pte_unmap_unlock(dst_pte, ptl); 147 pte_unmap_unlock(dst_pte, ptl); 460 out: << 461 return ret; 148 return ret; 462 } 149 } 463 150 464 static pmd_t *mm_alloc_pmd(struct mm_struct *m 151 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 465 { 152 { 466 pgd_t *pgd; 153 pgd_t *pgd; 467 p4d_t *p4d; 154 p4d_t *p4d; 468 pud_t *pud; 155 pud_t *pud; 469 156 470 pgd = pgd_offset(mm, address); 157 pgd = pgd_offset(mm, address); 471 p4d = p4d_alloc(mm, pgd, address); 158 p4d = p4d_alloc(mm, pgd, address); 472 if (!p4d) 159 if (!p4d) 473 return NULL; 160 return NULL; 474 pud = pud_alloc(mm, p4d, address); 161 pud = pud_alloc(mm, p4d, address); 475 if (!pud) 162 if (!pud) 476 return NULL; 163 return NULL; 477 /* 164 /* 478 * Note that we didn't run this becaus 165 * Note that we didn't run this because the pmd was 479 * missing, the *pmd may be already es 166 * missing, the *pmd may be already established and in 480 * turn it may also be a trans_huge_pm 167 * turn it may also be a trans_huge_pmd. 481 */ 168 */ 482 return pmd_alloc(mm, pud, address); 169 return pmd_alloc(mm, pud, address); 483 } 170 } 484 171 485 #ifdef CONFIG_HUGETLB_PAGE 172 #ifdef CONFIG_HUGETLB_PAGE 486 /* 173 /* 487 * mfill_atomic processing for HUGETLB vmas. !! 174 * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is 488 * called with either vma-lock or mmap_lock he !! 175 * called with mmap_sem held, it will release mmap_sem before returning. 489 * before returning. << 490 */ 176 */ 491 static __always_inline ssize_t mfill_atomic_hu !! 177 static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, 492 << 493 178 struct vm_area_struct *dst_vma, 494 179 unsigned long dst_start, 495 180 unsigned long src_start, 496 181 unsigned long len, 497 !! 182 bool *mmap_changing, >> 183 bool zeropage) 498 { 184 { 499 struct mm_struct *dst_mm = dst_vma->vm !! 185 int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; >> 186 int vm_shared = dst_vma->vm_flags & VM_SHARED; 500 ssize_t err; 187 ssize_t err; 501 pte_t *dst_pte; 188 pte_t *dst_pte; 502 unsigned long src_addr, dst_addr; 189 unsigned long src_addr, dst_addr; 503 long copied; 190 long copied; 504 struct folio *folio; !! 191 struct page *page; >> 192 struct hstate *h; 505 unsigned long vma_hpagesize; 193 unsigned long vma_hpagesize; 506 pgoff_t idx; 194 pgoff_t idx; 507 u32 hash; 195 u32 hash; 508 struct address_space *mapping; 196 struct address_space *mapping; 509 197 510 /* 198 /* 511 * There is no default zero huge page 199 * There is no default zero huge page for all huge page sizes as 512 * supported by hugetlb. A PMD_SIZE h 200 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 513 * by THP. Since we can not reliably 201 * by THP. Since we can not reliably insert a zero page, this 514 * feature is not supported. 202 * feature is not supported. 515 */ 203 */ 516 if (uffd_flags_mode_is(flags, MFILL_AT !! 204 if (zeropage) { 517 up_read(&ctx->map_changing_loc !! 205 up_read(&dst_mm->mmap_sem); 518 uffd_mfill_unlock(dst_vma); << 519 return -EINVAL; 206 return -EINVAL; 520 } 207 } 521 208 522 src_addr = src_start; 209 src_addr = src_start; 523 dst_addr = dst_start; 210 dst_addr = dst_start; 524 copied = 0; 211 copied = 0; 525 folio = NULL; !! 212 page = NULL; 526 vma_hpagesize = vma_kernel_pagesize(ds 213 vma_hpagesize = vma_kernel_pagesize(dst_vma); 527 214 528 /* 215 /* 529 * Validate alignment based on huge pa 216 * Validate alignment based on huge page size 530 */ 217 */ 531 err = -EINVAL; 218 err = -EINVAL; 532 if (dst_start & (vma_hpagesize - 1) || 219 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 533 goto out_unlock; 220 goto out_unlock; 534 221 535 retry: 222 retry: 536 /* 223 /* 537 * On routine entry dst_vma is set. I !! 224 * On routine entry dst_vma is set. If we had to drop mmap_sem and 538 * retry, dst_vma will be set to NULL 225 * retry, dst_vma will be set to NULL and we must lookup again. 539 */ 226 */ 540 if (!dst_vma) { 227 if (!dst_vma) { 541 dst_vma = uffd_mfill_lock(dst_ << 542 if (IS_ERR(dst_vma)) { << 543 err = PTR_ERR(dst_vma) << 544 goto out; << 545 } << 546 << 547 err = -ENOENT; 228 err = -ENOENT; 548 if (!is_vm_hugetlb_page(dst_vm !! 229 dst_vma = find_vma(dst_mm, dst_start); 549 goto out_unlock_vma; !! 230 if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) >> 231 goto out_unlock; >> 232 /* >> 233 * Check the vma is registered in uffd, this is >> 234 * required to enforce the VM_MAYWRITE check done at >> 235 * uffd registration time. >> 236 */ >> 237 if (!dst_vma->vm_userfaultfd_ctx.ctx) >> 238 goto out_unlock; >> 239 >> 240 if (dst_start < dst_vma->vm_start || >> 241 dst_start + len > dst_vma->vm_end) >> 242 goto out_unlock; 550 243 551 err = -EINVAL; 244 err = -EINVAL; 552 if (vma_hpagesize != vma_kerne 245 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 553 goto out_unlock_vma; !! 246 goto out_unlock; 554 247 555 /* !! 248 vm_shared = dst_vma->vm_flags & VM_SHARED; 556 * If memory mappings are chan !! 249 } 557 * operation (e.g. mremap) run !! 250 558 * request the user to retry l !! 251 if (WARN_ON(dst_addr & (vma_hpagesize - 1) || 559 */ !! 252 (len - copied) & (vma_hpagesize - 1))) 560 down_read(&ctx->map_changing_l !! 253 goto out_unlock; 561 err = -EAGAIN; !! 254 562 if (atomic_read(&ctx->mmap_cha !! 255 /* >> 256 * If not shared, ensure the dst_vma has a anon_vma. >> 257 */ >> 258 err = -ENOMEM; >> 259 if (!vm_shared) { >> 260 if (unlikely(anon_vma_prepare(dst_vma))) 563 goto out_unlock; 261 goto out_unlock; 564 } 262 } 565 263 >> 264 h = hstate_vma(dst_vma); >> 265 566 while (src_addr < src_start + len) { 266 while (src_addr < src_start + len) { >> 267 pte_t dst_pteval; >> 268 567 BUG_ON(dst_addr >= dst_start + 269 BUG_ON(dst_addr >= dst_start + len); >> 270 VM_BUG_ON(dst_addr & ~huge_page_mask(h)); 568 271 569 /* 272 /* 570 * Serialize via vma_lock and !! 273 * Serialize via hugetlb_fault_mutex 571 * vma_lock ensures the dst_pt << 572 * in the case of shared pmds. << 573 * races with other faulting t << 574 */ 274 */ 575 idx = linear_page_index(dst_vm 275 idx = linear_page_index(dst_vma, dst_addr); 576 mapping = dst_vma->vm_file->f_ 276 mapping = dst_vma->vm_file->f_mapping; 577 hash = hugetlb_fault_mutex_has !! 277 hash = hugetlb_fault_mutex_hash(h, mapping, idx); 578 mutex_lock(&hugetlb_fault_mute 278 mutex_lock(&hugetlb_fault_mutex_table[hash]); 579 hugetlb_vma_lock_read(dst_vma) << 580 279 581 err = -ENOMEM; 280 err = -ENOMEM; 582 dst_pte = huge_pte_alloc(dst_m !! 281 dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); 583 if (!dst_pte) { 282 if (!dst_pte) { 584 hugetlb_vma_unlock_rea << 585 mutex_unlock(&hugetlb_ 283 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 586 goto out_unlock; 284 goto out_unlock; 587 } 285 } 588 286 589 if (!uffd_flags_mode_is(flags, !! 287 err = -EEXIST; 590 !huge_pte_none_mostly(huge !! 288 dst_pteval = huge_ptep_get(dst_pte); 591 err = -EEXIST; !! 289 if (!huge_pte_none(dst_pteval)) { 592 hugetlb_vma_unlock_rea << 593 mutex_unlock(&hugetlb_ 290 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 594 goto out_unlock; 291 goto out_unlock; 595 } 292 } 596 293 597 err = hugetlb_mfill_atomic_pte !! 294 err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, 598 !! 295 dst_addr, src_addr, &page); 599 296 600 hugetlb_vma_unlock_read(dst_vm << 601 mutex_unlock(&hugetlb_fault_mu 297 mutex_unlock(&hugetlb_fault_mutex_table[hash]); >> 298 vm_alloc_shared = vm_shared; 602 299 603 cond_resched(); 300 cond_resched(); 604 301 605 if (unlikely(err == -ENOENT)) 302 if (unlikely(err == -ENOENT)) { 606 up_read(&ctx->map_chan !! 303 up_read(&dst_mm->mmap_sem); 607 uffd_mfill_unlock(dst_ !! 304 BUG_ON(!page); 608 BUG_ON(!folio); << 609 305 610 err = copy_folio_from_ !! 306 err = copy_huge_page_from_user(page, 611 !! 307 (const void __user *)src_addr, >> 308 pages_per_huge_page(h), true); 612 if (unlikely(err)) { 309 if (unlikely(err)) { 613 err = -EFAULT; 310 err = -EFAULT; 614 goto out; 311 goto out; 615 } 312 } >> 313 down_read(&dst_mm->mmap_sem); >> 314 /* >> 315 * If memory mappings are changing because of non-cooperative >> 316 * operation (e.g. mremap) running in parallel, bail out and >> 317 * request the user to retry later >> 318 */ >> 319 if (mmap_changing && READ_ONCE(*mmap_changing)) { >> 320 err = -EAGAIN; >> 321 break; >> 322 } 616 323 617 dst_vma = NULL; 324 dst_vma = NULL; 618 goto retry; 325 goto retry; 619 } else 326 } else 620 BUG_ON(folio); !! 327 BUG_ON(page); 621 328 622 if (!err) { 329 if (!err) { 623 dst_addr += vma_hpages 330 dst_addr += vma_hpagesize; 624 src_addr += vma_hpages 331 src_addr += vma_hpagesize; 625 copied += vma_hpagesiz 332 copied += vma_hpagesize; 626 333 627 if (fatal_signal_pendi 334 if (fatal_signal_pending(current)) 628 err = -EINTR; 335 err = -EINTR; 629 } 336 } 630 if (err) 337 if (err) 631 break; 338 break; 632 } 339 } 633 340 634 out_unlock: 341 out_unlock: 635 up_read(&ctx->map_changing_lock); !! 342 up_read(&dst_mm->mmap_sem); 636 out_unlock_vma: << 637 uffd_mfill_unlock(dst_vma); << 638 out: 343 out: 639 if (folio) !! 344 if (page) { 640 folio_put(folio); !! 345 /* >> 346 * We encountered an error and are about to free a newly >> 347 * allocated huge page. >> 348 * >> 349 * Reservation handling is very subtle, and is different for >> 350 * private and shared mappings. See the routine >> 351 * restore_reserve_on_error for details. Unfortunately, we >> 352 * can not call restore_reserve_on_error now as it would >> 353 * require holding mmap_sem. >> 354 * >> 355 * If a reservation for the page existed in the reservation >> 356 * map of a private mapping, the map was modified to indicate >> 357 * the reservation was consumed when the page was allocated. >> 358 * We clear the PagePrivate flag now so that the global >> 359 * reserve count will not be incremented in free_huge_page. >> 360 * The reservation map will still indicate the reservation >> 361 * was consumed and possibly prevent later page allocation. >> 362 * This is better than leaking a global reservation. If no >> 363 * reservation existed, it is still safe to clear PagePrivate >> 364 * as no adjustments to reservation counts were made during >> 365 * allocation. >> 366 * >> 367 * The reservation map for shared mappings indicates which >> 368 * pages have reservations. When a huge page is allocated >> 369 * for an address with a reservation, no change is made to >> 370 * the reserve map. In this case PagePrivate will be set >> 371 * to indicate that the global reservation count should be >> 372 * incremented when the page is freed. This is the desired >> 373 * behavior. However, when a huge page is allocated for an >> 374 * address without a reservation a reservation entry is added >> 375 * to the reservation map, and PagePrivate will not be set. >> 376 * When the page is freed, the global reserve count will NOT >> 377 * be incremented and it will appear as though we have leaked >> 378 * reserved page. In this case, set PagePrivate so that the >> 379 * global reserve count will be incremented to match the >> 380 * reservation map entry which was created. >> 381 * >> 382 * Note that vm_alloc_shared is based on the flags of the vma >> 383 * for which the page was originally allocated. dst_vma could >> 384 * be different or NULL on error. >> 385 */ >> 386 if (vm_alloc_shared) >> 387 SetPagePrivate(page); >> 388 else >> 389 ClearPagePrivate(page); >> 390 put_page(page); >> 391 } 641 BUG_ON(copied < 0); 392 BUG_ON(copied < 0); 642 BUG_ON(err > 0); 393 BUG_ON(err > 0); 643 BUG_ON(!copied && !err); 394 BUG_ON(!copied && !err); 644 return copied ? copied : err; 395 return copied ? copied : err; 645 } 396 } 646 #else /* !CONFIG_HUGETLB_PAGE */ 397 #else /* !CONFIG_HUGETLB_PAGE */ 647 /* fail at build time if gcc attempts to use t 398 /* fail at build time if gcc attempts to use this */ 648 extern ssize_t mfill_atomic_hugetlb(struct use !! 399 extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, 649 struct vm_ !! 400 struct vm_area_struct *dst_vma, 650 unsigned l !! 401 unsigned long dst_start, 651 unsigned l !! 402 unsigned long src_start, 652 unsigned l !! 403 unsigned long len, 653 uffd_flags !! 404 bool *mmap_changing, >> 405 bool zeropage); 654 #endif /* CONFIG_HUGETLB_PAGE */ 406 #endif /* CONFIG_HUGETLB_PAGE */ 655 407 656 static __always_inline ssize_t mfill_atomic_pt !! 408 static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, >> 409 pmd_t *dst_pmd, 657 410 struct vm_area_struct *dst_vma, 658 411 unsigned long dst_addr, 659 412 unsigned long src_addr, 660 !! 413 struct page **page, 661 !! 414 bool zeropage) 662 { 415 { 663 ssize_t err; 416 ssize_t err; 664 417 665 if (uffd_flags_mode_is(flags, MFILL_AT << 666 return mfill_atomic_pte_contin << 667 << 668 } else if (uffd_flags_mode_is(flags, M << 669 return mfill_atomic_pte_poison << 670 << 671 } << 672 << 673 /* 418 /* 674 * The normal page fault path for a sh 419 * The normal page fault path for a shmem will invoke the 675 * fault, fill the hole in the file an 420 * fault, fill the hole in the file and COW it right away. The 676 * result generates plain anonymous me 421 * result generates plain anonymous memory. So when we are 677 * asked to fill an hole in a MAP_PRIV 422 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll 678 * generate anonymous memory directly 423 * generate anonymous memory directly without actually filling 679 * the hole. For the MAP_PRIVATE case 424 * the hole. For the MAP_PRIVATE case the robustness check 680 * only happens in the pagetable (to v 425 * only happens in the pagetable (to verify it's still none) 681 * and not in the radix tree. 426 * and not in the radix tree. 682 */ 427 */ 683 if (!(dst_vma->vm_flags & VM_SHARED)) 428 if (!(dst_vma->vm_flags & VM_SHARED)) { 684 if (uffd_flags_mode_is(flags, !! 429 if (!zeropage) 685 err = mfill_atomic_pte !! 430 err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, 686 !! 431 dst_addr, src_addr, page); 687 << 688 else 432 else 689 err = mfill_atomic_pte !! 433 err = mfill_zeropage_pte(dst_mm, dst_pmd, 690 434 dst_vma, dst_addr); 691 } else { 435 } else { 692 err = shmem_mfill_atomic_pte(d !! 436 if (!zeropage) 693 d !! 437 err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, 694 f !! 438 dst_vma, dst_addr, >> 439 src_addr, page); >> 440 else >> 441 err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd, >> 442 dst_vma, dst_addr); 695 } 443 } 696 444 697 return err; 445 return err; 698 } 446 } 699 447 700 static __always_inline ssize_t mfill_atomic(st !! 448 static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, 701 un !! 449 unsigned long dst_start, 702 un !! 450 unsigned long src_start, 703 un !! 451 unsigned long len, 704 uf !! 452 bool zeropage, >> 453 bool *mmap_changing) 705 { 454 { 706 struct mm_struct *dst_mm = ctx->mm; << 707 struct vm_area_struct *dst_vma; 455 struct vm_area_struct *dst_vma; 708 ssize_t err; 456 ssize_t err; 709 pmd_t *dst_pmd; 457 pmd_t *dst_pmd; 710 unsigned long src_addr, dst_addr; 458 unsigned long src_addr, dst_addr; 711 long copied; 459 long copied; 712 struct folio *folio; !! 460 struct page *page; 713 461 714 /* 462 /* 715 * Sanitize the command parameters: 463 * Sanitize the command parameters: 716 */ 464 */ 717 BUG_ON(dst_start & ~PAGE_MASK); 465 BUG_ON(dst_start & ~PAGE_MASK); 718 BUG_ON(len & ~PAGE_MASK); 466 BUG_ON(len & ~PAGE_MASK); 719 467 720 /* Does the address range wrap, or is 468 /* Does the address range wrap, or is the span zero-sized? */ 721 BUG_ON(src_start + len <= src_start); 469 BUG_ON(src_start + len <= src_start); 722 BUG_ON(dst_start + len <= dst_start); 470 BUG_ON(dst_start + len <= dst_start); 723 471 724 src_addr = src_start; 472 src_addr = src_start; 725 dst_addr = dst_start; 473 dst_addr = dst_start; 726 copied = 0; 474 copied = 0; 727 folio = NULL; !! 475 page = NULL; 728 retry: 476 retry: 729 /* !! 477 down_read(&dst_mm->mmap_sem); 730 * Make sure the vma is not shared, th << 731 * both valid and fully within a singl << 732 */ << 733 dst_vma = uffd_mfill_lock(dst_mm, dst_ << 734 if (IS_ERR(dst_vma)) { << 735 err = PTR_ERR(dst_vma); << 736 goto out; << 737 } << 738 478 739 /* 479 /* 740 * If memory mappings are changing bec 480 * If memory mappings are changing because of non-cooperative 741 * operation (e.g. mremap) running in 481 * operation (e.g. mremap) running in parallel, bail out and 742 * request the user to retry later 482 * request the user to retry later 743 */ 483 */ 744 down_read(&ctx->map_changing_lock); << 745 err = -EAGAIN; 484 err = -EAGAIN; 746 if (atomic_read(&ctx->mmap_changing)) !! 485 if (mmap_changing && READ_ONCE(*mmap_changing)) >> 486 goto out_unlock; >> 487 >> 488 /* >> 489 * Make sure the vma is not shared, that the dst range is >> 490 * both valid and fully within a single existing vma. >> 491 */ >> 492 err = -ENOENT; >> 493 dst_vma = find_vma(dst_mm, dst_start); >> 494 if (!dst_vma) >> 495 goto out_unlock; >> 496 /* >> 497 * Check the vma is registered in uffd, this is required to >> 498 * enforce the VM_MAYWRITE check done at uffd registration >> 499 * time. >> 500 */ >> 501 if (!dst_vma->vm_userfaultfd_ctx.ctx) >> 502 goto out_unlock; >> 503 >> 504 if (dst_start < dst_vma->vm_start || >> 505 dst_start + len > dst_vma->vm_end) 747 goto out_unlock; 506 goto out_unlock; 748 507 749 err = -EINVAL; 508 err = -EINVAL; 750 /* 509 /* 751 * shmem_zero_setup is invoked in mmap 510 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 752 * it will overwrite vm_ops, so vma_is 511 * it will overwrite vm_ops, so vma_is_anonymous must return false. 753 */ 512 */ 754 if (WARN_ON_ONCE(vma_is_anonymous(dst_ 513 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 755 dst_vma->vm_flags & VM_SHARED)) 514 dst_vma->vm_flags & VM_SHARED)) 756 goto out_unlock; 515 goto out_unlock; 757 516 758 /* 517 /* 759 * validate 'mode' now that we know th << 760 * a wrprotect copy if the userfaultfd << 761 */ << 762 if ((flags & MFILL_ATOMIC_WP) && !(dst << 763 goto out_unlock; << 764 << 765 /* << 766 * If this is a HUGETLB vma, pass off 518 * If this is a HUGETLB vma, pass off to appropriate routine 767 */ 519 */ 768 if (is_vm_hugetlb_page(dst_vma)) 520 if (is_vm_hugetlb_page(dst_vma)) 769 return mfill_atomic_hugetlb(c !! 521 return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, 770 s !! 522 src_start, len, mmap_changing, >> 523 zeropage); 771 524 772 if (!vma_is_anonymous(dst_vma) && !vma 525 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) 773 goto out_unlock; 526 goto out_unlock; 774 if (!vma_is_shmem(dst_vma) && !! 527 775 uffd_flags_mode_is(flags, MFILL_AT !! 528 /* >> 529 * Ensure the dst_vma has a anon_vma or this page >> 530 * would get a NULL anon_vma when moved in the >> 531 * dst_vma. >> 532 */ >> 533 err = -ENOMEM; >> 534 if (!(dst_vma->vm_flags & VM_SHARED) && >> 535 unlikely(anon_vma_prepare(dst_vma))) 776 goto out_unlock; 536 goto out_unlock; 777 537 778 while (src_addr < src_start + len) { 538 while (src_addr < src_start + len) { 779 pmd_t dst_pmdval; 539 pmd_t dst_pmdval; 780 540 781 BUG_ON(dst_addr >= dst_start + 541 BUG_ON(dst_addr >= dst_start + len); 782 542 783 dst_pmd = mm_alloc_pmd(dst_mm, 543 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); 784 if (unlikely(!dst_pmd)) { 544 if (unlikely(!dst_pmd)) { 785 err = -ENOMEM; 545 err = -ENOMEM; 786 break; 546 break; 787 } 547 } 788 548 789 dst_pmdval = pmdp_get_lockless !! 549 dst_pmdval = pmd_read_atomic(dst_pmd); 790 if (unlikely(pmd_none(dst_pmdv << 791 unlikely(__pte_alloc(dst_m << 792 err = -ENOMEM; << 793 break; << 794 } << 795 dst_pmdval = pmdp_get_lockless << 796 /* 550 /* 797 * If the dst_pmd is THP don't !! 551 * If the dst_pmd is mapped as THP don't 798 * (This includes the case whe !! 552 * override it and just be strict. 799 * changed back to none after << 800 */ 553 */ 801 if (unlikely(!pmd_present(dst_ !! 554 if (unlikely(pmd_trans_huge(dst_pmdval))) { 802 pmd_devmap(dst_pm << 803 err = -EEXIST; 555 err = -EEXIST; 804 break; 556 break; 805 } 557 } 806 if (unlikely(pmd_bad(dst_pmdva !! 558 if (unlikely(pmd_none(dst_pmdval)) && >> 559 unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { >> 560 err = -ENOMEM; >> 561 break; >> 562 } >> 563 /* If an huge pmd materialized from under us fail */ >> 564 if (unlikely(pmd_trans_huge(*dst_pmd))) { 807 err = -EFAULT; 565 err = -EFAULT; 808 break; 566 break; 809 } 567 } 810 /* << 811 * For shmem mappings, khugepa << 812 * tables under us; pte_offset << 813 */ << 814 568 815 err = mfill_atomic_pte(dst_pmd !! 569 BUG_ON(pmd_none(*dst_pmd)); 816 src_add !! 570 BUG_ON(pmd_trans_huge(*dst_pmd)); >> 571 >> 572 err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, >> 573 src_addr, &page, zeropage); 817 cond_resched(); 574 cond_resched(); 818 575 819 if (unlikely(err == -ENOENT)) 576 if (unlikely(err == -ENOENT)) { 820 void *kaddr; !! 577 void *page_kaddr; 821 578 822 up_read(&ctx->map_chan !! 579 up_read(&dst_mm->mmap_sem); 823 uffd_mfill_unlock(dst_ !! 580 BUG_ON(!page); 824 BUG_ON(!folio); << 825 581 826 kaddr = kmap_local_fol !! 582 page_kaddr = kmap(page); 827 err = copy_from_user(k !! 583 err = copy_from_user(page_kaddr, 828 ( 584 (const void __user *) src_addr, 829 P 585 PAGE_SIZE); 830 kunmap_local(kaddr); !! 586 kunmap(page); 831 if (unlikely(err)) { 587 if (unlikely(err)) { 832 err = -EFAULT; 588 err = -EFAULT; 833 goto out; 589 goto out; 834 } 590 } 835 flush_dcache_folio(fol !! 591 flush_dcache_page(page); 836 goto retry; 592 goto retry; 837 } else 593 } else 838 BUG_ON(folio); !! 594 BUG_ON(page); 839 595 840 if (!err) { 596 if (!err) { 841 dst_addr += PAGE_SIZE; 597 dst_addr += PAGE_SIZE; 842 src_addr += PAGE_SIZE; 598 src_addr += PAGE_SIZE; 843 copied += PAGE_SIZE; 599 copied += PAGE_SIZE; 844 600 845 if (fatal_signal_pendi 601 if (fatal_signal_pending(current)) 846 err = -EINTR; 602 err = -EINTR; 847 } 603 } 848 if (err) 604 if (err) 849 break; 605 break; 850 } 606 } 851 607 852 out_unlock: 608 out_unlock: 853 up_read(&ctx->map_changing_lock); !! 609 up_read(&dst_mm->mmap_sem); 854 uffd_mfill_unlock(dst_vma); << 855 out: 610 out: 856 if (folio) !! 611 if (page) 857 folio_put(folio); !! 612 put_page(page); 858 BUG_ON(copied < 0); 613 BUG_ON(copied < 0); 859 BUG_ON(err > 0); 614 BUG_ON(err > 0); 860 BUG_ON(!copied && !err); 615 BUG_ON(!copied && !err); 861 return copied ? copied : err; 616 return copied ? copied : err; 862 } 617 } 863 618 864 ssize_t mfill_atomic_copy(struct userfaultfd_c !! 619 ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, 865 unsigned long src_st !! 620 unsigned long src_start, unsigned long len, 866 uffd_flags_t flags) !! 621 bool *mmap_changing) 867 { 622 { 868 return mfill_atomic(ctx, dst_start, sr !! 623 return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, 869 uffd_flags_set_mod !! 624 mmap_changing); 870 } 625 } 871 626 872 ssize_t mfill_atomic_zeropage(struct userfault !! 627 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, 873 unsigned long st !! 628 unsigned long len, bool *mmap_changing) 874 unsigned long le << 875 { 629 { 876 return mfill_atomic(ctx, start, 0, len !! 630 return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); 877 uffd_flags_set_mod << 878 } << 879 << 880 ssize_t mfill_atomic_continue(struct userfault << 881 unsigned long le << 882 { << 883 << 884 /* << 885 * A caller might reasonably assume th << 886 * smp_wmb() to ensure that any writes << 887 * the thread doing the UFFDIO_CONTINU << 888 * subsequent loads from the page thro << 889 */ << 890 smp_wmb(); << 891 << 892 return mfill_atomic(ctx, start, 0, len << 893 uffd_flags_set_mod << 894 } << 895 << 896 ssize_t mfill_atomic_poison(struct userfaultfd << 897 unsigned long len, << 898 { << 899 return mfill_atomic(ctx, start, 0, len << 900 uffd_flags_set_mod << 901 } << 902 << 903 long uffd_wp_range(struct vm_area_struct *dst_ << 904 unsigned long start, unsign << 905 { << 906 unsigned int mm_cp_flags; << 907 struct mmu_gather tlb; << 908 long ret; << 909 << 910 VM_WARN_ONCE(start < dst_vma->vm_start << 911 "The address range exc << 912 if (enable_wp) << 913 mm_cp_flags = MM_CP_UFFD_WP; << 914 else << 915 mm_cp_flags = MM_CP_UFFD_WP_RE << 916 << 917 /* << 918 * vma->vm_page_prot already reflects << 919 * VMA (see userfaultfd_set_vm_flags() << 920 * to be write-protected as default wh << 921 * Try upgrading write permissions man << 922 */ << 923 if (!enable_wp && vma_wants_manual_pte << 924 mm_cp_flags |= MM_CP_TRY_CHANG << 925 tlb_gather_mmu(&tlb, dst_vma->vm_mm); << 926 ret = change_protection(&tlb, dst_vma, << 927 tlb_finish_mmu(&tlb); << 928 << 929 return ret; << 930 } << 931 << 932 int mwriteprotect_range(struct userfaultfd_ctx << 933 unsigned long len, boo << 934 { << 935 struct mm_struct *dst_mm = ctx->mm; << 936 unsigned long end = start + len; << 937 unsigned long _start, _end; << 938 struct vm_area_struct *dst_vma; << 939 unsigned long page_mask; << 940 long err; << 941 VMA_ITERATOR(vmi, dst_mm, start); << 942 << 943 /* << 944 * Sanitize the command parameters: << 945 */ << 946 BUG_ON(start & ~PAGE_MASK); << 947 BUG_ON(len & ~PAGE_MASK); << 948 << 949 /* Does the address range wrap, or is << 950 BUG_ON(start + len <= start); << 951 << 952 mmap_read_lock(dst_mm); << 953 << 954 /* << 955 * If memory mappings are changing bec << 956 * operation (e.g. mremap) running in << 957 * request the user to retry later << 958 */ << 959 down_read(&ctx->map_changing_lock); << 960 err = -EAGAIN; << 961 if (atomic_read(&ctx->mmap_changing)) << 962 goto out_unlock; << 963 << 964 err = -ENOENT; << 965 for_each_vma_range(vmi, dst_vma, end) << 966 << 967 if (!userfaultfd_wp(dst_vma)) << 968 err = -ENOENT; << 969 break; << 970 } << 971 << 972 if (is_vm_hugetlb_page(dst_vma << 973 err = -EINVAL; << 974 page_mask = vma_kernel << 975 if ((start & page_mask << 976 break; << 977 } << 978 << 979 _start = max(dst_vma->vm_start << 980 _end = min(dst_vma->vm_end, en << 981 << 982 err = uffd_wp_range(dst_vma, _ << 983 << 984 /* Return 0 on success, <0 on << 985 if (err < 0) << 986 break; << 987 err = 0; << 988 } << 989 out_unlock: << 990 up_read(&ctx->map_changing_lock); << 991 mmap_read_unlock(dst_mm); << 992 return err; << 993 } << 994 << 995 << 996 void double_pt_lock(spinlock_t *ptl1, << 997 spinlock_t *ptl2) << 998 __acquires(ptl1) << 999 __acquires(ptl2) << 1000 { << 1001 if (ptl1 > ptl2) << 1002 swap(ptl1, ptl2); << 1003 /* lock in virtual address order to a << 1004 spin_lock(ptl1); << 1005 if (ptl1 != ptl2) << 1006 spin_lock_nested(ptl2, SINGLE << 1007 else << 1008 __acquire(ptl2); << 1009 } << 1010 << 1011 void double_pt_unlock(spinlock_t *ptl1, << 1012 spinlock_t *ptl2) << 1013 __releases(ptl1) << 1014 __releases(ptl2) << 1015 { << 1016 spin_unlock(ptl1); << 1017 if (ptl1 != ptl2) << 1018 spin_unlock(ptl2); << 1019 else << 1020 __release(ptl2); << 1021 } << 1022 << 1023 << 1024 static int move_present_pte(struct mm_struct << 1025 struct vm_area_st << 1026 struct vm_area_st << 1027 unsigned long dst << 1028 pte_t *dst_pte, p << 1029 pte_t orig_dst_pt << 1030 spinlock_t *dst_p << 1031 struct folio *src << 1032 { << 1033 int err = 0; << 1034 << 1035 double_pt_lock(dst_ptl, src_ptl); << 1036 << 1037 if (!pte_same(ptep_get(src_pte), orig << 1038 !pte_same(ptep_get(dst_pte), orig << 1039 err = -EAGAIN; << 1040 goto out; << 1041 } << 1042 if (folio_test_large(src_folio) || << 1043 folio_maybe_dma_pinned(src_folio) << 1044 !PageAnonExclusive(&src_folio->pa << 1045 err = -EBUSY; << 1046 goto out; << 1047 } << 1048 << 1049 orig_src_pte = ptep_clear_flush(src_v << 1050 /* Folio got pinned from under us. Pu << 1051 if (folio_maybe_dma_pinned(src_folio) << 1052 set_pte_at(mm, src_addr, src_ << 1053 err = -EBUSY; << 1054 goto out; << 1055 } << 1056 << 1057 folio_move_anon_rmap(src_folio, dst_v << 1058 src_folio->index = linear_page_index( << 1059 << 1060 orig_dst_pte = mk_pte(&src_folio->pag << 1061 /* Follow mremap() behavior and treat << 1062 orig_dst_pte = pte_mkwrite(pte_mkdirt << 1063 << 1064 set_pte_at(mm, dst_addr, dst_pte, ori << 1065 out: << 1066 double_pt_unlock(dst_ptl, src_ptl); << 1067 return err; << 1068 } << 1069 << 1070 static int move_swap_pte(struct mm_struct *mm << 1071 unsigned long dst_ad << 1072 pte_t *dst_pte, pte_ << 1073 pte_t orig_dst_pte, << 1074 spinlock_t *dst_ptl, << 1075 { << 1076 if (!pte_swp_exclusive(orig_src_pte)) << 1077 return -EBUSY; << 1078 << 1079 double_pt_lock(dst_ptl, src_ptl); << 1080 << 1081 if (!pte_same(ptep_get(src_pte), orig << 1082 !pte_same(ptep_get(dst_pte), orig << 1083 double_pt_unlock(dst_ptl, src << 1084 return -EAGAIN; << 1085 } << 1086 << 1087 orig_src_pte = ptep_get_and_clear(mm, << 1088 set_pte_at(mm, dst_addr, dst_pte, ori << 1089 double_pt_unlock(dst_ptl, src_ptl); << 1090 << 1091 return 0; << 1092 } << 1093 << 1094 static int move_zeropage_pte(struct mm_struct << 1095 struct vm_area_s << 1096 struct vm_area_s << 1097 unsigned long ds << 1098 pte_t *dst_pte, << 1099 pte_t orig_dst_p << 1100 spinlock_t *dst_ << 1101 { << 1102 pte_t zero_pte; << 1103 << 1104 double_pt_lock(dst_ptl, src_ptl); << 1105 if (!pte_same(ptep_get(src_pte), orig << 1106 !pte_same(ptep_get(dst_pte), orig << 1107 double_pt_unlock(dst_ptl, src << 1108 return -EAGAIN; << 1109 } << 1110 << 1111 zero_pte = pte_mkspecial(pfn_pte(my_z << 1112 dst_ << 1113 ptep_clear_flush(src_vma, src_addr, s << 1114 set_pte_at(mm, dst_addr, dst_pte, zer << 1115 double_pt_unlock(dst_ptl, src_ptl); << 1116 << 1117 return 0; << 1118 } << 1119 << 1120 << 1121 /* << 1122 * The mmap_lock for reading is held by the c << 1123 * from src_pmd to dst_pmd if possible, and r << 1124 * in moving the page. << 1125 */ << 1126 static int move_pages_pte(struct mm_struct *m << 1127 struct vm_area_stru << 1128 struct vm_area_stru << 1129 unsigned long dst_a << 1130 __u64 mode) << 1131 { << 1132 swp_entry_t entry; << 1133 pte_t orig_src_pte, orig_dst_pte; << 1134 pte_t src_folio_pte; << 1135 spinlock_t *src_ptl, *dst_ptl; << 1136 pte_t *src_pte = NULL; << 1137 pte_t *dst_pte = NULL; << 1138 << 1139 struct folio *src_folio = NULL; << 1140 struct anon_vma *src_anon_vma = NULL; << 1141 struct mmu_notifier_range range; << 1142 int err = 0; << 1143 << 1144 flush_cache_range(src_vma, src_addr, << 1145 mmu_notifier_range_init(&range, MMU_N << 1146 src_addr, src << 1147 mmu_notifier_invalidate_range_start(& << 1148 retry: << 1149 dst_pte = pte_offset_map_nolock(mm, d << 1150 << 1151 /* Retry if a huge pmd materialized f << 1152 if (unlikely(!dst_pte)) { << 1153 err = -EAGAIN; << 1154 goto out; << 1155 } << 1156 << 1157 src_pte = pte_offset_map_nolock(mm, s << 1158 << 1159 /* << 1160 * We held the mmap_lock for reading << 1161 * can zap transparent huge pages und << 1162 * transparent huge page fault can es << 1163 * transparent huge pages under us. << 1164 */ << 1165 if (unlikely(!src_pte)) { << 1166 err = -EAGAIN; << 1167 goto out; << 1168 } << 1169 << 1170 /* Sanity checks before the operation << 1171 if (WARN_ON_ONCE(pmd_none(*dst_pmd)) << 1172 WARN_ON_ONCE(pmd_trans_huge(*dst_ << 1173 err = -EINVAL; << 1174 goto out; << 1175 } << 1176 << 1177 spin_lock(dst_ptl); << 1178 orig_dst_pte = ptep_get(dst_pte); << 1179 spin_unlock(dst_ptl); << 1180 if (!pte_none(orig_dst_pte)) { << 1181 err = -EEXIST; << 1182 goto out; << 1183 } << 1184 << 1185 spin_lock(src_ptl); << 1186 orig_src_pte = ptep_get(src_pte); << 1187 spin_unlock(src_ptl); << 1188 if (pte_none(orig_src_pte)) { << 1189 if (!(mode & UFFDIO_MOVE_MODE << 1190 err = -ENOENT; << 1191 else /* nothing to do to move << 1192 err = 0; << 1193 goto out; << 1194 } << 1195 << 1196 /* If PTE changed after we locked the << 1197 if (src_folio && unlikely(!pte_same(s << 1198 err = -EAGAIN; << 1199 goto out; << 1200 } << 1201 << 1202 if (pte_present(orig_src_pte)) { << 1203 if (is_zero_pfn(pte_pfn(orig_ << 1204 err = move_zeropage_p << 1205 << 1206 << 1207 << 1208 goto out; << 1209 } << 1210 << 1211 /* << 1212 * Pin and lock both source f << 1213 * RCU read section, we can't << 1214 * unmap the ptes, obtain the << 1215 */ << 1216 if (!src_folio) { << 1217 struct folio *folio; << 1218 << 1219 /* << 1220 * Pin the page while << 1221 * page isn't freed u << 1222 */ << 1223 spin_lock(src_ptl); << 1224 if (!pte_same(orig_sr << 1225 spin_unlock(s << 1226 err = -EAGAIN << 1227 goto out; << 1228 } << 1229 << 1230 folio = vm_normal_fol << 1231 if (!folio || !PageAn << 1232 spin_unlock(s << 1233 err = -EBUSY; << 1234 goto out; << 1235 } << 1236 << 1237 folio_get(folio); << 1238 src_folio = folio; << 1239 src_folio_pte = orig_ << 1240 spin_unlock(src_ptl); << 1241 << 1242 if (!folio_trylock(sr << 1243 pte_unmap(&or << 1244 pte_unmap(&or << 1245 src_pte = dst << 1246 /* now we can << 1247 folio_lock(sr << 1248 goto retry; << 1249 } << 1250 << 1251 if (WARN_ON_ONCE(!fol << 1252 err = -EBUSY; << 1253 goto out; << 1254 } << 1255 } << 1256 << 1257 /* at this point we have src_ << 1258 if (folio_test_large(src_foli << 1259 /* split_folio() can << 1260 pte_unmap(&orig_src_p << 1261 pte_unmap(&orig_dst_p << 1262 src_pte = dst_pte = N << 1263 err = split_folio(src << 1264 if (err) << 1265 goto out; << 1266 /* have to reacquire << 1267 folio_unlock(src_foli << 1268 folio_put(src_folio); << 1269 src_folio = NULL; << 1270 goto retry; << 1271 } << 1272 << 1273 if (!src_anon_vma) { << 1274 /* << 1275 * folio_referenced w << 1276 * without the folio << 1277 * the anon_vma lock, << 1278 */ << 1279 src_anon_vma = folio_ << 1280 if (!src_anon_vma) { << 1281 /* page was u << 1282 err = -EAGAIN << 1283 goto out; << 1284 } << 1285 if (!anon_vma_trylock << 1286 pte_unmap(&or << 1287 pte_unmap(&or << 1288 src_pte = dst << 1289 /* now we can << 1290 anon_vma_lock << 1291 goto retry; << 1292 } << 1293 } << 1294 << 1295 err = move_present_pte(mm, d << 1296 dst_ad << 1297 orig_d << 1298 dst_pt << 1299 } else { << 1300 entry = pte_to_swp_entry(orig << 1301 if (non_swap_entry(entry)) { << 1302 if (is_migration_entr << 1303 pte_unmap(&or << 1304 pte_unmap(&or << 1305 src_pte = dst << 1306 migration_ent << 1307 err = -EAGAIN << 1308 } else << 1309 err = -EFAULT << 1310 goto out; << 1311 } << 1312 << 1313 err = move_swap_pte(mm, dst_a << 1314 dst_pte, << 1315 orig_dst_ << 1316 dst_ptl, << 1317 } << 1318 << 1319 out: << 1320 if (src_anon_vma) { << 1321 anon_vma_unlock_write(src_ano << 1322 put_anon_vma(src_anon_vma); << 1323 } << 1324 if (src_folio) { << 1325 folio_unlock(src_folio); << 1326 folio_put(src_folio); << 1327 } << 1328 if (dst_pte) << 1329 pte_unmap(dst_pte); << 1330 if (src_pte) << 1331 pte_unmap(src_pte); << 1332 mmu_notifier_invalidate_range_end(&ra << 1333 << 1334 return err; << 1335 } << 1336 << 1337 #ifdef CONFIG_TRANSPARENT_HUGEPAGE << 1338 static inline bool move_splits_huge_pmd(unsig << 1339 unsig << 1340 unsig << 1341 { << 1342 return (src_addr & ~HPAGE_PMD_MASK) | << 1343 src_end - src_addr < HPAGE_PM << 1344 } << 1345 #else << 1346 static inline bool move_splits_huge_pmd(unsig << 1347 unsig << 1348 unsig << 1349 { << 1350 /* This is unreachable anyway, just t << 1351 return false; << 1352 } << 1353 #endif << 1354 << 1355 static inline bool vma_move_compatible(struct << 1356 { << 1357 return !(vma->vm_flags & (VM_PFNMAP | << 1358 VM_MIXEDMAP << 1359 } << 1360 << 1361 static int validate_move_areas(struct userfau << 1362 struct vm_area << 1363 struct vm_area << 1364 { << 1365 /* Only allow moving if both have the << 1366 if ((src_vma->vm_flags & VM_ACCESS_FL << 1367 pgprot_val(src_vma->vm_page_prot) << 1368 return -EINVAL; << 1369 << 1370 /* Only allow moving if both are mloc << 1371 if ((src_vma->vm_flags & VM_LOCKED) ! << 1372 return -EINVAL; << 1373 << 1374 /* << 1375 * For now, we keep it simple and onl << 1376 * Access flags are equal, therefore << 1377 */ << 1378 if (!(src_vma->vm_flags & VM_WRITE)) << 1379 return -EINVAL; << 1380 << 1381 /* Check if vma flags indicate conten << 1382 if (!vma_move_compatible(src_vma) || << 1383 return -EINVAL; << 1384 << 1385 /* Ensure dst_vma is registered in uf << 1386 if (!dst_vma->vm_userfaultfd_ctx.ctx << 1387 dst_vma->vm_userfaultfd_ctx.ctx ! << 1388 return -EINVAL; << 1389 << 1390 /* Only allow moving across anonymous << 1391 if (!vma_is_anonymous(src_vma) || !vm << 1392 return -EINVAL; << 1393 << 1394 return 0; << 1395 } << 1396 << 1397 static __always_inline << 1398 int find_vmas_mm_locked(struct mm_struct *mm, << 1399 unsigned long dst_sta << 1400 unsigned long src_sta << 1401 struct vm_area_struct << 1402 struct vm_area_struct << 1403 { << 1404 struct vm_area_struct *vma; << 1405 << 1406 mmap_assert_locked(mm); << 1407 vma = find_vma_and_prepare_anon(mm, d << 1408 if (IS_ERR(vma)) << 1409 return PTR_ERR(vma); << 1410 << 1411 *dst_vmap = vma; << 1412 /* Skip finding src_vma if src_start << 1413 if (src_start >= vma->vm_start && src << 1414 goto out_success; << 1415 << 1416 vma = vma_lookup(mm, src_start); << 1417 if (!vma) << 1418 return -ENOENT; << 1419 out_success: << 1420 *src_vmap = vma; << 1421 return 0; << 1422 } << 1423 << 1424 #ifdef CONFIG_PER_VMA_LOCK << 1425 static int uffd_move_lock(struct mm_struct *m << 1426 unsigned long dst_s << 1427 unsigned long src_s << 1428 struct vm_area_stru << 1429 struct vm_area_stru << 1430 { << 1431 struct vm_area_struct *vma; << 1432 int err; << 1433 << 1434 vma = uffd_lock_vma(mm, dst_start); << 1435 if (IS_ERR(vma)) << 1436 return PTR_ERR(vma); << 1437 << 1438 *dst_vmap = vma; << 1439 /* << 1440 * Skip finding src_vma if src_start << 1441 * that we don't lock the same vma tw << 1442 */ << 1443 if (src_start >= vma->vm_start && src << 1444 *src_vmap = vma; << 1445 return 0; << 1446 } << 1447 << 1448 /* << 1449 * Using uffd_lock_vma() to get src_v << 1450 * << 1451 * Thread1 << 1452 * ------- << 1453 * vma_start_read(dst_vma) << 1454 * << 1455 * << 1456 * vma_start_read(src_vma) << 1457 * mmap_read_lock(mm) << 1458 * << 1459 */ << 1460 *src_vmap = lock_vma_under_rcu(mm, sr << 1461 if (likely(*src_vmap)) << 1462 return 0; << 1463 << 1464 /* Undo any locking and retry in mmap << 1465 vma_end_read(*dst_vmap); << 1466 << 1467 mmap_read_lock(mm); << 1468 err = find_vmas_mm_locked(mm, dst_sta << 1469 if (!err) { << 1470 /* << 1471 * See comment in uffd_lock_v << 1472 * vma_start_read() here. << 1473 */ << 1474 down_read(&(*dst_vmap)->vm_lo << 1475 if (*dst_vmap != *src_vmap) << 1476 down_read_nested(&(*s << 1477 SING << 1478 } << 1479 mmap_read_unlock(mm); << 1480 return err; << 1481 } << 1482 << 1483 static void uffd_move_unlock(struct vm_area_s << 1484 struct vm_area_s << 1485 { << 1486 vma_end_read(src_vma); << 1487 if (src_vma != dst_vma) << 1488 vma_end_read(dst_vma); << 1489 } << 1490 << 1491 #else << 1492 << 1493 static int uffd_move_lock(struct mm_struct *m << 1494 unsigned long dst_s << 1495 unsigned long src_s << 1496 struct vm_area_stru << 1497 struct vm_area_stru << 1498 { << 1499 int err; << 1500 << 1501 mmap_read_lock(mm); << 1502 err = find_vmas_mm_locked(mm, dst_sta << 1503 if (err) << 1504 mmap_read_unlock(mm); << 1505 return err; << 1506 } << 1507 << 1508 static void uffd_move_unlock(struct vm_area_s << 1509 struct vm_area_s << 1510 { << 1511 mmap_assert_locked(src_vma->vm_mm); << 1512 mmap_read_unlock(dst_vma->vm_mm); << 1513 } << 1514 #endif << 1515 << 1516 /** << 1517 * move_pages - move arbitrary anonymous page << 1518 * @ctx: pointer to the userfaultfd context << 1519 * @dst_start: start of the destination virtu << 1520 * @src_start: start of the source virtual me << 1521 * @len: length of the virtual memory range << 1522 * @mode: flags from uffdio_move.mode << 1523 * << 1524 * It will either use the mmap_lock in read m << 1525 * << 1526 * move_pages() remaps arbitrary anonymous pa << 1527 * copy. It only works on non shared anonymou << 1528 * be relocated without generating non linear << 1529 * code. << 1530 * << 1531 * It provides a zero copy mechanism to handl << 1532 * The source vma pages should have mapcount << 1533 * enforced by using madvise(MADV_DONTFORK) o << 1534 * << 1535 * The thread receiving the page during the u << 1536 * will receive the faulting page in the sour << 1537 * storage or any other I/O device (MADV_DONT << 1538 * avoids move_pages() to fail with -EBUSY if << 1539 * move_pages() is called), then it will call << 1540 * page in the faulting address in the destin << 1541 * << 1542 * This userfaultfd command works purely via << 1543 * most efficient way to move physical non sh << 1544 * across different virtual addresses. Unlike << 1545 * it does not create any new vmas. The mappi << 1546 * address is atomic. << 1547 * << 1548 * It only works if the vma protection bits a << 1549 * source and destination vma. << 1550 * << 1551 * It can remap non shared anonymous pages wi << 1552 * << 1553 * If the source virtual memory range has any << 1554 * the destination virtual memory range is no << 1555 * move_pages() will fail respectively with - << 1556 * provides a very strict behavior to avoid a << 1557 * corruption going unnoticed if there are us << 1558 * Only one thread should resolve the userlan << 1559 * time for any given faulting address. This << 1560 * try to both call move_pages() on the same << 1561 * same time, the second thread will get an e << 1562 * command. << 1563 * << 1564 * The command retval will return "len" is su << 1565 * however can be interrupted by fatal signal << 1566 * interrupted it will return the number of b << 1567 * remapped before the interruption if any, o << 1568 * none. It will never return zero. Either it << 1569 * an amount of bytes successfully moved. If << 1570 * "short" remap, the move_pages() command sh << 1571 * userland with src+retval, dst+reval, len-r << 1572 * about the error that interrupted it. << 1573 * << 1574 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag << 1575 * prevent -ENOENT errors to materialize if t << 1576 * source virtual range that is being remappe << 1577 * accounted as successfully remapped in the << 1578 * command. This is mostly useful to remap hu << 1579 * virtual regions without knowing if there a << 1580 * in the regions or not, but preventing the << 1581 * the hugepmd during the remap. << 1582 * << 1583 * If there's any rmap walk that is taking th << 1584 * first obtaining the folio lock (the only c << 1585 * folio_referenced), they will have to verif << 1586 * has changed after taking the anon_vma lock << 1587 * should release the lock and retry obtainin << 1588 * it means the anon_vma was changed by move_ << 1589 * could be obtained. This is the only additi << 1590 * the rmap code to provide this anonymous pa << 1591 */ << 1592 ssize_t move_pages(struct userfaultfd_ctx *ct << 1593 unsigned long src_start, u << 1594 { << 1595 struct mm_struct *mm = ctx->mm; << 1596 struct vm_area_struct *src_vma, *dst_ << 1597 unsigned long src_addr, dst_addr; << 1598 pmd_t *src_pmd, *dst_pmd; << 1599 long err = -EINVAL; << 1600 ssize_t moved = 0; << 1601 << 1602 /* Sanitize the command parameters. * << 1603 if (WARN_ON_ONCE(src_start & ~PAGE_MA << 1604 WARN_ON_ONCE(dst_start & ~PAGE_MA << 1605 WARN_ON_ONCE(len & ~PAGE_MASK)) << 1606 goto out; << 1607 << 1608 /* Does the address range wrap, or is << 1609 if (WARN_ON_ONCE(src_start + len <= s << 1610 WARN_ON_ONCE(dst_start + len <= d << 1611 goto out; << 1612 << 1613 err = uffd_move_lock(mm, dst_start, s << 1614 if (err) << 1615 goto out; << 1616 << 1617 /* Re-check after taking map_changing << 1618 err = -EAGAIN; << 1619 down_read(&ctx->map_changing_lock); << 1620 if (likely(atomic_read(&ctx->mmap_cha << 1621 goto out_unlock; << 1622 /* << 1623 * Make sure the vma is not shared, t << 1624 * ranges are both valid and fully wi << 1625 * vma. << 1626 */ << 1627 err = -EINVAL; << 1628 if (src_vma->vm_flags & VM_SHARED) << 1629 goto out_unlock; << 1630 if (src_start + len > src_vma->vm_end << 1631 goto out_unlock; << 1632 << 1633 if (dst_vma->vm_flags & VM_SHARED) << 1634 goto out_unlock; << 1635 if (dst_start + len > dst_vma->vm_end << 1636 goto out_unlock; << 1637 << 1638 err = validate_move_areas(ctx, src_vm << 1639 if (err) << 1640 goto out_unlock; << 1641 << 1642 for (src_addr = src_start, dst_addr = << 1643 src_addr < src_start + len;) { << 1644 spinlock_t *ptl; << 1645 pmd_t dst_pmdval; << 1646 unsigned long step_size; << 1647 << 1648 /* << 1649 * Below works because anonym << 1650 * transparent huge PUD. If f << 1651 * that case would need to be << 1652 */ << 1653 src_pmd = mm_find_pmd(mm, src << 1654 if (unlikely(!src_pmd)) { << 1655 if (!(mode & UFFDIO_M << 1656 err = -ENOENT << 1657 break; << 1658 } << 1659 src_pmd = mm_alloc_pm << 1660 if (unlikely(!src_pmd << 1661 err = -ENOMEM << 1662 break; << 1663 } << 1664 } << 1665 dst_pmd = mm_alloc_pmd(mm, ds << 1666 if (unlikely(!dst_pmd)) { << 1667 err = -ENOMEM; << 1668 break; << 1669 } << 1670 << 1671 dst_pmdval = pmdp_get_lockles << 1672 /* << 1673 * If the dst_pmd is mapped a << 1674 * be strict. If dst_pmd chan << 1675 * move_pages_huge_pmd() will << 1676 * while move_pages_pte() wil << 1677 */ << 1678 if (unlikely(pmd_trans_huge(d << 1679 err = -EEXIST; << 1680 break; << 1681 } << 1682 << 1683 ptl = pmd_trans_huge_lock(src << 1684 if (ptl) { << 1685 if (pmd_devmap(*src_p << 1686 spin_unlock(p << 1687 err = -ENOENT << 1688 break; << 1689 } << 1690 << 1691 /* Check if we can mo << 1692 if (move_splits_huge_ << 1693 !pmd_none(dst_pmd << 1694 struct folio << 1695 << 1696 if (!folio || << 1697 << 1698 spin_ << 1699 err = << 1700 break << 1701 } << 1702 << 1703 spin_unlock(p << 1704 split_huge_pm << 1705 /* The folio << 1706 continue; << 1707 } << 1708 << 1709 err = move_pages_huge << 1710 << 1711 << 1712 step_size = HPAGE_PMD << 1713 } else { << 1714 if (pmd_none(*src_pmd << 1715 if (!(mode & << 1716 err = << 1717 break << 1718 } << 1719 if (unlikely( << 1720 err = << 1721 break << 1722 } << 1723 } << 1724 << 1725 if (unlikely(pte_allo << 1726 err = -ENOMEM << 1727 break; << 1728 } << 1729 << 1730 err = move_pages_pte( << 1731 << 1732 << 1733 step_size = PAGE_SIZE << 1734 } << 1735 << 1736 cond_resched(); << 1737 << 1738 if (fatal_signal_pending(curr << 1739 /* Do not override an << 1740 if (!err || err == -E << 1741 err = -EINTR; << 1742 break; << 1743 } << 1744 << 1745 if (err) { << 1746 if (err == -EAGAIN) << 1747 continue; << 1748 break; << 1749 } << 1750 << 1751 /* Proceed to the next page * << 1752 dst_addr += step_size; << 1753 src_addr += step_size; << 1754 moved += step_size; << 1755 } << 1756 << 1757 out_unlock: << 1758 up_read(&ctx->map_changing_lock); << 1759 uffd_move_unlock(dst_vma, src_vma); << 1760 out: << 1761 VM_WARN_ON(moved < 0); << 1762 VM_WARN_ON(err > 0); << 1763 VM_WARN_ON(!moved && !err); << 1764 return moved ? moved : err; << 1765 } << 1766 << 1767 static void userfaultfd_set_vm_flags(struct v << 1768 vm_flags << 1769 { << 1770 const bool uffd_wp_changed = (vma->vm << 1771 << 1772 vm_flags_reset(vma, flags); << 1773 /* << 1774 * For shared mappings, we want to en << 1775 * userfaultfd-wp is enabled (see vma << 1776 * recalculate vma->vm_page_prot when << 1777 */ << 1778 if ((vma->vm_flags & VM_SHARED) && uf << 1779 vma_set_page_prot(vma); << 1780 } << 1781 << 1782 static void userfaultfd_set_ctx(struct vm_are << 1783 struct userfa << 1784 unsigned long << 1785 { << 1786 vma_start_write(vma); << 1787 vma->vm_userfaultfd_ctx = (struct vm_ << 1788 userfaultfd_set_vm_flags(vma, << 1789 (vma->vm_fla << 1790 } << 1791 << 1792 void userfaultfd_reset_ctx(struct vm_area_str << 1793 { << 1794 userfaultfd_set_ctx(vma, NULL, 0); << 1795 } << 1796 << 1797 struct vm_area_struct *userfaultfd_clear_vma( << 1798 << 1799 << 1800 << 1801 << 1802 { << 1803 struct vm_area_struct *ret; << 1804 << 1805 /* Reset ptes for the whole vma range << 1806 if (userfaultfd_wp(vma)) << 1807 uffd_wp_range(vma, start, end << 1808 << 1809 ret = vma_modify_flags_uffd(vmi, prev << 1810 vma->vm_f << 1811 NULL_VM_U << 1812 << 1813 /* << 1814 * In the vma_merge() successful mpro << 1815 * the next vma was merged into the c << 1816 * the current one has not been updat << 1817 */ << 1818 if (!IS_ERR(ret)) << 1819 userfaultfd_reset_ctx(ret); << 1820 << 1821 return ret; << 1822 } << 1823 << 1824 /* Assumes mmap write lock taken, and mm_stru << 1825 int userfaultfd_register_range(struct userfau << 1826 struct vm_area << 1827 unsigned long << 1828 unsigned long << 1829 bool wp_async) << 1830 { << 1831 VMA_ITERATOR(vmi, ctx->mm, start); << 1832 struct vm_area_struct *prev = vma_pre << 1833 unsigned long vma_end; << 1834 unsigned long new_flags; << 1835 << 1836 if (vma->vm_start < start) << 1837 prev = vma; << 1838 << 1839 for_each_vma_range(vmi, vma, end) { << 1840 cond_resched(); << 1841 << 1842 BUG_ON(!vma_can_userfault(vma << 1843 BUG_ON(vma->vm_userfaultfd_ct << 1844 vma->vm_userfaultfd_ct << 1845 WARN_ON(!(vma->vm_flags & VM_ << 1846 << 1847 /* << 1848 * Nothing to do: this vma is << 1849 * userfaultfd and with the r << 1850 */ << 1851 if (vma->vm_userfaultfd_ctx.c << 1852 (vma->vm_flags & vm_flags << 1853 goto skip; << 1854 << 1855 if (vma->vm_start > start) << 1856 start = vma->vm_start << 1857 vma_end = min(end, vma->vm_en << 1858 << 1859 new_flags = (vma->vm_flags & << 1860 vma = vma_modify_flags_uffd(& << 1861 n << 1862 ( << 1863 if (IS_ERR(vma)) << 1864 return PTR_ERR(vma); << 1865 << 1866 /* << 1867 * In the vma_merge() success << 1868 * the next vma was merged in << 1869 * the current one has not be << 1870 */ << 1871 userfaultfd_set_ctx(vma, ctx, << 1872 << 1873 if (is_vm_hugetlb_page(vma) & << 1874 hugetlb_unshare_all_p << 1875 << 1876 skip: << 1877 prev = vma; << 1878 start = vma->vm_end; << 1879 } << 1880 << 1881 return 0; << 1882 } << 1883 << 1884 void userfaultfd_release_new(struct userfault << 1885 { << 1886 struct mm_struct *mm = ctx->mm; << 1887 struct vm_area_struct *vma; << 1888 VMA_ITERATOR(vmi, mm, 0); << 1889 << 1890 /* the various vma->vm_userfaultfd_ct << 1891 mmap_write_lock(mm); << 1892 for_each_vma(vmi, vma) { << 1893 if (vma->vm_userfaultfd_ctx.c << 1894 userfaultfd_reset_ctx << 1895 } << 1896 mmap_write_unlock(mm); << 1897 } << 1898 << 1899 void userfaultfd_release_all(struct mm_struct << 1900 struct userfault << 1901 { << 1902 struct vm_area_struct *vma, *prev; << 1903 VMA_ITERATOR(vmi, mm, 0); << 1904 << 1905 if (!mmget_not_zero(mm)) << 1906 return; << 1907 << 1908 /* << 1909 * Flush page faults out of all CPUs. << 1910 * must be retried without returning << 1911 * userfaultfd_ctx_get() succeeds but << 1912 * changes while handle_userfault rel << 1913 * it's critical that released is set << 1914 * taking the mmap_lock for writing. << 1915 */ << 1916 mmap_write_lock(mm); << 1917 prev = NULL; << 1918 for_each_vma(vmi, vma) { << 1919 cond_resched(); << 1920 BUG_ON(!!vma->vm_userfaultfd_ << 1921 !!(vma->vm_flags & __V << 1922 if (vma->vm_userfaultfd_ctx.c << 1923 prev = vma; << 1924 continue; << 1925 } << 1926 << 1927 vma = userfaultfd_clear_vma(& << 1928 v << 1929 prev = vma; << 1930 } << 1931 mmap_write_unlock(mm); << 1932 mmput(mm); << 1933 } 631 } 1934 632
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.