1 // SPDX-License-Identifier: GPL-2.0-only << 2 /* 1 /* 3 * mm/userfaultfd.c 2 * mm/userfaultfd.c 4 * 3 * 5 * Copyright (C) 2015 Red Hat, Inc. 4 * Copyright (C) 2015 Red Hat, Inc. >> 5 * >> 6 * This work is licensed under the terms of the GNU GPL, version 2. See >> 7 * the COPYING file in the top-level directory. 6 */ 8 */ 7 9 8 #include <linux/mm.h> 10 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 11 #include <linux/sched/signal.h> 10 #include <linux/pagemap.h> 12 #include <linux/pagemap.h> 11 #include <linux/rmap.h> 13 #include <linux/rmap.h> 12 #include <linux/swap.h> 14 #include <linux/swap.h> 13 #include <linux/swapops.h> 15 #include <linux/swapops.h> 14 #include <linux/userfaultfd_k.h> 16 #include <linux/userfaultfd_k.h> 15 #include <linux/mmu_notifier.h> 17 #include <linux/mmu_notifier.h> 16 #include <linux/hugetlb.h> 18 #include <linux/hugetlb.h> >> 19 #include <linux/pagemap.h> 17 #include <linux/shmem_fs.h> 20 #include <linux/shmem_fs.h> 18 #include <asm/tlbflush.h> 21 #include <asm/tlbflush.h> 19 #include <asm/tlb.h> << 20 #include "internal.h" 22 #include "internal.h" 21 23 22 static __always_inline !! 24 static int mcopy_atomic_pte(struct mm_struct *dst_mm, 23 bool validate_dst_vma(struct vm_area_struct *d !! 25 pmd_t *dst_pmd, 24 { !! 26 struct vm_area_struct *dst_vma, 25 /* Make sure that the dst range is ful !! 27 unsigned long dst_addr, 26 if (dst_end > dst_vma->vm_end) !! 28 unsigned long src_addr, 27 return false; !! 29 struct page **pagep) 28 << 29 /* << 30 * Check the vma is registered in uffd << 31 * enforce the VM_MAYWRITE check done << 32 * time. << 33 */ << 34 if (!dst_vma->vm_userfaultfd_ctx.ctx) << 35 return false; << 36 << 37 return true; << 38 } << 39 << 40 static __always_inline << 41 struct vm_area_struct *find_vma_and_prepare_an << 42 << 43 { << 44 struct vm_area_struct *vma; << 45 << 46 mmap_assert_locked(mm); << 47 vma = vma_lookup(mm, addr); << 48 if (!vma) << 49 vma = ERR_PTR(-ENOENT); << 50 else if (!(vma->vm_flags & VM_SHARED) << 51 unlikely(anon_vma_prepare(vma << 52 vma = ERR_PTR(-ENOMEM); << 53 << 54 return vma; << 55 } << 56 << 57 #ifdef CONFIG_PER_VMA_LOCK << 58 /* << 59 * uffd_lock_vma() - Lookup and lock vma corre << 60 * @mm: mm to search vma in. << 61 * @address: address that the vma should conta << 62 * << 63 * Should be called without holding mmap_lock. << 64 * << 65 * Return: A locked vma containing @address, - << 66 * -ENOMEM if anon_vma couldn't be allocated. << 67 */ << 68 static struct vm_area_struct *uffd_lock_vma(st << 69 unsigne << 70 { << 71 struct vm_area_struct *vma; << 72 << 73 vma = lock_vma_under_rcu(mm, address); << 74 if (vma) { << 75 /* << 76 * We know we're going to need << 77 * that early. << 78 */ << 79 if (!(vma->vm_flags & VM_SHARE << 80 vma_end_read(vma); << 81 else << 82 return vma; << 83 } << 84 << 85 mmap_read_lock(mm); << 86 vma = find_vma_and_prepare_anon(mm, ad << 87 if (!IS_ERR(vma)) { << 88 /* << 89 * We cannot use vma_start_rea << 90 * false locked (see comment i << 91 * can avoid that by directly << 92 * mmap_lock, which guarantees << 93 * vma for write (vma_start_wr << 94 */ << 95 down_read(&vma->vm_lock->lock) << 96 } << 97 << 98 mmap_read_unlock(mm); << 99 return vma; << 100 } << 101 << 102 static struct vm_area_struct *uffd_mfill_lock( << 103 << 104 << 105 { << 106 struct vm_area_struct *dst_vma; << 107 << 108 dst_vma = uffd_lock_vma(dst_mm, dst_st << 109 if (IS_ERR(dst_vma) || validate_dst_vm << 110 return dst_vma; << 111 << 112 vma_end_read(dst_vma); << 113 return ERR_PTR(-ENOENT); << 114 } << 115 << 116 static void uffd_mfill_unlock(struct vm_area_s << 117 { << 118 vma_end_read(vma); << 119 } << 120 << 121 #else << 122 << 123 static struct vm_area_struct *uffd_mfill_lock( << 124 << 125 << 126 { << 127 struct vm_area_struct *dst_vma; << 128 << 129 mmap_read_lock(dst_mm); << 130 dst_vma = find_vma_and_prepare_anon(ds << 131 if (IS_ERR(dst_vma)) << 132 goto out_unlock; << 133 << 134 if (validate_dst_vma(dst_vma, dst_star << 135 return dst_vma; << 136 << 137 dst_vma = ERR_PTR(-ENOENT); << 138 out_unlock: << 139 mmap_read_unlock(dst_mm); << 140 return dst_vma; << 141 } << 142 << 143 static void uffd_mfill_unlock(struct vm_area_s << 144 { << 145 mmap_read_unlock(vma->vm_mm); << 146 } << 147 #endif << 148 << 149 /* Check if dst_addr is outside of file's size << 150 static bool mfill_file_over_size(struct vm_are << 151 unsigned long << 152 { << 153 struct inode *inode; << 154 pgoff_t offset, max_off; << 155 << 156 if (!dst_vma->vm_file) << 157 return false; << 158 << 159 inode = dst_vma->vm_file->f_inode; << 160 offset = linear_page_index(dst_vma, ds << 161 max_off = DIV_ROUND_UP(i_size_read(ino << 162 return offset >= max_off; << 163 } << 164 << 165 /* << 166 * Install PTEs, to map dst_addr (within dst_v << 167 * << 168 * This function handles both MCOPY_ATOMIC_NOR << 169 * and anon, and for both shared and private V << 170 */ << 171 int mfill_atomic_install_pte(pmd_t *dst_pmd, << 172 struct vm_area_st << 173 unsigned long dst << 174 bool newly_alloca << 175 { 30 { 176 int ret; !! 31 struct mem_cgroup *memcg; 177 struct mm_struct *dst_mm = dst_vma->vm << 178 pte_t _dst_pte, *dst_pte; 32 pte_t _dst_pte, *dst_pte; 179 bool writable = dst_vma->vm_flags & VM << 180 bool vm_shared = dst_vma->vm_flags & V << 181 spinlock_t *ptl; 33 spinlock_t *ptl; 182 struct folio *folio = page_folio(page) !! 34 void *page_kaddr; 183 bool page_in_cache = folio_mapping(fol << 184 << 185 _dst_pte = mk_pte(page, dst_vma->vm_pa << 186 _dst_pte = pte_mkdirty(_dst_pte); << 187 if (page_in_cache && !vm_shared) << 188 writable = false; << 189 if (writable) << 190 _dst_pte = pte_mkwrite(_dst_pt << 191 if (flags & MFILL_ATOMIC_WP) << 192 _dst_pte = pte_mkuffd_wp(_dst_ << 193 << 194 ret = -EAGAIN; << 195 dst_pte = pte_offset_map_lock(dst_mm, << 196 if (!dst_pte) << 197 goto out; << 198 << 199 if (mfill_file_over_size(dst_vma, dst_ << 200 ret = -EFAULT; << 201 goto out_unlock; << 202 } << 203 << 204 ret = -EEXIST; << 205 /* << 206 * We allow to overwrite a pte marker: << 207 * registered, we firstly wr-protect a << 208 * page backing it, then access the pa << 209 */ << 210 if (!pte_none_mostly(ptep_get(dst_pte) << 211 goto out_unlock; << 212 << 213 if (page_in_cache) { << 214 /* Usually, cache pages are al << 215 if (newly_allocated) << 216 folio_add_lru(folio); << 217 folio_add_file_rmap_pte(folio, << 218 } else { << 219 folio_add_new_anon_rmap(folio, << 220 folio_add_lru_vma(folio, dst_v << 221 } << 222 << 223 /* << 224 * Must happen after rmap, as mm_count << 225 * PageAnon()), which is set by __page << 226 */ << 227 inc_mm_counter(dst_mm, mm_counter(foli << 228 << 229 set_pte_at(dst_mm, dst_addr, dst_pte, << 230 << 231 /* No need to invalidate - it was non- << 232 update_mmu_cache(dst_vma, dst_addr, ds << 233 ret = 0; << 234 out_unlock: << 235 pte_unmap_unlock(dst_pte, ptl); << 236 out: << 237 return ret; << 238 } << 239 << 240 static int mfill_atomic_pte_copy(pmd_t *dst_pm << 241 struct vm_are << 242 unsigned long << 243 unsigned long << 244 uffd_flags_t << 245 struct folio << 246 { << 247 void *kaddr; << 248 int ret; 35 int ret; 249 struct folio *folio; !! 36 struct page *page; 250 37 251 if (!*foliop) { !! 38 if (!*pagep) { 252 ret = -ENOMEM; 39 ret = -ENOMEM; 253 folio = vma_alloc_folio(GFP_HI !! 40 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr); 254 dst_ad !! 41 if (!page) 255 if (!folio) << 256 goto out; 42 goto out; 257 43 258 kaddr = kmap_local_folio(folio !! 44 page_kaddr = kmap_atomic(page); 259 /* !! 45 ret = copy_from_user(page_kaddr, 260 * The read mmap_lock is held !! 46 (const void __user *) src_addr, 261 * mmap_lock being read recurs << 262 * possible if a writer has ta << 263 * << 264 * process A thread 1 takes re << 265 * process A thread 2 calls mm << 266 * process B thread 1 takes pa << 267 * process B thread 2 calls mm << 268 * process A thread 1 blocks t << 269 * process B thread 1 blocks t << 270 * << 271 * Disable page faults to prev << 272 * and retry the copy outside << 273 */ << 274 pagefault_disable(); << 275 ret = copy_from_user(kaddr, (c << 276 PAGE_SIZE 47 PAGE_SIZE); 277 pagefault_enable(); !! 48 kunmap_atomic(page_kaddr); 278 kunmap_local(kaddr); << 279 49 280 /* fallback to copy_from_user !! 50 /* fallback to copy_from_user outside mmap_sem */ 281 if (unlikely(ret)) { 51 if (unlikely(ret)) { 282 ret = -ENOENT; !! 52 ret = -EFAULT; 283 *foliop = folio; !! 53 *pagep = page; 284 /* don't free the page 54 /* don't free the page */ 285 goto out; 55 goto out; 286 } 56 } 287 << 288 flush_dcache_folio(folio); << 289 } else { 57 } else { 290 folio = *foliop; !! 58 page = *pagep; 291 *foliop = NULL; !! 59 *pagep = NULL; 292 } 60 } 293 61 294 /* 62 /* 295 * The memory barrier inside __folio_m !! 63 * The memory barrier inside __SetPageUptodate makes sure that 296 * preceding stores to the page conten !! 64 * preceeding stores to the page contents become visible before 297 * the set_pte_at() write. 65 * the set_pte_at() write. 298 */ 66 */ 299 __folio_mark_uptodate(folio); !! 67 __SetPageUptodate(page); 300 68 301 ret = -ENOMEM; 69 ret = -ENOMEM; 302 if (mem_cgroup_charge(folio, dst_vma-> !! 70 if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) 303 goto out_release; 71 goto out_release; 304 72 305 ret = mfill_atomic_install_pte(dst_pmd !! 73 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 306 &folio- !! 74 if (dst_vma->vm_flags & VM_WRITE) 307 if (ret) !! 75 _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); 308 goto out_release; << 309 out: << 310 return ret; << 311 out_release: << 312 folio_put(folio); << 313 goto out; << 314 } << 315 << 316 static int mfill_atomic_pte_zeroed_folio(pmd_t << 317 struc << 318 unsig << 319 { << 320 struct folio *folio; << 321 int ret = -ENOMEM; << 322 << 323 folio = vma_alloc_zeroed_movable_folio << 324 if (!folio) << 325 return ret; << 326 << 327 if (mem_cgroup_charge(folio, dst_vma-> << 328 goto out_put; << 329 << 330 /* << 331 * The memory barrier inside __folio_m << 332 * zeroing out the folio become visibl << 333 * using set_pte_at(). See do_anonymou << 334 */ << 335 __folio_mark_uptodate(folio); << 336 76 337 ret = mfill_atomic_install_pte(dst_pmd !! 77 ret = -EEXIST; 338 &folio- !! 78 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 339 if (ret) !! 79 if (!pte_none(*dst_pte)) 340 goto out_put; !! 80 goto out_release_uncharge_unlock; 341 << 342 return 0; << 343 out_put: << 344 folio_put(folio); << 345 return ret; << 346 } << 347 81 348 static int mfill_atomic_pte_zeropage(pmd_t *ds !! 82 inc_mm_counter(dst_mm, MM_ANONPAGES); 349 struct vm !! 83 page_add_new_anon_rmap(page, dst_vma, dst_addr, false); 350 unsigned !! 84 mem_cgroup_commit_charge(page, memcg, false, false); 351 { !! 85 lru_cache_add_active_or_unevictable(page, dst_vma); 352 pte_t _dst_pte, *dst_pte; << 353 spinlock_t *ptl; << 354 int ret; << 355 86 356 if (mm_forbids_zeropage(dst_vma->vm_mm !! 87 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 357 return mfill_atomic_pte_zeroed << 358 88 359 _dst_pte = pte_mkspecial(pfn_pte(my_ze << 360 dst_v << 361 ret = -EAGAIN; << 362 dst_pte = pte_offset_map_lock(dst_vma- << 363 if (!dst_pte) << 364 goto out; << 365 if (mfill_file_over_size(dst_vma, dst_ << 366 ret = -EFAULT; << 367 goto out_unlock; << 368 } << 369 ret = -EEXIST; << 370 if (!pte_none(ptep_get(dst_pte))) << 371 goto out_unlock; << 372 set_pte_at(dst_vma->vm_mm, dst_addr, d << 373 /* No need to invalidate - it was non- 89 /* No need to invalidate - it was non-present before */ 374 update_mmu_cache(dst_vma, dst_addr, ds 90 update_mmu_cache(dst_vma, dst_addr, dst_pte); 375 ret = 0; << 376 out_unlock: << 377 pte_unmap_unlock(dst_pte, ptl); << 378 out: << 379 return ret; << 380 } << 381 << 382 /* Handles UFFDIO_CONTINUE for all shmem VMAs << 383 static int mfill_atomic_pte_continue(pmd_t *ds << 384 struct vm << 385 unsigned << 386 uffd_flag << 387 { << 388 struct inode *inode = file_inode(dst_v << 389 pgoff_t pgoff = linear_page_index(dst_ << 390 struct folio *folio; << 391 struct page *page; << 392 int ret; << 393 << 394 ret = shmem_get_folio(inode, pgoff, 0, << 395 /* Our caller expects us to return -EF << 396 if (ret == -ENOENT) << 397 ret = -EFAULT; << 398 if (ret) << 399 goto out; << 400 if (!folio) { << 401 ret = -EFAULT; << 402 goto out; << 403 } << 404 91 405 page = folio_file_page(folio, pgoff); !! 92 pte_unmap_unlock(dst_pte, ptl); 406 if (PageHWPoison(page)) { << 407 ret = -EIO; << 408 goto out_release; << 409 } << 410 << 411 ret = mfill_atomic_install_pte(dst_pmd << 412 page, f << 413 if (ret) << 414 goto out_release; << 415 << 416 folio_unlock(folio); << 417 ret = 0; 93 ret = 0; 418 out: 94 out: 419 return ret; 95 return ret; >> 96 out_release_uncharge_unlock: >> 97 pte_unmap_unlock(dst_pte, ptl); >> 98 mem_cgroup_cancel_charge(page, memcg, false); 420 out_release: 99 out_release: 421 folio_unlock(folio); !! 100 put_page(page); 422 folio_put(folio); << 423 goto out; 101 goto out; 424 } 102 } 425 103 426 /* Handles UFFDIO_POISON for all non-hugetlb V !! 104 static int mfill_zeropage_pte(struct mm_struct *dst_mm, 427 static int mfill_atomic_pte_poison(pmd_t *dst_ !! 105 pmd_t *dst_pmd, 428 struct vm_a !! 106 struct vm_area_struct *dst_vma, 429 unsigned lo !! 107 unsigned long dst_addr) 430 uffd_flags_ << 431 { 108 { 432 int ret; << 433 struct mm_struct *dst_mm = dst_vma->vm << 434 pte_t _dst_pte, *dst_pte; 109 pte_t _dst_pte, *dst_pte; 435 spinlock_t *ptl; 110 spinlock_t *ptl; >> 111 int ret; 436 112 437 _dst_pte = make_pte_marker(PTE_MARKER_ !! 113 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 438 ret = -EAGAIN; !! 114 dst_vma->vm_page_prot)); 439 dst_pte = pte_offset_map_lock(dst_mm, << 440 if (!dst_pte) << 441 goto out; << 442 << 443 if (mfill_file_over_size(dst_vma, dst_ << 444 ret = -EFAULT; << 445 goto out_unlock; << 446 } << 447 << 448 ret = -EEXIST; 115 ret = -EEXIST; 449 /* Refuse to overwrite any PTE, even a !! 116 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 450 if (!pte_none(ptep_get(dst_pte))) !! 117 if (!pte_none(*dst_pte)) 451 goto out_unlock; 118 goto out_unlock; 452 << 453 set_pte_at(dst_mm, dst_addr, dst_pte, 119 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 454 << 455 /* No need to invalidate - it was non- 120 /* No need to invalidate - it was non-present before */ 456 update_mmu_cache(dst_vma, dst_addr, ds 121 update_mmu_cache(dst_vma, dst_addr, dst_pte); 457 ret = 0; 122 ret = 0; 458 out_unlock: 123 out_unlock: 459 pte_unmap_unlock(dst_pte, ptl); 124 pte_unmap_unlock(dst_pte, ptl); 460 out: << 461 return ret; 125 return ret; 462 } 126 } 463 127 464 static pmd_t *mm_alloc_pmd(struct mm_struct *m 128 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 465 { 129 { 466 pgd_t *pgd; 130 pgd_t *pgd; 467 p4d_t *p4d; 131 p4d_t *p4d; 468 pud_t *pud; 132 pud_t *pud; 469 133 470 pgd = pgd_offset(mm, address); 134 pgd = pgd_offset(mm, address); 471 p4d = p4d_alloc(mm, pgd, address); 135 p4d = p4d_alloc(mm, pgd, address); 472 if (!p4d) 136 if (!p4d) 473 return NULL; 137 return NULL; 474 pud = pud_alloc(mm, p4d, address); 138 pud = pud_alloc(mm, p4d, address); 475 if (!pud) 139 if (!pud) 476 return NULL; 140 return NULL; 477 /* 141 /* 478 * Note that we didn't run this becaus 142 * Note that we didn't run this because the pmd was 479 * missing, the *pmd may be already es 143 * missing, the *pmd may be already established and in 480 * turn it may also be a trans_huge_pm 144 * turn it may also be a trans_huge_pmd. 481 */ 145 */ 482 return pmd_alloc(mm, pud, address); 146 return pmd_alloc(mm, pud, address); 483 } 147 } 484 148 485 #ifdef CONFIG_HUGETLB_PAGE 149 #ifdef CONFIG_HUGETLB_PAGE 486 /* 150 /* 487 * mfill_atomic processing for HUGETLB vmas. !! 151 * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is 488 * called with either vma-lock or mmap_lock he !! 152 * called with mmap_sem held, it will release mmap_sem before returning. 489 * before returning. << 490 */ 153 */ 491 static __always_inline ssize_t mfill_atomic_hu !! 154 static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, 492 << 493 155 struct vm_area_struct *dst_vma, 494 156 unsigned long dst_start, 495 157 unsigned long src_start, 496 158 unsigned long len, 497 !! 159 bool zeropage) 498 { 160 { 499 struct mm_struct *dst_mm = dst_vma->vm !! 161 int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; >> 162 int vm_shared = dst_vma->vm_flags & VM_SHARED; 500 ssize_t err; 163 ssize_t err; 501 pte_t *dst_pte; 164 pte_t *dst_pte; 502 unsigned long src_addr, dst_addr; 165 unsigned long src_addr, dst_addr; 503 long copied; 166 long copied; 504 struct folio *folio; !! 167 struct page *page; >> 168 struct hstate *h; 505 unsigned long vma_hpagesize; 169 unsigned long vma_hpagesize; 506 pgoff_t idx; 170 pgoff_t idx; 507 u32 hash; 171 u32 hash; 508 struct address_space *mapping; 172 struct address_space *mapping; 509 173 510 /* 174 /* 511 * There is no default zero huge page 175 * There is no default zero huge page for all huge page sizes as 512 * supported by hugetlb. A PMD_SIZE h 176 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 513 * by THP. Since we can not reliably 177 * by THP. Since we can not reliably insert a zero page, this 514 * feature is not supported. 178 * feature is not supported. 515 */ 179 */ 516 if (uffd_flags_mode_is(flags, MFILL_AT !! 180 if (zeropage) { 517 up_read(&ctx->map_changing_loc !! 181 up_read(&dst_mm->mmap_sem); 518 uffd_mfill_unlock(dst_vma); << 519 return -EINVAL; 182 return -EINVAL; 520 } 183 } 521 184 522 src_addr = src_start; 185 src_addr = src_start; 523 dst_addr = dst_start; 186 dst_addr = dst_start; 524 copied = 0; 187 copied = 0; 525 folio = NULL; !! 188 page = NULL; 526 vma_hpagesize = vma_kernel_pagesize(ds 189 vma_hpagesize = vma_kernel_pagesize(dst_vma); 527 190 528 /* 191 /* 529 * Validate alignment based on huge pa 192 * Validate alignment based on huge page size 530 */ 193 */ 531 err = -EINVAL; 194 err = -EINVAL; 532 if (dst_start & (vma_hpagesize - 1) || 195 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 533 goto out_unlock; 196 goto out_unlock; 534 197 535 retry: 198 retry: 536 /* 199 /* 537 * On routine entry dst_vma is set. I !! 200 * On routine entry dst_vma is set. If we had to drop mmap_sem and 538 * retry, dst_vma will be set to NULL 201 * retry, dst_vma will be set to NULL and we must lookup again. 539 */ 202 */ 540 if (!dst_vma) { 203 if (!dst_vma) { 541 dst_vma = uffd_mfill_lock(dst_ << 542 if (IS_ERR(dst_vma)) { << 543 err = PTR_ERR(dst_vma) << 544 goto out; << 545 } << 546 << 547 err = -ENOENT; 204 err = -ENOENT; 548 if (!is_vm_hugetlb_page(dst_vm !! 205 dst_vma = find_vma(dst_mm, dst_start); 549 goto out_unlock_vma; !! 206 if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) >> 207 goto out_unlock; >> 208 /* >> 209 * Only allow __mcopy_atomic_hugetlb on userfaultfd >> 210 * registered ranges. >> 211 */ >> 212 if (!dst_vma->vm_userfaultfd_ctx.ctx) >> 213 goto out_unlock; >> 214 >> 215 if (dst_start < dst_vma->vm_start || >> 216 dst_start + len > dst_vma->vm_end) >> 217 goto out_unlock; 550 218 551 err = -EINVAL; 219 err = -EINVAL; 552 if (vma_hpagesize != vma_kerne 220 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 553 goto out_unlock_vma; !! 221 goto out_unlock; 554 222 555 /* !! 223 vm_shared = dst_vma->vm_flags & VM_SHARED; 556 * If memory mappings are chan !! 224 } 557 * operation (e.g. mremap) run !! 225 558 * request the user to retry l !! 226 if (WARN_ON(dst_addr & (vma_hpagesize - 1) || 559 */ !! 227 (len - copied) & (vma_hpagesize - 1))) 560 down_read(&ctx->map_changing_l !! 228 goto out_unlock; 561 err = -EAGAIN; !! 229 562 if (atomic_read(&ctx->mmap_cha !! 230 /* >> 231 * If not shared, ensure the dst_vma has a anon_vma. >> 232 */ >> 233 err = -ENOMEM; >> 234 if (!vm_shared) { >> 235 if (unlikely(anon_vma_prepare(dst_vma))) 563 goto out_unlock; 236 goto out_unlock; 564 } 237 } 565 238 >> 239 h = hstate_vma(dst_vma); >> 240 566 while (src_addr < src_start + len) { 241 while (src_addr < src_start + len) { >> 242 pte_t dst_pteval; >> 243 567 BUG_ON(dst_addr >= dst_start + 244 BUG_ON(dst_addr >= dst_start + len); >> 245 VM_BUG_ON(dst_addr & ~huge_page_mask(h)); 568 246 569 /* 247 /* 570 * Serialize via vma_lock and !! 248 * Serialize via hugetlb_fault_mutex 571 * vma_lock ensures the dst_pt << 572 * in the case of shared pmds. << 573 * races with other faulting t << 574 */ 249 */ 575 idx = linear_page_index(dst_vm 250 idx = linear_page_index(dst_vma, dst_addr); 576 mapping = dst_vma->vm_file->f_ 251 mapping = dst_vma->vm_file->f_mapping; 577 hash = hugetlb_fault_mutex_has !! 252 hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, >> 253 idx, dst_addr); 578 mutex_lock(&hugetlb_fault_mute 254 mutex_lock(&hugetlb_fault_mutex_table[hash]); 579 hugetlb_vma_lock_read(dst_vma) << 580 255 581 err = -ENOMEM; 256 err = -ENOMEM; 582 dst_pte = huge_pte_alloc(dst_m !! 257 dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); 583 if (!dst_pte) { 258 if (!dst_pte) { 584 hugetlb_vma_unlock_rea << 585 mutex_unlock(&hugetlb_ 259 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 586 goto out_unlock; 260 goto out_unlock; 587 } 261 } 588 262 589 if (!uffd_flags_mode_is(flags, !! 263 err = -EEXIST; 590 !huge_pte_none_mostly(huge !! 264 dst_pteval = huge_ptep_get(dst_pte); 591 err = -EEXIST; !! 265 if (!huge_pte_none(dst_pteval)) { 592 hugetlb_vma_unlock_rea << 593 mutex_unlock(&hugetlb_ 266 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 594 goto out_unlock; 267 goto out_unlock; 595 } 268 } 596 269 597 err = hugetlb_mfill_atomic_pte !! 270 err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, 598 !! 271 dst_addr, src_addr, &page); 599 272 600 hugetlb_vma_unlock_read(dst_vm << 601 mutex_unlock(&hugetlb_fault_mu 273 mutex_unlock(&hugetlb_fault_mutex_table[hash]); >> 274 vm_alloc_shared = vm_shared; 602 275 603 cond_resched(); 276 cond_resched(); 604 277 605 if (unlikely(err == -ENOENT)) !! 278 if (unlikely(err == -EFAULT)) { 606 up_read(&ctx->map_chan !! 279 up_read(&dst_mm->mmap_sem); 607 uffd_mfill_unlock(dst_ !! 280 BUG_ON(!page); 608 BUG_ON(!folio); !! 281 609 !! 282 err = copy_huge_page_from_user(page, 610 err = copy_folio_from_ !! 283 (const void __user *)src_addr, 611 !! 284 pages_per_huge_page(h), true); 612 if (unlikely(err)) { 285 if (unlikely(err)) { 613 err = -EFAULT; 286 err = -EFAULT; 614 goto out; 287 goto out; 615 } 288 } >> 289 down_read(&dst_mm->mmap_sem); 616 290 617 dst_vma = NULL; 291 dst_vma = NULL; 618 goto retry; 292 goto retry; 619 } else 293 } else 620 BUG_ON(folio); !! 294 BUG_ON(page); 621 295 622 if (!err) { 296 if (!err) { 623 dst_addr += vma_hpages 297 dst_addr += vma_hpagesize; 624 src_addr += vma_hpages 298 src_addr += vma_hpagesize; 625 copied += vma_hpagesiz 299 copied += vma_hpagesize; 626 300 627 if (fatal_signal_pendi 301 if (fatal_signal_pending(current)) 628 err = -EINTR; 302 err = -EINTR; 629 } 303 } 630 if (err) 304 if (err) 631 break; 305 break; 632 } 306 } 633 307 634 out_unlock: 308 out_unlock: 635 up_read(&ctx->map_changing_lock); !! 309 up_read(&dst_mm->mmap_sem); 636 out_unlock_vma: << 637 uffd_mfill_unlock(dst_vma); << 638 out: 310 out: 639 if (folio) !! 311 if (page) { 640 folio_put(folio); !! 312 /* >> 313 * We encountered an error and are about to free a newly >> 314 * allocated huge page. >> 315 * >> 316 * Reservation handling is very subtle, and is different for >> 317 * private and shared mappings. See the routine >> 318 * restore_reserve_on_error for details. Unfortunately, we >> 319 * can not call restore_reserve_on_error now as it would >> 320 * require holding mmap_sem. >> 321 * >> 322 * If a reservation for the page existed in the reservation >> 323 * map of a private mapping, the map was modified to indicate >> 324 * the reservation was consumed when the page was allocated. >> 325 * We clear the PagePrivate flag now so that the global >> 326 * reserve count will not be incremented in free_huge_page. >> 327 * The reservation map will still indicate the reservation >> 328 * was consumed and possibly prevent later page allocation. >> 329 * This is better than leaking a global reservation. If no >> 330 * reservation existed, it is still safe to clear PagePrivate >> 331 * as no adjustments to reservation counts were made during >> 332 * allocation. >> 333 * >> 334 * The reservation map for shared mappings indicates which >> 335 * pages have reservations. When a huge page is allocated >> 336 * for an address with a reservation, no change is made to >> 337 * the reserve map. In this case PagePrivate will be set >> 338 * to indicate that the global reservation count should be >> 339 * incremented when the page is freed. This is the desired >> 340 * behavior. However, when a huge page is allocated for an >> 341 * address without a reservation a reservation entry is added >> 342 * to the reservation map, and PagePrivate will not be set. >> 343 * When the page is freed, the global reserve count will NOT >> 344 * be incremented and it will appear as though we have leaked >> 345 * reserved page. In this case, set PagePrivate so that the >> 346 * global reserve count will be incremented to match the >> 347 * reservation map entry which was created. >> 348 * >> 349 * Note that vm_alloc_shared is based on the flags of the vma >> 350 * for which the page was originally allocated. dst_vma could >> 351 * be different or NULL on error. >> 352 */ >> 353 if (vm_alloc_shared) >> 354 SetPagePrivate(page); >> 355 else >> 356 ClearPagePrivate(page); >> 357 put_page(page); >> 358 } 641 BUG_ON(copied < 0); 359 BUG_ON(copied < 0); 642 BUG_ON(err > 0); 360 BUG_ON(err > 0); 643 BUG_ON(!copied && !err); 361 BUG_ON(!copied && !err); 644 return copied ? copied : err; 362 return copied ? copied : err; 645 } 363 } 646 #else /* !CONFIG_HUGETLB_PAGE */ 364 #else /* !CONFIG_HUGETLB_PAGE */ 647 /* fail at build time if gcc attempts to use t 365 /* fail at build time if gcc attempts to use this */ 648 extern ssize_t mfill_atomic_hugetlb(struct use !! 366 extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, 649 struct vm_ !! 367 struct vm_area_struct *dst_vma, 650 unsigned l !! 368 unsigned long dst_start, 651 unsigned l !! 369 unsigned long src_start, 652 unsigned l !! 370 unsigned long len, 653 uffd_flags !! 371 bool zeropage); 654 #endif /* CONFIG_HUGETLB_PAGE */ 372 #endif /* CONFIG_HUGETLB_PAGE */ 655 373 656 static __always_inline ssize_t mfill_atomic_pt !! 374 static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, 657 !! 375 unsigned long dst_start, 658 !! 376 unsigned long src_start, 659 !! 377 unsigned long len, 660 !! 378 bool zeropage) 661 << 662 { << 663 ssize_t err; << 664 << 665 if (uffd_flags_mode_is(flags, MFILL_AT << 666 return mfill_atomic_pte_contin << 667 << 668 } else if (uffd_flags_mode_is(flags, M << 669 return mfill_atomic_pte_poison << 670 << 671 } << 672 << 673 /* << 674 * The normal page fault path for a sh << 675 * fault, fill the hole in the file an << 676 * result generates plain anonymous me << 677 * asked to fill an hole in a MAP_PRIV << 678 * generate anonymous memory directly << 679 * the hole. For the MAP_PRIVATE case << 680 * only happens in the pagetable (to v << 681 * and not in the radix tree. << 682 */ << 683 if (!(dst_vma->vm_flags & VM_SHARED)) << 684 if (uffd_flags_mode_is(flags, << 685 err = mfill_atomic_pte << 686 << 687 << 688 else << 689 err = mfill_atomic_pte << 690 << 691 } else { << 692 err = shmem_mfill_atomic_pte(d << 693 d << 694 f << 695 } << 696 << 697 return err; << 698 } << 699 << 700 static __always_inline ssize_t mfill_atomic(st << 701 un << 702 un << 703 un << 704 uf << 705 { 379 { 706 struct mm_struct *dst_mm = ctx->mm; << 707 struct vm_area_struct *dst_vma; 380 struct vm_area_struct *dst_vma; 708 ssize_t err; 381 ssize_t err; 709 pmd_t *dst_pmd; 382 pmd_t *dst_pmd; 710 unsigned long src_addr, dst_addr; 383 unsigned long src_addr, dst_addr; 711 long copied; 384 long copied; 712 struct folio *folio; !! 385 struct page *page; 713 386 714 /* 387 /* 715 * Sanitize the command parameters: 388 * Sanitize the command parameters: 716 */ 389 */ 717 BUG_ON(dst_start & ~PAGE_MASK); 390 BUG_ON(dst_start & ~PAGE_MASK); 718 BUG_ON(len & ~PAGE_MASK); 391 BUG_ON(len & ~PAGE_MASK); 719 392 720 /* Does the address range wrap, or is 393 /* Does the address range wrap, or is the span zero-sized? */ 721 BUG_ON(src_start + len <= src_start); 394 BUG_ON(src_start + len <= src_start); 722 BUG_ON(dst_start + len <= dst_start); 395 BUG_ON(dst_start + len <= dst_start); 723 396 724 src_addr = src_start; 397 src_addr = src_start; 725 dst_addr = dst_start; 398 dst_addr = dst_start; 726 copied = 0; 399 copied = 0; 727 folio = NULL; !! 400 page = NULL; 728 retry: 401 retry: >> 402 down_read(&dst_mm->mmap_sem); >> 403 729 /* 404 /* 730 * Make sure the vma is not shared, th 405 * Make sure the vma is not shared, that the dst range is 731 * both valid and fully within a singl 406 * both valid and fully within a single existing vma. 732 */ 407 */ 733 dst_vma = uffd_mfill_lock(dst_mm, dst_ !! 408 err = -ENOENT; 734 if (IS_ERR(dst_vma)) { !! 409 dst_vma = find_vma(dst_mm, dst_start); 735 err = PTR_ERR(dst_vma); !! 410 if (!dst_vma) 736 goto out; !! 411 goto out_unlock; 737 } << 738 << 739 /* 412 /* 740 * If memory mappings are changing bec !! 413 * Be strict and only allow __mcopy_atomic on userfaultfd 741 * operation (e.g. mremap) running in !! 414 * registered ranges to prevent userland errors going 742 * request the user to retry later !! 415 * unnoticed. As far as the VM consistency is concerned, it >> 416 * would be perfectly safe to remove this check, but there's >> 417 * no useful usage for __mcopy_atomic ouside of userfaultfd >> 418 * registered ranges. This is after all why these are ioctls >> 419 * belonging to the userfaultfd and not syscalls. 743 */ 420 */ 744 down_read(&ctx->map_changing_lock); !! 421 if (!dst_vma->vm_userfaultfd_ctx.ctx) 745 err = -EAGAIN; !! 422 goto out_unlock; 746 if (atomic_read(&ctx->mmap_changing)) !! 423 >> 424 if (dst_start < dst_vma->vm_start || >> 425 dst_start + len > dst_vma->vm_end) 747 goto out_unlock; 426 goto out_unlock; 748 427 749 err = -EINVAL; 428 err = -EINVAL; 750 /* 429 /* 751 * shmem_zero_setup is invoked in mmap 430 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 752 * it will overwrite vm_ops, so vma_is 431 * it will overwrite vm_ops, so vma_is_anonymous must return false. 753 */ 432 */ 754 if (WARN_ON_ONCE(vma_is_anonymous(dst_ 433 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 755 dst_vma->vm_flags & VM_SHARED)) 434 dst_vma->vm_flags & VM_SHARED)) 756 goto out_unlock; 435 goto out_unlock; 757 436 758 /* 437 /* 759 * validate 'mode' now that we know th << 760 * a wrprotect copy if the userfaultfd << 761 */ << 762 if ((flags & MFILL_ATOMIC_WP) && !(dst << 763 goto out_unlock; << 764 << 765 /* << 766 * If this is a HUGETLB vma, pass off 438 * If this is a HUGETLB vma, pass off to appropriate routine 767 */ 439 */ 768 if (is_vm_hugetlb_page(dst_vma)) 440 if (is_vm_hugetlb_page(dst_vma)) 769 return mfill_atomic_hugetlb(c !! 441 return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, 770 s !! 442 src_start, len, zeropage); 771 443 772 if (!vma_is_anonymous(dst_vma) && !vma 444 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) 773 goto out_unlock; 445 goto out_unlock; 774 if (!vma_is_shmem(dst_vma) && !! 446 775 uffd_flags_mode_is(flags, MFILL_AT !! 447 /* >> 448 * Ensure the dst_vma has a anon_vma or this page >> 449 * would get a NULL anon_vma when moved in the >> 450 * dst_vma. >> 451 */ >> 452 err = -ENOMEM; >> 453 if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma))) 776 goto out_unlock; 454 goto out_unlock; 777 455 778 while (src_addr < src_start + len) { 456 while (src_addr < src_start + len) { 779 pmd_t dst_pmdval; 457 pmd_t dst_pmdval; 780 458 781 BUG_ON(dst_addr >= dst_start + 459 BUG_ON(dst_addr >= dst_start + len); 782 460 783 dst_pmd = mm_alloc_pmd(dst_mm, 461 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); 784 if (unlikely(!dst_pmd)) { 462 if (unlikely(!dst_pmd)) { 785 err = -ENOMEM; 463 err = -ENOMEM; 786 break; 464 break; 787 } 465 } 788 466 789 dst_pmdval = pmdp_get_lockless !! 467 dst_pmdval = pmd_read_atomic(dst_pmd); 790 if (unlikely(pmd_none(dst_pmdv << 791 unlikely(__pte_alloc(dst_m << 792 err = -ENOMEM; << 793 break; << 794 } << 795 dst_pmdval = pmdp_get_lockless << 796 /* 468 /* 797 * If the dst_pmd is THP don't !! 469 * If the dst_pmd is mapped as THP don't 798 * (This includes the case whe !! 470 * override it and just be strict. 799 * changed back to none after << 800 */ 471 */ 801 if (unlikely(!pmd_present(dst_ !! 472 if (unlikely(pmd_trans_huge(dst_pmdval))) { 802 pmd_devmap(dst_pm << 803 err = -EEXIST; 473 err = -EEXIST; 804 break; 474 break; 805 } 475 } 806 if (unlikely(pmd_bad(dst_pmdva !! 476 if (unlikely(pmd_none(dst_pmdval)) && >> 477 unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { >> 478 err = -ENOMEM; >> 479 break; >> 480 } >> 481 /* If an huge pmd materialized from under us fail */ >> 482 if (unlikely(pmd_trans_huge(*dst_pmd))) { 807 err = -EFAULT; 483 err = -EFAULT; 808 break; 484 break; 809 } 485 } 810 /* << 811 * For shmem mappings, khugepa << 812 * tables under us; pte_offset << 813 */ << 814 486 815 err = mfill_atomic_pte(dst_pmd !! 487 BUG_ON(pmd_none(*dst_pmd)); 816 src_add !! 488 BUG_ON(pmd_trans_huge(*dst_pmd)); >> 489 >> 490 if (vma_is_anonymous(dst_vma)) { >> 491 if (!zeropage) >> 492 err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, >> 493 dst_addr, src_addr, >> 494 &page); >> 495 else >> 496 err = mfill_zeropage_pte(dst_mm, dst_pmd, >> 497 dst_vma, dst_addr); >> 498 } else { >> 499 err = -EINVAL; /* if zeropage is true return -EINVAL */ >> 500 if (likely(!zeropage)) >> 501 err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, >> 502 dst_vma, dst_addr, >> 503 src_addr, &page); >> 504 } >> 505 817 cond_resched(); 506 cond_resched(); 818 507 819 if (unlikely(err == -ENOENT)) !! 508 if (unlikely(err == -EFAULT)) { 820 void *kaddr; !! 509 void *page_kaddr; 821 510 822 up_read(&ctx->map_chan !! 511 up_read(&dst_mm->mmap_sem); 823 uffd_mfill_unlock(dst_ !! 512 BUG_ON(!page); 824 BUG_ON(!folio); << 825 513 826 kaddr = kmap_local_fol !! 514 page_kaddr = kmap(page); 827 err = copy_from_user(k !! 515 err = copy_from_user(page_kaddr, 828 ( 516 (const void __user *) src_addr, 829 P 517 PAGE_SIZE); 830 kunmap_local(kaddr); !! 518 kunmap(page); 831 if (unlikely(err)) { 519 if (unlikely(err)) { 832 err = -EFAULT; 520 err = -EFAULT; 833 goto out; 521 goto out; 834 } 522 } 835 flush_dcache_folio(fol << 836 goto retry; 523 goto retry; 837 } else 524 } else 838 BUG_ON(folio); !! 525 BUG_ON(page); 839 526 840 if (!err) { 527 if (!err) { 841 dst_addr += PAGE_SIZE; 528 dst_addr += PAGE_SIZE; 842 src_addr += PAGE_SIZE; 529 src_addr += PAGE_SIZE; 843 copied += PAGE_SIZE; 530 copied += PAGE_SIZE; 844 531 845 if (fatal_signal_pendi 532 if (fatal_signal_pending(current)) 846 err = -EINTR; 533 err = -EINTR; 847 } 534 } 848 if (err) 535 if (err) 849 break; 536 break; 850 } 537 } 851 538 852 out_unlock: 539 out_unlock: 853 up_read(&ctx->map_changing_lock); !! 540 up_read(&dst_mm->mmap_sem); 854 uffd_mfill_unlock(dst_vma); << 855 out: 541 out: 856 if (folio) !! 542 if (page) 857 folio_put(folio); !! 543 put_page(page); 858 BUG_ON(copied < 0); 544 BUG_ON(copied < 0); 859 BUG_ON(err > 0); 545 BUG_ON(err > 0); 860 BUG_ON(!copied && !err); 546 BUG_ON(!copied && !err); 861 return copied ? copied : err; 547 return copied ? copied : err; 862 } 548 } 863 549 864 ssize_t mfill_atomic_copy(struct userfaultfd_c !! 550 ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, 865 unsigned long src_st !! 551 unsigned long src_start, unsigned long len) 866 uffd_flags_t flags) << 867 { 552 { 868 return mfill_atomic(ctx, dst_start, sr !! 553 return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); 869 uffd_flags_set_mod << 870 } 554 } 871 555 872 ssize_t mfill_atomic_zeropage(struct userfault !! 556 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, 873 unsigned long st !! 557 unsigned long len) 874 unsigned long le << 875 { 558 { 876 return mfill_atomic(ctx, start, 0, len !! 559 return __mcopy_atomic(dst_mm, start, 0, len, true); 877 uffd_flags_set_mod << 878 } << 879 << 880 ssize_t mfill_atomic_continue(struct userfault << 881 unsigned long le << 882 { << 883 << 884 /* << 885 * A caller might reasonably assume th << 886 * smp_wmb() to ensure that any writes << 887 * the thread doing the UFFDIO_CONTINU << 888 * subsequent loads from the page thro << 889 */ << 890 smp_wmb(); << 891 << 892 return mfill_atomic(ctx, start, 0, len << 893 uffd_flags_set_mod << 894 } << 895 << 896 ssize_t mfill_atomic_poison(struct userfaultfd << 897 unsigned long len, << 898 { << 899 return mfill_atomic(ctx, start, 0, len << 900 uffd_flags_set_mod << 901 } << 902 << 903 long uffd_wp_range(struct vm_area_struct *dst_ << 904 unsigned long start, unsign << 905 { << 906 unsigned int mm_cp_flags; << 907 struct mmu_gather tlb; << 908 long ret; << 909 << 910 VM_WARN_ONCE(start < dst_vma->vm_start << 911 "The address range exc << 912 if (enable_wp) << 913 mm_cp_flags = MM_CP_UFFD_WP; << 914 else << 915 mm_cp_flags = MM_CP_UFFD_WP_RE << 916 << 917 /* << 918 * vma->vm_page_prot already reflects << 919 * VMA (see userfaultfd_set_vm_flags() << 920 * to be write-protected as default wh << 921 * Try upgrading write permissions man << 922 */ << 923 if (!enable_wp && vma_wants_manual_pte << 924 mm_cp_flags |= MM_CP_TRY_CHANG << 925 tlb_gather_mmu(&tlb, dst_vma->vm_mm); << 926 ret = change_protection(&tlb, dst_vma, << 927 tlb_finish_mmu(&tlb); << 928 << 929 return ret; << 930 } << 931 << 932 int mwriteprotect_range(struct userfaultfd_ctx << 933 unsigned long len, boo << 934 { << 935 struct mm_struct *dst_mm = ctx->mm; << 936 unsigned long end = start + len; << 937 unsigned long _start, _end; << 938 struct vm_area_struct *dst_vma; << 939 unsigned long page_mask; << 940 long err; << 941 VMA_ITERATOR(vmi, dst_mm, start); << 942 << 943 /* << 944 * Sanitize the command parameters: << 945 */ << 946 BUG_ON(start & ~PAGE_MASK); << 947 BUG_ON(len & ~PAGE_MASK); << 948 << 949 /* Does the address range wrap, or is << 950 BUG_ON(start + len <= start); << 951 << 952 mmap_read_lock(dst_mm); << 953 << 954 /* << 955 * If memory mappings are changing bec << 956 * operation (e.g. mremap) running in << 957 * request the user to retry later << 958 */ << 959 down_read(&ctx->map_changing_lock); << 960 err = -EAGAIN; << 961 if (atomic_read(&ctx->mmap_changing)) << 962 goto out_unlock; << 963 << 964 err = -ENOENT; << 965 for_each_vma_range(vmi, dst_vma, end) << 966 << 967 if (!userfaultfd_wp(dst_vma)) << 968 err = -ENOENT; << 969 break; << 970 } << 971 << 972 if (is_vm_hugetlb_page(dst_vma << 973 err = -EINVAL; << 974 page_mask = vma_kernel << 975 if ((start & page_mask << 976 break; << 977 } << 978 << 979 _start = max(dst_vma->vm_start << 980 _end = min(dst_vma->vm_end, en << 981 << 982 err = uffd_wp_range(dst_vma, _ << 983 << 984 /* Return 0 on success, <0 on << 985 if (err < 0) << 986 break; << 987 err = 0; << 988 } << 989 out_unlock: << 990 up_read(&ctx->map_changing_lock); << 991 mmap_read_unlock(dst_mm); << 992 return err; << 993 } << 994 << 995 << 996 void double_pt_lock(spinlock_t *ptl1, << 997 spinlock_t *ptl2) << 998 __acquires(ptl1) << 999 __acquires(ptl2) << 1000 { << 1001 if (ptl1 > ptl2) << 1002 swap(ptl1, ptl2); << 1003 /* lock in virtual address order to a << 1004 spin_lock(ptl1); << 1005 if (ptl1 != ptl2) << 1006 spin_lock_nested(ptl2, SINGLE << 1007 else << 1008 __acquire(ptl2); << 1009 } << 1010 << 1011 void double_pt_unlock(spinlock_t *ptl1, << 1012 spinlock_t *ptl2) << 1013 __releases(ptl1) << 1014 __releases(ptl2) << 1015 { << 1016 spin_unlock(ptl1); << 1017 if (ptl1 != ptl2) << 1018 spin_unlock(ptl2); << 1019 else << 1020 __release(ptl2); << 1021 } << 1022 << 1023 << 1024 static int move_present_pte(struct mm_struct << 1025 struct vm_area_st << 1026 struct vm_area_st << 1027 unsigned long dst << 1028 pte_t *dst_pte, p << 1029 pte_t orig_dst_pt << 1030 spinlock_t *dst_p << 1031 struct folio *src << 1032 { << 1033 int err = 0; << 1034 << 1035 double_pt_lock(dst_ptl, src_ptl); << 1036 << 1037 if (!pte_same(ptep_get(src_pte), orig << 1038 !pte_same(ptep_get(dst_pte), orig << 1039 err = -EAGAIN; << 1040 goto out; << 1041 } << 1042 if (folio_test_large(src_folio) || << 1043 folio_maybe_dma_pinned(src_folio) << 1044 !PageAnonExclusive(&src_folio->pa << 1045 err = -EBUSY; << 1046 goto out; << 1047 } << 1048 << 1049 orig_src_pte = ptep_clear_flush(src_v << 1050 /* Folio got pinned from under us. Pu << 1051 if (folio_maybe_dma_pinned(src_folio) << 1052 set_pte_at(mm, src_addr, src_ << 1053 err = -EBUSY; << 1054 goto out; << 1055 } << 1056 << 1057 folio_move_anon_rmap(src_folio, dst_v << 1058 src_folio->index = linear_page_index( << 1059 << 1060 orig_dst_pte = mk_pte(&src_folio->pag << 1061 /* Follow mremap() behavior and treat << 1062 orig_dst_pte = pte_mkwrite(pte_mkdirt << 1063 << 1064 set_pte_at(mm, dst_addr, dst_pte, ori << 1065 out: << 1066 double_pt_unlock(dst_ptl, src_ptl); << 1067 return err; << 1068 } << 1069 << 1070 static int move_swap_pte(struct mm_struct *mm << 1071 unsigned long dst_ad << 1072 pte_t *dst_pte, pte_ << 1073 pte_t orig_dst_pte, << 1074 spinlock_t *dst_ptl, << 1075 { << 1076 if (!pte_swp_exclusive(orig_src_pte)) << 1077 return -EBUSY; << 1078 << 1079 double_pt_lock(dst_ptl, src_ptl); << 1080 << 1081 if (!pte_same(ptep_get(src_pte), orig << 1082 !pte_same(ptep_get(dst_pte), orig << 1083 double_pt_unlock(dst_ptl, src << 1084 return -EAGAIN; << 1085 } << 1086 << 1087 orig_src_pte = ptep_get_and_clear(mm, << 1088 set_pte_at(mm, dst_addr, dst_pte, ori << 1089 double_pt_unlock(dst_ptl, src_ptl); << 1090 << 1091 return 0; << 1092 } << 1093 << 1094 static int move_zeropage_pte(struct mm_struct << 1095 struct vm_area_s << 1096 struct vm_area_s << 1097 unsigned long ds << 1098 pte_t *dst_pte, << 1099 pte_t orig_dst_p << 1100 spinlock_t *dst_ << 1101 { << 1102 pte_t zero_pte; << 1103 << 1104 double_pt_lock(dst_ptl, src_ptl); << 1105 if (!pte_same(ptep_get(src_pte), orig << 1106 !pte_same(ptep_get(dst_pte), orig << 1107 double_pt_unlock(dst_ptl, src << 1108 return -EAGAIN; << 1109 } << 1110 << 1111 zero_pte = pte_mkspecial(pfn_pte(my_z << 1112 dst_ << 1113 ptep_clear_flush(src_vma, src_addr, s << 1114 set_pte_at(mm, dst_addr, dst_pte, zer << 1115 double_pt_unlock(dst_ptl, src_ptl); << 1116 << 1117 return 0; << 1118 } << 1119 << 1120 << 1121 /* << 1122 * The mmap_lock for reading is held by the c << 1123 * from src_pmd to dst_pmd if possible, and r << 1124 * in moving the page. << 1125 */ << 1126 static int move_pages_pte(struct mm_struct *m << 1127 struct vm_area_stru << 1128 struct vm_area_stru << 1129 unsigned long dst_a << 1130 __u64 mode) << 1131 { << 1132 swp_entry_t entry; << 1133 pte_t orig_src_pte, orig_dst_pte; << 1134 pte_t src_folio_pte; << 1135 spinlock_t *src_ptl, *dst_ptl; << 1136 pte_t *src_pte = NULL; << 1137 pte_t *dst_pte = NULL; << 1138 << 1139 struct folio *src_folio = NULL; << 1140 struct anon_vma *src_anon_vma = NULL; << 1141 struct mmu_notifier_range range; << 1142 int err = 0; << 1143 << 1144 flush_cache_range(src_vma, src_addr, << 1145 mmu_notifier_range_init(&range, MMU_N << 1146 src_addr, src << 1147 mmu_notifier_invalidate_range_start(& << 1148 retry: << 1149 dst_pte = pte_offset_map_nolock(mm, d << 1150 << 1151 /* Retry if a huge pmd materialized f << 1152 if (unlikely(!dst_pte)) { << 1153 err = -EAGAIN; << 1154 goto out; << 1155 } << 1156 << 1157 src_pte = pte_offset_map_nolock(mm, s << 1158 << 1159 /* << 1160 * We held the mmap_lock for reading << 1161 * can zap transparent huge pages und << 1162 * transparent huge page fault can es << 1163 * transparent huge pages under us. << 1164 */ << 1165 if (unlikely(!src_pte)) { << 1166 err = -EAGAIN; << 1167 goto out; << 1168 } << 1169 << 1170 /* Sanity checks before the operation << 1171 if (WARN_ON_ONCE(pmd_none(*dst_pmd)) << 1172 WARN_ON_ONCE(pmd_trans_huge(*dst_ << 1173 err = -EINVAL; << 1174 goto out; << 1175 } << 1176 << 1177 spin_lock(dst_ptl); << 1178 orig_dst_pte = ptep_get(dst_pte); << 1179 spin_unlock(dst_ptl); << 1180 if (!pte_none(orig_dst_pte)) { << 1181 err = -EEXIST; << 1182 goto out; << 1183 } << 1184 << 1185 spin_lock(src_ptl); << 1186 orig_src_pte = ptep_get(src_pte); << 1187 spin_unlock(src_ptl); << 1188 if (pte_none(orig_src_pte)) { << 1189 if (!(mode & UFFDIO_MOVE_MODE << 1190 err = -ENOENT; << 1191 else /* nothing to do to move << 1192 err = 0; << 1193 goto out; << 1194 } << 1195 << 1196 /* If PTE changed after we locked the << 1197 if (src_folio && unlikely(!pte_same(s << 1198 err = -EAGAIN; << 1199 goto out; << 1200 } << 1201 << 1202 if (pte_present(orig_src_pte)) { << 1203 if (is_zero_pfn(pte_pfn(orig_ << 1204 err = move_zeropage_p << 1205 << 1206 << 1207 << 1208 goto out; << 1209 } << 1210 << 1211 /* << 1212 * Pin and lock both source f << 1213 * RCU read section, we can't << 1214 * unmap the ptes, obtain the << 1215 */ << 1216 if (!src_folio) { << 1217 struct folio *folio; << 1218 << 1219 /* << 1220 * Pin the page while << 1221 * page isn't freed u << 1222 */ << 1223 spin_lock(src_ptl); << 1224 if (!pte_same(orig_sr << 1225 spin_unlock(s << 1226 err = -EAGAIN << 1227 goto out; << 1228 } << 1229 << 1230 folio = vm_normal_fol << 1231 if (!folio || !PageAn << 1232 spin_unlock(s << 1233 err = -EBUSY; << 1234 goto out; << 1235 } << 1236 << 1237 folio_get(folio); << 1238 src_folio = folio; << 1239 src_folio_pte = orig_ << 1240 spin_unlock(src_ptl); << 1241 << 1242 if (!folio_trylock(sr << 1243 pte_unmap(&or << 1244 pte_unmap(&or << 1245 src_pte = dst << 1246 /* now we can << 1247 folio_lock(sr << 1248 goto retry; << 1249 } << 1250 << 1251 if (WARN_ON_ONCE(!fol << 1252 err = -EBUSY; << 1253 goto out; << 1254 } << 1255 } << 1256 << 1257 /* at this point we have src_ << 1258 if (folio_test_large(src_foli << 1259 /* split_folio() can << 1260 pte_unmap(&orig_src_p << 1261 pte_unmap(&orig_dst_p << 1262 src_pte = dst_pte = N << 1263 err = split_folio(src << 1264 if (err) << 1265 goto out; << 1266 /* have to reacquire << 1267 folio_unlock(src_foli << 1268 folio_put(src_folio); << 1269 src_folio = NULL; << 1270 goto retry; << 1271 } << 1272 << 1273 if (!src_anon_vma) { << 1274 /* << 1275 * folio_referenced w << 1276 * without the folio << 1277 * the anon_vma lock, << 1278 */ << 1279 src_anon_vma = folio_ << 1280 if (!src_anon_vma) { << 1281 /* page was u << 1282 err = -EAGAIN << 1283 goto out; << 1284 } << 1285 if (!anon_vma_trylock << 1286 pte_unmap(&or << 1287 pte_unmap(&or << 1288 src_pte = dst << 1289 /* now we can << 1290 anon_vma_lock << 1291 goto retry; << 1292 } << 1293 } << 1294 << 1295 err = move_present_pte(mm, d << 1296 dst_ad << 1297 orig_d << 1298 dst_pt << 1299 } else { << 1300 entry = pte_to_swp_entry(orig << 1301 if (non_swap_entry(entry)) { << 1302 if (is_migration_entr << 1303 pte_unmap(&or << 1304 pte_unmap(&or << 1305 src_pte = dst << 1306 migration_ent << 1307 err = -EAGAIN << 1308 } else << 1309 err = -EFAULT << 1310 goto out; << 1311 } << 1312 << 1313 err = move_swap_pte(mm, dst_a << 1314 dst_pte, << 1315 orig_dst_ << 1316 dst_ptl, << 1317 } << 1318 << 1319 out: << 1320 if (src_anon_vma) { << 1321 anon_vma_unlock_write(src_ano << 1322 put_anon_vma(src_anon_vma); << 1323 } << 1324 if (src_folio) { << 1325 folio_unlock(src_folio); << 1326 folio_put(src_folio); << 1327 } << 1328 if (dst_pte) << 1329 pte_unmap(dst_pte); << 1330 if (src_pte) << 1331 pte_unmap(src_pte); << 1332 mmu_notifier_invalidate_range_end(&ra << 1333 << 1334 return err; << 1335 } << 1336 << 1337 #ifdef CONFIG_TRANSPARENT_HUGEPAGE << 1338 static inline bool move_splits_huge_pmd(unsig << 1339 unsig << 1340 unsig << 1341 { << 1342 return (src_addr & ~HPAGE_PMD_MASK) | << 1343 src_end - src_addr < HPAGE_PM << 1344 } << 1345 #else << 1346 static inline bool move_splits_huge_pmd(unsig << 1347 unsig << 1348 unsig << 1349 { << 1350 /* This is unreachable anyway, just t << 1351 return false; << 1352 } << 1353 #endif << 1354 << 1355 static inline bool vma_move_compatible(struct << 1356 { << 1357 return !(vma->vm_flags & (VM_PFNMAP | << 1358 VM_MIXEDMAP << 1359 } << 1360 << 1361 static int validate_move_areas(struct userfau << 1362 struct vm_area << 1363 struct vm_area << 1364 { << 1365 /* Only allow moving if both have the << 1366 if ((src_vma->vm_flags & VM_ACCESS_FL << 1367 pgprot_val(src_vma->vm_page_prot) << 1368 return -EINVAL; << 1369 << 1370 /* Only allow moving if both are mloc << 1371 if ((src_vma->vm_flags & VM_LOCKED) ! << 1372 return -EINVAL; << 1373 << 1374 /* << 1375 * For now, we keep it simple and onl << 1376 * Access flags are equal, therefore << 1377 */ << 1378 if (!(src_vma->vm_flags & VM_WRITE)) << 1379 return -EINVAL; << 1380 << 1381 /* Check if vma flags indicate conten << 1382 if (!vma_move_compatible(src_vma) || << 1383 return -EINVAL; << 1384 << 1385 /* Ensure dst_vma is registered in uf << 1386 if (!dst_vma->vm_userfaultfd_ctx.ctx << 1387 dst_vma->vm_userfaultfd_ctx.ctx ! << 1388 return -EINVAL; << 1389 << 1390 /* Only allow moving across anonymous << 1391 if (!vma_is_anonymous(src_vma) || !vm << 1392 return -EINVAL; << 1393 << 1394 return 0; << 1395 } << 1396 << 1397 static __always_inline << 1398 int find_vmas_mm_locked(struct mm_struct *mm, << 1399 unsigned long dst_sta << 1400 unsigned long src_sta << 1401 struct vm_area_struct << 1402 struct vm_area_struct << 1403 { << 1404 struct vm_area_struct *vma; << 1405 << 1406 mmap_assert_locked(mm); << 1407 vma = find_vma_and_prepare_anon(mm, d << 1408 if (IS_ERR(vma)) << 1409 return PTR_ERR(vma); << 1410 << 1411 *dst_vmap = vma; << 1412 /* Skip finding src_vma if src_start << 1413 if (src_start >= vma->vm_start && src << 1414 goto out_success; << 1415 << 1416 vma = vma_lookup(mm, src_start); << 1417 if (!vma) << 1418 return -ENOENT; << 1419 out_success: << 1420 *src_vmap = vma; << 1421 return 0; << 1422 } << 1423 << 1424 #ifdef CONFIG_PER_VMA_LOCK << 1425 static int uffd_move_lock(struct mm_struct *m << 1426 unsigned long dst_s << 1427 unsigned long src_s << 1428 struct vm_area_stru << 1429 struct vm_area_stru << 1430 { << 1431 struct vm_area_struct *vma; << 1432 int err; << 1433 << 1434 vma = uffd_lock_vma(mm, dst_start); << 1435 if (IS_ERR(vma)) << 1436 return PTR_ERR(vma); << 1437 << 1438 *dst_vmap = vma; << 1439 /* << 1440 * Skip finding src_vma if src_start << 1441 * that we don't lock the same vma tw << 1442 */ << 1443 if (src_start >= vma->vm_start && src << 1444 *src_vmap = vma; << 1445 return 0; << 1446 } << 1447 << 1448 /* << 1449 * Using uffd_lock_vma() to get src_v << 1450 * << 1451 * Thread1 << 1452 * ------- << 1453 * vma_start_read(dst_vma) << 1454 * << 1455 * << 1456 * vma_start_read(src_vma) << 1457 * mmap_read_lock(mm) << 1458 * << 1459 */ << 1460 *src_vmap = lock_vma_under_rcu(mm, sr << 1461 if (likely(*src_vmap)) << 1462 return 0; << 1463 << 1464 /* Undo any locking and retry in mmap << 1465 vma_end_read(*dst_vmap); << 1466 << 1467 mmap_read_lock(mm); << 1468 err = find_vmas_mm_locked(mm, dst_sta << 1469 if (!err) { << 1470 /* << 1471 * See comment in uffd_lock_v << 1472 * vma_start_read() here. << 1473 */ << 1474 down_read(&(*dst_vmap)->vm_lo << 1475 if (*dst_vmap != *src_vmap) << 1476 down_read_nested(&(*s << 1477 SING << 1478 } << 1479 mmap_read_unlock(mm); << 1480 return err; << 1481 } << 1482 << 1483 static void uffd_move_unlock(struct vm_area_s << 1484 struct vm_area_s << 1485 { << 1486 vma_end_read(src_vma); << 1487 if (src_vma != dst_vma) << 1488 vma_end_read(dst_vma); << 1489 } << 1490 << 1491 #else << 1492 << 1493 static int uffd_move_lock(struct mm_struct *m << 1494 unsigned long dst_s << 1495 unsigned long src_s << 1496 struct vm_area_stru << 1497 struct vm_area_stru << 1498 { << 1499 int err; << 1500 << 1501 mmap_read_lock(mm); << 1502 err = find_vmas_mm_locked(mm, dst_sta << 1503 if (err) << 1504 mmap_read_unlock(mm); << 1505 return err; << 1506 } << 1507 << 1508 static void uffd_move_unlock(struct vm_area_s << 1509 struct vm_area_s << 1510 { << 1511 mmap_assert_locked(src_vma->vm_mm); << 1512 mmap_read_unlock(dst_vma->vm_mm); << 1513 } << 1514 #endif << 1515 << 1516 /** << 1517 * move_pages - move arbitrary anonymous page << 1518 * @ctx: pointer to the userfaultfd context << 1519 * @dst_start: start of the destination virtu << 1520 * @src_start: start of the source virtual me << 1521 * @len: length of the virtual memory range << 1522 * @mode: flags from uffdio_move.mode << 1523 * << 1524 * It will either use the mmap_lock in read m << 1525 * << 1526 * move_pages() remaps arbitrary anonymous pa << 1527 * copy. It only works on non shared anonymou << 1528 * be relocated without generating non linear << 1529 * code. << 1530 * << 1531 * It provides a zero copy mechanism to handl << 1532 * The source vma pages should have mapcount << 1533 * enforced by using madvise(MADV_DONTFORK) o << 1534 * << 1535 * The thread receiving the page during the u << 1536 * will receive the faulting page in the sour << 1537 * storage or any other I/O device (MADV_DONT << 1538 * avoids move_pages() to fail with -EBUSY if << 1539 * move_pages() is called), then it will call << 1540 * page in the faulting address in the destin << 1541 * << 1542 * This userfaultfd command works purely via << 1543 * most efficient way to move physical non sh << 1544 * across different virtual addresses. Unlike << 1545 * it does not create any new vmas. The mappi << 1546 * address is atomic. << 1547 * << 1548 * It only works if the vma protection bits a << 1549 * source and destination vma. << 1550 * << 1551 * It can remap non shared anonymous pages wi << 1552 * << 1553 * If the source virtual memory range has any << 1554 * the destination virtual memory range is no << 1555 * move_pages() will fail respectively with - << 1556 * provides a very strict behavior to avoid a << 1557 * corruption going unnoticed if there are us << 1558 * Only one thread should resolve the userlan << 1559 * time for any given faulting address. This << 1560 * try to both call move_pages() on the same << 1561 * same time, the second thread will get an e << 1562 * command. << 1563 * << 1564 * The command retval will return "len" is su << 1565 * however can be interrupted by fatal signal << 1566 * interrupted it will return the number of b << 1567 * remapped before the interruption if any, o << 1568 * none. It will never return zero. Either it << 1569 * an amount of bytes successfully moved. If << 1570 * "short" remap, the move_pages() command sh << 1571 * userland with src+retval, dst+reval, len-r << 1572 * about the error that interrupted it. << 1573 * << 1574 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag << 1575 * prevent -ENOENT errors to materialize if t << 1576 * source virtual range that is being remappe << 1577 * accounted as successfully remapped in the << 1578 * command. This is mostly useful to remap hu << 1579 * virtual regions without knowing if there a << 1580 * in the regions or not, but preventing the << 1581 * the hugepmd during the remap. << 1582 * << 1583 * If there's any rmap walk that is taking th << 1584 * first obtaining the folio lock (the only c << 1585 * folio_referenced), they will have to verif << 1586 * has changed after taking the anon_vma lock << 1587 * should release the lock and retry obtainin << 1588 * it means the anon_vma was changed by move_ << 1589 * could be obtained. This is the only additi << 1590 * the rmap code to provide this anonymous pa << 1591 */ << 1592 ssize_t move_pages(struct userfaultfd_ctx *ct << 1593 unsigned long src_start, u << 1594 { << 1595 struct mm_struct *mm = ctx->mm; << 1596 struct vm_area_struct *src_vma, *dst_ << 1597 unsigned long src_addr, dst_addr; << 1598 pmd_t *src_pmd, *dst_pmd; << 1599 long err = -EINVAL; << 1600 ssize_t moved = 0; << 1601 << 1602 /* Sanitize the command parameters. * << 1603 if (WARN_ON_ONCE(src_start & ~PAGE_MA << 1604 WARN_ON_ONCE(dst_start & ~PAGE_MA << 1605 WARN_ON_ONCE(len & ~PAGE_MASK)) << 1606 goto out; << 1607 << 1608 /* Does the address range wrap, or is << 1609 if (WARN_ON_ONCE(src_start + len <= s << 1610 WARN_ON_ONCE(dst_start + len <= d << 1611 goto out; << 1612 << 1613 err = uffd_move_lock(mm, dst_start, s << 1614 if (err) << 1615 goto out; << 1616 << 1617 /* Re-check after taking map_changing << 1618 err = -EAGAIN; << 1619 down_read(&ctx->map_changing_lock); << 1620 if (likely(atomic_read(&ctx->mmap_cha << 1621 goto out_unlock; << 1622 /* << 1623 * Make sure the vma is not shared, t << 1624 * ranges are both valid and fully wi << 1625 * vma. << 1626 */ << 1627 err = -EINVAL; << 1628 if (src_vma->vm_flags & VM_SHARED) << 1629 goto out_unlock; << 1630 if (src_start + len > src_vma->vm_end << 1631 goto out_unlock; << 1632 << 1633 if (dst_vma->vm_flags & VM_SHARED) << 1634 goto out_unlock; << 1635 if (dst_start + len > dst_vma->vm_end << 1636 goto out_unlock; << 1637 << 1638 err = validate_move_areas(ctx, src_vm << 1639 if (err) << 1640 goto out_unlock; << 1641 << 1642 for (src_addr = src_start, dst_addr = << 1643 src_addr < src_start + len;) { << 1644 spinlock_t *ptl; << 1645 pmd_t dst_pmdval; << 1646 unsigned long step_size; << 1647 << 1648 /* << 1649 * Below works because anonym << 1650 * transparent huge PUD. If f << 1651 * that case would need to be << 1652 */ << 1653 src_pmd = mm_find_pmd(mm, src << 1654 if (unlikely(!src_pmd)) { << 1655 if (!(mode & UFFDIO_M << 1656 err = -ENOENT << 1657 break; << 1658 } << 1659 src_pmd = mm_alloc_pm << 1660 if (unlikely(!src_pmd << 1661 err = -ENOMEM << 1662 break; << 1663 } << 1664 } << 1665 dst_pmd = mm_alloc_pmd(mm, ds << 1666 if (unlikely(!dst_pmd)) { << 1667 err = -ENOMEM; << 1668 break; << 1669 } << 1670 << 1671 dst_pmdval = pmdp_get_lockles << 1672 /* << 1673 * If the dst_pmd is mapped a << 1674 * be strict. If dst_pmd chan << 1675 * move_pages_huge_pmd() will << 1676 * while move_pages_pte() wil << 1677 */ << 1678 if (unlikely(pmd_trans_huge(d << 1679 err = -EEXIST; << 1680 break; << 1681 } << 1682 << 1683 ptl = pmd_trans_huge_lock(src << 1684 if (ptl) { << 1685 if (pmd_devmap(*src_p << 1686 spin_unlock(p << 1687 err = -ENOENT << 1688 break; << 1689 } << 1690 << 1691 /* Check if we can mo << 1692 if (move_splits_huge_ << 1693 !pmd_none(dst_pmd << 1694 struct folio << 1695 << 1696 if (!folio || << 1697 << 1698 spin_ << 1699 err = << 1700 break << 1701 } << 1702 << 1703 spin_unlock(p << 1704 split_huge_pm << 1705 /* The folio << 1706 continue; << 1707 } << 1708 << 1709 err = move_pages_huge << 1710 << 1711 << 1712 step_size = HPAGE_PMD << 1713 } else { << 1714 if (pmd_none(*src_pmd << 1715 if (!(mode & << 1716 err = << 1717 break << 1718 } << 1719 if (unlikely( << 1720 err = << 1721 break << 1722 } << 1723 } << 1724 << 1725 if (unlikely(pte_allo << 1726 err = -ENOMEM << 1727 break; << 1728 } << 1729 << 1730 err = move_pages_pte( << 1731 << 1732 << 1733 step_size = PAGE_SIZE << 1734 } << 1735 << 1736 cond_resched(); << 1737 << 1738 if (fatal_signal_pending(curr << 1739 /* Do not override an << 1740 if (!err || err == -E << 1741 err = -EINTR; << 1742 break; << 1743 } << 1744 << 1745 if (err) { << 1746 if (err == -EAGAIN) << 1747 continue; << 1748 break; << 1749 } << 1750 << 1751 /* Proceed to the next page * << 1752 dst_addr += step_size; << 1753 src_addr += step_size; << 1754 moved += step_size; << 1755 } << 1756 << 1757 out_unlock: << 1758 up_read(&ctx->map_changing_lock); << 1759 uffd_move_unlock(dst_vma, src_vma); << 1760 out: << 1761 VM_WARN_ON(moved < 0); << 1762 VM_WARN_ON(err > 0); << 1763 VM_WARN_ON(!moved && !err); << 1764 return moved ? moved : err; << 1765 } << 1766 << 1767 static void userfaultfd_set_vm_flags(struct v << 1768 vm_flags << 1769 { << 1770 const bool uffd_wp_changed = (vma->vm << 1771 << 1772 vm_flags_reset(vma, flags); << 1773 /* << 1774 * For shared mappings, we want to en << 1775 * userfaultfd-wp is enabled (see vma << 1776 * recalculate vma->vm_page_prot when << 1777 */ << 1778 if ((vma->vm_flags & VM_SHARED) && uf << 1779 vma_set_page_prot(vma); << 1780 } << 1781 << 1782 static void userfaultfd_set_ctx(struct vm_are << 1783 struct userfa << 1784 unsigned long << 1785 { << 1786 vma_start_write(vma); << 1787 vma->vm_userfaultfd_ctx = (struct vm_ << 1788 userfaultfd_set_vm_flags(vma, << 1789 (vma->vm_fla << 1790 } << 1791 << 1792 void userfaultfd_reset_ctx(struct vm_area_str << 1793 { << 1794 userfaultfd_set_ctx(vma, NULL, 0); << 1795 } << 1796 << 1797 struct vm_area_struct *userfaultfd_clear_vma( << 1798 << 1799 << 1800 << 1801 << 1802 { << 1803 struct vm_area_struct *ret; << 1804 << 1805 /* Reset ptes for the whole vma range << 1806 if (userfaultfd_wp(vma)) << 1807 uffd_wp_range(vma, start, end << 1808 << 1809 ret = vma_modify_flags_uffd(vmi, prev << 1810 vma->vm_f << 1811 NULL_VM_U << 1812 << 1813 /* << 1814 * In the vma_merge() successful mpro << 1815 * the next vma was merged into the c << 1816 * the current one has not been updat << 1817 */ << 1818 if (!IS_ERR(ret)) << 1819 userfaultfd_reset_ctx(ret); << 1820 << 1821 return ret; << 1822 } << 1823 << 1824 /* Assumes mmap write lock taken, and mm_stru << 1825 int userfaultfd_register_range(struct userfau << 1826 struct vm_area << 1827 unsigned long << 1828 unsigned long << 1829 bool wp_async) << 1830 { << 1831 VMA_ITERATOR(vmi, ctx->mm, start); << 1832 struct vm_area_struct *prev = vma_pre << 1833 unsigned long vma_end; << 1834 unsigned long new_flags; << 1835 << 1836 if (vma->vm_start < start) << 1837 prev = vma; << 1838 << 1839 for_each_vma_range(vmi, vma, end) { << 1840 cond_resched(); << 1841 << 1842 BUG_ON(!vma_can_userfault(vma << 1843 BUG_ON(vma->vm_userfaultfd_ct << 1844 vma->vm_userfaultfd_ct << 1845 WARN_ON(!(vma->vm_flags & VM_ << 1846 << 1847 /* << 1848 * Nothing to do: this vma is << 1849 * userfaultfd and with the r << 1850 */ << 1851 if (vma->vm_userfaultfd_ctx.c << 1852 (vma->vm_flags & vm_flags << 1853 goto skip; << 1854 << 1855 if (vma->vm_start > start) << 1856 start = vma->vm_start << 1857 vma_end = min(end, vma->vm_en << 1858 << 1859 new_flags = (vma->vm_flags & << 1860 vma = vma_modify_flags_uffd(& << 1861 n << 1862 ( << 1863 if (IS_ERR(vma)) << 1864 return PTR_ERR(vma); << 1865 << 1866 /* << 1867 * In the vma_merge() success << 1868 * the next vma was merged in << 1869 * the current one has not be << 1870 */ << 1871 userfaultfd_set_ctx(vma, ctx, << 1872 << 1873 if (is_vm_hugetlb_page(vma) & << 1874 hugetlb_unshare_all_p << 1875 << 1876 skip: << 1877 prev = vma; << 1878 start = vma->vm_end; << 1879 } << 1880 << 1881 return 0; << 1882 } << 1883 << 1884 void userfaultfd_release_new(struct userfault << 1885 { << 1886 struct mm_struct *mm = ctx->mm; << 1887 struct vm_area_struct *vma; << 1888 VMA_ITERATOR(vmi, mm, 0); << 1889 << 1890 /* the various vma->vm_userfaultfd_ct << 1891 mmap_write_lock(mm); << 1892 for_each_vma(vmi, vma) { << 1893 if (vma->vm_userfaultfd_ctx.c << 1894 userfaultfd_reset_ctx << 1895 } << 1896 mmap_write_unlock(mm); << 1897 } << 1898 << 1899 void userfaultfd_release_all(struct mm_struct << 1900 struct userfault << 1901 { << 1902 struct vm_area_struct *vma, *prev; << 1903 VMA_ITERATOR(vmi, mm, 0); << 1904 << 1905 if (!mmget_not_zero(mm)) << 1906 return; << 1907 << 1908 /* << 1909 * Flush page faults out of all CPUs. << 1910 * must be retried without returning << 1911 * userfaultfd_ctx_get() succeeds but << 1912 * changes while handle_userfault rel << 1913 * it's critical that released is set << 1914 * taking the mmap_lock for writing. << 1915 */ << 1916 mmap_write_lock(mm); << 1917 prev = NULL; << 1918 for_each_vma(vmi, vma) { << 1919 cond_resched(); << 1920 BUG_ON(!!vma->vm_userfaultfd_ << 1921 !!(vma->vm_flags & __V << 1922 if (vma->vm_userfaultfd_ctx.c << 1923 prev = vma; << 1924 continue; << 1925 } << 1926 << 1927 vma = userfaultfd_clear_vma(& << 1928 v << 1929 prev = vma; << 1930 } << 1931 mmap_write_unlock(mm); << 1932 mmput(mm); << 1933 } 560 } 1934 561
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.