1 // SPDX-License-Identifier: GPL-2.0-only 1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 2 /* 3 * mm/userfaultfd.c 3 * mm/userfaultfd.c 4 * 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 5 * Copyright (C) 2015 Red Hat, Inc. 6 */ 6 */ 7 7 8 #include <linux/mm.h> 8 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 9 #include <linux/sched/signal.h> 10 #include <linux/pagemap.h> 10 #include <linux/pagemap.h> 11 #include <linux/rmap.h> 11 #include <linux/rmap.h> 12 #include <linux/swap.h> 12 #include <linux/swap.h> 13 #include <linux/swapops.h> 13 #include <linux/swapops.h> 14 #include <linux/userfaultfd_k.h> 14 #include <linux/userfaultfd_k.h> 15 #include <linux/mmu_notifier.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/hugetlb.h> 16 #include <linux/hugetlb.h> 17 #include <linux/shmem_fs.h> 17 #include <linux/shmem_fs.h> 18 #include <asm/tlbflush.h> 18 #include <asm/tlbflush.h> 19 #include <asm/tlb.h> << 20 #include "internal.h" 19 #include "internal.h" 21 20 22 static __always_inline 21 static __always_inline 23 bool validate_dst_vma(struct vm_area_struct *d !! 22 struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, >> 23 unsigned long dst_start, >> 24 unsigned long len) 24 { 25 { 25 /* Make sure that the dst range is ful !! 26 /* 26 if (dst_end > dst_vma->vm_end) !! 27 * Make sure that the dst range is both valid and fully within a 27 return false; !! 28 * single existing vma. >> 29 */ >> 30 struct vm_area_struct *dst_vma; >> 31 >> 32 dst_vma = find_vma(dst_mm, dst_start); >> 33 if (!dst_vma) >> 34 return NULL; >> 35 >> 36 if (dst_start < dst_vma->vm_start || >> 37 dst_start + len > dst_vma->vm_end) >> 38 return NULL; 28 39 29 /* 40 /* 30 * Check the vma is registered in uffd 41 * Check the vma is registered in uffd, this is required to 31 * enforce the VM_MAYWRITE check done 42 * enforce the VM_MAYWRITE check done at uffd registration 32 * time. 43 * time. 33 */ 44 */ 34 if (!dst_vma->vm_userfaultfd_ctx.ctx) 45 if (!dst_vma->vm_userfaultfd_ctx.ctx) 35 return false; !! 46 return NULL; 36 << 37 return true; << 38 } << 39 << 40 static __always_inline << 41 struct vm_area_struct *find_vma_and_prepare_an << 42 << 43 { << 44 struct vm_area_struct *vma; << 45 << 46 mmap_assert_locked(mm); << 47 vma = vma_lookup(mm, addr); << 48 if (!vma) << 49 vma = ERR_PTR(-ENOENT); << 50 else if (!(vma->vm_flags & VM_SHARED) << 51 unlikely(anon_vma_prepare(vma << 52 vma = ERR_PTR(-ENOMEM); << 53 << 54 return vma; << 55 } << 56 << 57 #ifdef CONFIG_PER_VMA_LOCK << 58 /* << 59 * uffd_lock_vma() - Lookup and lock vma corre << 60 * @mm: mm to search vma in. << 61 * @address: address that the vma should conta << 62 * << 63 * Should be called without holding mmap_lock. << 64 * << 65 * Return: A locked vma containing @address, - << 66 * -ENOMEM if anon_vma couldn't be allocated. << 67 */ << 68 static struct vm_area_struct *uffd_lock_vma(st << 69 unsigne << 70 { << 71 struct vm_area_struct *vma; << 72 << 73 vma = lock_vma_under_rcu(mm, address); << 74 if (vma) { << 75 /* << 76 * We know we're going to need << 77 * that early. << 78 */ << 79 if (!(vma->vm_flags & VM_SHARE << 80 vma_end_read(vma); << 81 else << 82 return vma; << 83 } << 84 << 85 mmap_read_lock(mm); << 86 vma = find_vma_and_prepare_anon(mm, ad << 87 if (!IS_ERR(vma)) { << 88 /* << 89 * We cannot use vma_start_rea << 90 * false locked (see comment i << 91 * can avoid that by directly << 92 * mmap_lock, which guarantees << 93 * vma for write (vma_start_wr << 94 */ << 95 down_read(&vma->vm_lock->lock) << 96 } << 97 << 98 mmap_read_unlock(mm); << 99 return vma; << 100 } << 101 << 102 static struct vm_area_struct *uffd_mfill_lock( << 103 << 104 << 105 { << 106 struct vm_area_struct *dst_vma; << 107 << 108 dst_vma = uffd_lock_vma(dst_mm, dst_st << 109 if (IS_ERR(dst_vma) || validate_dst_vm << 110 return dst_vma; << 111 << 112 vma_end_read(dst_vma); << 113 return ERR_PTR(-ENOENT); << 114 } << 115 << 116 static void uffd_mfill_unlock(struct vm_area_s << 117 { << 118 vma_end_read(vma); << 119 } << 120 << 121 #else << 122 << 123 static struct vm_area_struct *uffd_mfill_lock( << 124 << 125 << 126 { << 127 struct vm_area_struct *dst_vma; << 128 << 129 mmap_read_lock(dst_mm); << 130 dst_vma = find_vma_and_prepare_anon(ds << 131 if (IS_ERR(dst_vma)) << 132 goto out_unlock; << 133 << 134 if (validate_dst_vma(dst_vma, dst_star << 135 return dst_vma; << 136 47 137 dst_vma = ERR_PTR(-ENOENT); << 138 out_unlock: << 139 mmap_read_unlock(dst_mm); << 140 return dst_vma; 48 return dst_vma; 141 } 49 } 142 50 143 static void uffd_mfill_unlock(struct vm_area_s !! 51 static int mcopy_atomic_pte(struct mm_struct *dst_mm, 144 { !! 52 pmd_t *dst_pmd, 145 mmap_read_unlock(vma->vm_mm); !! 53 struct vm_area_struct *dst_vma, 146 } !! 54 unsigned long dst_addr, 147 #endif !! 55 unsigned long src_addr, 148 !! 56 struct page **pagep, 149 /* Check if dst_addr is outside of file's size !! 57 bool wp_copy) 150 static bool mfill_file_over_size(struct vm_are << 151 unsigned long << 152 { << 153 struct inode *inode; << 154 pgoff_t offset, max_off; << 155 << 156 if (!dst_vma->vm_file) << 157 return false; << 158 << 159 inode = dst_vma->vm_file->f_inode; << 160 offset = linear_page_index(dst_vma, ds << 161 max_off = DIV_ROUND_UP(i_size_read(ino << 162 return offset >= max_off; << 163 } << 164 << 165 /* << 166 * Install PTEs, to map dst_addr (within dst_v << 167 * << 168 * This function handles both MCOPY_ATOMIC_NOR << 169 * and anon, and for both shared and private V << 170 */ << 171 int mfill_atomic_install_pte(pmd_t *dst_pmd, << 172 struct vm_area_st << 173 unsigned long dst << 174 bool newly_alloca << 175 { 58 { 176 int ret; << 177 struct mm_struct *dst_mm = dst_vma->vm << 178 pte_t _dst_pte, *dst_pte; 59 pte_t _dst_pte, *dst_pte; 179 bool writable = dst_vma->vm_flags & VM << 180 bool vm_shared = dst_vma->vm_flags & V << 181 spinlock_t *ptl; 60 spinlock_t *ptl; 182 struct folio *folio = page_folio(page) !! 61 void *page_kaddr; 183 bool page_in_cache = folio_mapping(fol << 184 << 185 _dst_pte = mk_pte(page, dst_vma->vm_pa << 186 _dst_pte = pte_mkdirty(_dst_pte); << 187 if (page_in_cache && !vm_shared) << 188 writable = false; << 189 if (writable) << 190 _dst_pte = pte_mkwrite(_dst_pt << 191 if (flags & MFILL_ATOMIC_WP) << 192 _dst_pte = pte_mkuffd_wp(_dst_ << 193 << 194 ret = -EAGAIN; << 195 dst_pte = pte_offset_map_lock(dst_mm, << 196 if (!dst_pte) << 197 goto out; << 198 << 199 if (mfill_file_over_size(dst_vma, dst_ << 200 ret = -EFAULT; << 201 goto out_unlock; << 202 } << 203 << 204 ret = -EEXIST; << 205 /* << 206 * We allow to overwrite a pte marker: << 207 * registered, we firstly wr-protect a << 208 * page backing it, then access the pa << 209 */ << 210 if (!pte_none_mostly(ptep_get(dst_pte) << 211 goto out_unlock; << 212 << 213 if (page_in_cache) { << 214 /* Usually, cache pages are al << 215 if (newly_allocated) << 216 folio_add_lru(folio); << 217 folio_add_file_rmap_pte(folio, << 218 } else { << 219 folio_add_new_anon_rmap(folio, << 220 folio_add_lru_vma(folio, dst_v << 221 } << 222 << 223 /* << 224 * Must happen after rmap, as mm_count << 225 * PageAnon()), which is set by __page << 226 */ << 227 inc_mm_counter(dst_mm, mm_counter(foli << 228 << 229 set_pte_at(dst_mm, dst_addr, dst_pte, << 230 << 231 /* No need to invalidate - it was non- << 232 update_mmu_cache(dst_vma, dst_addr, ds << 233 ret = 0; << 234 out_unlock: << 235 pte_unmap_unlock(dst_pte, ptl); << 236 out: << 237 return ret; << 238 } << 239 << 240 static int mfill_atomic_pte_copy(pmd_t *dst_pm << 241 struct vm_are << 242 unsigned long << 243 unsigned long << 244 uffd_flags_t << 245 struct folio << 246 { << 247 void *kaddr; << 248 int ret; 62 int ret; 249 struct folio *folio; !! 63 struct page *page; >> 64 pgoff_t offset, max_off; >> 65 struct inode *inode; 250 66 251 if (!*foliop) { !! 67 if (!*pagep) { 252 ret = -ENOMEM; 68 ret = -ENOMEM; 253 folio = vma_alloc_folio(GFP_HI !! 69 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr); 254 dst_ad !! 70 if (!page) 255 if (!folio) << 256 goto out; 71 goto out; 257 72 258 kaddr = kmap_local_folio(folio !! 73 page_kaddr = kmap_atomic(page); 259 /* !! 74 ret = copy_from_user(page_kaddr, 260 * The read mmap_lock is held !! 75 (const void __user *) src_addr, 261 * mmap_lock being read recurs << 262 * possible if a writer has ta << 263 * << 264 * process A thread 1 takes re << 265 * process A thread 2 calls mm << 266 * process B thread 1 takes pa << 267 * process B thread 2 calls mm << 268 * process A thread 1 blocks t << 269 * process B thread 1 blocks t << 270 * << 271 * Disable page faults to prev << 272 * and retry the copy outside << 273 */ << 274 pagefault_disable(); << 275 ret = copy_from_user(kaddr, (c << 276 PAGE_SIZE 76 PAGE_SIZE); 277 pagefault_enable(); !! 77 kunmap_atomic(page_kaddr); 278 kunmap_local(kaddr); << 279 78 280 /* fallback to copy_from_user 79 /* fallback to copy_from_user outside mmap_lock */ 281 if (unlikely(ret)) { 80 if (unlikely(ret)) { 282 ret = -ENOENT; 81 ret = -ENOENT; 283 *foliop = folio; !! 82 *pagep = page; 284 /* don't free the page 83 /* don't free the page */ 285 goto out; 84 goto out; 286 } 85 } 287 86 288 flush_dcache_folio(folio); !! 87 flush_dcache_page(page); 289 } else { 88 } else { 290 folio = *foliop; !! 89 page = *pagep; 291 *foliop = NULL; !! 90 *pagep = NULL; 292 } 91 } 293 92 294 /* 93 /* 295 * The memory barrier inside __folio_m !! 94 * The memory barrier inside __SetPageUptodate makes sure that 296 * preceding stores to the page conten 95 * preceding stores to the page contents become visible before 297 * the set_pte_at() write. 96 * the set_pte_at() write. 298 */ 97 */ 299 __folio_mark_uptodate(folio); !! 98 __SetPageUptodate(page); 300 99 301 ret = -ENOMEM; 100 ret = -ENOMEM; 302 if (mem_cgroup_charge(folio, dst_vma-> !! 101 if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL)) 303 goto out_release; << 304 << 305 ret = mfill_atomic_install_pte(dst_pmd << 306 &folio- << 307 if (ret) << 308 goto out_release; 102 goto out_release; 309 out: << 310 return ret; << 311 out_release: << 312 folio_put(folio); << 313 goto out; << 314 } << 315 << 316 static int mfill_atomic_pte_zeroed_folio(pmd_t << 317 struc << 318 unsig << 319 { << 320 struct folio *folio; << 321 int ret = -ENOMEM; << 322 << 323 folio = vma_alloc_zeroed_movable_folio << 324 if (!folio) << 325 return ret; << 326 103 327 if (mem_cgroup_charge(folio, dst_vma-> !! 104 _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); 328 goto out_put; !! 105 if (dst_vma->vm_flags & VM_WRITE) { 329 !! 106 if (wp_copy) 330 /* !! 107 _dst_pte = pte_mkuffd_wp(_dst_pte); 331 * The memory barrier inside __folio_m !! 108 else 332 * zeroing out the folio become visibl !! 109 _dst_pte = pte_mkwrite(_dst_pte); 333 * using set_pte_at(). See do_anonymou !! 110 } 334 */ << 335 __folio_mark_uptodate(folio); << 336 << 337 ret = mfill_atomic_install_pte(dst_pmd << 338 &folio- << 339 if (ret) << 340 goto out_put; << 341 << 342 return 0; << 343 out_put: << 344 folio_put(folio); << 345 return ret; << 346 } << 347 << 348 static int mfill_atomic_pte_zeropage(pmd_t *ds << 349 struct vm << 350 unsigned << 351 { << 352 pte_t _dst_pte, *dst_pte; << 353 spinlock_t *ptl; << 354 int ret; << 355 << 356 if (mm_forbids_zeropage(dst_vma->vm_mm << 357 return mfill_atomic_pte_zeroed << 358 111 359 _dst_pte = pte_mkspecial(pfn_pte(my_ze !! 112 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 360 dst_v !! 113 if (dst_vma->vm_file) { 361 ret = -EAGAIN; !! 114 /* the shmem MAP_PRIVATE case requires checking the i_size */ 362 dst_pte = pte_offset_map_lock(dst_vma- !! 115 inode = dst_vma->vm_file->f_inode; 363 if (!dst_pte) !! 116 offset = linear_page_index(dst_vma, dst_addr); 364 goto out; !! 117 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 365 if (mfill_file_over_size(dst_vma, dst_ << 366 ret = -EFAULT; 118 ret = -EFAULT; 367 goto out_unlock; !! 119 if (unlikely(offset >= max_off)) >> 120 goto out_release_uncharge_unlock; 368 } 121 } 369 ret = -EEXIST; 122 ret = -EEXIST; 370 if (!pte_none(ptep_get(dst_pte))) !! 123 if (!pte_none(*dst_pte)) 371 goto out_unlock; !! 124 goto out_release_uncharge_unlock; 372 set_pte_at(dst_vma->vm_mm, dst_addr, d << 373 /* No need to invalidate - it was non- << 374 update_mmu_cache(dst_vma, dst_addr, ds << 375 ret = 0; << 376 out_unlock: << 377 pte_unmap_unlock(dst_pte, ptl); << 378 out: << 379 return ret; << 380 } << 381 125 382 /* Handles UFFDIO_CONTINUE for all shmem VMAs !! 126 inc_mm_counter(dst_mm, MM_ANONPAGES); 383 static int mfill_atomic_pte_continue(pmd_t *ds !! 127 page_add_new_anon_rmap(page, dst_vma, dst_addr, false); 384 struct vm !! 128 lru_cache_add_inactive_or_unevictable(page, dst_vma); 385 unsigned << 386 uffd_flag << 387 { << 388 struct inode *inode = file_inode(dst_v << 389 pgoff_t pgoff = linear_page_index(dst_ << 390 struct folio *folio; << 391 struct page *page; << 392 int ret; << 393 129 394 ret = shmem_get_folio(inode, pgoff, 0, !! 130 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 395 /* Our caller expects us to return -EF << 396 if (ret == -ENOENT) << 397 ret = -EFAULT; << 398 if (ret) << 399 goto out; << 400 if (!folio) { << 401 ret = -EFAULT; << 402 goto out; << 403 } << 404 << 405 page = folio_file_page(folio, pgoff); << 406 if (PageHWPoison(page)) { << 407 ret = -EIO; << 408 goto out_release; << 409 } << 410 131 411 ret = mfill_atomic_install_pte(dst_pmd !! 132 /* No need to invalidate - it was non-present before */ 412 page, f !! 133 update_mmu_cache(dst_vma, dst_addr, dst_pte); 413 if (ret) << 414 goto out_release; << 415 134 416 folio_unlock(folio); !! 135 pte_unmap_unlock(dst_pte, ptl); 417 ret = 0; 136 ret = 0; 418 out: 137 out: 419 return ret; 138 return ret; >> 139 out_release_uncharge_unlock: >> 140 pte_unmap_unlock(dst_pte, ptl); 420 out_release: 141 out_release: 421 folio_unlock(folio); !! 142 put_page(page); 422 folio_put(folio); << 423 goto out; 143 goto out; 424 } 144 } 425 145 426 /* Handles UFFDIO_POISON for all non-hugetlb V !! 146 static int mfill_zeropage_pte(struct mm_struct *dst_mm, 427 static int mfill_atomic_pte_poison(pmd_t *dst_ !! 147 pmd_t *dst_pmd, 428 struct vm_a !! 148 struct vm_area_struct *dst_vma, 429 unsigned lo !! 149 unsigned long dst_addr) 430 uffd_flags_ << 431 { 150 { 432 int ret; << 433 struct mm_struct *dst_mm = dst_vma->vm << 434 pte_t _dst_pte, *dst_pte; 151 pte_t _dst_pte, *dst_pte; 435 spinlock_t *ptl; 152 spinlock_t *ptl; >> 153 int ret; >> 154 pgoff_t offset, max_off; >> 155 struct inode *inode; 436 156 437 _dst_pte = make_pte_marker(PTE_MARKER_ !! 157 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 438 ret = -EAGAIN; !! 158 dst_vma->vm_page_prot)); 439 dst_pte = pte_offset_map_lock(dst_mm, 159 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 440 if (!dst_pte) !! 160 if (dst_vma->vm_file) { 441 goto out; !! 161 /* the shmem MAP_PRIVATE case requires checking the i_size */ 442 !! 162 inode = dst_vma->vm_file->f_inode; 443 if (mfill_file_over_size(dst_vma, dst_ !! 163 offset = linear_page_index(dst_vma, dst_addr); >> 164 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 444 ret = -EFAULT; 165 ret = -EFAULT; 445 goto out_unlock; !! 166 if (unlikely(offset >= max_off)) >> 167 goto out_unlock; 446 } 168 } 447 << 448 ret = -EEXIST; 169 ret = -EEXIST; 449 /* Refuse to overwrite any PTE, even a !! 170 if (!pte_none(*dst_pte)) 450 if (!pte_none(ptep_get(dst_pte))) << 451 goto out_unlock; 171 goto out_unlock; 452 << 453 set_pte_at(dst_mm, dst_addr, dst_pte, 172 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 454 << 455 /* No need to invalidate - it was non- 173 /* No need to invalidate - it was non-present before */ 456 update_mmu_cache(dst_vma, dst_addr, ds 174 update_mmu_cache(dst_vma, dst_addr, dst_pte); 457 ret = 0; 175 ret = 0; 458 out_unlock: 176 out_unlock: 459 pte_unmap_unlock(dst_pte, ptl); 177 pte_unmap_unlock(dst_pte, ptl); 460 out: << 461 return ret; 178 return ret; 462 } 179 } 463 180 464 static pmd_t *mm_alloc_pmd(struct mm_struct *m 181 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 465 { 182 { 466 pgd_t *pgd; 183 pgd_t *pgd; 467 p4d_t *p4d; 184 p4d_t *p4d; 468 pud_t *pud; 185 pud_t *pud; 469 186 470 pgd = pgd_offset(mm, address); 187 pgd = pgd_offset(mm, address); 471 p4d = p4d_alloc(mm, pgd, address); 188 p4d = p4d_alloc(mm, pgd, address); 472 if (!p4d) 189 if (!p4d) 473 return NULL; 190 return NULL; 474 pud = pud_alloc(mm, p4d, address); 191 pud = pud_alloc(mm, p4d, address); 475 if (!pud) 192 if (!pud) 476 return NULL; 193 return NULL; 477 /* 194 /* 478 * Note that we didn't run this becaus 195 * Note that we didn't run this because the pmd was 479 * missing, the *pmd may be already es 196 * missing, the *pmd may be already established and in 480 * turn it may also be a trans_huge_pm 197 * turn it may also be a trans_huge_pmd. 481 */ 198 */ 482 return pmd_alloc(mm, pud, address); 199 return pmd_alloc(mm, pud, address); 483 } 200 } 484 201 485 #ifdef CONFIG_HUGETLB_PAGE 202 #ifdef CONFIG_HUGETLB_PAGE 486 /* 203 /* 487 * mfill_atomic processing for HUGETLB vmas. !! 204 * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is 488 * called with either vma-lock or mmap_lock he !! 205 * called with mmap_lock held, it will release mmap_lock before returning. 489 * before returning. << 490 */ 206 */ 491 static __always_inline ssize_t mfill_atomic_hu !! 207 static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, 492 << 493 208 struct vm_area_struct *dst_vma, 494 209 unsigned long dst_start, 495 210 unsigned long src_start, 496 211 unsigned long len, 497 !! 212 bool *mmap_changing, >> 213 bool zeropage) 498 { 214 { 499 struct mm_struct *dst_mm = dst_vma->vm !! 215 int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; >> 216 int vm_shared = dst_vma->vm_flags & VM_SHARED; 500 ssize_t err; 217 ssize_t err; 501 pte_t *dst_pte; 218 pte_t *dst_pte; 502 unsigned long src_addr, dst_addr; 219 unsigned long src_addr, dst_addr; 503 long copied; 220 long copied; 504 struct folio *folio; !! 221 struct page *page; 505 unsigned long vma_hpagesize; 222 unsigned long vma_hpagesize; 506 pgoff_t idx; 223 pgoff_t idx; 507 u32 hash; 224 u32 hash; 508 struct address_space *mapping; 225 struct address_space *mapping; 509 226 510 /* 227 /* 511 * There is no default zero huge page 228 * There is no default zero huge page for all huge page sizes as 512 * supported by hugetlb. A PMD_SIZE h 229 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 513 * by THP. Since we can not reliably 230 * by THP. Since we can not reliably insert a zero page, this 514 * feature is not supported. 231 * feature is not supported. 515 */ 232 */ 516 if (uffd_flags_mode_is(flags, MFILL_AT !! 233 if (zeropage) { 517 up_read(&ctx->map_changing_loc !! 234 mmap_read_unlock(dst_mm); 518 uffd_mfill_unlock(dst_vma); << 519 return -EINVAL; 235 return -EINVAL; 520 } 236 } 521 237 522 src_addr = src_start; 238 src_addr = src_start; 523 dst_addr = dst_start; 239 dst_addr = dst_start; 524 copied = 0; 240 copied = 0; 525 folio = NULL; !! 241 page = NULL; 526 vma_hpagesize = vma_kernel_pagesize(ds 242 vma_hpagesize = vma_kernel_pagesize(dst_vma); 527 243 528 /* 244 /* 529 * Validate alignment based on huge pa 245 * Validate alignment based on huge page size 530 */ 246 */ 531 err = -EINVAL; 247 err = -EINVAL; 532 if (dst_start & (vma_hpagesize - 1) || 248 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 533 goto out_unlock; 249 goto out_unlock; 534 250 535 retry: 251 retry: 536 /* 252 /* 537 * On routine entry dst_vma is set. I 253 * On routine entry dst_vma is set. If we had to drop mmap_lock and 538 * retry, dst_vma will be set to NULL 254 * retry, dst_vma will be set to NULL and we must lookup again. 539 */ 255 */ 540 if (!dst_vma) { 256 if (!dst_vma) { 541 dst_vma = uffd_mfill_lock(dst_ << 542 if (IS_ERR(dst_vma)) { << 543 err = PTR_ERR(dst_vma) << 544 goto out; << 545 } << 546 << 547 err = -ENOENT; 257 err = -ENOENT; 548 if (!is_vm_hugetlb_page(dst_vm !! 258 dst_vma = find_dst_vma(dst_mm, dst_start, len); 549 goto out_unlock_vma; !! 259 if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) >> 260 goto out_unlock; 550 261 551 err = -EINVAL; 262 err = -EINVAL; 552 if (vma_hpagesize != vma_kerne 263 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 553 goto out_unlock_vma; !! 264 goto out_unlock; 554 265 555 /* !! 266 vm_shared = dst_vma->vm_flags & VM_SHARED; 556 * If memory mappings are chan !! 267 } 557 * operation (e.g. mremap) run !! 268 558 * request the user to retry l !! 269 /* 559 */ !! 270 * If not shared, ensure the dst_vma has a anon_vma. 560 down_read(&ctx->map_changing_l !! 271 */ 561 err = -EAGAIN; !! 272 err = -ENOMEM; 562 if (atomic_read(&ctx->mmap_cha !! 273 if (!vm_shared) { >> 274 if (unlikely(anon_vma_prepare(dst_vma))) 563 goto out_unlock; 275 goto out_unlock; 564 } 276 } 565 277 566 while (src_addr < src_start + len) { 278 while (src_addr < src_start + len) { >> 279 pte_t dst_pteval; >> 280 567 BUG_ON(dst_addr >= dst_start + 281 BUG_ON(dst_addr >= dst_start + len); 568 282 569 /* 283 /* 570 * Serialize via vma_lock and !! 284 * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. 571 * vma_lock ensures the dst_pt !! 285 * i_mmap_rwsem ensures the dst_pte remains valid even 572 * in the case of shared pmds. 286 * in the case of shared pmds. fault mutex prevents 573 * races with other faulting t 287 * races with other faulting threads. 574 */ 288 */ 575 idx = linear_page_index(dst_vm << 576 mapping = dst_vma->vm_file->f_ 289 mapping = dst_vma->vm_file->f_mapping; >> 290 i_mmap_lock_read(mapping); >> 291 idx = linear_page_index(dst_vma, dst_addr); 577 hash = hugetlb_fault_mutex_has 292 hash = hugetlb_fault_mutex_hash(mapping, idx); 578 mutex_lock(&hugetlb_fault_mute 293 mutex_lock(&hugetlb_fault_mutex_table[hash]); 579 hugetlb_vma_lock_read(dst_vma) << 580 294 581 err = -ENOMEM; 295 err = -ENOMEM; 582 dst_pte = huge_pte_alloc(dst_m !! 296 dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize); 583 if (!dst_pte) { 297 if (!dst_pte) { 584 hugetlb_vma_unlock_rea << 585 mutex_unlock(&hugetlb_ 298 mutex_unlock(&hugetlb_fault_mutex_table[hash]); >> 299 i_mmap_unlock_read(mapping); 586 goto out_unlock; 300 goto out_unlock; 587 } 301 } 588 302 589 if (!uffd_flags_mode_is(flags, !! 303 err = -EEXIST; 590 !huge_pte_none_mostly(huge !! 304 dst_pteval = huge_ptep_get(dst_pte); 591 err = -EEXIST; !! 305 if (!huge_pte_none(dst_pteval)) { 592 hugetlb_vma_unlock_rea << 593 mutex_unlock(&hugetlb_ 306 mutex_unlock(&hugetlb_fault_mutex_table[hash]); >> 307 i_mmap_unlock_read(mapping); 594 goto out_unlock; 308 goto out_unlock; 595 } 309 } 596 310 597 err = hugetlb_mfill_atomic_pte !! 311 err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, 598 !! 312 dst_addr, src_addr, &page); 599 313 600 hugetlb_vma_unlock_read(dst_vm << 601 mutex_unlock(&hugetlb_fault_mu 314 mutex_unlock(&hugetlb_fault_mutex_table[hash]); >> 315 i_mmap_unlock_read(mapping); >> 316 vm_alloc_shared = vm_shared; 602 317 603 cond_resched(); 318 cond_resched(); 604 319 605 if (unlikely(err == -ENOENT)) 320 if (unlikely(err == -ENOENT)) { 606 up_read(&ctx->map_chan !! 321 mmap_read_unlock(dst_mm); 607 uffd_mfill_unlock(dst_ !! 322 BUG_ON(!page); 608 BUG_ON(!folio); << 609 323 610 err = copy_folio_from_ !! 324 err = copy_huge_page_from_user(page, 611 !! 325 (const void __user *)src_addr, >> 326 vma_hpagesize / PAGE_SIZE, >> 327 true); 612 if (unlikely(err)) { 328 if (unlikely(err)) { 613 err = -EFAULT; 329 err = -EFAULT; 614 goto out; 330 goto out; 615 } 331 } >> 332 mmap_read_lock(dst_mm); >> 333 /* >> 334 * If memory mappings are changing because of non-cooperative >> 335 * operation (e.g. mremap) running in parallel, bail out and >> 336 * request the user to retry later >> 337 */ >> 338 if (mmap_changing && READ_ONCE(*mmap_changing)) { >> 339 err = -EAGAIN; >> 340 break; >> 341 } 616 342 617 dst_vma = NULL; 343 dst_vma = NULL; 618 goto retry; 344 goto retry; 619 } else 345 } else 620 BUG_ON(folio); !! 346 BUG_ON(page); 621 347 622 if (!err) { 348 if (!err) { 623 dst_addr += vma_hpages 349 dst_addr += vma_hpagesize; 624 src_addr += vma_hpages 350 src_addr += vma_hpagesize; 625 copied += vma_hpagesiz 351 copied += vma_hpagesize; 626 352 627 if (fatal_signal_pendi 353 if (fatal_signal_pending(current)) 628 err = -EINTR; 354 err = -EINTR; 629 } 355 } 630 if (err) 356 if (err) 631 break; 357 break; 632 } 358 } 633 359 634 out_unlock: 360 out_unlock: 635 up_read(&ctx->map_changing_lock); !! 361 mmap_read_unlock(dst_mm); 636 out_unlock_vma: << 637 uffd_mfill_unlock(dst_vma); << 638 out: 362 out: 639 if (folio) !! 363 if (page) { 640 folio_put(folio); !! 364 /* >> 365 * We encountered an error and are about to free a newly >> 366 * allocated huge page. >> 367 * >> 368 * Reservation handling is very subtle, and is different for >> 369 * private and shared mappings. See the routine >> 370 * restore_reserve_on_error for details. Unfortunately, we >> 371 * can not call restore_reserve_on_error now as it would >> 372 * require holding mmap_lock. >> 373 * >> 374 * If a reservation for the page existed in the reservation >> 375 * map of a private mapping, the map was modified to indicate >> 376 * the reservation was consumed when the page was allocated. >> 377 * We clear the PagePrivate flag now so that the global >> 378 * reserve count will not be incremented in free_huge_page. >> 379 * The reservation map will still indicate the reservation >> 380 * was consumed and possibly prevent later page allocation. >> 381 * This is better than leaking a global reservation. If no >> 382 * reservation existed, it is still safe to clear PagePrivate >> 383 * as no adjustments to reservation counts were made during >> 384 * allocation. >> 385 * >> 386 * The reservation map for shared mappings indicates which >> 387 * pages have reservations. When a huge page is allocated >> 388 * for an address with a reservation, no change is made to >> 389 * the reserve map. In this case PagePrivate will be set >> 390 * to indicate that the global reservation count should be >> 391 * incremented when the page is freed. This is the desired >> 392 * behavior. However, when a huge page is allocated for an >> 393 * address without a reservation a reservation entry is added >> 394 * to the reservation map, and PagePrivate will not be set. >> 395 * When the page is freed, the global reserve count will NOT >> 396 * be incremented and it will appear as though we have leaked >> 397 * reserved page. In this case, set PagePrivate so that the >> 398 * global reserve count will be incremented to match the >> 399 * reservation map entry which was created. >> 400 * >> 401 * Note that vm_alloc_shared is based on the flags of the vma >> 402 * for which the page was originally allocated. dst_vma could >> 403 * be different or NULL on error. >> 404 */ >> 405 if (vm_alloc_shared) >> 406 SetPagePrivate(page); >> 407 else >> 408 ClearPagePrivate(page); >> 409 put_page(page); >> 410 } 641 BUG_ON(copied < 0); 411 BUG_ON(copied < 0); 642 BUG_ON(err > 0); 412 BUG_ON(err > 0); 643 BUG_ON(!copied && !err); 413 BUG_ON(!copied && !err); 644 return copied ? copied : err; 414 return copied ? copied : err; 645 } 415 } 646 #else /* !CONFIG_HUGETLB_PAGE */ 416 #else /* !CONFIG_HUGETLB_PAGE */ 647 /* fail at build time if gcc attempts to use t 417 /* fail at build time if gcc attempts to use this */ 648 extern ssize_t mfill_atomic_hugetlb(struct use !! 418 extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, 649 struct vm_ !! 419 struct vm_area_struct *dst_vma, 650 unsigned l !! 420 unsigned long dst_start, 651 unsigned l !! 421 unsigned long src_start, 652 unsigned l !! 422 unsigned long len, 653 uffd_flags !! 423 bool *mmap_changing, >> 424 bool zeropage); 654 #endif /* CONFIG_HUGETLB_PAGE */ 425 #endif /* CONFIG_HUGETLB_PAGE */ 655 426 656 static __always_inline ssize_t mfill_atomic_pt !! 427 static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, >> 428 pmd_t *dst_pmd, 657 429 struct vm_area_struct *dst_vma, 658 430 unsigned long dst_addr, 659 431 unsigned long src_addr, 660 !! 432 struct page **page, 661 !! 433 bool zeropage, >> 434 bool wp_copy) 662 { 435 { 663 ssize_t err; 436 ssize_t err; 664 437 665 if (uffd_flags_mode_is(flags, MFILL_AT << 666 return mfill_atomic_pte_contin << 667 << 668 } else if (uffd_flags_mode_is(flags, M << 669 return mfill_atomic_pte_poison << 670 << 671 } << 672 << 673 /* 438 /* 674 * The normal page fault path for a sh 439 * The normal page fault path for a shmem will invoke the 675 * fault, fill the hole in the file an 440 * fault, fill the hole in the file and COW it right away. The 676 * result generates plain anonymous me 441 * result generates plain anonymous memory. So when we are 677 * asked to fill an hole in a MAP_PRIV 442 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll 678 * generate anonymous memory directly 443 * generate anonymous memory directly without actually filling 679 * the hole. For the MAP_PRIVATE case 444 * the hole. For the MAP_PRIVATE case the robustness check 680 * only happens in the pagetable (to v 445 * only happens in the pagetable (to verify it's still none) 681 * and not in the radix tree. 446 * and not in the radix tree. 682 */ 447 */ 683 if (!(dst_vma->vm_flags & VM_SHARED)) 448 if (!(dst_vma->vm_flags & VM_SHARED)) { 684 if (uffd_flags_mode_is(flags, !! 449 if (!zeropage) 685 err = mfill_atomic_pte !! 450 err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, 686 !! 451 dst_addr, src_addr, page, 687 !! 452 wp_copy); 688 else 453 else 689 err = mfill_atomic_pte !! 454 err = mfill_zeropage_pte(dst_mm, dst_pmd, 690 455 dst_vma, dst_addr); 691 } else { 456 } else { 692 err = shmem_mfill_atomic_pte(d !! 457 VM_WARN_ON_ONCE(wp_copy); 693 d !! 458 if (!zeropage) 694 f !! 459 err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, >> 460 dst_vma, dst_addr, >> 461 src_addr, page); >> 462 else >> 463 err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd, >> 464 dst_vma, dst_addr); 695 } 465 } 696 466 697 return err; 467 return err; 698 } 468 } 699 469 700 static __always_inline ssize_t mfill_atomic(st !! 470 static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, 701 un !! 471 unsigned long dst_start, 702 un !! 472 unsigned long src_start, 703 un !! 473 unsigned long len, 704 uf !! 474 bool zeropage, >> 475 bool *mmap_changing, >> 476 __u64 mode) 705 { 477 { 706 struct mm_struct *dst_mm = ctx->mm; << 707 struct vm_area_struct *dst_vma; 478 struct vm_area_struct *dst_vma; 708 ssize_t err; 479 ssize_t err; 709 pmd_t *dst_pmd; 480 pmd_t *dst_pmd; 710 unsigned long src_addr, dst_addr; 481 unsigned long src_addr, dst_addr; 711 long copied; 482 long copied; 712 struct folio *folio; !! 483 struct page *page; >> 484 bool wp_copy; 713 485 714 /* 486 /* 715 * Sanitize the command parameters: 487 * Sanitize the command parameters: 716 */ 488 */ 717 BUG_ON(dst_start & ~PAGE_MASK); 489 BUG_ON(dst_start & ~PAGE_MASK); 718 BUG_ON(len & ~PAGE_MASK); 490 BUG_ON(len & ~PAGE_MASK); 719 491 720 /* Does the address range wrap, or is 492 /* Does the address range wrap, or is the span zero-sized? */ 721 BUG_ON(src_start + len <= src_start); 493 BUG_ON(src_start + len <= src_start); 722 BUG_ON(dst_start + len <= dst_start); 494 BUG_ON(dst_start + len <= dst_start); 723 495 724 src_addr = src_start; 496 src_addr = src_start; 725 dst_addr = dst_start; 497 dst_addr = dst_start; 726 copied = 0; 498 copied = 0; 727 folio = NULL; !! 499 page = NULL; 728 retry: 500 retry: 729 /* !! 501 mmap_read_lock(dst_mm); 730 * Make sure the vma is not shared, th << 731 * both valid and fully within a singl << 732 */ << 733 dst_vma = uffd_mfill_lock(dst_mm, dst_ << 734 if (IS_ERR(dst_vma)) { << 735 err = PTR_ERR(dst_vma); << 736 goto out; << 737 } << 738 502 739 /* 503 /* 740 * If memory mappings are changing bec 504 * If memory mappings are changing because of non-cooperative 741 * operation (e.g. mremap) running in 505 * operation (e.g. mremap) running in parallel, bail out and 742 * request the user to retry later 506 * request the user to retry later 743 */ 507 */ 744 down_read(&ctx->map_changing_lock); << 745 err = -EAGAIN; 508 err = -EAGAIN; 746 if (atomic_read(&ctx->mmap_changing)) !! 509 if (mmap_changing && READ_ONCE(*mmap_changing)) >> 510 goto out_unlock; >> 511 >> 512 /* >> 513 * Make sure the vma is not shared, that the dst range is >> 514 * both valid and fully within a single existing vma. >> 515 */ >> 516 err = -ENOENT; >> 517 dst_vma = find_dst_vma(dst_mm, dst_start, len); >> 518 if (!dst_vma) 747 goto out_unlock; 519 goto out_unlock; 748 520 749 err = -EINVAL; 521 err = -EINVAL; 750 /* 522 /* 751 * shmem_zero_setup is invoked in mmap 523 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 752 * it will overwrite vm_ops, so vma_is 524 * it will overwrite vm_ops, so vma_is_anonymous must return false. 753 */ 525 */ 754 if (WARN_ON_ONCE(vma_is_anonymous(dst_ 526 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 755 dst_vma->vm_flags & VM_SHARED)) 527 dst_vma->vm_flags & VM_SHARED)) 756 goto out_unlock; 528 goto out_unlock; 757 529 758 /* 530 /* 759 * validate 'mode' now that we know th 531 * validate 'mode' now that we know the dst_vma: don't allow 760 * a wrprotect copy if the userfaultfd 532 * a wrprotect copy if the userfaultfd didn't register as WP. 761 */ 533 */ 762 if ((flags & MFILL_ATOMIC_WP) && !(dst !! 534 wp_copy = mode & UFFDIO_COPY_MODE_WP; >> 535 if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP)) 763 goto out_unlock; 536 goto out_unlock; 764 537 765 /* 538 /* 766 * If this is a HUGETLB vma, pass off 539 * If this is a HUGETLB vma, pass off to appropriate routine 767 */ 540 */ 768 if (is_vm_hugetlb_page(dst_vma)) 541 if (is_vm_hugetlb_page(dst_vma)) 769 return mfill_atomic_hugetlb(c !! 542 return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, 770 s !! 543 src_start, len, mmap_changing, >> 544 zeropage); 771 545 772 if (!vma_is_anonymous(dst_vma) && !vma 546 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) 773 goto out_unlock; 547 goto out_unlock; 774 if (!vma_is_shmem(dst_vma) && !! 548 775 uffd_flags_mode_is(flags, MFILL_AT !! 549 /* >> 550 * Ensure the dst_vma has a anon_vma or this page >> 551 * would get a NULL anon_vma when moved in the >> 552 * dst_vma. >> 553 */ >> 554 err = -ENOMEM; >> 555 if (!(dst_vma->vm_flags & VM_SHARED) && >> 556 unlikely(anon_vma_prepare(dst_vma))) 776 goto out_unlock; 557 goto out_unlock; 777 558 778 while (src_addr < src_start + len) { 559 while (src_addr < src_start + len) { 779 pmd_t dst_pmdval; 560 pmd_t dst_pmdval; 780 561 781 BUG_ON(dst_addr >= dst_start + 562 BUG_ON(dst_addr >= dst_start + len); 782 563 783 dst_pmd = mm_alloc_pmd(dst_mm, 564 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); 784 if (unlikely(!dst_pmd)) { 565 if (unlikely(!dst_pmd)) { 785 err = -ENOMEM; 566 err = -ENOMEM; 786 break; 567 break; 787 } 568 } 788 569 789 dst_pmdval = pmdp_get_lockless !! 570 dst_pmdval = pmd_read_atomic(dst_pmd); 790 if (unlikely(pmd_none(dst_pmdv << 791 unlikely(__pte_alloc(dst_m << 792 err = -ENOMEM; << 793 break; << 794 } << 795 dst_pmdval = pmdp_get_lockless << 796 /* 571 /* 797 * If the dst_pmd is THP don't !! 572 * If the dst_pmd is mapped as THP don't 798 * (This includes the case whe !! 573 * override it and just be strict. 799 * changed back to none after << 800 */ 574 */ 801 if (unlikely(!pmd_present(dst_ !! 575 if (unlikely(pmd_trans_huge(dst_pmdval))) { 802 pmd_devmap(dst_pm << 803 err = -EEXIST; 576 err = -EEXIST; 804 break; 577 break; 805 } 578 } 806 if (unlikely(pmd_bad(dst_pmdva !! 579 if (unlikely(pmd_none(dst_pmdval)) && >> 580 unlikely(__pte_alloc(dst_mm, dst_pmd))) { >> 581 err = -ENOMEM; >> 582 break; >> 583 } >> 584 /* If an huge pmd materialized from under us fail */ >> 585 if (unlikely(pmd_trans_huge(*dst_pmd))) { 807 err = -EFAULT; 586 err = -EFAULT; 808 break; 587 break; 809 } 588 } 810 /* << 811 * For shmem mappings, khugepa << 812 * tables under us; pte_offset << 813 */ << 814 589 815 err = mfill_atomic_pte(dst_pmd !! 590 BUG_ON(pmd_none(*dst_pmd)); 816 src_add !! 591 BUG_ON(pmd_trans_huge(*dst_pmd)); >> 592 >> 593 err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, >> 594 src_addr, &page, zeropage, wp_copy); 817 cond_resched(); 595 cond_resched(); 818 596 819 if (unlikely(err == -ENOENT)) 597 if (unlikely(err == -ENOENT)) { 820 void *kaddr; !! 598 void *page_kaddr; 821 599 822 up_read(&ctx->map_chan !! 600 mmap_read_unlock(dst_mm); 823 uffd_mfill_unlock(dst_ !! 601 BUG_ON(!page); 824 BUG_ON(!folio); << 825 602 826 kaddr = kmap_local_fol !! 603 page_kaddr = kmap(page); 827 err = copy_from_user(k !! 604 err = copy_from_user(page_kaddr, 828 ( 605 (const void __user *) src_addr, 829 P 606 PAGE_SIZE); 830 kunmap_local(kaddr); !! 607 kunmap(page); 831 if (unlikely(err)) { 608 if (unlikely(err)) { 832 err = -EFAULT; 609 err = -EFAULT; 833 goto out; 610 goto out; 834 } 611 } 835 flush_dcache_folio(fol !! 612 flush_dcache_page(page); 836 goto retry; 613 goto retry; 837 } else 614 } else 838 BUG_ON(folio); !! 615 BUG_ON(page); 839 616 840 if (!err) { 617 if (!err) { 841 dst_addr += PAGE_SIZE; 618 dst_addr += PAGE_SIZE; 842 src_addr += PAGE_SIZE; 619 src_addr += PAGE_SIZE; 843 copied += PAGE_SIZE; 620 copied += PAGE_SIZE; 844 621 845 if (fatal_signal_pendi 622 if (fatal_signal_pending(current)) 846 err = -EINTR; 623 err = -EINTR; 847 } 624 } 848 if (err) 625 if (err) 849 break; 626 break; 850 } 627 } 851 628 852 out_unlock: 629 out_unlock: 853 up_read(&ctx->map_changing_lock); !! 630 mmap_read_unlock(dst_mm); 854 uffd_mfill_unlock(dst_vma); << 855 out: 631 out: 856 if (folio) !! 632 if (page) 857 folio_put(folio); !! 633 put_page(page); 858 BUG_ON(copied < 0); 634 BUG_ON(copied < 0); 859 BUG_ON(err > 0); 635 BUG_ON(err > 0); 860 BUG_ON(!copied && !err); 636 BUG_ON(!copied && !err); 861 return copied ? copied : err; 637 return copied ? copied : err; 862 } 638 } 863 639 864 ssize_t mfill_atomic_copy(struct userfaultfd_c !! 640 ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, 865 unsigned long src_st !! 641 unsigned long src_start, unsigned long len, 866 uffd_flags_t flags) !! 642 bool *mmap_changing, __u64 mode) 867 { << 868 return mfill_atomic(ctx, dst_start, sr << 869 uffd_flags_set_mod << 870 } << 871 << 872 ssize_t mfill_atomic_zeropage(struct userfault << 873 unsigned long st << 874 unsigned long le << 875 { << 876 return mfill_atomic(ctx, start, 0, len << 877 uffd_flags_set_mod << 878 } << 879 << 880 ssize_t mfill_atomic_continue(struct userfault << 881 unsigned long le << 882 { << 883 << 884 /* << 885 * A caller might reasonably assume th << 886 * smp_wmb() to ensure that any writes << 887 * the thread doing the UFFDIO_CONTINU << 888 * subsequent loads from the page thro << 889 */ << 890 smp_wmb(); << 891 << 892 return mfill_atomic(ctx, start, 0, len << 893 uffd_flags_set_mod << 894 } << 895 << 896 ssize_t mfill_atomic_poison(struct userfaultfd << 897 unsigned long len, << 898 { 643 { 899 return mfill_atomic(ctx, start, 0, len !! 644 return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, 900 uffd_flags_set_mod !! 645 mmap_changing, mode); 901 } 646 } 902 647 903 long uffd_wp_range(struct vm_area_struct *dst_ !! 648 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, 904 unsigned long start, unsign !! 649 unsigned long len, bool *mmap_changing) 905 { 650 { 906 unsigned int mm_cp_flags; !! 651 return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0); 907 struct mmu_gather tlb; << 908 long ret; << 909 << 910 VM_WARN_ONCE(start < dst_vma->vm_start << 911 "The address range exc << 912 if (enable_wp) << 913 mm_cp_flags = MM_CP_UFFD_WP; << 914 else << 915 mm_cp_flags = MM_CP_UFFD_WP_RE << 916 << 917 /* << 918 * vma->vm_page_prot already reflects << 919 * VMA (see userfaultfd_set_vm_flags() << 920 * to be write-protected as default wh << 921 * Try upgrading write permissions man << 922 */ << 923 if (!enable_wp && vma_wants_manual_pte << 924 mm_cp_flags |= MM_CP_TRY_CHANG << 925 tlb_gather_mmu(&tlb, dst_vma->vm_mm); << 926 ret = change_protection(&tlb, dst_vma, << 927 tlb_finish_mmu(&tlb); << 928 << 929 return ret; << 930 } 652 } 931 653 932 int mwriteprotect_range(struct userfaultfd_ctx !! 654 int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, 933 unsigned long len, boo !! 655 unsigned long len, bool enable_wp, bool *mmap_changing) 934 { 656 { 935 struct mm_struct *dst_mm = ctx->mm; << 936 unsigned long end = start + len; << 937 unsigned long _start, _end; << 938 struct vm_area_struct *dst_vma; 657 struct vm_area_struct *dst_vma; 939 unsigned long page_mask; !! 658 pgprot_t newprot; 940 long err; !! 659 int err; 941 VMA_ITERATOR(vmi, dst_mm, start); << 942 660 943 /* 661 /* 944 * Sanitize the command parameters: 662 * Sanitize the command parameters: 945 */ 663 */ 946 BUG_ON(start & ~PAGE_MASK); 664 BUG_ON(start & ~PAGE_MASK); 947 BUG_ON(len & ~PAGE_MASK); 665 BUG_ON(len & ~PAGE_MASK); 948 666 949 /* Does the address range wrap, or is 667 /* Does the address range wrap, or is the span zero-sized? */ 950 BUG_ON(start + len <= start); 668 BUG_ON(start + len <= start); 951 669 952 mmap_read_lock(dst_mm); 670 mmap_read_lock(dst_mm); 953 671 954 /* 672 /* 955 * If memory mappings are changing bec 673 * If memory mappings are changing because of non-cooperative 956 * operation (e.g. mremap) running in 674 * operation (e.g. mremap) running in parallel, bail out and 957 * request the user to retry later 675 * request the user to retry later 958 */ 676 */ 959 down_read(&ctx->map_changing_lock); << 960 err = -EAGAIN; 677 err = -EAGAIN; 961 if (atomic_read(&ctx->mmap_changing)) !! 678 if (mmap_changing && READ_ONCE(*mmap_changing)) 962 goto out_unlock; 679 goto out_unlock; 963 680 964 err = -ENOENT; 681 err = -ENOENT; 965 for_each_vma_range(vmi, dst_vma, end) !! 682 dst_vma = find_dst_vma(dst_mm, start, len); 966 << 967 if (!userfaultfd_wp(dst_vma)) << 968 err = -ENOENT; << 969 break; << 970 } << 971 << 972 if (is_vm_hugetlb_page(dst_vma << 973 err = -EINVAL; << 974 page_mask = vma_kernel << 975 if ((start & page_mask << 976 break; << 977 } << 978 << 979 _start = max(dst_vma->vm_start << 980 _end = min(dst_vma->vm_end, en << 981 << 982 err = uffd_wp_range(dst_vma, _ << 983 << 984 /* Return 0 on success, <0 on << 985 if (err < 0) << 986 break; << 987 err = 0; << 988 } << 989 out_unlock: << 990 up_read(&ctx->map_changing_lock); << 991 mmap_read_unlock(dst_mm); << 992 return err; << 993 } << 994 << 995 << 996 void double_pt_lock(spinlock_t *ptl1, << 997 spinlock_t *ptl2) << 998 __acquires(ptl1) << 999 __acquires(ptl2) << 1000 { << 1001 if (ptl1 > ptl2) << 1002 swap(ptl1, ptl2); << 1003 /* lock in virtual address order to a << 1004 spin_lock(ptl1); << 1005 if (ptl1 != ptl2) << 1006 spin_lock_nested(ptl2, SINGLE << 1007 else << 1008 __acquire(ptl2); << 1009 } << 1010 << 1011 void double_pt_unlock(spinlock_t *ptl1, << 1012 spinlock_t *ptl2) << 1013 __releases(ptl1) << 1014 __releases(ptl2) << 1015 { << 1016 spin_unlock(ptl1); << 1017 if (ptl1 != ptl2) << 1018 spin_unlock(ptl2); << 1019 else << 1020 __release(ptl2); << 1021 } << 1022 << 1023 << 1024 static int move_present_pte(struct mm_struct << 1025 struct vm_area_st << 1026 struct vm_area_st << 1027 unsigned long dst << 1028 pte_t *dst_pte, p << 1029 pte_t orig_dst_pt << 1030 spinlock_t *dst_p << 1031 struct folio *src << 1032 { << 1033 int err = 0; << 1034 << 1035 double_pt_lock(dst_ptl, src_ptl); << 1036 << 1037 if (!pte_same(ptep_get(src_pte), orig << 1038 !pte_same(ptep_get(dst_pte), orig << 1039 err = -EAGAIN; << 1040 goto out; << 1041 } << 1042 if (folio_test_large(src_folio) || << 1043 folio_maybe_dma_pinned(src_folio) << 1044 !PageAnonExclusive(&src_folio->pa << 1045 err = -EBUSY; << 1046 goto out; << 1047 } << 1048 << 1049 orig_src_pte = ptep_clear_flush(src_v << 1050 /* Folio got pinned from under us. Pu << 1051 if (folio_maybe_dma_pinned(src_folio) << 1052 set_pte_at(mm, src_addr, src_ << 1053 err = -EBUSY; << 1054 goto out; << 1055 } << 1056 << 1057 folio_move_anon_rmap(src_folio, dst_v << 1058 src_folio->index = linear_page_index( << 1059 << 1060 orig_dst_pte = mk_pte(&src_folio->pag << 1061 /* Follow mremap() behavior and treat << 1062 orig_dst_pte = pte_mkwrite(pte_mkdirt << 1063 << 1064 set_pte_at(mm, dst_addr, dst_pte, ori << 1065 out: << 1066 double_pt_unlock(dst_ptl, src_ptl); << 1067 return err; << 1068 } << 1069 << 1070 static int move_swap_pte(struct mm_struct *mm << 1071 unsigned long dst_ad << 1072 pte_t *dst_pte, pte_ << 1073 pte_t orig_dst_pte, << 1074 spinlock_t *dst_ptl, << 1075 { << 1076 if (!pte_swp_exclusive(orig_src_pte)) << 1077 return -EBUSY; << 1078 << 1079 double_pt_lock(dst_ptl, src_ptl); << 1080 << 1081 if (!pte_same(ptep_get(src_pte), orig << 1082 !pte_same(ptep_get(dst_pte), orig << 1083 double_pt_unlock(dst_ptl, src << 1084 return -EAGAIN; << 1085 } << 1086 << 1087 orig_src_pte = ptep_get_and_clear(mm, << 1088 set_pte_at(mm, dst_addr, dst_pte, ori << 1089 double_pt_unlock(dst_ptl, src_ptl); << 1090 << 1091 return 0; << 1092 } << 1093 << 1094 static int move_zeropage_pte(struct mm_struct << 1095 struct vm_area_s << 1096 struct vm_area_s << 1097 unsigned long ds << 1098 pte_t *dst_pte, << 1099 pte_t orig_dst_p << 1100 spinlock_t *dst_ << 1101 { << 1102 pte_t zero_pte; << 1103 << 1104 double_pt_lock(dst_ptl, src_ptl); << 1105 if (!pte_same(ptep_get(src_pte), orig << 1106 !pte_same(ptep_get(dst_pte), orig << 1107 double_pt_unlock(dst_ptl, src << 1108 return -EAGAIN; << 1109 } << 1110 << 1111 zero_pte = pte_mkspecial(pfn_pte(my_z << 1112 dst_ << 1113 ptep_clear_flush(src_vma, src_addr, s << 1114 set_pte_at(mm, dst_addr, dst_pte, zer << 1115 double_pt_unlock(dst_ptl, src_ptl); << 1116 << 1117 return 0; << 1118 } << 1119 << 1120 << 1121 /* << 1122 * The mmap_lock for reading is held by the c << 1123 * from src_pmd to dst_pmd if possible, and r << 1124 * in moving the page. << 1125 */ << 1126 static int move_pages_pte(struct mm_struct *m << 1127 struct vm_area_stru << 1128 struct vm_area_stru << 1129 unsigned long dst_a << 1130 __u64 mode) << 1131 { << 1132 swp_entry_t entry; << 1133 pte_t orig_src_pte, orig_dst_pte; << 1134 pte_t src_folio_pte; << 1135 spinlock_t *src_ptl, *dst_ptl; << 1136 pte_t *src_pte = NULL; << 1137 pte_t *dst_pte = NULL; << 1138 << 1139 struct folio *src_folio = NULL; << 1140 struct anon_vma *src_anon_vma = NULL; << 1141 struct mmu_notifier_range range; << 1142 int err = 0; << 1143 << 1144 flush_cache_range(src_vma, src_addr, << 1145 mmu_notifier_range_init(&range, MMU_N << 1146 src_addr, src << 1147 mmu_notifier_invalidate_range_start(& << 1148 retry: << 1149 dst_pte = pte_offset_map_nolock(mm, d << 1150 << 1151 /* Retry if a huge pmd materialized f << 1152 if (unlikely(!dst_pte)) { << 1153 err = -EAGAIN; << 1154 goto out; << 1155 } << 1156 << 1157 src_pte = pte_offset_map_nolock(mm, s << 1158 << 1159 /* << 1160 * We held the mmap_lock for reading << 1161 * can zap transparent huge pages und << 1162 * transparent huge page fault can es << 1163 * transparent huge pages under us. << 1164 */ << 1165 if (unlikely(!src_pte)) { << 1166 err = -EAGAIN; << 1167 goto out; << 1168 } << 1169 << 1170 /* Sanity checks before the operation << 1171 if (WARN_ON_ONCE(pmd_none(*dst_pmd)) << 1172 WARN_ON_ONCE(pmd_trans_huge(*dst_ << 1173 err = -EINVAL; << 1174 goto out; << 1175 } << 1176 << 1177 spin_lock(dst_ptl); << 1178 orig_dst_pte = ptep_get(dst_pte); << 1179 spin_unlock(dst_ptl); << 1180 if (!pte_none(orig_dst_pte)) { << 1181 err = -EEXIST; << 1182 goto out; << 1183 } << 1184 << 1185 spin_lock(src_ptl); << 1186 orig_src_pte = ptep_get(src_pte); << 1187 spin_unlock(src_ptl); << 1188 if (pte_none(orig_src_pte)) { << 1189 if (!(mode & UFFDIO_MOVE_MODE << 1190 err = -ENOENT; << 1191 else /* nothing to do to move << 1192 err = 0; << 1193 goto out; << 1194 } << 1195 << 1196 /* If PTE changed after we locked the << 1197 if (src_folio && unlikely(!pte_same(s << 1198 err = -EAGAIN; << 1199 goto out; << 1200 } << 1201 << 1202 if (pte_present(orig_src_pte)) { << 1203 if (is_zero_pfn(pte_pfn(orig_ << 1204 err = move_zeropage_p << 1205 << 1206 << 1207 << 1208 goto out; << 1209 } << 1210 << 1211 /* << 1212 * Pin and lock both source f << 1213 * RCU read section, we can't << 1214 * unmap the ptes, obtain the << 1215 */ << 1216 if (!src_folio) { << 1217 struct folio *folio; << 1218 << 1219 /* << 1220 * Pin the page while << 1221 * page isn't freed u << 1222 */ << 1223 spin_lock(src_ptl); << 1224 if (!pte_same(orig_sr << 1225 spin_unlock(s << 1226 err = -EAGAIN << 1227 goto out; << 1228 } << 1229 << 1230 folio = vm_normal_fol << 1231 if (!folio || !PageAn << 1232 spin_unlock(s << 1233 err = -EBUSY; << 1234 goto out; << 1235 } << 1236 << 1237 folio_get(folio); << 1238 src_folio = folio; << 1239 src_folio_pte = orig_ << 1240 spin_unlock(src_ptl); << 1241 << 1242 if (!folio_trylock(sr << 1243 pte_unmap(&or << 1244 pte_unmap(&or << 1245 src_pte = dst << 1246 /* now we can << 1247 folio_lock(sr << 1248 goto retry; << 1249 } << 1250 << 1251 if (WARN_ON_ONCE(!fol << 1252 err = -EBUSY; << 1253 goto out; << 1254 } << 1255 } << 1256 << 1257 /* at this point we have src_ << 1258 if (folio_test_large(src_foli << 1259 /* split_folio() can << 1260 pte_unmap(&orig_src_p << 1261 pte_unmap(&orig_dst_p << 1262 src_pte = dst_pte = N << 1263 err = split_folio(src << 1264 if (err) << 1265 goto out; << 1266 /* have to reacquire << 1267 folio_unlock(src_foli << 1268 folio_put(src_folio); << 1269 src_folio = NULL; << 1270 goto retry; << 1271 } << 1272 << 1273 if (!src_anon_vma) { << 1274 /* << 1275 * folio_referenced w << 1276 * without the folio << 1277 * the anon_vma lock, << 1278 */ << 1279 src_anon_vma = folio_ << 1280 if (!src_anon_vma) { << 1281 /* page was u << 1282 err = -EAGAIN << 1283 goto out; << 1284 } << 1285 if (!anon_vma_trylock << 1286 pte_unmap(&or << 1287 pte_unmap(&or << 1288 src_pte = dst << 1289 /* now we can << 1290 anon_vma_lock << 1291 goto retry; << 1292 } << 1293 } << 1294 << 1295 err = move_present_pte(mm, d << 1296 dst_ad << 1297 orig_d << 1298 dst_pt << 1299 } else { << 1300 entry = pte_to_swp_entry(orig << 1301 if (non_swap_entry(entry)) { << 1302 if (is_migration_entr << 1303 pte_unmap(&or << 1304 pte_unmap(&or << 1305 src_pte = dst << 1306 migration_ent << 1307 err = -EAGAIN << 1308 } else << 1309 err = -EFAULT << 1310 goto out; << 1311 } << 1312 << 1313 err = move_swap_pte(mm, dst_a << 1314 dst_pte, << 1315 orig_dst_ << 1316 dst_ptl, << 1317 } << 1318 << 1319 out: << 1320 if (src_anon_vma) { << 1321 anon_vma_unlock_write(src_ano << 1322 put_anon_vma(src_anon_vma); << 1323 } << 1324 if (src_folio) { << 1325 folio_unlock(src_folio); << 1326 folio_put(src_folio); << 1327 } << 1328 if (dst_pte) << 1329 pte_unmap(dst_pte); << 1330 if (src_pte) << 1331 pte_unmap(src_pte); << 1332 mmu_notifier_invalidate_range_end(&ra << 1333 << 1334 return err; << 1335 } << 1336 << 1337 #ifdef CONFIG_TRANSPARENT_HUGEPAGE << 1338 static inline bool move_splits_huge_pmd(unsig << 1339 unsig << 1340 unsig << 1341 { << 1342 return (src_addr & ~HPAGE_PMD_MASK) | << 1343 src_end - src_addr < HPAGE_PM << 1344 } << 1345 #else << 1346 static inline bool move_splits_huge_pmd(unsig << 1347 unsig << 1348 unsig << 1349 { << 1350 /* This is unreachable anyway, just t << 1351 return false; << 1352 } << 1353 #endif << 1354 << 1355 static inline bool vma_move_compatible(struct << 1356 { << 1357 return !(vma->vm_flags & (VM_PFNMAP | << 1358 VM_MIXEDMAP << 1359 } << 1360 << 1361 static int validate_move_areas(struct userfau << 1362 struct vm_area << 1363 struct vm_area << 1364 { << 1365 /* Only allow moving if both have the << 1366 if ((src_vma->vm_flags & VM_ACCESS_FL << 1367 pgprot_val(src_vma->vm_page_prot) << 1368 return -EINVAL; << 1369 << 1370 /* Only allow moving if both are mloc << 1371 if ((src_vma->vm_flags & VM_LOCKED) ! << 1372 return -EINVAL; << 1373 << 1374 /* << 1375 * For now, we keep it simple and onl << 1376 * Access flags are equal, therefore << 1377 */ << 1378 if (!(src_vma->vm_flags & VM_WRITE)) << 1379 return -EINVAL; << 1380 << 1381 /* Check if vma flags indicate conten << 1382 if (!vma_move_compatible(src_vma) || << 1383 return -EINVAL; << 1384 << 1385 /* Ensure dst_vma is registered in uf << 1386 if (!dst_vma->vm_userfaultfd_ctx.ctx << 1387 dst_vma->vm_userfaultfd_ctx.ctx ! << 1388 return -EINVAL; << 1389 << 1390 /* Only allow moving across anonymous << 1391 if (!vma_is_anonymous(src_vma) || !vm << 1392 return -EINVAL; << 1393 << 1394 return 0; << 1395 } << 1396 << 1397 static __always_inline << 1398 int find_vmas_mm_locked(struct mm_struct *mm, << 1399 unsigned long dst_sta << 1400 unsigned long src_sta << 1401 struct vm_area_struct << 1402 struct vm_area_struct << 1403 { << 1404 struct vm_area_struct *vma; << 1405 << 1406 mmap_assert_locked(mm); << 1407 vma = find_vma_and_prepare_anon(mm, d << 1408 if (IS_ERR(vma)) << 1409 return PTR_ERR(vma); << 1410 << 1411 *dst_vmap = vma; << 1412 /* Skip finding src_vma if src_start << 1413 if (src_start >= vma->vm_start && src << 1414 goto out_success; << 1415 << 1416 vma = vma_lookup(mm, src_start); << 1417 if (!vma) << 1418 return -ENOENT; << 1419 out_success: << 1420 *src_vmap = vma; << 1421 return 0; << 1422 } << 1423 << 1424 #ifdef CONFIG_PER_VMA_LOCK << 1425 static int uffd_move_lock(struct mm_struct *m << 1426 unsigned long dst_s << 1427 unsigned long src_s << 1428 struct vm_area_stru << 1429 struct vm_area_stru << 1430 { << 1431 struct vm_area_struct *vma; << 1432 int err; << 1433 << 1434 vma = uffd_lock_vma(mm, dst_start); << 1435 if (IS_ERR(vma)) << 1436 return PTR_ERR(vma); << 1437 << 1438 *dst_vmap = vma; << 1439 /* 683 /* 1440 * Skip finding src_vma if src_start !! 684 * Make sure the vma is not shared, that the dst range is 1441 * that we don't lock the same vma tw !! 685 * both valid and fully within a single existing vma. 1442 */ << 1443 if (src_start >= vma->vm_start && src << 1444 *src_vmap = vma; << 1445 return 0; << 1446 } << 1447 << 1448 /* << 1449 * Using uffd_lock_vma() to get src_v << 1450 * << 1451 * Thread1 << 1452 * ------- << 1453 * vma_start_read(dst_vma) << 1454 * << 1455 * << 1456 * vma_start_read(src_vma) << 1457 * mmap_read_lock(mm) << 1458 * << 1459 */ << 1460 *src_vmap = lock_vma_under_rcu(mm, sr << 1461 if (likely(*src_vmap)) << 1462 return 0; << 1463 << 1464 /* Undo any locking and retry in mmap << 1465 vma_end_read(*dst_vmap); << 1466 << 1467 mmap_read_lock(mm); << 1468 err = find_vmas_mm_locked(mm, dst_sta << 1469 if (!err) { << 1470 /* << 1471 * See comment in uffd_lock_v << 1472 * vma_start_read() here. << 1473 */ << 1474 down_read(&(*dst_vmap)->vm_lo << 1475 if (*dst_vmap != *src_vmap) << 1476 down_read_nested(&(*s << 1477 SING << 1478 } << 1479 mmap_read_unlock(mm); << 1480 return err; << 1481 } << 1482 << 1483 static void uffd_move_unlock(struct vm_area_s << 1484 struct vm_area_s << 1485 { << 1486 vma_end_read(src_vma); << 1487 if (src_vma != dst_vma) << 1488 vma_end_read(dst_vma); << 1489 } << 1490 << 1491 #else << 1492 << 1493 static int uffd_move_lock(struct mm_struct *m << 1494 unsigned long dst_s << 1495 unsigned long src_s << 1496 struct vm_area_stru << 1497 struct vm_area_stru << 1498 { << 1499 int err; << 1500 << 1501 mmap_read_lock(mm); << 1502 err = find_vmas_mm_locked(mm, dst_sta << 1503 if (err) << 1504 mmap_read_unlock(mm); << 1505 return err; << 1506 } << 1507 << 1508 static void uffd_move_unlock(struct vm_area_s << 1509 struct vm_area_s << 1510 { << 1511 mmap_assert_locked(src_vma->vm_mm); << 1512 mmap_read_unlock(dst_vma->vm_mm); << 1513 } << 1514 #endif << 1515 << 1516 /** << 1517 * move_pages - move arbitrary anonymous page << 1518 * @ctx: pointer to the userfaultfd context << 1519 * @dst_start: start of the destination virtu << 1520 * @src_start: start of the source virtual me << 1521 * @len: length of the virtual memory range << 1522 * @mode: flags from uffdio_move.mode << 1523 * << 1524 * It will either use the mmap_lock in read m << 1525 * << 1526 * move_pages() remaps arbitrary anonymous pa << 1527 * copy. It only works on non shared anonymou << 1528 * be relocated without generating non linear << 1529 * code. << 1530 * << 1531 * It provides a zero copy mechanism to handl << 1532 * The source vma pages should have mapcount << 1533 * enforced by using madvise(MADV_DONTFORK) o << 1534 * << 1535 * The thread receiving the page during the u << 1536 * will receive the faulting page in the sour << 1537 * storage or any other I/O device (MADV_DONT << 1538 * avoids move_pages() to fail with -EBUSY if << 1539 * move_pages() is called), then it will call << 1540 * page in the faulting address in the destin << 1541 * << 1542 * This userfaultfd command works purely via << 1543 * most efficient way to move physical non sh << 1544 * across different virtual addresses. Unlike << 1545 * it does not create any new vmas. The mappi << 1546 * address is atomic. << 1547 * << 1548 * It only works if the vma protection bits a << 1549 * source and destination vma. << 1550 * << 1551 * It can remap non shared anonymous pages wi << 1552 * << 1553 * If the source virtual memory range has any << 1554 * the destination virtual memory range is no << 1555 * move_pages() will fail respectively with - << 1556 * provides a very strict behavior to avoid a << 1557 * corruption going unnoticed if there are us << 1558 * Only one thread should resolve the userlan << 1559 * time for any given faulting address. This << 1560 * try to both call move_pages() on the same << 1561 * same time, the second thread will get an e << 1562 * command. << 1563 * << 1564 * The command retval will return "len" is su << 1565 * however can be interrupted by fatal signal << 1566 * interrupted it will return the number of b << 1567 * remapped before the interruption if any, o << 1568 * none. It will never return zero. Either it << 1569 * an amount of bytes successfully moved. If << 1570 * "short" remap, the move_pages() command sh << 1571 * userland with src+retval, dst+reval, len-r << 1572 * about the error that interrupted it. << 1573 * << 1574 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag << 1575 * prevent -ENOENT errors to materialize if t << 1576 * source virtual range that is being remappe << 1577 * accounted as successfully remapped in the << 1578 * command. This is mostly useful to remap hu << 1579 * virtual regions without knowing if there a << 1580 * in the regions or not, but preventing the << 1581 * the hugepmd during the remap. << 1582 * << 1583 * If there's any rmap walk that is taking th << 1584 * first obtaining the folio lock (the only c << 1585 * folio_referenced), they will have to verif << 1586 * has changed after taking the anon_vma lock << 1587 * should release the lock and retry obtainin << 1588 * it means the anon_vma was changed by move_ << 1589 * could be obtained. This is the only additi << 1590 * the rmap code to provide this anonymous pa << 1591 */ << 1592 ssize_t move_pages(struct userfaultfd_ctx *ct << 1593 unsigned long src_start, u << 1594 { << 1595 struct mm_struct *mm = ctx->mm; << 1596 struct vm_area_struct *src_vma, *dst_ << 1597 unsigned long src_addr, dst_addr; << 1598 pmd_t *src_pmd, *dst_pmd; << 1599 long err = -EINVAL; << 1600 ssize_t moved = 0; << 1601 << 1602 /* Sanitize the command parameters. * << 1603 if (WARN_ON_ONCE(src_start & ~PAGE_MA << 1604 WARN_ON_ONCE(dst_start & ~PAGE_MA << 1605 WARN_ON_ONCE(len & ~PAGE_MASK)) << 1606 goto out; << 1607 << 1608 /* Does the address range wrap, or is << 1609 if (WARN_ON_ONCE(src_start + len <= s << 1610 WARN_ON_ONCE(dst_start + len <= d << 1611 goto out; << 1612 << 1613 err = uffd_move_lock(mm, dst_start, s << 1614 if (err) << 1615 goto out; << 1616 << 1617 /* Re-check after taking map_changing << 1618 err = -EAGAIN; << 1619 down_read(&ctx->map_changing_lock); << 1620 if (likely(atomic_read(&ctx->mmap_cha << 1621 goto out_unlock; << 1622 /* << 1623 * Make sure the vma is not shared, t << 1624 * ranges are both valid and fully wi << 1625 * vma. << 1626 */ 686 */ 1627 err = -EINVAL; !! 687 if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) 1628 if (src_vma->vm_flags & VM_SHARED) << 1629 goto out_unlock; << 1630 if (src_start + len > src_vma->vm_end << 1631 goto out_unlock; 688 goto out_unlock; 1632 !! 689 if (!userfaultfd_wp(dst_vma)) 1633 if (dst_vma->vm_flags & VM_SHARED) << 1634 goto out_unlock; 690 goto out_unlock; 1635 if (dst_start + len > dst_vma->vm_end !! 691 if (!vma_is_anonymous(dst_vma)) 1636 goto out_unlock; << 1637 << 1638 err = validate_move_areas(ctx, src_vm << 1639 if (err) << 1640 goto out_unlock; 692 goto out_unlock; 1641 693 1642 for (src_addr = src_start, dst_addr = !! 694 if (enable_wp) 1643 src_addr < src_start + len;) { !! 695 newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); 1644 spinlock_t *ptl; !! 696 else 1645 pmd_t dst_pmdval; !! 697 newprot = vm_get_page_prot(dst_vma->vm_flags); 1646 unsigned long step_size; << 1647 << 1648 /* << 1649 * Below works because anonym << 1650 * transparent huge PUD. If f << 1651 * that case would need to be << 1652 */ << 1653 src_pmd = mm_find_pmd(mm, src << 1654 if (unlikely(!src_pmd)) { << 1655 if (!(mode & UFFDIO_M << 1656 err = -ENOENT << 1657 break; << 1658 } << 1659 src_pmd = mm_alloc_pm << 1660 if (unlikely(!src_pmd << 1661 err = -ENOMEM << 1662 break; << 1663 } << 1664 } << 1665 dst_pmd = mm_alloc_pmd(mm, ds << 1666 if (unlikely(!dst_pmd)) { << 1667 err = -ENOMEM; << 1668 break; << 1669 } << 1670 << 1671 dst_pmdval = pmdp_get_lockles << 1672 /* << 1673 * If the dst_pmd is mapped a << 1674 * be strict. If dst_pmd chan << 1675 * move_pages_huge_pmd() will << 1676 * while move_pages_pte() wil << 1677 */ << 1678 if (unlikely(pmd_trans_huge(d << 1679 err = -EEXIST; << 1680 break; << 1681 } << 1682 << 1683 ptl = pmd_trans_huge_lock(src << 1684 if (ptl) { << 1685 if (pmd_devmap(*src_p << 1686 spin_unlock(p << 1687 err = -ENOENT << 1688 break; << 1689 } << 1690 << 1691 /* Check if we can mo << 1692 if (move_splits_huge_ << 1693 !pmd_none(dst_pmd << 1694 struct folio << 1695 << 1696 if (!folio || << 1697 << 1698 spin_ << 1699 err = << 1700 break << 1701 } << 1702 << 1703 spin_unlock(p << 1704 split_huge_pm << 1705 /* The folio << 1706 continue; << 1707 } << 1708 << 1709 err = move_pages_huge << 1710 << 1711 << 1712 step_size = HPAGE_PMD << 1713 } else { << 1714 if (pmd_none(*src_pmd << 1715 if (!(mode & << 1716 err = << 1717 break << 1718 } << 1719 if (unlikely( << 1720 err = << 1721 break << 1722 } << 1723 } << 1724 << 1725 if (unlikely(pte_allo << 1726 err = -ENOMEM << 1727 break; << 1728 } << 1729 << 1730 err = move_pages_pte( << 1731 << 1732 << 1733 step_size = PAGE_SIZE << 1734 } << 1735 << 1736 cond_resched(); << 1737 << 1738 if (fatal_signal_pending(curr << 1739 /* Do not override an << 1740 if (!err || err == -E << 1741 err = -EINTR; << 1742 break; << 1743 } << 1744 << 1745 if (err) { << 1746 if (err == -EAGAIN) << 1747 continue; << 1748 break; << 1749 } << 1750 698 1751 /* Proceed to the next page * !! 699 change_protection(dst_vma, start, start + len, newprot, 1752 dst_addr += step_size; !! 700 enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); 1753 src_addr += step_size; << 1754 moved += step_size; << 1755 } << 1756 701 >> 702 err = 0; 1757 out_unlock: 703 out_unlock: 1758 up_read(&ctx->map_changing_lock); !! 704 mmap_read_unlock(dst_mm); 1759 uffd_move_unlock(dst_vma, src_vma); !! 705 return err; 1760 out: << 1761 VM_WARN_ON(moved < 0); << 1762 VM_WARN_ON(err > 0); << 1763 VM_WARN_ON(!moved && !err); << 1764 return moved ? moved : err; << 1765 } << 1766 << 1767 static void userfaultfd_set_vm_flags(struct v << 1768 vm_flags << 1769 { << 1770 const bool uffd_wp_changed = (vma->vm << 1771 << 1772 vm_flags_reset(vma, flags); << 1773 /* << 1774 * For shared mappings, we want to en << 1775 * userfaultfd-wp is enabled (see vma << 1776 * recalculate vma->vm_page_prot when << 1777 */ << 1778 if ((vma->vm_flags & VM_SHARED) && uf << 1779 vma_set_page_prot(vma); << 1780 } << 1781 << 1782 static void userfaultfd_set_ctx(struct vm_are << 1783 struct userfa << 1784 unsigned long << 1785 { << 1786 vma_start_write(vma); << 1787 vma->vm_userfaultfd_ctx = (struct vm_ << 1788 userfaultfd_set_vm_flags(vma, << 1789 (vma->vm_fla << 1790 } << 1791 << 1792 void userfaultfd_reset_ctx(struct vm_area_str << 1793 { << 1794 userfaultfd_set_ctx(vma, NULL, 0); << 1795 } << 1796 << 1797 struct vm_area_struct *userfaultfd_clear_vma( << 1798 << 1799 << 1800 << 1801 << 1802 { << 1803 struct vm_area_struct *ret; << 1804 << 1805 /* Reset ptes for the whole vma range << 1806 if (userfaultfd_wp(vma)) << 1807 uffd_wp_range(vma, start, end << 1808 << 1809 ret = vma_modify_flags_uffd(vmi, prev << 1810 vma->vm_f << 1811 NULL_VM_U << 1812 << 1813 /* << 1814 * In the vma_merge() successful mpro << 1815 * the next vma was merged into the c << 1816 * the current one has not been updat << 1817 */ << 1818 if (!IS_ERR(ret)) << 1819 userfaultfd_reset_ctx(ret); << 1820 << 1821 return ret; << 1822 } << 1823 << 1824 /* Assumes mmap write lock taken, and mm_stru << 1825 int userfaultfd_register_range(struct userfau << 1826 struct vm_area << 1827 unsigned long << 1828 unsigned long << 1829 bool wp_async) << 1830 { << 1831 VMA_ITERATOR(vmi, ctx->mm, start); << 1832 struct vm_area_struct *prev = vma_pre << 1833 unsigned long vma_end; << 1834 unsigned long new_flags; << 1835 << 1836 if (vma->vm_start < start) << 1837 prev = vma; << 1838 << 1839 for_each_vma_range(vmi, vma, end) { << 1840 cond_resched(); << 1841 << 1842 BUG_ON(!vma_can_userfault(vma << 1843 BUG_ON(vma->vm_userfaultfd_ct << 1844 vma->vm_userfaultfd_ct << 1845 WARN_ON(!(vma->vm_flags & VM_ << 1846 << 1847 /* << 1848 * Nothing to do: this vma is << 1849 * userfaultfd and with the r << 1850 */ << 1851 if (vma->vm_userfaultfd_ctx.c << 1852 (vma->vm_flags & vm_flags << 1853 goto skip; << 1854 << 1855 if (vma->vm_start > start) << 1856 start = vma->vm_start << 1857 vma_end = min(end, vma->vm_en << 1858 << 1859 new_flags = (vma->vm_flags & << 1860 vma = vma_modify_flags_uffd(& << 1861 n << 1862 ( << 1863 if (IS_ERR(vma)) << 1864 return PTR_ERR(vma); << 1865 << 1866 /* << 1867 * In the vma_merge() success << 1868 * the next vma was merged in << 1869 * the current one has not be << 1870 */ << 1871 userfaultfd_set_ctx(vma, ctx, << 1872 << 1873 if (is_vm_hugetlb_page(vma) & << 1874 hugetlb_unshare_all_p << 1875 << 1876 skip: << 1877 prev = vma; << 1878 start = vma->vm_end; << 1879 } << 1880 << 1881 return 0; << 1882 } << 1883 << 1884 void userfaultfd_release_new(struct userfault << 1885 { << 1886 struct mm_struct *mm = ctx->mm; << 1887 struct vm_area_struct *vma; << 1888 VMA_ITERATOR(vmi, mm, 0); << 1889 << 1890 /* the various vma->vm_userfaultfd_ct << 1891 mmap_write_lock(mm); << 1892 for_each_vma(vmi, vma) { << 1893 if (vma->vm_userfaultfd_ctx.c << 1894 userfaultfd_reset_ctx << 1895 } << 1896 mmap_write_unlock(mm); << 1897 } << 1898 << 1899 void userfaultfd_release_all(struct mm_struct << 1900 struct userfault << 1901 { << 1902 struct vm_area_struct *vma, *prev; << 1903 VMA_ITERATOR(vmi, mm, 0); << 1904 << 1905 if (!mmget_not_zero(mm)) << 1906 return; << 1907 << 1908 /* << 1909 * Flush page faults out of all CPUs. << 1910 * must be retried without returning << 1911 * userfaultfd_ctx_get() succeeds but << 1912 * changes while handle_userfault rel << 1913 * it's critical that released is set << 1914 * taking the mmap_lock for writing. << 1915 */ << 1916 mmap_write_lock(mm); << 1917 prev = NULL; << 1918 for_each_vma(vmi, vma) { << 1919 cond_resched(); << 1920 BUG_ON(!!vma->vm_userfaultfd_ << 1921 !!(vma->vm_flags & __V << 1922 if (vma->vm_userfaultfd_ctx.c << 1923 prev = vma; << 1924 continue; << 1925 } << 1926 << 1927 vma = userfaultfd_clear_vma(& << 1928 v << 1929 prev = vma; << 1930 } << 1931 mmap_write_unlock(mm); << 1932 mmput(mm); << 1933 } 706 } 1934 707
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.