1 // SPDX-License-Identifier: GPL-2.0-only 1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 2 /* 3 * mm/userfaultfd.c 3 * mm/userfaultfd.c 4 * 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 5 * Copyright (C) 2015 Red Hat, Inc. 6 */ 6 */ 7 7 8 #include <linux/mm.h> 8 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 9 #include <linux/sched/signal.h> 10 #include <linux/pagemap.h> 10 #include <linux/pagemap.h> 11 #include <linux/rmap.h> 11 #include <linux/rmap.h> 12 #include <linux/swap.h> 12 #include <linux/swap.h> 13 #include <linux/swapops.h> 13 #include <linux/swapops.h> 14 #include <linux/userfaultfd_k.h> 14 #include <linux/userfaultfd_k.h> 15 #include <linux/mmu_notifier.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/hugetlb.h> 16 #include <linux/hugetlb.h> 17 #include <linux/shmem_fs.h> 17 #include <linux/shmem_fs.h> 18 #include <asm/tlbflush.h> 18 #include <asm/tlbflush.h> 19 #include <asm/tlb.h> << 20 #include "internal.h" 19 #include "internal.h" 21 20 22 static __always_inline 21 static __always_inline 23 bool validate_dst_vma(struct vm_area_struct *d !! 22 struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, >> 23 unsigned long dst_start, >> 24 unsigned long len) 24 { 25 { 25 /* Make sure that the dst range is ful !! 26 /* 26 if (dst_end > dst_vma->vm_end) !! 27 * Make sure that the dst range is both valid and fully within a 27 return false; !! 28 * single existing vma. >> 29 */ >> 30 struct vm_area_struct *dst_vma; >> 31 >> 32 dst_vma = find_vma(dst_mm, dst_start); >> 33 if (!dst_vma) >> 34 return NULL; >> 35 >> 36 if (dst_start < dst_vma->vm_start || >> 37 dst_start + len > dst_vma->vm_end) >> 38 return NULL; 28 39 29 /* 40 /* 30 * Check the vma is registered in uffd 41 * Check the vma is registered in uffd, this is required to 31 * enforce the VM_MAYWRITE check done 42 * enforce the VM_MAYWRITE check done at uffd registration 32 * time. 43 * time. 33 */ 44 */ 34 if (!dst_vma->vm_userfaultfd_ctx.ctx) 45 if (!dst_vma->vm_userfaultfd_ctx.ctx) 35 return false; !! 46 return NULL; 36 << 37 return true; << 38 } << 39 << 40 static __always_inline << 41 struct vm_area_struct *find_vma_and_prepare_an << 42 << 43 { << 44 struct vm_area_struct *vma; << 45 << 46 mmap_assert_locked(mm); << 47 vma = vma_lookup(mm, addr); << 48 if (!vma) << 49 vma = ERR_PTR(-ENOENT); << 50 else if (!(vma->vm_flags & VM_SHARED) << 51 unlikely(anon_vma_prepare(vma << 52 vma = ERR_PTR(-ENOMEM); << 53 << 54 return vma; << 55 } << 56 << 57 #ifdef CONFIG_PER_VMA_LOCK << 58 /* << 59 * uffd_lock_vma() - Lookup and lock vma corre << 60 * @mm: mm to search vma in. << 61 * @address: address that the vma should conta << 62 * << 63 * Should be called without holding mmap_lock. << 64 * << 65 * Return: A locked vma containing @address, - << 66 * -ENOMEM if anon_vma couldn't be allocated. << 67 */ << 68 static struct vm_area_struct *uffd_lock_vma(st << 69 unsigne << 70 { << 71 struct vm_area_struct *vma; << 72 << 73 vma = lock_vma_under_rcu(mm, address); << 74 if (vma) { << 75 /* << 76 * We know we're going to need << 77 * that early. << 78 */ << 79 if (!(vma->vm_flags & VM_SHARE << 80 vma_end_read(vma); << 81 else << 82 return vma; << 83 } << 84 << 85 mmap_read_lock(mm); << 86 vma = find_vma_and_prepare_anon(mm, ad << 87 if (!IS_ERR(vma)) { << 88 /* << 89 * We cannot use vma_start_rea << 90 * false locked (see comment i << 91 * can avoid that by directly << 92 * mmap_lock, which guarantees << 93 * vma for write (vma_start_wr << 94 */ << 95 down_read(&vma->vm_lock->lock) << 96 } << 97 << 98 mmap_read_unlock(mm); << 99 return vma; << 100 } << 101 << 102 static struct vm_area_struct *uffd_mfill_lock( << 103 << 104 << 105 { << 106 struct vm_area_struct *dst_vma; << 107 << 108 dst_vma = uffd_lock_vma(dst_mm, dst_st << 109 if (IS_ERR(dst_vma) || validate_dst_vm << 110 return dst_vma; << 111 << 112 vma_end_read(dst_vma); << 113 return ERR_PTR(-ENOENT); << 114 } << 115 << 116 static void uffd_mfill_unlock(struct vm_area_s << 117 { << 118 vma_end_read(vma); << 119 } << 120 << 121 #else << 122 << 123 static struct vm_area_struct *uffd_mfill_lock( << 124 << 125 << 126 { << 127 struct vm_area_struct *dst_vma; << 128 << 129 mmap_read_lock(dst_mm); << 130 dst_vma = find_vma_and_prepare_anon(ds << 131 if (IS_ERR(dst_vma)) << 132 goto out_unlock; << 133 << 134 if (validate_dst_vma(dst_vma, dst_star << 135 return dst_vma; << 136 47 137 dst_vma = ERR_PTR(-ENOENT); << 138 out_unlock: << 139 mmap_read_unlock(dst_mm); << 140 return dst_vma; 48 return dst_vma; 141 } 49 } 142 50 143 static void uffd_mfill_unlock(struct vm_area_s << 144 { << 145 mmap_read_unlock(vma->vm_mm); << 146 } << 147 #endif << 148 << 149 /* Check if dst_addr is outside of file's size << 150 static bool mfill_file_over_size(struct vm_are << 151 unsigned long << 152 { << 153 struct inode *inode; << 154 pgoff_t offset, max_off; << 155 << 156 if (!dst_vma->vm_file) << 157 return false; << 158 << 159 inode = dst_vma->vm_file->f_inode; << 160 offset = linear_page_index(dst_vma, ds << 161 max_off = DIV_ROUND_UP(i_size_read(ino << 162 return offset >= max_off; << 163 } << 164 << 165 /* 51 /* 166 * Install PTEs, to map dst_addr (within dst_v 52 * Install PTEs, to map dst_addr (within dst_vma) to page. 167 * 53 * 168 * This function handles both MCOPY_ATOMIC_NOR 54 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem 169 * and anon, and for both shared and private V 55 * and anon, and for both shared and private VMAs. 170 */ 56 */ 171 int mfill_atomic_install_pte(pmd_t *dst_pmd, !! 57 int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, 172 struct vm_area_st 58 struct vm_area_struct *dst_vma, 173 unsigned long dst 59 unsigned long dst_addr, struct page *page, 174 bool newly_alloca !! 60 bool newly_allocated, bool wp_copy) 175 { 61 { 176 int ret; 62 int ret; 177 struct mm_struct *dst_mm = dst_vma->vm << 178 pte_t _dst_pte, *dst_pte; 63 pte_t _dst_pte, *dst_pte; 179 bool writable = dst_vma->vm_flags & VM 64 bool writable = dst_vma->vm_flags & VM_WRITE; 180 bool vm_shared = dst_vma->vm_flags & V 65 bool vm_shared = dst_vma->vm_flags & VM_SHARED; >> 66 bool page_in_cache = page->mapping; 181 spinlock_t *ptl; 67 spinlock_t *ptl; 182 struct folio *folio = page_folio(page) !! 68 struct inode *inode; 183 bool page_in_cache = folio_mapping(fol !! 69 pgoff_t offset, max_off; 184 70 185 _dst_pte = mk_pte(page, dst_vma->vm_pa 71 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 186 _dst_pte = pte_mkdirty(_dst_pte); 72 _dst_pte = pte_mkdirty(_dst_pte); 187 if (page_in_cache && !vm_shared) 73 if (page_in_cache && !vm_shared) 188 writable = false; 74 writable = false; 189 if (writable) !! 75 190 _dst_pte = pte_mkwrite(_dst_pt !! 76 /* 191 if (flags & MFILL_ATOMIC_WP) !! 77 * Always mark a PTE as write-protected when needed, regardless of >> 78 * VM_WRITE, which the user might change. >> 79 */ >> 80 if (wp_copy) 192 _dst_pte = pte_mkuffd_wp(_dst_ 81 _dst_pte = pte_mkuffd_wp(_dst_pte); >> 82 else if (writable) >> 83 _dst_pte = pte_mkwrite(_dst_pte); 193 84 194 ret = -EAGAIN; << 195 dst_pte = pte_offset_map_lock(dst_mm, 85 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 196 if (!dst_pte) << 197 goto out; << 198 86 199 if (mfill_file_over_size(dst_vma, dst_ !! 87 if (vma_is_shmem(dst_vma)) { >> 88 /* serialize against truncate with the page table lock */ >> 89 inode = dst_vma->vm_file->f_inode; >> 90 offset = linear_page_index(dst_vma, dst_addr); >> 91 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 200 ret = -EFAULT; 92 ret = -EFAULT; 201 goto out_unlock; !! 93 if (unlikely(offset >= max_off)) >> 94 goto out_unlock; 202 } 95 } 203 96 204 ret = -EEXIST; 97 ret = -EEXIST; 205 /* !! 98 if (!pte_none(*dst_pte)) 206 * We allow to overwrite a pte marker: << 207 * registered, we firstly wr-protect a << 208 * page backing it, then access the pa << 209 */ << 210 if (!pte_none_mostly(ptep_get(dst_pte) << 211 goto out_unlock; 99 goto out_unlock; 212 100 213 if (page_in_cache) { !! 101 if (page_in_cache) 214 /* Usually, cache pages are al !! 102 page_add_file_rmap(page, false); 215 if (newly_allocated) !! 103 else 216 folio_add_lru(folio); !! 104 page_add_new_anon_rmap(page, dst_vma, dst_addr, false); 217 folio_add_file_rmap_pte(folio, << 218 } else { << 219 folio_add_new_anon_rmap(folio, << 220 folio_add_lru_vma(folio, dst_v << 221 } << 222 105 223 /* 106 /* 224 * Must happen after rmap, as mm_count 107 * Must happen after rmap, as mm_counter() checks mapping (via 225 * PageAnon()), which is set by __page 108 * PageAnon()), which is set by __page_set_anon_rmap(). 226 */ 109 */ 227 inc_mm_counter(dst_mm, mm_counter(foli !! 110 inc_mm_counter(dst_mm, mm_counter(page)); >> 111 >> 112 if (newly_allocated) >> 113 lru_cache_add_inactive_or_unevictable(page, dst_vma); 228 114 229 set_pte_at(dst_mm, dst_addr, dst_pte, 115 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 230 116 231 /* No need to invalidate - it was non- 117 /* No need to invalidate - it was non-present before */ 232 update_mmu_cache(dst_vma, dst_addr, ds 118 update_mmu_cache(dst_vma, dst_addr, dst_pte); 233 ret = 0; 119 ret = 0; 234 out_unlock: 120 out_unlock: 235 pte_unmap_unlock(dst_pte, ptl); 121 pte_unmap_unlock(dst_pte, ptl); 236 out: << 237 return ret; 122 return ret; 238 } 123 } 239 124 240 static int mfill_atomic_pte_copy(pmd_t *dst_pm !! 125 static int mcopy_atomic_pte(struct mm_struct *dst_mm, 241 struct vm_are !! 126 pmd_t *dst_pmd, 242 unsigned long !! 127 struct vm_area_struct *dst_vma, 243 unsigned long !! 128 unsigned long dst_addr, 244 uffd_flags_t !! 129 unsigned long src_addr, 245 struct folio !! 130 struct page **pagep, >> 131 bool wp_copy) 246 { 132 { 247 void *kaddr; !! 133 void *page_kaddr; 248 int ret; 134 int ret; 249 struct folio *folio; !! 135 struct page *page; 250 136 251 if (!*foliop) { !! 137 if (!*pagep) { 252 ret = -ENOMEM; 138 ret = -ENOMEM; 253 folio = vma_alloc_folio(GFP_HI !! 139 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr); 254 dst_ad !! 140 if (!page) 255 if (!folio) << 256 goto out; 141 goto out; 257 142 258 kaddr = kmap_local_folio(folio !! 143 page_kaddr = kmap_atomic(page); 259 /* !! 144 ret = copy_from_user(page_kaddr, 260 * The read mmap_lock is held !! 145 (const void __user *) src_addr, 261 * mmap_lock being read recurs << 262 * possible if a writer has ta << 263 * << 264 * process A thread 1 takes re << 265 * process A thread 2 calls mm << 266 * process B thread 1 takes pa << 267 * process B thread 2 calls mm << 268 * process A thread 1 blocks t << 269 * process B thread 1 blocks t << 270 * << 271 * Disable page faults to prev << 272 * and retry the copy outside << 273 */ << 274 pagefault_disable(); << 275 ret = copy_from_user(kaddr, (c << 276 PAGE_SIZE 146 PAGE_SIZE); 277 pagefault_enable(); !! 147 kunmap_atomic(page_kaddr); 278 kunmap_local(kaddr); << 279 148 280 /* fallback to copy_from_user 149 /* fallback to copy_from_user outside mmap_lock */ 281 if (unlikely(ret)) { 150 if (unlikely(ret)) { 282 ret = -ENOENT; 151 ret = -ENOENT; 283 *foliop = folio; !! 152 *pagep = page; 284 /* don't free the page 153 /* don't free the page */ 285 goto out; 154 goto out; 286 } 155 } 287 156 288 flush_dcache_folio(folio); !! 157 flush_dcache_page(page); 289 } else { 158 } else { 290 folio = *foliop; !! 159 page = *pagep; 291 *foliop = NULL; !! 160 *pagep = NULL; 292 } 161 } 293 162 294 /* 163 /* 295 * The memory barrier inside __folio_m !! 164 * The memory barrier inside __SetPageUptodate makes sure that 296 * preceding stores to the page conten 165 * preceding stores to the page contents become visible before 297 * the set_pte_at() write. 166 * the set_pte_at() write. 298 */ 167 */ 299 __folio_mark_uptodate(folio); !! 168 __SetPageUptodate(page); 300 169 301 ret = -ENOMEM; 170 ret = -ENOMEM; 302 if (mem_cgroup_charge(folio, dst_vma-> !! 171 if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL)) 303 goto out_release; 172 goto out_release; 304 173 305 ret = mfill_atomic_install_pte(dst_pmd !! 174 ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, 306 &folio- !! 175 page, true, wp_copy); 307 if (ret) 176 if (ret) 308 goto out_release; 177 goto out_release; 309 out: 178 out: 310 return ret; 179 return ret; 311 out_release: 180 out_release: 312 folio_put(folio); !! 181 put_page(page); 313 goto out; 182 goto out; 314 } 183 } 315 184 316 static int mfill_atomic_pte_zeroed_folio(pmd_t !! 185 static int mfill_zeropage_pte(struct mm_struct *dst_mm, 317 struc !! 186 pmd_t *dst_pmd, 318 unsig !! 187 struct vm_area_struct *dst_vma, 319 { !! 188 unsigned long dst_addr) 320 struct folio *folio; << 321 int ret = -ENOMEM; << 322 << 323 folio = vma_alloc_zeroed_movable_folio << 324 if (!folio) << 325 return ret; << 326 << 327 if (mem_cgroup_charge(folio, dst_vma-> << 328 goto out_put; << 329 << 330 /* << 331 * The memory barrier inside __folio_m << 332 * zeroing out the folio become visibl << 333 * using set_pte_at(). See do_anonymou << 334 */ << 335 __folio_mark_uptodate(folio); << 336 << 337 ret = mfill_atomic_install_pte(dst_pmd << 338 &folio- << 339 if (ret) << 340 goto out_put; << 341 << 342 return 0; << 343 out_put: << 344 folio_put(folio); << 345 return ret; << 346 } << 347 << 348 static int mfill_atomic_pte_zeropage(pmd_t *ds << 349 struct vm << 350 unsigned << 351 { 189 { 352 pte_t _dst_pte, *dst_pte; 190 pte_t _dst_pte, *dst_pte; 353 spinlock_t *ptl; 191 spinlock_t *ptl; 354 int ret; 192 int ret; 355 !! 193 pgoff_t offset, max_off; 356 if (mm_forbids_zeropage(dst_vma->vm_mm !! 194 struct inode *inode; 357 return mfill_atomic_pte_zeroed << 358 195 359 _dst_pte = pte_mkspecial(pfn_pte(my_ze 196 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 360 dst_v 197 dst_vma->vm_page_prot)); 361 ret = -EAGAIN; !! 198 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 362 dst_pte = pte_offset_map_lock(dst_vma- !! 199 if (dst_vma->vm_file) { 363 if (!dst_pte) !! 200 /* the shmem MAP_PRIVATE case requires checking the i_size */ 364 goto out; !! 201 inode = dst_vma->vm_file->f_inode; 365 if (mfill_file_over_size(dst_vma, dst_ !! 202 offset = linear_page_index(dst_vma, dst_addr); >> 203 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 366 ret = -EFAULT; 204 ret = -EFAULT; 367 goto out_unlock; !! 205 if (unlikely(offset >= max_off)) >> 206 goto out_unlock; 368 } 207 } 369 ret = -EEXIST; 208 ret = -EEXIST; 370 if (!pte_none(ptep_get(dst_pte))) !! 209 if (!pte_none(*dst_pte)) 371 goto out_unlock; 210 goto out_unlock; 372 set_pte_at(dst_vma->vm_mm, dst_addr, d !! 211 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 373 /* No need to invalidate - it was non- 212 /* No need to invalidate - it was non-present before */ 374 update_mmu_cache(dst_vma, dst_addr, ds 213 update_mmu_cache(dst_vma, dst_addr, dst_pte); 375 ret = 0; 214 ret = 0; 376 out_unlock: 215 out_unlock: 377 pte_unmap_unlock(dst_pte, ptl); 216 pte_unmap_unlock(dst_pte, ptl); 378 out: << 379 return ret; 217 return ret; 380 } 218 } 381 219 382 /* Handles UFFDIO_CONTINUE for all shmem VMAs 220 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ 383 static int mfill_atomic_pte_continue(pmd_t *ds !! 221 static int mcontinue_atomic_pte(struct mm_struct *dst_mm, 384 struct vm !! 222 pmd_t *dst_pmd, 385 unsigned !! 223 struct vm_area_struct *dst_vma, 386 uffd_flag !! 224 unsigned long dst_addr, >> 225 bool wp_copy) 387 { 226 { 388 struct inode *inode = file_inode(dst_v 227 struct inode *inode = file_inode(dst_vma->vm_file); 389 pgoff_t pgoff = linear_page_index(dst_ 228 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 390 struct folio *folio; << 391 struct page *page; 229 struct page *page; 392 int ret; 230 int ret; 393 231 394 ret = shmem_get_folio(inode, pgoff, 0, !! 232 ret = shmem_getpage(inode, pgoff, &page, SGP_READ); 395 /* Our caller expects us to return -EF << 396 if (ret == -ENOENT) << 397 ret = -EFAULT; << 398 if (ret) 233 if (ret) 399 goto out; 234 goto out; 400 if (!folio) { !! 235 if (!page) { 401 ret = -EFAULT; 236 ret = -EFAULT; 402 goto out; 237 goto out; 403 } 238 } 404 239 405 page = folio_file_page(folio, pgoff); << 406 if (PageHWPoison(page)) { 240 if (PageHWPoison(page)) { 407 ret = -EIO; 241 ret = -EIO; 408 goto out_release; 242 goto out_release; 409 } 243 } 410 244 411 ret = mfill_atomic_install_pte(dst_pmd !! 245 ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, 412 page, f !! 246 page, false, wp_copy); 413 if (ret) 247 if (ret) 414 goto out_release; 248 goto out_release; 415 249 416 folio_unlock(folio); !! 250 unlock_page(page); 417 ret = 0; 251 ret = 0; 418 out: 252 out: 419 return ret; 253 return ret; 420 out_release: 254 out_release: 421 folio_unlock(folio); !! 255 unlock_page(page); 422 folio_put(folio); !! 256 put_page(page); 423 goto out; 257 goto out; 424 } 258 } 425 259 426 /* Handles UFFDIO_POISON for all non-hugetlb V << 427 static int mfill_atomic_pte_poison(pmd_t *dst_ << 428 struct vm_a << 429 unsigned lo << 430 uffd_flags_ << 431 { << 432 int ret; << 433 struct mm_struct *dst_mm = dst_vma->vm << 434 pte_t _dst_pte, *dst_pte; << 435 spinlock_t *ptl; << 436 << 437 _dst_pte = make_pte_marker(PTE_MARKER_ << 438 ret = -EAGAIN; << 439 dst_pte = pte_offset_map_lock(dst_mm, << 440 if (!dst_pte) << 441 goto out; << 442 << 443 if (mfill_file_over_size(dst_vma, dst_ << 444 ret = -EFAULT; << 445 goto out_unlock; << 446 } << 447 << 448 ret = -EEXIST; << 449 /* Refuse to overwrite any PTE, even a << 450 if (!pte_none(ptep_get(dst_pte))) << 451 goto out_unlock; << 452 << 453 set_pte_at(dst_mm, dst_addr, dst_pte, << 454 << 455 /* No need to invalidate - it was non- << 456 update_mmu_cache(dst_vma, dst_addr, ds << 457 ret = 0; << 458 out_unlock: << 459 pte_unmap_unlock(dst_pte, ptl); << 460 out: << 461 return ret; << 462 } << 463 << 464 static pmd_t *mm_alloc_pmd(struct mm_struct *m 260 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 465 { 261 { 466 pgd_t *pgd; 262 pgd_t *pgd; 467 p4d_t *p4d; 263 p4d_t *p4d; 468 pud_t *pud; 264 pud_t *pud; 469 265 470 pgd = pgd_offset(mm, address); 266 pgd = pgd_offset(mm, address); 471 p4d = p4d_alloc(mm, pgd, address); 267 p4d = p4d_alloc(mm, pgd, address); 472 if (!p4d) 268 if (!p4d) 473 return NULL; 269 return NULL; 474 pud = pud_alloc(mm, p4d, address); 270 pud = pud_alloc(mm, p4d, address); 475 if (!pud) 271 if (!pud) 476 return NULL; 272 return NULL; 477 /* 273 /* 478 * Note that we didn't run this becaus 274 * Note that we didn't run this because the pmd was 479 * missing, the *pmd may be already es 275 * missing, the *pmd may be already established and in 480 * turn it may also be a trans_huge_pm 276 * turn it may also be a trans_huge_pmd. 481 */ 277 */ 482 return pmd_alloc(mm, pud, address); 278 return pmd_alloc(mm, pud, address); 483 } 279 } 484 280 485 #ifdef CONFIG_HUGETLB_PAGE 281 #ifdef CONFIG_HUGETLB_PAGE 486 /* 282 /* 487 * mfill_atomic processing for HUGETLB vmas. !! 283 * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is 488 * called with either vma-lock or mmap_lock he !! 284 * called with mmap_lock held, it will release mmap_lock before returning. 489 * before returning. << 490 */ 285 */ 491 static __always_inline ssize_t mfill_atomic_hu !! 286 static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, 492 << 493 287 struct vm_area_struct *dst_vma, 494 288 unsigned long dst_start, 495 289 unsigned long src_start, 496 290 unsigned long len, 497 !! 291 enum mcopy_atomic_mode mode) 498 { 292 { 499 struct mm_struct *dst_mm = dst_vma->vm !! 293 int vm_shared = dst_vma->vm_flags & VM_SHARED; 500 ssize_t err; 294 ssize_t err; 501 pte_t *dst_pte; 295 pte_t *dst_pte; 502 unsigned long src_addr, dst_addr; 296 unsigned long src_addr, dst_addr; 503 long copied; 297 long copied; 504 struct folio *folio; !! 298 struct page *page; 505 unsigned long vma_hpagesize; 299 unsigned long vma_hpagesize; 506 pgoff_t idx; 300 pgoff_t idx; 507 u32 hash; 301 u32 hash; 508 struct address_space *mapping; 302 struct address_space *mapping; 509 303 510 /* 304 /* 511 * There is no default zero huge page 305 * There is no default zero huge page for all huge page sizes as 512 * supported by hugetlb. A PMD_SIZE h 306 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 513 * by THP. Since we can not reliably 307 * by THP. Since we can not reliably insert a zero page, this 514 * feature is not supported. 308 * feature is not supported. 515 */ 309 */ 516 if (uffd_flags_mode_is(flags, MFILL_AT !! 310 if (mode == MCOPY_ATOMIC_ZEROPAGE) { 517 up_read(&ctx->map_changing_loc !! 311 mmap_read_unlock(dst_mm); 518 uffd_mfill_unlock(dst_vma); << 519 return -EINVAL; 312 return -EINVAL; 520 } 313 } 521 314 522 src_addr = src_start; 315 src_addr = src_start; 523 dst_addr = dst_start; 316 dst_addr = dst_start; 524 copied = 0; 317 copied = 0; 525 folio = NULL; !! 318 page = NULL; 526 vma_hpagesize = vma_kernel_pagesize(ds 319 vma_hpagesize = vma_kernel_pagesize(dst_vma); 527 320 528 /* 321 /* 529 * Validate alignment based on huge pa 322 * Validate alignment based on huge page size 530 */ 323 */ 531 err = -EINVAL; 324 err = -EINVAL; 532 if (dst_start & (vma_hpagesize - 1) || 325 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 533 goto out_unlock; 326 goto out_unlock; 534 327 535 retry: 328 retry: 536 /* 329 /* 537 * On routine entry dst_vma is set. I 330 * On routine entry dst_vma is set. If we had to drop mmap_lock and 538 * retry, dst_vma will be set to NULL 331 * retry, dst_vma will be set to NULL and we must lookup again. 539 */ 332 */ 540 if (!dst_vma) { 333 if (!dst_vma) { 541 dst_vma = uffd_mfill_lock(dst_ << 542 if (IS_ERR(dst_vma)) { << 543 err = PTR_ERR(dst_vma) << 544 goto out; << 545 } << 546 << 547 err = -ENOENT; 334 err = -ENOENT; 548 if (!is_vm_hugetlb_page(dst_vm !! 335 dst_vma = find_dst_vma(dst_mm, dst_start, len); 549 goto out_unlock_vma; !! 336 if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) >> 337 goto out_unlock; 550 338 551 err = -EINVAL; 339 err = -EINVAL; 552 if (vma_hpagesize != vma_kerne 340 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 553 goto out_unlock_vma; !! 341 goto out_unlock; 554 342 555 /* !! 343 vm_shared = dst_vma->vm_flags & VM_SHARED; 556 * If memory mappings are chan !! 344 } 557 * operation (e.g. mremap) run !! 345 558 * request the user to retry l !! 346 /* 559 */ !! 347 * If not shared, ensure the dst_vma has a anon_vma. 560 down_read(&ctx->map_changing_l !! 348 */ 561 err = -EAGAIN; !! 349 err = -ENOMEM; 562 if (atomic_read(&ctx->mmap_cha !! 350 if (!vm_shared) { >> 351 if (unlikely(anon_vma_prepare(dst_vma))) 563 goto out_unlock; 352 goto out_unlock; 564 } 353 } 565 354 566 while (src_addr < src_start + len) { 355 while (src_addr < src_start + len) { 567 BUG_ON(dst_addr >= dst_start + 356 BUG_ON(dst_addr >= dst_start + len); 568 357 569 /* 358 /* 570 * Serialize via vma_lock and !! 359 * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. 571 * vma_lock ensures the dst_pt !! 360 * i_mmap_rwsem ensures the dst_pte remains valid even 572 * in the case of shared pmds. 361 * in the case of shared pmds. fault mutex prevents 573 * races with other faulting t 362 * races with other faulting threads. 574 */ 363 */ 575 idx = linear_page_index(dst_vm << 576 mapping = dst_vma->vm_file->f_ 364 mapping = dst_vma->vm_file->f_mapping; >> 365 i_mmap_lock_read(mapping); >> 366 idx = linear_page_index(dst_vma, dst_addr); 577 hash = hugetlb_fault_mutex_has 367 hash = hugetlb_fault_mutex_hash(mapping, idx); 578 mutex_lock(&hugetlb_fault_mute 368 mutex_lock(&hugetlb_fault_mutex_table[hash]); 579 hugetlb_vma_lock_read(dst_vma) << 580 369 581 err = -ENOMEM; 370 err = -ENOMEM; 582 dst_pte = huge_pte_alloc(dst_m 371 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); 583 if (!dst_pte) { 372 if (!dst_pte) { 584 hugetlb_vma_unlock_rea << 585 mutex_unlock(&hugetlb_ 373 mutex_unlock(&hugetlb_fault_mutex_table[hash]); >> 374 i_mmap_unlock_read(mapping); 586 goto out_unlock; 375 goto out_unlock; 587 } 376 } 588 377 589 if (!uffd_flags_mode_is(flags, !! 378 if (mode != MCOPY_ATOMIC_CONTINUE && 590 !huge_pte_none_mostly(huge !! 379 !huge_pte_none(huge_ptep_get(dst_pte))) { 591 err = -EEXIST; 380 err = -EEXIST; 592 hugetlb_vma_unlock_rea << 593 mutex_unlock(&hugetlb_ 381 mutex_unlock(&hugetlb_fault_mutex_table[hash]); >> 382 i_mmap_unlock_read(mapping); 594 goto out_unlock; 383 goto out_unlock; 595 } 384 } 596 385 597 err = hugetlb_mfill_atomic_pte !! 386 err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, 598 !! 387 dst_addr, src_addr, mode, &page); 599 388 600 hugetlb_vma_unlock_read(dst_vm << 601 mutex_unlock(&hugetlb_fault_mu 389 mutex_unlock(&hugetlb_fault_mutex_table[hash]); >> 390 i_mmap_unlock_read(mapping); 602 391 603 cond_resched(); 392 cond_resched(); 604 393 605 if (unlikely(err == -ENOENT)) 394 if (unlikely(err == -ENOENT)) { 606 up_read(&ctx->map_chan !! 395 mmap_read_unlock(dst_mm); 607 uffd_mfill_unlock(dst_ !! 396 BUG_ON(!page); 608 BUG_ON(!folio); << 609 397 610 err = copy_folio_from_ !! 398 err = copy_huge_page_from_user(page, 611 !! 399 (const void __user *)src_addr, >> 400 vma_hpagesize / PAGE_SIZE, >> 401 true); 612 if (unlikely(err)) { 402 if (unlikely(err)) { 613 err = -EFAULT; 403 err = -EFAULT; 614 goto out; 404 goto out; 615 } 405 } >> 406 mmap_read_lock(dst_mm); 616 407 617 dst_vma = NULL; 408 dst_vma = NULL; 618 goto retry; 409 goto retry; 619 } else 410 } else 620 BUG_ON(folio); !! 411 BUG_ON(page); 621 412 622 if (!err) { 413 if (!err) { 623 dst_addr += vma_hpages 414 dst_addr += vma_hpagesize; 624 src_addr += vma_hpages 415 src_addr += vma_hpagesize; 625 copied += vma_hpagesiz 416 copied += vma_hpagesize; 626 417 627 if (fatal_signal_pendi 418 if (fatal_signal_pending(current)) 628 err = -EINTR; 419 err = -EINTR; 629 } 420 } 630 if (err) 421 if (err) 631 break; 422 break; 632 } 423 } 633 424 634 out_unlock: 425 out_unlock: 635 up_read(&ctx->map_changing_lock); !! 426 mmap_read_unlock(dst_mm); 636 out_unlock_vma: << 637 uffd_mfill_unlock(dst_vma); << 638 out: 427 out: 639 if (folio) !! 428 if (page) 640 folio_put(folio); !! 429 put_page(page); 641 BUG_ON(copied < 0); 430 BUG_ON(copied < 0); 642 BUG_ON(err > 0); 431 BUG_ON(err > 0); 643 BUG_ON(!copied && !err); 432 BUG_ON(!copied && !err); 644 return copied ? copied : err; 433 return copied ? copied : err; 645 } 434 } 646 #else /* !CONFIG_HUGETLB_PAGE */ 435 #else /* !CONFIG_HUGETLB_PAGE */ 647 /* fail at build time if gcc attempts to use t 436 /* fail at build time if gcc attempts to use this */ 648 extern ssize_t mfill_atomic_hugetlb(struct use !! 437 extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, 649 struct vm_ !! 438 struct vm_area_struct *dst_vma, 650 unsigned l !! 439 unsigned long dst_start, 651 unsigned l !! 440 unsigned long src_start, 652 unsigned l !! 441 unsigned long len, 653 uffd_flags !! 442 enum mcopy_atomic_mode mode); 654 #endif /* CONFIG_HUGETLB_PAGE */ 443 #endif /* CONFIG_HUGETLB_PAGE */ 655 444 656 static __always_inline ssize_t mfill_atomic_pt !! 445 static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, >> 446 pmd_t *dst_pmd, 657 447 struct vm_area_struct *dst_vma, 658 448 unsigned long dst_addr, 659 449 unsigned long src_addr, 660 !! 450 struct page **page, 661 !! 451 enum mcopy_atomic_mode mode, >> 452 bool wp_copy) 662 { 453 { 663 ssize_t err; 454 ssize_t err; 664 455 665 if (uffd_flags_mode_is(flags, MFILL_AT !! 456 if (mode == MCOPY_ATOMIC_CONTINUE) { 666 return mfill_atomic_pte_contin !! 457 return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, 667 !! 458 wp_copy); 668 } else if (uffd_flags_mode_is(flags, M << 669 return mfill_atomic_pte_poison << 670 << 671 } 459 } 672 460 673 /* 461 /* 674 * The normal page fault path for a sh 462 * The normal page fault path for a shmem will invoke the 675 * fault, fill the hole in the file an 463 * fault, fill the hole in the file and COW it right away. The 676 * result generates plain anonymous me 464 * result generates plain anonymous memory. So when we are 677 * asked to fill an hole in a MAP_PRIV 465 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll 678 * generate anonymous memory directly 466 * generate anonymous memory directly without actually filling 679 * the hole. For the MAP_PRIVATE case 467 * the hole. For the MAP_PRIVATE case the robustness check 680 * only happens in the pagetable (to v 468 * only happens in the pagetable (to verify it's still none) 681 * and not in the radix tree. 469 * and not in the radix tree. 682 */ 470 */ 683 if (!(dst_vma->vm_flags & VM_SHARED)) 471 if (!(dst_vma->vm_flags & VM_SHARED)) { 684 if (uffd_flags_mode_is(flags, !! 472 if (mode == MCOPY_ATOMIC_NORMAL) 685 err = mfill_atomic_pte !! 473 err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, 686 !! 474 dst_addr, src_addr, page, 687 !! 475 wp_copy); 688 else 476 else 689 err = mfill_atomic_pte !! 477 err = mfill_zeropage_pte(dst_mm, dst_pmd, 690 478 dst_vma, dst_addr); 691 } else { 479 } else { 692 err = shmem_mfill_atomic_pte(d !! 480 VM_WARN_ON_ONCE(wp_copy); >> 481 err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, 693 d 482 dst_addr, src_addr, 694 f !! 483 mode != MCOPY_ATOMIC_NORMAL, >> 484 page); 695 } 485 } 696 486 697 return err; 487 return err; 698 } 488 } 699 489 700 static __always_inline ssize_t mfill_atomic(st !! 490 static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, 701 un !! 491 unsigned long dst_start, 702 un !! 492 unsigned long src_start, 703 un !! 493 unsigned long len, 704 uf !! 494 enum mcopy_atomic_mode mcopy_mode, >> 495 atomic_t *mmap_changing, >> 496 __u64 mode) 705 { 497 { 706 struct mm_struct *dst_mm = ctx->mm; << 707 struct vm_area_struct *dst_vma; 498 struct vm_area_struct *dst_vma; 708 ssize_t err; 499 ssize_t err; 709 pmd_t *dst_pmd; 500 pmd_t *dst_pmd; 710 unsigned long src_addr, dst_addr; 501 unsigned long src_addr, dst_addr; 711 long copied; 502 long copied; 712 struct folio *folio; !! 503 struct page *page; >> 504 bool wp_copy; 713 505 714 /* 506 /* 715 * Sanitize the command parameters: 507 * Sanitize the command parameters: 716 */ 508 */ 717 BUG_ON(dst_start & ~PAGE_MASK); 509 BUG_ON(dst_start & ~PAGE_MASK); 718 BUG_ON(len & ~PAGE_MASK); 510 BUG_ON(len & ~PAGE_MASK); 719 511 720 /* Does the address range wrap, or is 512 /* Does the address range wrap, or is the span zero-sized? */ 721 BUG_ON(src_start + len <= src_start); 513 BUG_ON(src_start + len <= src_start); 722 BUG_ON(dst_start + len <= dst_start); 514 BUG_ON(dst_start + len <= dst_start); 723 515 724 src_addr = src_start; 516 src_addr = src_start; 725 dst_addr = dst_start; 517 dst_addr = dst_start; 726 copied = 0; 518 copied = 0; 727 folio = NULL; !! 519 page = NULL; 728 retry: 520 retry: 729 /* !! 521 mmap_read_lock(dst_mm); 730 * Make sure the vma is not shared, th << 731 * both valid and fully within a singl << 732 */ << 733 dst_vma = uffd_mfill_lock(dst_mm, dst_ << 734 if (IS_ERR(dst_vma)) { << 735 err = PTR_ERR(dst_vma); << 736 goto out; << 737 } << 738 522 739 /* 523 /* 740 * If memory mappings are changing bec 524 * If memory mappings are changing because of non-cooperative 741 * operation (e.g. mremap) running in 525 * operation (e.g. mremap) running in parallel, bail out and 742 * request the user to retry later 526 * request the user to retry later 743 */ 527 */ 744 down_read(&ctx->map_changing_lock); << 745 err = -EAGAIN; 528 err = -EAGAIN; 746 if (atomic_read(&ctx->mmap_changing)) !! 529 if (mmap_changing && atomic_read(mmap_changing)) >> 530 goto out_unlock; >> 531 >> 532 /* >> 533 * Make sure the vma is not shared, that the dst range is >> 534 * both valid and fully within a single existing vma. >> 535 */ >> 536 err = -ENOENT; >> 537 dst_vma = find_dst_vma(dst_mm, dst_start, len); >> 538 if (!dst_vma) 747 goto out_unlock; 539 goto out_unlock; 748 540 749 err = -EINVAL; 541 err = -EINVAL; 750 /* 542 /* 751 * shmem_zero_setup is invoked in mmap 543 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 752 * it will overwrite vm_ops, so vma_is 544 * it will overwrite vm_ops, so vma_is_anonymous must return false. 753 */ 545 */ 754 if (WARN_ON_ONCE(vma_is_anonymous(dst_ 546 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 755 dst_vma->vm_flags & VM_SHARED)) 547 dst_vma->vm_flags & VM_SHARED)) 756 goto out_unlock; 548 goto out_unlock; 757 549 758 /* 550 /* 759 * validate 'mode' now that we know th 551 * validate 'mode' now that we know the dst_vma: don't allow 760 * a wrprotect copy if the userfaultfd 552 * a wrprotect copy if the userfaultfd didn't register as WP. 761 */ 553 */ 762 if ((flags & MFILL_ATOMIC_WP) && !(dst !! 554 wp_copy = mode & UFFDIO_COPY_MODE_WP; >> 555 if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP)) 763 goto out_unlock; 556 goto out_unlock; 764 557 765 /* 558 /* 766 * If this is a HUGETLB vma, pass off 559 * If this is a HUGETLB vma, pass off to appropriate routine 767 */ 560 */ 768 if (is_vm_hugetlb_page(dst_vma)) 561 if (is_vm_hugetlb_page(dst_vma)) 769 return mfill_atomic_hugetlb(c !! 562 return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, 770 s !! 563 src_start, len, mcopy_mode); 771 564 772 if (!vma_is_anonymous(dst_vma) && !vma 565 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) 773 goto out_unlock; 566 goto out_unlock; 774 if (!vma_is_shmem(dst_vma) && !! 567 if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE) 775 uffd_flags_mode_is(flags, MFILL_AT !! 568 goto out_unlock; >> 569 >> 570 /* >> 571 * Ensure the dst_vma has a anon_vma or this page >> 572 * would get a NULL anon_vma when moved in the >> 573 * dst_vma. >> 574 */ >> 575 err = -ENOMEM; >> 576 if (!(dst_vma->vm_flags & VM_SHARED) && >> 577 unlikely(anon_vma_prepare(dst_vma))) 776 goto out_unlock; 578 goto out_unlock; 777 579 778 while (src_addr < src_start + len) { 580 while (src_addr < src_start + len) { 779 pmd_t dst_pmdval; 581 pmd_t dst_pmdval; 780 582 781 BUG_ON(dst_addr >= dst_start + 583 BUG_ON(dst_addr >= dst_start + len); 782 584 783 dst_pmd = mm_alloc_pmd(dst_mm, 585 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); 784 if (unlikely(!dst_pmd)) { 586 if (unlikely(!dst_pmd)) { 785 err = -ENOMEM; 587 err = -ENOMEM; 786 break; 588 break; 787 } 589 } 788 590 789 dst_pmdval = pmdp_get_lockless !! 591 dst_pmdval = pmd_read_atomic(dst_pmd); 790 if (unlikely(pmd_none(dst_pmdv << 791 unlikely(__pte_alloc(dst_m << 792 err = -ENOMEM; << 793 break; << 794 } << 795 dst_pmdval = pmdp_get_lockless << 796 /* 592 /* 797 * If the dst_pmd is THP don't !! 593 * If the dst_pmd is mapped as THP don't 798 * (This includes the case whe !! 594 * override it and just be strict. 799 * changed back to none after << 800 */ 595 */ 801 if (unlikely(!pmd_present(dst_ !! 596 if (unlikely(pmd_trans_huge(dst_pmdval))) { 802 pmd_devmap(dst_pm << 803 err = -EEXIST; 597 err = -EEXIST; 804 break; 598 break; 805 } 599 } 806 if (unlikely(pmd_bad(dst_pmdva !! 600 if (unlikely(pmd_none(dst_pmdval)) && >> 601 unlikely(__pte_alloc(dst_mm, dst_pmd))) { >> 602 err = -ENOMEM; >> 603 break; >> 604 } >> 605 /* If an huge pmd materialized from under us fail */ >> 606 if (unlikely(pmd_trans_huge(*dst_pmd))) { 807 err = -EFAULT; 607 err = -EFAULT; 808 break; 608 break; 809 } 609 } 810 /* << 811 * For shmem mappings, khugepa << 812 * tables under us; pte_offset << 813 */ << 814 610 815 err = mfill_atomic_pte(dst_pmd !! 611 BUG_ON(pmd_none(*dst_pmd)); 816 src_add !! 612 BUG_ON(pmd_trans_huge(*dst_pmd)); >> 613 >> 614 err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, >> 615 src_addr, &page, mcopy_mode, wp_copy); 817 cond_resched(); 616 cond_resched(); 818 617 819 if (unlikely(err == -ENOENT)) 618 if (unlikely(err == -ENOENT)) { 820 void *kaddr; !! 619 void *page_kaddr; 821 620 822 up_read(&ctx->map_chan !! 621 mmap_read_unlock(dst_mm); 823 uffd_mfill_unlock(dst_ !! 622 BUG_ON(!page); 824 BUG_ON(!folio); << 825 623 826 kaddr = kmap_local_fol !! 624 page_kaddr = kmap(page); 827 err = copy_from_user(k !! 625 err = copy_from_user(page_kaddr, 828 ( 626 (const void __user *) src_addr, 829 P 627 PAGE_SIZE); 830 kunmap_local(kaddr); !! 628 kunmap(page); 831 if (unlikely(err)) { 629 if (unlikely(err)) { 832 err = -EFAULT; 630 err = -EFAULT; 833 goto out; 631 goto out; 834 } 632 } 835 flush_dcache_folio(fol !! 633 flush_dcache_page(page); 836 goto retry; 634 goto retry; 837 } else 635 } else 838 BUG_ON(folio); !! 636 BUG_ON(page); 839 637 840 if (!err) { 638 if (!err) { 841 dst_addr += PAGE_SIZE; 639 dst_addr += PAGE_SIZE; 842 src_addr += PAGE_SIZE; 640 src_addr += PAGE_SIZE; 843 copied += PAGE_SIZE; 641 copied += PAGE_SIZE; 844 642 845 if (fatal_signal_pendi 643 if (fatal_signal_pending(current)) 846 err = -EINTR; 644 err = -EINTR; 847 } 645 } 848 if (err) 646 if (err) 849 break; 647 break; 850 } 648 } 851 649 852 out_unlock: 650 out_unlock: 853 up_read(&ctx->map_changing_lock); !! 651 mmap_read_unlock(dst_mm); 854 uffd_mfill_unlock(dst_vma); << 855 out: 652 out: 856 if (folio) !! 653 if (page) 857 folio_put(folio); !! 654 put_page(page); 858 BUG_ON(copied < 0); 655 BUG_ON(copied < 0); 859 BUG_ON(err > 0); 656 BUG_ON(err > 0); 860 BUG_ON(!copied && !err); 657 BUG_ON(!copied && !err); 861 return copied ? copied : err; 658 return copied ? copied : err; 862 } 659 } 863 660 864 ssize_t mfill_atomic_copy(struct userfaultfd_c !! 661 ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, 865 unsigned long src_st !! 662 unsigned long src_start, unsigned long len, 866 uffd_flags_t flags) !! 663 atomic_t *mmap_changing, __u64 mode) 867 { 664 { 868 return mfill_atomic(ctx, dst_start, sr !! 665 return __mcopy_atomic(dst_mm, dst_start, src_start, len, 869 uffd_flags_set_mod !! 666 MCOPY_ATOMIC_NORMAL, mmap_changing, mode); 870 } << 871 << 872 ssize_t mfill_atomic_zeropage(struct userfault << 873 unsigned long st << 874 unsigned long le << 875 { << 876 return mfill_atomic(ctx, start, 0, len << 877 uffd_flags_set_mod << 878 } << 879 << 880 ssize_t mfill_atomic_continue(struct userfault << 881 unsigned long le << 882 { << 883 << 884 /* << 885 * A caller might reasonably assume th << 886 * smp_wmb() to ensure that any writes << 887 * the thread doing the UFFDIO_CONTINU << 888 * subsequent loads from the page thro << 889 */ << 890 smp_wmb(); << 891 << 892 return mfill_atomic(ctx, start, 0, len << 893 uffd_flags_set_mod << 894 } 667 } 895 668 896 ssize_t mfill_atomic_poison(struct userfaultfd !! 669 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, 897 unsigned long len, !! 670 unsigned long len, atomic_t *mmap_changing) 898 { 671 { 899 return mfill_atomic(ctx, start, 0, len !! 672 return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, 900 uffd_flags_set_mod !! 673 mmap_changing, 0); 901 } 674 } 902 675 903 long uffd_wp_range(struct vm_area_struct *dst_ !! 676 ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, 904 unsigned long start, unsign !! 677 unsigned long len, atomic_t *mmap_changing) 905 { 678 { 906 unsigned int mm_cp_flags; !! 679 return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, 907 struct mmu_gather tlb; !! 680 mmap_changing, 0); 908 long ret; << 909 << 910 VM_WARN_ONCE(start < dst_vma->vm_start << 911 "The address range exc << 912 if (enable_wp) << 913 mm_cp_flags = MM_CP_UFFD_WP; << 914 else << 915 mm_cp_flags = MM_CP_UFFD_WP_RE << 916 << 917 /* << 918 * vma->vm_page_prot already reflects << 919 * VMA (see userfaultfd_set_vm_flags() << 920 * to be write-protected as default wh << 921 * Try upgrading write permissions man << 922 */ << 923 if (!enable_wp && vma_wants_manual_pte << 924 mm_cp_flags |= MM_CP_TRY_CHANG << 925 tlb_gather_mmu(&tlb, dst_vma->vm_mm); << 926 ret = change_protection(&tlb, dst_vma, << 927 tlb_finish_mmu(&tlb); << 928 << 929 return ret; << 930 } 681 } 931 682 932 int mwriteprotect_range(struct userfaultfd_ctx !! 683 int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, 933 unsigned long len, boo !! 684 unsigned long len, bool enable_wp, >> 685 atomic_t *mmap_changing) 934 { 686 { 935 struct mm_struct *dst_mm = ctx->mm; << 936 unsigned long end = start + len; << 937 unsigned long _start, _end; << 938 struct vm_area_struct *dst_vma; 687 struct vm_area_struct *dst_vma; 939 unsigned long page_mask; !! 688 pgprot_t newprot; 940 long err; !! 689 int err; 941 VMA_ITERATOR(vmi, dst_mm, start); << 942 690 943 /* 691 /* 944 * Sanitize the command parameters: 692 * Sanitize the command parameters: 945 */ 693 */ 946 BUG_ON(start & ~PAGE_MASK); 694 BUG_ON(start & ~PAGE_MASK); 947 BUG_ON(len & ~PAGE_MASK); 695 BUG_ON(len & ~PAGE_MASK); 948 696 949 /* Does the address range wrap, or is 697 /* Does the address range wrap, or is the span zero-sized? */ 950 BUG_ON(start + len <= start); 698 BUG_ON(start + len <= start); 951 699 952 mmap_read_lock(dst_mm); 700 mmap_read_lock(dst_mm); 953 701 954 /* 702 /* 955 * If memory mappings are changing bec 703 * If memory mappings are changing because of non-cooperative 956 * operation (e.g. mremap) running in 704 * operation (e.g. mremap) running in parallel, bail out and 957 * request the user to retry later 705 * request the user to retry later 958 */ 706 */ 959 down_read(&ctx->map_changing_lock); << 960 err = -EAGAIN; 707 err = -EAGAIN; 961 if (atomic_read(&ctx->mmap_changing)) !! 708 if (mmap_changing && atomic_read(mmap_changing)) 962 goto out_unlock; 709 goto out_unlock; 963 710 964 err = -ENOENT; 711 err = -ENOENT; 965 for_each_vma_range(vmi, dst_vma, end) !! 712 dst_vma = find_dst_vma(dst_mm, start, len); 966 << 967 if (!userfaultfd_wp(dst_vma)) << 968 err = -ENOENT; << 969 break; << 970 } << 971 << 972 if (is_vm_hugetlb_page(dst_vma << 973 err = -EINVAL; << 974 page_mask = vma_kernel << 975 if ((start & page_mask << 976 break; << 977 } << 978 << 979 _start = max(dst_vma->vm_start << 980 _end = min(dst_vma->vm_end, en << 981 << 982 err = uffd_wp_range(dst_vma, _ << 983 << 984 /* Return 0 on success, <0 on << 985 if (err < 0) << 986 break; << 987 err = 0; << 988 } << 989 out_unlock: << 990 up_read(&ctx->map_changing_lock); << 991 mmap_read_unlock(dst_mm); << 992 return err; << 993 } << 994 << 995 << 996 void double_pt_lock(spinlock_t *ptl1, << 997 spinlock_t *ptl2) << 998 __acquires(ptl1) << 999 __acquires(ptl2) << 1000 { << 1001 if (ptl1 > ptl2) << 1002 swap(ptl1, ptl2); << 1003 /* lock in virtual address order to a << 1004 spin_lock(ptl1); << 1005 if (ptl1 != ptl2) << 1006 spin_lock_nested(ptl2, SINGLE << 1007 else << 1008 __acquire(ptl2); << 1009 } << 1010 << 1011 void double_pt_unlock(spinlock_t *ptl1, << 1012 spinlock_t *ptl2) << 1013 __releases(ptl1) << 1014 __releases(ptl2) << 1015 { << 1016 spin_unlock(ptl1); << 1017 if (ptl1 != ptl2) << 1018 spin_unlock(ptl2); << 1019 else << 1020 __release(ptl2); << 1021 } << 1022 << 1023 << 1024 static int move_present_pte(struct mm_struct << 1025 struct vm_area_st << 1026 struct vm_area_st << 1027 unsigned long dst << 1028 pte_t *dst_pte, p << 1029 pte_t orig_dst_pt << 1030 spinlock_t *dst_p << 1031 struct folio *src << 1032 { << 1033 int err = 0; << 1034 << 1035 double_pt_lock(dst_ptl, src_ptl); << 1036 << 1037 if (!pte_same(ptep_get(src_pte), orig << 1038 !pte_same(ptep_get(dst_pte), orig << 1039 err = -EAGAIN; << 1040 goto out; << 1041 } << 1042 if (folio_test_large(src_folio) || << 1043 folio_maybe_dma_pinned(src_folio) << 1044 !PageAnonExclusive(&src_folio->pa << 1045 err = -EBUSY; << 1046 goto out; << 1047 } << 1048 << 1049 orig_src_pte = ptep_clear_flush(src_v << 1050 /* Folio got pinned from under us. Pu << 1051 if (folio_maybe_dma_pinned(src_folio) << 1052 set_pte_at(mm, src_addr, src_ << 1053 err = -EBUSY; << 1054 goto out; << 1055 } << 1056 << 1057 folio_move_anon_rmap(src_folio, dst_v << 1058 src_folio->index = linear_page_index( << 1059 << 1060 orig_dst_pte = mk_pte(&src_folio->pag << 1061 /* Follow mremap() behavior and treat << 1062 orig_dst_pte = pte_mkwrite(pte_mkdirt << 1063 << 1064 set_pte_at(mm, dst_addr, dst_pte, ori << 1065 out: << 1066 double_pt_unlock(dst_ptl, src_ptl); << 1067 return err; << 1068 } << 1069 << 1070 static int move_swap_pte(struct mm_struct *mm << 1071 unsigned long dst_ad << 1072 pte_t *dst_pte, pte_ << 1073 pte_t orig_dst_pte, << 1074 spinlock_t *dst_ptl, << 1075 { << 1076 if (!pte_swp_exclusive(orig_src_pte)) << 1077 return -EBUSY; << 1078 << 1079 double_pt_lock(dst_ptl, src_ptl); << 1080 << 1081 if (!pte_same(ptep_get(src_pte), orig << 1082 !pte_same(ptep_get(dst_pte), orig << 1083 double_pt_unlock(dst_ptl, src << 1084 return -EAGAIN; << 1085 } << 1086 << 1087 orig_src_pte = ptep_get_and_clear(mm, << 1088 set_pte_at(mm, dst_addr, dst_pte, ori << 1089 double_pt_unlock(dst_ptl, src_ptl); << 1090 << 1091 return 0; << 1092 } << 1093 << 1094 static int move_zeropage_pte(struct mm_struct << 1095 struct vm_area_s << 1096 struct vm_area_s << 1097 unsigned long ds << 1098 pte_t *dst_pte, << 1099 pte_t orig_dst_p << 1100 spinlock_t *dst_ << 1101 { << 1102 pte_t zero_pte; << 1103 << 1104 double_pt_lock(dst_ptl, src_ptl); << 1105 if (!pte_same(ptep_get(src_pte), orig << 1106 !pte_same(ptep_get(dst_pte), orig << 1107 double_pt_unlock(dst_ptl, src << 1108 return -EAGAIN; << 1109 } << 1110 << 1111 zero_pte = pte_mkspecial(pfn_pte(my_z << 1112 dst_ << 1113 ptep_clear_flush(src_vma, src_addr, s << 1114 set_pte_at(mm, dst_addr, dst_pte, zer << 1115 double_pt_unlock(dst_ptl, src_ptl); << 1116 << 1117 return 0; << 1118 } << 1119 << 1120 << 1121 /* << 1122 * The mmap_lock for reading is held by the c << 1123 * from src_pmd to dst_pmd if possible, and r << 1124 * in moving the page. << 1125 */ << 1126 static int move_pages_pte(struct mm_struct *m << 1127 struct vm_area_stru << 1128 struct vm_area_stru << 1129 unsigned long dst_a << 1130 __u64 mode) << 1131 { << 1132 swp_entry_t entry; << 1133 pte_t orig_src_pte, orig_dst_pte; << 1134 pte_t src_folio_pte; << 1135 spinlock_t *src_ptl, *dst_ptl; << 1136 pte_t *src_pte = NULL; << 1137 pte_t *dst_pte = NULL; << 1138 << 1139 struct folio *src_folio = NULL; << 1140 struct anon_vma *src_anon_vma = NULL; << 1141 struct mmu_notifier_range range; << 1142 int err = 0; << 1143 << 1144 flush_cache_range(src_vma, src_addr, << 1145 mmu_notifier_range_init(&range, MMU_N << 1146 src_addr, src << 1147 mmu_notifier_invalidate_range_start(& << 1148 retry: << 1149 dst_pte = pte_offset_map_nolock(mm, d << 1150 << 1151 /* Retry if a huge pmd materialized f << 1152 if (unlikely(!dst_pte)) { << 1153 err = -EAGAIN; << 1154 goto out; << 1155 } << 1156 << 1157 src_pte = pte_offset_map_nolock(mm, s << 1158 << 1159 /* << 1160 * We held the mmap_lock for reading << 1161 * can zap transparent huge pages und << 1162 * transparent huge page fault can es << 1163 * transparent huge pages under us. << 1164 */ << 1165 if (unlikely(!src_pte)) { << 1166 err = -EAGAIN; << 1167 goto out; << 1168 } << 1169 << 1170 /* Sanity checks before the operation << 1171 if (WARN_ON_ONCE(pmd_none(*dst_pmd)) << 1172 WARN_ON_ONCE(pmd_trans_huge(*dst_ << 1173 err = -EINVAL; << 1174 goto out; << 1175 } << 1176 << 1177 spin_lock(dst_ptl); << 1178 orig_dst_pte = ptep_get(dst_pte); << 1179 spin_unlock(dst_ptl); << 1180 if (!pte_none(orig_dst_pte)) { << 1181 err = -EEXIST; << 1182 goto out; << 1183 } << 1184 << 1185 spin_lock(src_ptl); << 1186 orig_src_pte = ptep_get(src_pte); << 1187 spin_unlock(src_ptl); << 1188 if (pte_none(orig_src_pte)) { << 1189 if (!(mode & UFFDIO_MOVE_MODE << 1190 err = -ENOENT; << 1191 else /* nothing to do to move << 1192 err = 0; << 1193 goto out; << 1194 } << 1195 << 1196 /* If PTE changed after we locked the << 1197 if (src_folio && unlikely(!pte_same(s << 1198 err = -EAGAIN; << 1199 goto out; << 1200 } << 1201 << 1202 if (pte_present(orig_src_pte)) { << 1203 if (is_zero_pfn(pte_pfn(orig_ << 1204 err = move_zeropage_p << 1205 << 1206 << 1207 << 1208 goto out; << 1209 } << 1210 << 1211 /* << 1212 * Pin and lock both source f << 1213 * RCU read section, we can't << 1214 * unmap the ptes, obtain the << 1215 */ << 1216 if (!src_folio) { << 1217 struct folio *folio; << 1218 << 1219 /* << 1220 * Pin the page while << 1221 * page isn't freed u << 1222 */ << 1223 spin_lock(src_ptl); << 1224 if (!pte_same(orig_sr << 1225 spin_unlock(s << 1226 err = -EAGAIN << 1227 goto out; << 1228 } << 1229 << 1230 folio = vm_normal_fol << 1231 if (!folio || !PageAn << 1232 spin_unlock(s << 1233 err = -EBUSY; << 1234 goto out; << 1235 } << 1236 << 1237 folio_get(folio); << 1238 src_folio = folio; << 1239 src_folio_pte = orig_ << 1240 spin_unlock(src_ptl); << 1241 << 1242 if (!folio_trylock(sr << 1243 pte_unmap(&or << 1244 pte_unmap(&or << 1245 src_pte = dst << 1246 /* now we can << 1247 folio_lock(sr << 1248 goto retry; << 1249 } << 1250 << 1251 if (WARN_ON_ONCE(!fol << 1252 err = -EBUSY; << 1253 goto out; << 1254 } << 1255 } << 1256 << 1257 /* at this point we have src_ << 1258 if (folio_test_large(src_foli << 1259 /* split_folio() can << 1260 pte_unmap(&orig_src_p << 1261 pte_unmap(&orig_dst_p << 1262 src_pte = dst_pte = N << 1263 err = split_folio(src << 1264 if (err) << 1265 goto out; << 1266 /* have to reacquire << 1267 folio_unlock(src_foli << 1268 folio_put(src_folio); << 1269 src_folio = NULL; << 1270 goto retry; << 1271 } << 1272 << 1273 if (!src_anon_vma) { << 1274 /* << 1275 * folio_referenced w << 1276 * without the folio << 1277 * the anon_vma lock, << 1278 */ << 1279 src_anon_vma = folio_ << 1280 if (!src_anon_vma) { << 1281 /* page was u << 1282 err = -EAGAIN << 1283 goto out; << 1284 } << 1285 if (!anon_vma_trylock << 1286 pte_unmap(&or << 1287 pte_unmap(&or << 1288 src_pte = dst << 1289 /* now we can << 1290 anon_vma_lock << 1291 goto retry; << 1292 } << 1293 } << 1294 << 1295 err = move_present_pte(mm, d << 1296 dst_ad << 1297 orig_d << 1298 dst_pt << 1299 } else { << 1300 entry = pte_to_swp_entry(orig << 1301 if (non_swap_entry(entry)) { << 1302 if (is_migration_entr << 1303 pte_unmap(&or << 1304 pte_unmap(&or << 1305 src_pte = dst << 1306 migration_ent << 1307 err = -EAGAIN << 1308 } else << 1309 err = -EFAULT << 1310 goto out; << 1311 } << 1312 << 1313 err = move_swap_pte(mm, dst_a << 1314 dst_pte, << 1315 orig_dst_ << 1316 dst_ptl, << 1317 } << 1318 << 1319 out: << 1320 if (src_anon_vma) { << 1321 anon_vma_unlock_write(src_ano << 1322 put_anon_vma(src_anon_vma); << 1323 } << 1324 if (src_folio) { << 1325 folio_unlock(src_folio); << 1326 folio_put(src_folio); << 1327 } << 1328 if (dst_pte) << 1329 pte_unmap(dst_pte); << 1330 if (src_pte) << 1331 pte_unmap(src_pte); << 1332 mmu_notifier_invalidate_range_end(&ra << 1333 << 1334 return err; << 1335 } << 1336 << 1337 #ifdef CONFIG_TRANSPARENT_HUGEPAGE << 1338 static inline bool move_splits_huge_pmd(unsig << 1339 unsig << 1340 unsig << 1341 { << 1342 return (src_addr & ~HPAGE_PMD_MASK) | << 1343 src_end - src_addr < HPAGE_PM << 1344 } << 1345 #else << 1346 static inline bool move_splits_huge_pmd(unsig << 1347 unsig << 1348 unsig << 1349 { << 1350 /* This is unreachable anyway, just t << 1351 return false; << 1352 } << 1353 #endif << 1354 << 1355 static inline bool vma_move_compatible(struct << 1356 { << 1357 return !(vma->vm_flags & (VM_PFNMAP | << 1358 VM_MIXEDMAP << 1359 } << 1360 << 1361 static int validate_move_areas(struct userfau << 1362 struct vm_area << 1363 struct vm_area << 1364 { << 1365 /* Only allow moving if both have the << 1366 if ((src_vma->vm_flags & VM_ACCESS_FL << 1367 pgprot_val(src_vma->vm_page_prot) << 1368 return -EINVAL; << 1369 << 1370 /* Only allow moving if both are mloc << 1371 if ((src_vma->vm_flags & VM_LOCKED) ! << 1372 return -EINVAL; << 1373 << 1374 /* << 1375 * For now, we keep it simple and onl << 1376 * Access flags are equal, therefore << 1377 */ << 1378 if (!(src_vma->vm_flags & VM_WRITE)) << 1379 return -EINVAL; << 1380 << 1381 /* Check if vma flags indicate conten << 1382 if (!vma_move_compatible(src_vma) || << 1383 return -EINVAL; << 1384 << 1385 /* Ensure dst_vma is registered in uf << 1386 if (!dst_vma->vm_userfaultfd_ctx.ctx << 1387 dst_vma->vm_userfaultfd_ctx.ctx ! << 1388 return -EINVAL; << 1389 << 1390 /* Only allow moving across anonymous << 1391 if (!vma_is_anonymous(src_vma) || !vm << 1392 return -EINVAL; << 1393 << 1394 return 0; << 1395 } << 1396 << 1397 static __always_inline << 1398 int find_vmas_mm_locked(struct mm_struct *mm, << 1399 unsigned long dst_sta << 1400 unsigned long src_sta << 1401 struct vm_area_struct << 1402 struct vm_area_struct << 1403 { << 1404 struct vm_area_struct *vma; << 1405 << 1406 mmap_assert_locked(mm); << 1407 vma = find_vma_and_prepare_anon(mm, d << 1408 if (IS_ERR(vma)) << 1409 return PTR_ERR(vma); << 1410 << 1411 *dst_vmap = vma; << 1412 /* Skip finding src_vma if src_start << 1413 if (src_start >= vma->vm_start && src << 1414 goto out_success; << 1415 << 1416 vma = vma_lookup(mm, src_start); << 1417 if (!vma) << 1418 return -ENOENT; << 1419 out_success: << 1420 *src_vmap = vma; << 1421 return 0; << 1422 } << 1423 << 1424 #ifdef CONFIG_PER_VMA_LOCK << 1425 static int uffd_move_lock(struct mm_struct *m << 1426 unsigned long dst_s << 1427 unsigned long src_s << 1428 struct vm_area_stru << 1429 struct vm_area_stru << 1430 { << 1431 struct vm_area_struct *vma; << 1432 int err; << 1433 << 1434 vma = uffd_lock_vma(mm, dst_start); << 1435 if (IS_ERR(vma)) << 1436 return PTR_ERR(vma); << 1437 << 1438 *dst_vmap = vma; << 1439 /* << 1440 * Skip finding src_vma if src_start << 1441 * that we don't lock the same vma tw << 1442 */ << 1443 if (src_start >= vma->vm_start && src << 1444 *src_vmap = vma; << 1445 return 0; << 1446 } << 1447 << 1448 /* << 1449 * Using uffd_lock_vma() to get src_v << 1450 * << 1451 * Thread1 << 1452 * ------- << 1453 * vma_start_read(dst_vma) << 1454 * << 1455 * << 1456 * vma_start_read(src_vma) << 1457 * mmap_read_lock(mm) << 1458 * << 1459 */ << 1460 *src_vmap = lock_vma_under_rcu(mm, sr << 1461 if (likely(*src_vmap)) << 1462 return 0; << 1463 << 1464 /* Undo any locking and retry in mmap << 1465 vma_end_read(*dst_vmap); << 1466 << 1467 mmap_read_lock(mm); << 1468 err = find_vmas_mm_locked(mm, dst_sta << 1469 if (!err) { << 1470 /* << 1471 * See comment in uffd_lock_v << 1472 * vma_start_read() here. << 1473 */ << 1474 down_read(&(*dst_vmap)->vm_lo << 1475 if (*dst_vmap != *src_vmap) << 1476 down_read_nested(&(*s << 1477 SING << 1478 } << 1479 mmap_read_unlock(mm); << 1480 return err; << 1481 } << 1482 << 1483 static void uffd_move_unlock(struct vm_area_s << 1484 struct vm_area_s << 1485 { << 1486 vma_end_read(src_vma); << 1487 if (src_vma != dst_vma) << 1488 vma_end_read(dst_vma); << 1489 } << 1490 << 1491 #else << 1492 << 1493 static int uffd_move_lock(struct mm_struct *m << 1494 unsigned long dst_s << 1495 unsigned long src_s << 1496 struct vm_area_stru << 1497 struct vm_area_stru << 1498 { << 1499 int err; << 1500 << 1501 mmap_read_lock(mm); << 1502 err = find_vmas_mm_locked(mm, dst_sta << 1503 if (err) << 1504 mmap_read_unlock(mm); << 1505 return err; << 1506 } << 1507 << 1508 static void uffd_move_unlock(struct vm_area_s << 1509 struct vm_area_s << 1510 { << 1511 mmap_assert_locked(src_vma->vm_mm); << 1512 mmap_read_unlock(dst_vma->vm_mm); << 1513 } << 1514 #endif << 1515 << 1516 /** << 1517 * move_pages - move arbitrary anonymous page << 1518 * @ctx: pointer to the userfaultfd context << 1519 * @dst_start: start of the destination virtu << 1520 * @src_start: start of the source virtual me << 1521 * @len: length of the virtual memory range << 1522 * @mode: flags from uffdio_move.mode << 1523 * << 1524 * It will either use the mmap_lock in read m << 1525 * << 1526 * move_pages() remaps arbitrary anonymous pa << 1527 * copy. It only works on non shared anonymou << 1528 * be relocated without generating non linear << 1529 * code. << 1530 * << 1531 * It provides a zero copy mechanism to handl << 1532 * The source vma pages should have mapcount << 1533 * enforced by using madvise(MADV_DONTFORK) o << 1534 * << 1535 * The thread receiving the page during the u << 1536 * will receive the faulting page in the sour << 1537 * storage or any other I/O device (MADV_DONT << 1538 * avoids move_pages() to fail with -EBUSY if << 1539 * move_pages() is called), then it will call << 1540 * page in the faulting address in the destin << 1541 * << 1542 * This userfaultfd command works purely via << 1543 * most efficient way to move physical non sh << 1544 * across different virtual addresses. Unlike << 1545 * it does not create any new vmas. The mappi << 1546 * address is atomic. << 1547 * << 1548 * It only works if the vma protection bits a << 1549 * source and destination vma. << 1550 * << 1551 * It can remap non shared anonymous pages wi << 1552 * << 1553 * If the source virtual memory range has any << 1554 * the destination virtual memory range is no << 1555 * move_pages() will fail respectively with - << 1556 * provides a very strict behavior to avoid a << 1557 * corruption going unnoticed if there are us << 1558 * Only one thread should resolve the userlan << 1559 * time for any given faulting address. This << 1560 * try to both call move_pages() on the same << 1561 * same time, the second thread will get an e << 1562 * command. << 1563 * << 1564 * The command retval will return "len" is su << 1565 * however can be interrupted by fatal signal << 1566 * interrupted it will return the number of b << 1567 * remapped before the interruption if any, o << 1568 * none. It will never return zero. Either it << 1569 * an amount of bytes successfully moved. If << 1570 * "short" remap, the move_pages() command sh << 1571 * userland with src+retval, dst+reval, len-r << 1572 * about the error that interrupted it. << 1573 * << 1574 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag << 1575 * prevent -ENOENT errors to materialize if t << 1576 * source virtual range that is being remappe << 1577 * accounted as successfully remapped in the << 1578 * command. This is mostly useful to remap hu << 1579 * virtual regions without knowing if there a << 1580 * in the regions or not, but preventing the << 1581 * the hugepmd during the remap. << 1582 * << 1583 * If there's any rmap walk that is taking th << 1584 * first obtaining the folio lock (the only c << 1585 * folio_referenced), they will have to verif << 1586 * has changed after taking the anon_vma lock << 1587 * should release the lock and retry obtainin << 1588 * it means the anon_vma was changed by move_ << 1589 * could be obtained. This is the only additi << 1590 * the rmap code to provide this anonymous pa << 1591 */ << 1592 ssize_t move_pages(struct userfaultfd_ctx *ct << 1593 unsigned long src_start, u << 1594 { << 1595 struct mm_struct *mm = ctx->mm; << 1596 struct vm_area_struct *src_vma, *dst_ << 1597 unsigned long src_addr, dst_addr; << 1598 pmd_t *src_pmd, *dst_pmd; << 1599 long err = -EINVAL; << 1600 ssize_t moved = 0; << 1601 << 1602 /* Sanitize the command parameters. * << 1603 if (WARN_ON_ONCE(src_start & ~PAGE_MA << 1604 WARN_ON_ONCE(dst_start & ~PAGE_MA << 1605 WARN_ON_ONCE(len & ~PAGE_MASK)) << 1606 goto out; << 1607 << 1608 /* Does the address range wrap, or is << 1609 if (WARN_ON_ONCE(src_start + len <= s << 1610 WARN_ON_ONCE(dst_start + len <= d << 1611 goto out; << 1612 << 1613 err = uffd_move_lock(mm, dst_start, s << 1614 if (err) << 1615 goto out; << 1616 << 1617 /* Re-check after taking map_changing << 1618 err = -EAGAIN; << 1619 down_read(&ctx->map_changing_lock); << 1620 if (likely(atomic_read(&ctx->mmap_cha << 1621 goto out_unlock; << 1622 /* 713 /* 1623 * Make sure the vma is not shared, t !! 714 * Make sure the vma is not shared, that the dst range is 1624 * ranges are both valid and fully wi !! 715 * both valid and fully within a single existing vma. 1625 * vma. << 1626 */ 716 */ 1627 err = -EINVAL; !! 717 if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) 1628 if (src_vma->vm_flags & VM_SHARED) << 1629 goto out_unlock; 718 goto out_unlock; 1630 if (src_start + len > src_vma->vm_end !! 719 if (!userfaultfd_wp(dst_vma)) 1631 goto out_unlock; 720 goto out_unlock; 1632 !! 721 if (!vma_is_anonymous(dst_vma)) 1633 if (dst_vma->vm_flags & VM_SHARED) << 1634 goto out_unlock; << 1635 if (dst_start + len > dst_vma->vm_end << 1636 goto out_unlock; << 1637 << 1638 err = validate_move_areas(ctx, src_vm << 1639 if (err) << 1640 goto out_unlock; 722 goto out_unlock; 1641 723 1642 for (src_addr = src_start, dst_addr = !! 724 if (enable_wp) 1643 src_addr < src_start + len;) { !! 725 newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); 1644 spinlock_t *ptl; !! 726 else 1645 pmd_t dst_pmdval; !! 727 newprot = vm_get_page_prot(dst_vma->vm_flags); 1646 unsigned long step_size; << 1647 << 1648 /* << 1649 * Below works because anonym << 1650 * transparent huge PUD. If f << 1651 * that case would need to be << 1652 */ << 1653 src_pmd = mm_find_pmd(mm, src << 1654 if (unlikely(!src_pmd)) { << 1655 if (!(mode & UFFDIO_M << 1656 err = -ENOENT << 1657 break; << 1658 } << 1659 src_pmd = mm_alloc_pm << 1660 if (unlikely(!src_pmd << 1661 err = -ENOMEM << 1662 break; << 1663 } << 1664 } << 1665 dst_pmd = mm_alloc_pmd(mm, ds << 1666 if (unlikely(!dst_pmd)) { << 1667 err = -ENOMEM; << 1668 break; << 1669 } << 1670 << 1671 dst_pmdval = pmdp_get_lockles << 1672 /* << 1673 * If the dst_pmd is mapped a << 1674 * be strict. If dst_pmd chan << 1675 * move_pages_huge_pmd() will << 1676 * while move_pages_pte() wil << 1677 */ << 1678 if (unlikely(pmd_trans_huge(d << 1679 err = -EEXIST; << 1680 break; << 1681 } << 1682 << 1683 ptl = pmd_trans_huge_lock(src << 1684 if (ptl) { << 1685 if (pmd_devmap(*src_p << 1686 spin_unlock(p << 1687 err = -ENOENT << 1688 break; << 1689 } << 1690 << 1691 /* Check if we can mo << 1692 if (move_splits_huge_ << 1693 !pmd_none(dst_pmd << 1694 struct folio << 1695 << 1696 if (!folio || << 1697 << 1698 spin_ << 1699 err = << 1700 break << 1701 } << 1702 << 1703 spin_unlock(p << 1704 split_huge_pm << 1705 /* The folio << 1706 continue; << 1707 } << 1708 << 1709 err = move_pages_huge << 1710 << 1711 << 1712 step_size = HPAGE_PMD << 1713 } else { << 1714 if (pmd_none(*src_pmd << 1715 if (!(mode & << 1716 err = << 1717 break << 1718 } << 1719 if (unlikely( << 1720 err = << 1721 break << 1722 } << 1723 } << 1724 << 1725 if (unlikely(pte_allo << 1726 err = -ENOMEM << 1727 break; << 1728 } << 1729 << 1730 err = move_pages_pte( << 1731 << 1732 << 1733 step_size = PAGE_SIZE << 1734 } << 1735 << 1736 cond_resched(); << 1737 << 1738 if (fatal_signal_pending(curr << 1739 /* Do not override an << 1740 if (!err || err == -E << 1741 err = -EINTR; << 1742 break; << 1743 } << 1744 << 1745 if (err) { << 1746 if (err == -EAGAIN) << 1747 continue; << 1748 break; << 1749 } << 1750 728 1751 /* Proceed to the next page * !! 729 change_protection(dst_vma, start, start + len, newprot, 1752 dst_addr += step_size; !! 730 enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); 1753 src_addr += step_size; << 1754 moved += step_size; << 1755 } << 1756 731 >> 732 err = 0; 1757 out_unlock: 733 out_unlock: 1758 up_read(&ctx->map_changing_lock); !! 734 mmap_read_unlock(dst_mm); 1759 uffd_move_unlock(dst_vma, src_vma); !! 735 return err; 1760 out: << 1761 VM_WARN_ON(moved < 0); << 1762 VM_WARN_ON(err > 0); << 1763 VM_WARN_ON(!moved && !err); << 1764 return moved ? moved : err; << 1765 } << 1766 << 1767 static void userfaultfd_set_vm_flags(struct v << 1768 vm_flags << 1769 { << 1770 const bool uffd_wp_changed = (vma->vm << 1771 << 1772 vm_flags_reset(vma, flags); << 1773 /* << 1774 * For shared mappings, we want to en << 1775 * userfaultfd-wp is enabled (see vma << 1776 * recalculate vma->vm_page_prot when << 1777 */ << 1778 if ((vma->vm_flags & VM_SHARED) && uf << 1779 vma_set_page_prot(vma); << 1780 } << 1781 << 1782 static void userfaultfd_set_ctx(struct vm_are << 1783 struct userfa << 1784 unsigned long << 1785 { << 1786 vma_start_write(vma); << 1787 vma->vm_userfaultfd_ctx = (struct vm_ << 1788 userfaultfd_set_vm_flags(vma, << 1789 (vma->vm_fla << 1790 } << 1791 << 1792 void userfaultfd_reset_ctx(struct vm_area_str << 1793 { << 1794 userfaultfd_set_ctx(vma, NULL, 0); << 1795 } << 1796 << 1797 struct vm_area_struct *userfaultfd_clear_vma( << 1798 << 1799 << 1800 << 1801 << 1802 { << 1803 struct vm_area_struct *ret; << 1804 << 1805 /* Reset ptes for the whole vma range << 1806 if (userfaultfd_wp(vma)) << 1807 uffd_wp_range(vma, start, end << 1808 << 1809 ret = vma_modify_flags_uffd(vmi, prev << 1810 vma->vm_f << 1811 NULL_VM_U << 1812 << 1813 /* << 1814 * In the vma_merge() successful mpro << 1815 * the next vma was merged into the c << 1816 * the current one has not been updat << 1817 */ << 1818 if (!IS_ERR(ret)) << 1819 userfaultfd_reset_ctx(ret); << 1820 << 1821 return ret; << 1822 } << 1823 << 1824 /* Assumes mmap write lock taken, and mm_stru << 1825 int userfaultfd_register_range(struct userfau << 1826 struct vm_area << 1827 unsigned long << 1828 unsigned long << 1829 bool wp_async) << 1830 { << 1831 VMA_ITERATOR(vmi, ctx->mm, start); << 1832 struct vm_area_struct *prev = vma_pre << 1833 unsigned long vma_end; << 1834 unsigned long new_flags; << 1835 << 1836 if (vma->vm_start < start) << 1837 prev = vma; << 1838 << 1839 for_each_vma_range(vmi, vma, end) { << 1840 cond_resched(); << 1841 << 1842 BUG_ON(!vma_can_userfault(vma << 1843 BUG_ON(vma->vm_userfaultfd_ct << 1844 vma->vm_userfaultfd_ct << 1845 WARN_ON(!(vma->vm_flags & VM_ << 1846 << 1847 /* << 1848 * Nothing to do: this vma is << 1849 * userfaultfd and with the r << 1850 */ << 1851 if (vma->vm_userfaultfd_ctx.c << 1852 (vma->vm_flags & vm_flags << 1853 goto skip; << 1854 << 1855 if (vma->vm_start > start) << 1856 start = vma->vm_start << 1857 vma_end = min(end, vma->vm_en << 1858 << 1859 new_flags = (vma->vm_flags & << 1860 vma = vma_modify_flags_uffd(& << 1861 n << 1862 ( << 1863 if (IS_ERR(vma)) << 1864 return PTR_ERR(vma); << 1865 << 1866 /* << 1867 * In the vma_merge() success << 1868 * the next vma was merged in << 1869 * the current one has not be << 1870 */ << 1871 userfaultfd_set_ctx(vma, ctx, << 1872 << 1873 if (is_vm_hugetlb_page(vma) & << 1874 hugetlb_unshare_all_p << 1875 << 1876 skip: << 1877 prev = vma; << 1878 start = vma->vm_end; << 1879 } << 1880 << 1881 return 0; << 1882 } << 1883 << 1884 void userfaultfd_release_new(struct userfault << 1885 { << 1886 struct mm_struct *mm = ctx->mm; << 1887 struct vm_area_struct *vma; << 1888 VMA_ITERATOR(vmi, mm, 0); << 1889 << 1890 /* the various vma->vm_userfaultfd_ct << 1891 mmap_write_lock(mm); << 1892 for_each_vma(vmi, vma) { << 1893 if (vma->vm_userfaultfd_ctx.c << 1894 userfaultfd_reset_ctx << 1895 } << 1896 mmap_write_unlock(mm); << 1897 } << 1898 << 1899 void userfaultfd_release_all(struct mm_struct << 1900 struct userfault << 1901 { << 1902 struct vm_area_struct *vma, *prev; << 1903 VMA_ITERATOR(vmi, mm, 0); << 1904 << 1905 if (!mmget_not_zero(mm)) << 1906 return; << 1907 << 1908 /* << 1909 * Flush page faults out of all CPUs. << 1910 * must be retried without returning << 1911 * userfaultfd_ctx_get() succeeds but << 1912 * changes while handle_userfault rel << 1913 * it's critical that released is set << 1914 * taking the mmap_lock for writing. << 1915 */ << 1916 mmap_write_lock(mm); << 1917 prev = NULL; << 1918 for_each_vma(vmi, vma) { << 1919 cond_resched(); << 1920 BUG_ON(!!vma->vm_userfaultfd_ << 1921 !!(vma->vm_flags & __V << 1922 if (vma->vm_userfaultfd_ctx.c << 1923 prev = vma; << 1924 continue; << 1925 } << 1926 << 1927 vma = userfaultfd_clear_vma(& << 1928 v << 1929 prev = vma; << 1930 } << 1931 mmap_write_unlock(mm); << 1932 mmput(mm); << 1933 } 736 } 1934 737
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.