~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/mm/userfaultfd.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /mm/userfaultfd.c (Architecture m68k) and /mm/userfaultfd.c (Architecture mips)


  1 // SPDX-License-Identifier: GPL-2.0-only            1 // SPDX-License-Identifier: GPL-2.0-only
  2 /*                                                  2 /*
  3  *  mm/userfaultfd.c                                3  *  mm/userfaultfd.c
  4  *                                                  4  *
  5  *  Copyright (C) 2015  Red Hat, Inc.               5  *  Copyright (C) 2015  Red Hat, Inc.
  6  */                                                 6  */
  7                                                     7 
  8 #include <linux/mm.h>                               8 #include <linux/mm.h>
  9 #include <linux/sched/signal.h>                     9 #include <linux/sched/signal.h>
 10 #include <linux/pagemap.h>                         10 #include <linux/pagemap.h>
 11 #include <linux/rmap.h>                            11 #include <linux/rmap.h>
 12 #include <linux/swap.h>                            12 #include <linux/swap.h>
 13 #include <linux/swapops.h>                         13 #include <linux/swapops.h>
 14 #include <linux/userfaultfd_k.h>                   14 #include <linux/userfaultfd_k.h>
 15 #include <linux/mmu_notifier.h>                    15 #include <linux/mmu_notifier.h>
 16 #include <linux/hugetlb.h>                         16 #include <linux/hugetlb.h>
 17 #include <linux/shmem_fs.h>                        17 #include <linux/shmem_fs.h>
 18 #include <asm/tlbflush.h>                          18 #include <asm/tlbflush.h>
 19 #include <asm/tlb.h>                               19 #include <asm/tlb.h>
 20 #include "internal.h"                              20 #include "internal.h"
 21                                                    21 
 22 static __always_inline                             22 static __always_inline
 23 bool validate_dst_vma(struct vm_area_struct *d     23 bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
 24 {                                                  24 {
 25         /* Make sure that the dst range is ful     25         /* Make sure that the dst range is fully within dst_vma. */
 26         if (dst_end > dst_vma->vm_end)             26         if (dst_end > dst_vma->vm_end)
 27                 return false;                      27                 return false;
 28                                                    28 
 29         /*                                         29         /*
 30          * Check the vma is registered in uffd     30          * Check the vma is registered in uffd, this is required to
 31          * enforce the VM_MAYWRITE check done      31          * enforce the VM_MAYWRITE check done at uffd registration
 32          * time.                                   32          * time.
 33          */                                        33          */
 34         if (!dst_vma->vm_userfaultfd_ctx.ctx)      34         if (!dst_vma->vm_userfaultfd_ctx.ctx)
 35                 return false;                      35                 return false;
 36                                                    36 
 37         return true;                               37         return true;
 38 }                                                  38 }
 39                                                    39 
 40 static __always_inline                             40 static __always_inline
 41 struct vm_area_struct *find_vma_and_prepare_an     41 struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
 42                                                    42                                                  unsigned long addr)
 43 {                                                  43 {
 44         struct vm_area_struct *vma;                44         struct vm_area_struct *vma;
 45                                                    45 
 46         mmap_assert_locked(mm);                    46         mmap_assert_locked(mm);
 47         vma = vma_lookup(mm, addr);                47         vma = vma_lookup(mm, addr);
 48         if (!vma)                                  48         if (!vma)
 49                 vma = ERR_PTR(-ENOENT);            49                 vma = ERR_PTR(-ENOENT);
 50         else if (!(vma->vm_flags & VM_SHARED)      50         else if (!(vma->vm_flags & VM_SHARED) &&
 51                  unlikely(anon_vma_prepare(vma     51                  unlikely(anon_vma_prepare(vma)))
 52                 vma = ERR_PTR(-ENOMEM);            52                 vma = ERR_PTR(-ENOMEM);
 53                                                    53 
 54         return vma;                                54         return vma;
 55 }                                                  55 }
 56                                                    56 
 57 #ifdef CONFIG_PER_VMA_LOCK                         57 #ifdef CONFIG_PER_VMA_LOCK
 58 /*                                                 58 /*
 59  * uffd_lock_vma() - Lookup and lock vma corre     59  * uffd_lock_vma() - Lookup and lock vma corresponding to @address.
 60  * @mm: mm to search vma in.                       60  * @mm: mm to search vma in.
 61  * @address: address that the vma should conta     61  * @address: address that the vma should contain.
 62  *                                                 62  *
 63  * Should be called without holding mmap_lock.     63  * Should be called without holding mmap_lock.
 64  *                                                 64  *
 65  * Return: A locked vma containing @address, -     65  * Return: A locked vma containing @address, -ENOENT if no vma is found, or
 66  * -ENOMEM if anon_vma couldn't be allocated.      66  * -ENOMEM if anon_vma couldn't be allocated.
 67  */                                                67  */
 68 static struct vm_area_struct *uffd_lock_vma(st     68 static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
 69                                        unsigne     69                                        unsigned long address)
 70 {                                                  70 {
 71         struct vm_area_struct *vma;                71         struct vm_area_struct *vma;
 72                                                    72 
 73         vma = lock_vma_under_rcu(mm, address);     73         vma = lock_vma_under_rcu(mm, address);
 74         if (vma) {                                 74         if (vma) {
 75                 /*                                 75                 /*
 76                  * We know we're going to need     76                  * We know we're going to need to use anon_vma, so check
 77                  * that early.                     77                  * that early.
 78                  */                                78                  */
 79                 if (!(vma->vm_flags & VM_SHARE     79                 if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
 80                         vma_end_read(vma);         80                         vma_end_read(vma);
 81                 else                               81                 else
 82                         return vma;                82                         return vma;
 83         }                                          83         }
 84                                                    84 
 85         mmap_read_lock(mm);                        85         mmap_read_lock(mm);
 86         vma = find_vma_and_prepare_anon(mm, ad     86         vma = find_vma_and_prepare_anon(mm, address);
 87         if (!IS_ERR(vma)) {                        87         if (!IS_ERR(vma)) {
 88                 /*                                 88                 /*
 89                  * We cannot use vma_start_rea     89                  * We cannot use vma_start_read() as it may fail due to
 90                  * false locked (see comment i     90                  * false locked (see comment in vma_start_read()). We
 91                  * can avoid that by directly      91                  * can avoid that by directly locking vm_lock under
 92                  * mmap_lock, which guarantees     92                  * mmap_lock, which guarantees that nobody can lock the
 93                  * vma for write (vma_start_wr     93                  * vma for write (vma_start_write()) under us.
 94                  */                                94                  */
 95                 down_read(&vma->vm_lock->lock)     95                 down_read(&vma->vm_lock->lock);
 96         }                                          96         }
 97                                                    97 
 98         mmap_read_unlock(mm);                      98         mmap_read_unlock(mm);
 99         return vma;                                99         return vma;
100 }                                                 100 }
101                                                   101 
102 static struct vm_area_struct *uffd_mfill_lock(    102 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
103                                                   103                                               unsigned long dst_start,
104                                                   104                                               unsigned long len)
105 {                                                 105 {
106         struct vm_area_struct *dst_vma;           106         struct vm_area_struct *dst_vma;
107                                                   107 
108         dst_vma = uffd_lock_vma(dst_mm, dst_st    108         dst_vma = uffd_lock_vma(dst_mm, dst_start);
109         if (IS_ERR(dst_vma) || validate_dst_vm    109         if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len))
110                 return dst_vma;                   110                 return dst_vma;
111                                                   111 
112         vma_end_read(dst_vma);                    112         vma_end_read(dst_vma);
113         return ERR_PTR(-ENOENT);                  113         return ERR_PTR(-ENOENT);
114 }                                                 114 }
115                                                   115 
116 static void uffd_mfill_unlock(struct vm_area_s    116 static void uffd_mfill_unlock(struct vm_area_struct *vma)
117 {                                                 117 {
118         vma_end_read(vma);                        118         vma_end_read(vma);
119 }                                                 119 }
120                                                   120 
121 #else                                             121 #else
122                                                   122 
123 static struct vm_area_struct *uffd_mfill_lock(    123 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
124                                                   124                                               unsigned long dst_start,
125                                                   125                                               unsigned long len)
126 {                                                 126 {
127         struct vm_area_struct *dst_vma;           127         struct vm_area_struct *dst_vma;
128                                                   128 
129         mmap_read_lock(dst_mm);                   129         mmap_read_lock(dst_mm);
130         dst_vma = find_vma_and_prepare_anon(ds    130         dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
131         if (IS_ERR(dst_vma))                      131         if (IS_ERR(dst_vma))
132                 goto out_unlock;                  132                 goto out_unlock;
133                                                   133 
134         if (validate_dst_vma(dst_vma, dst_star    134         if (validate_dst_vma(dst_vma, dst_start + len))
135                 return dst_vma;                   135                 return dst_vma;
136                                                   136 
137         dst_vma = ERR_PTR(-ENOENT);               137         dst_vma = ERR_PTR(-ENOENT);
138 out_unlock:                                       138 out_unlock:
139         mmap_read_unlock(dst_mm);                 139         mmap_read_unlock(dst_mm);
140         return dst_vma;                           140         return dst_vma;
141 }                                                 141 }
142                                                   142 
143 static void uffd_mfill_unlock(struct vm_area_s    143 static void uffd_mfill_unlock(struct vm_area_struct *vma)
144 {                                                 144 {
145         mmap_read_unlock(vma->vm_mm);             145         mmap_read_unlock(vma->vm_mm);
146 }                                                 146 }
147 #endif                                            147 #endif
148                                                   148 
149 /* Check if dst_addr is outside of file's size    149 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
150 static bool mfill_file_over_size(struct vm_are    150 static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
151                                  unsigned long    151                                  unsigned long dst_addr)
152 {                                                 152 {
153         struct inode *inode;                      153         struct inode *inode;
154         pgoff_t offset, max_off;                  154         pgoff_t offset, max_off;
155                                                   155 
156         if (!dst_vma->vm_file)                    156         if (!dst_vma->vm_file)
157                 return false;                     157                 return false;
158                                                   158 
159         inode = dst_vma->vm_file->f_inode;        159         inode = dst_vma->vm_file->f_inode;
160         offset = linear_page_index(dst_vma, ds    160         offset = linear_page_index(dst_vma, dst_addr);
161         max_off = DIV_ROUND_UP(i_size_read(ino    161         max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
162         return offset >= max_off;                 162         return offset >= max_off;
163 }                                                 163 }
164                                                   164 
165 /*                                                165 /*
166  * Install PTEs, to map dst_addr (within dst_v    166  * Install PTEs, to map dst_addr (within dst_vma) to page.
167  *                                                167  *
168  * This function handles both MCOPY_ATOMIC_NOR    168  * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
169  * and anon, and for both shared and private V    169  * and anon, and for both shared and private VMAs.
170  */                                               170  */
171 int mfill_atomic_install_pte(pmd_t *dst_pmd,      171 int mfill_atomic_install_pte(pmd_t *dst_pmd,
172                              struct vm_area_st    172                              struct vm_area_struct *dst_vma,
173                              unsigned long dst    173                              unsigned long dst_addr, struct page *page,
174                              bool newly_alloca    174                              bool newly_allocated, uffd_flags_t flags)
175 {                                                 175 {
176         int ret;                                  176         int ret;
177         struct mm_struct *dst_mm = dst_vma->vm    177         struct mm_struct *dst_mm = dst_vma->vm_mm;
178         pte_t _dst_pte, *dst_pte;                 178         pte_t _dst_pte, *dst_pte;
179         bool writable = dst_vma->vm_flags & VM    179         bool writable = dst_vma->vm_flags & VM_WRITE;
180         bool vm_shared = dst_vma->vm_flags & V    180         bool vm_shared = dst_vma->vm_flags & VM_SHARED;
181         spinlock_t *ptl;                          181         spinlock_t *ptl;
182         struct folio *folio = page_folio(page)    182         struct folio *folio = page_folio(page);
183         bool page_in_cache = folio_mapping(fol    183         bool page_in_cache = folio_mapping(folio);
184                                                   184 
185         _dst_pte = mk_pte(page, dst_vma->vm_pa    185         _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
186         _dst_pte = pte_mkdirty(_dst_pte);         186         _dst_pte = pte_mkdirty(_dst_pte);
187         if (page_in_cache && !vm_shared)          187         if (page_in_cache && !vm_shared)
188                 writable = false;                 188                 writable = false;
189         if (writable)                             189         if (writable)
190                 _dst_pte = pte_mkwrite(_dst_pt    190                 _dst_pte = pte_mkwrite(_dst_pte, dst_vma);
191         if (flags & MFILL_ATOMIC_WP)              191         if (flags & MFILL_ATOMIC_WP)
192                 _dst_pte = pte_mkuffd_wp(_dst_    192                 _dst_pte = pte_mkuffd_wp(_dst_pte);
193                                                   193 
194         ret = -EAGAIN;                            194         ret = -EAGAIN;
195         dst_pte = pte_offset_map_lock(dst_mm,     195         dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
196         if (!dst_pte)                             196         if (!dst_pte)
197                 goto out;                         197                 goto out;
198                                                   198 
199         if (mfill_file_over_size(dst_vma, dst_    199         if (mfill_file_over_size(dst_vma, dst_addr)) {
200                 ret = -EFAULT;                    200                 ret = -EFAULT;
201                 goto out_unlock;                  201                 goto out_unlock;
202         }                                         202         }
203                                                   203 
204         ret = -EEXIST;                            204         ret = -EEXIST;
205         /*                                        205         /*
206          * We allow to overwrite a pte marker:    206          * We allow to overwrite a pte marker: consider when both MISSING|WP
207          * registered, we firstly wr-protect a    207          * registered, we firstly wr-protect a none pte which has no page cache
208          * page backing it, then access the pa    208          * page backing it, then access the page.
209          */                                       209          */
210         if (!pte_none_mostly(ptep_get(dst_pte)    210         if (!pte_none_mostly(ptep_get(dst_pte)))
211                 goto out_unlock;                  211                 goto out_unlock;
212                                                   212 
213         if (page_in_cache) {                      213         if (page_in_cache) {
214                 /* Usually, cache pages are al    214                 /* Usually, cache pages are already added to LRU */
215                 if (newly_allocated)              215                 if (newly_allocated)
216                         folio_add_lru(folio);     216                         folio_add_lru(folio);
217                 folio_add_file_rmap_pte(folio,    217                 folio_add_file_rmap_pte(folio, page, dst_vma);
218         } else {                                  218         } else {
219                 folio_add_new_anon_rmap(folio,    219                 folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE);
220                 folio_add_lru_vma(folio, dst_v    220                 folio_add_lru_vma(folio, dst_vma);
221         }                                         221         }
222                                                   222 
223         /*                                        223         /*
224          * Must happen after rmap, as mm_count    224          * Must happen after rmap, as mm_counter() checks mapping (via
225          * PageAnon()), which is set by __page    225          * PageAnon()), which is set by __page_set_anon_rmap().
226          */                                       226          */
227         inc_mm_counter(dst_mm, mm_counter(foli    227         inc_mm_counter(dst_mm, mm_counter(folio));
228                                                   228 
229         set_pte_at(dst_mm, dst_addr, dst_pte,     229         set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
230                                                   230 
231         /* No need to invalidate - it was non-    231         /* No need to invalidate - it was non-present before */
232         update_mmu_cache(dst_vma, dst_addr, ds    232         update_mmu_cache(dst_vma, dst_addr, dst_pte);
233         ret = 0;                                  233         ret = 0;
234 out_unlock:                                       234 out_unlock:
235         pte_unmap_unlock(dst_pte, ptl);           235         pte_unmap_unlock(dst_pte, ptl);
236 out:                                              236 out:
237         return ret;                               237         return ret;
238 }                                                 238 }
239                                                   239 
240 static int mfill_atomic_pte_copy(pmd_t *dst_pm    240 static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
241                                  struct vm_are    241                                  struct vm_area_struct *dst_vma,
242                                  unsigned long    242                                  unsigned long dst_addr,
243                                  unsigned long    243                                  unsigned long src_addr,
244                                  uffd_flags_t     244                                  uffd_flags_t flags,
245                                  struct folio     245                                  struct folio **foliop)
246 {                                                 246 {
247         void *kaddr;                              247         void *kaddr;
248         int ret;                                  248         int ret;
249         struct folio *folio;                      249         struct folio *folio;
250                                                   250 
251         if (!*foliop) {                           251         if (!*foliop) {
252                 ret = -ENOMEM;                    252                 ret = -ENOMEM;
253                 folio = vma_alloc_folio(GFP_HI    253                 folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
254                                         dst_ad    254                                         dst_addr, false);
255                 if (!folio)                       255                 if (!folio)
256                         goto out;                 256                         goto out;
257                                                   257 
258                 kaddr = kmap_local_folio(folio    258                 kaddr = kmap_local_folio(folio, 0);
259                 /*                                259                 /*
260                  * The read mmap_lock is held     260                  * The read mmap_lock is held here.  Despite the
261                  * mmap_lock being read recurs    261                  * mmap_lock being read recursive a deadlock is still
262                  * possible if a writer has ta    262                  * possible if a writer has taken a lock.  For example:
263                  *                                263                  *
264                  * process A thread 1 takes re    264                  * process A thread 1 takes read lock on own mmap_lock
265                  * process A thread 2 calls mm    265                  * process A thread 2 calls mmap, blocks taking write lock
266                  * process B thread 1 takes pa    266                  * process B thread 1 takes page fault, read lock on own mmap lock
267                  * process B thread 2 calls mm    267                  * process B thread 2 calls mmap, blocks taking write lock
268                  * process A thread 1 blocks t    268                  * process A thread 1 blocks taking read lock on process B
269                  * process B thread 1 blocks t    269                  * process B thread 1 blocks taking read lock on process A
270                  *                                270                  *
271                  * Disable page faults to prev    271                  * Disable page faults to prevent potential deadlock
272                  * and retry the copy outside     272                  * and retry the copy outside the mmap_lock.
273                  */                               273                  */
274                 pagefault_disable();              274                 pagefault_disable();
275                 ret = copy_from_user(kaddr, (c    275                 ret = copy_from_user(kaddr, (const void __user *) src_addr,
276                                      PAGE_SIZE    276                                      PAGE_SIZE);
277                 pagefault_enable();               277                 pagefault_enable();
278                 kunmap_local(kaddr);              278                 kunmap_local(kaddr);
279                                                   279 
280                 /* fallback to copy_from_user     280                 /* fallback to copy_from_user outside mmap_lock */
281                 if (unlikely(ret)) {              281                 if (unlikely(ret)) {
282                         ret = -ENOENT;            282                         ret = -ENOENT;
283                         *foliop = folio;          283                         *foliop = folio;
284                         /* don't free the page    284                         /* don't free the page */
285                         goto out;                 285                         goto out;
286                 }                                 286                 }
287                                                   287 
288                 flush_dcache_folio(folio);        288                 flush_dcache_folio(folio);
289         } else {                                  289         } else {
290                 folio = *foliop;                  290                 folio = *foliop;
291                 *foliop = NULL;                   291                 *foliop = NULL;
292         }                                         292         }
293                                                   293 
294         /*                                        294         /*
295          * The memory barrier inside __folio_m    295          * The memory barrier inside __folio_mark_uptodate makes sure that
296          * preceding stores to the page conten    296          * preceding stores to the page contents become visible before
297          * the set_pte_at() write.                297          * the set_pte_at() write.
298          */                                       298          */
299         __folio_mark_uptodate(folio);             299         __folio_mark_uptodate(folio);
300                                                   300 
301         ret = -ENOMEM;                            301         ret = -ENOMEM;
302         if (mem_cgroup_charge(folio, dst_vma->    302         if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
303                 goto out_release;                 303                 goto out_release;
304                                                   304 
305         ret = mfill_atomic_install_pte(dst_pmd    305         ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
306                                        &folio-    306                                        &folio->page, true, flags);
307         if (ret)                                  307         if (ret)
308                 goto out_release;                 308                 goto out_release;
309 out:                                              309 out:
310         return ret;                               310         return ret;
311 out_release:                                      311 out_release:
312         folio_put(folio);                         312         folio_put(folio);
313         goto out;                                 313         goto out;
314 }                                                 314 }
315                                                   315 
316 static int mfill_atomic_pte_zeroed_folio(pmd_t    316 static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
317                                          struc    317                                          struct vm_area_struct *dst_vma,
318                                          unsig    318                                          unsigned long dst_addr)
319 {                                                 319 {
320         struct folio *folio;                      320         struct folio *folio;
321         int ret = -ENOMEM;                        321         int ret = -ENOMEM;
322                                                   322 
323         folio = vma_alloc_zeroed_movable_folio    323         folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
324         if (!folio)                               324         if (!folio)
325                 return ret;                       325                 return ret;
326                                                   326 
327         if (mem_cgroup_charge(folio, dst_vma->    327         if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
328                 goto out_put;                     328                 goto out_put;
329                                                   329 
330         /*                                        330         /*
331          * The memory barrier inside __folio_m    331          * The memory barrier inside __folio_mark_uptodate makes sure that
332          * zeroing out the folio become visibl    332          * zeroing out the folio become visible before mapping the page
333          * using set_pte_at(). See do_anonymou    333          * using set_pte_at(). See do_anonymous_page().
334          */                                       334          */
335         __folio_mark_uptodate(folio);             335         __folio_mark_uptodate(folio);
336                                                   336 
337         ret = mfill_atomic_install_pte(dst_pmd    337         ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
338                                        &folio-    338                                        &folio->page, true, 0);
339         if (ret)                                  339         if (ret)
340                 goto out_put;                     340                 goto out_put;
341                                                   341 
342         return 0;                                 342         return 0;
343 out_put:                                          343 out_put:
344         folio_put(folio);                         344         folio_put(folio);
345         return ret;                               345         return ret;
346 }                                                 346 }
347                                                   347 
348 static int mfill_atomic_pte_zeropage(pmd_t *ds    348 static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
349                                      struct vm    349                                      struct vm_area_struct *dst_vma,
350                                      unsigned     350                                      unsigned long dst_addr)
351 {                                                 351 {
352         pte_t _dst_pte, *dst_pte;                 352         pte_t _dst_pte, *dst_pte;
353         spinlock_t *ptl;                          353         spinlock_t *ptl;
354         int ret;                                  354         int ret;
355                                                   355 
356         if (mm_forbids_zeropage(dst_vma->vm_mm    356         if (mm_forbids_zeropage(dst_vma->vm_mm))
357                 return mfill_atomic_pte_zeroed    357                 return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
358                                                   358 
359         _dst_pte = pte_mkspecial(pfn_pte(my_ze    359         _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
360                                          dst_v    360                                          dst_vma->vm_page_prot));
361         ret = -EAGAIN;                            361         ret = -EAGAIN;
362         dst_pte = pte_offset_map_lock(dst_vma-    362         dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
363         if (!dst_pte)                             363         if (!dst_pte)
364                 goto out;                         364                 goto out;
365         if (mfill_file_over_size(dst_vma, dst_    365         if (mfill_file_over_size(dst_vma, dst_addr)) {
366                 ret = -EFAULT;                    366                 ret = -EFAULT;
367                 goto out_unlock;                  367                 goto out_unlock;
368         }                                         368         }
369         ret = -EEXIST;                            369         ret = -EEXIST;
370         if (!pte_none(ptep_get(dst_pte)))         370         if (!pte_none(ptep_get(dst_pte)))
371                 goto out_unlock;                  371                 goto out_unlock;
372         set_pte_at(dst_vma->vm_mm, dst_addr, d    372         set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
373         /* No need to invalidate - it was non-    373         /* No need to invalidate - it was non-present before */
374         update_mmu_cache(dst_vma, dst_addr, ds    374         update_mmu_cache(dst_vma, dst_addr, dst_pte);
375         ret = 0;                                  375         ret = 0;
376 out_unlock:                                       376 out_unlock:
377         pte_unmap_unlock(dst_pte, ptl);           377         pte_unmap_unlock(dst_pte, ptl);
378 out:                                              378 out:
379         return ret;                               379         return ret;
380 }                                                 380 }
381                                                   381 
382 /* Handles UFFDIO_CONTINUE for all shmem VMAs     382 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
383 static int mfill_atomic_pte_continue(pmd_t *ds    383 static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
384                                      struct vm    384                                      struct vm_area_struct *dst_vma,
385                                      unsigned     385                                      unsigned long dst_addr,
386                                      uffd_flag    386                                      uffd_flags_t flags)
387 {                                                 387 {
388         struct inode *inode = file_inode(dst_v    388         struct inode *inode = file_inode(dst_vma->vm_file);
389         pgoff_t pgoff = linear_page_index(dst_    389         pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
390         struct folio *folio;                      390         struct folio *folio;
391         struct page *page;                        391         struct page *page;
392         int ret;                                  392         int ret;
393                                                   393 
394         ret = shmem_get_folio(inode, pgoff, 0,    394         ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
395         /* Our caller expects us to return -EF    395         /* Our caller expects us to return -EFAULT if we failed to find folio */
396         if (ret == -ENOENT)                       396         if (ret == -ENOENT)
397                 ret = -EFAULT;                    397                 ret = -EFAULT;
398         if (ret)                                  398         if (ret)
399                 goto out;                         399                 goto out;
400         if (!folio) {                             400         if (!folio) {
401                 ret = -EFAULT;                    401                 ret = -EFAULT;
402                 goto out;                         402                 goto out;
403         }                                         403         }
404                                                   404 
405         page = folio_file_page(folio, pgoff);     405         page = folio_file_page(folio, pgoff);
406         if (PageHWPoison(page)) {                 406         if (PageHWPoison(page)) {
407                 ret = -EIO;                       407                 ret = -EIO;
408                 goto out_release;                 408                 goto out_release;
409         }                                         409         }
410                                                   410 
411         ret = mfill_atomic_install_pte(dst_pmd    411         ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
412                                        page, f    412                                        page, false, flags);
413         if (ret)                                  413         if (ret)
414                 goto out_release;                 414                 goto out_release;
415                                                   415 
416         folio_unlock(folio);                      416         folio_unlock(folio);
417         ret = 0;                                  417         ret = 0;
418 out:                                              418 out:
419         return ret;                               419         return ret;
420 out_release:                                      420 out_release:
421         folio_unlock(folio);                      421         folio_unlock(folio);
422         folio_put(folio);                         422         folio_put(folio);
423         goto out;                                 423         goto out;
424 }                                                 424 }
425                                                   425 
426 /* Handles UFFDIO_POISON for all non-hugetlb V    426 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
427 static int mfill_atomic_pte_poison(pmd_t *dst_    427 static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
428                                    struct vm_a    428                                    struct vm_area_struct *dst_vma,
429                                    unsigned lo    429                                    unsigned long dst_addr,
430                                    uffd_flags_    430                                    uffd_flags_t flags)
431 {                                                 431 {
432         int ret;                                  432         int ret;
433         struct mm_struct *dst_mm = dst_vma->vm    433         struct mm_struct *dst_mm = dst_vma->vm_mm;
434         pte_t _dst_pte, *dst_pte;                 434         pte_t _dst_pte, *dst_pte;
435         spinlock_t *ptl;                          435         spinlock_t *ptl;
436                                                   436 
437         _dst_pte = make_pte_marker(PTE_MARKER_    437         _dst_pte = make_pte_marker(PTE_MARKER_POISONED);
438         ret = -EAGAIN;                            438         ret = -EAGAIN;
439         dst_pte = pte_offset_map_lock(dst_mm,     439         dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
440         if (!dst_pte)                             440         if (!dst_pte)
441                 goto out;                         441                 goto out;
442                                                   442 
443         if (mfill_file_over_size(dst_vma, dst_    443         if (mfill_file_over_size(dst_vma, dst_addr)) {
444                 ret = -EFAULT;                    444                 ret = -EFAULT;
445                 goto out_unlock;                  445                 goto out_unlock;
446         }                                         446         }
447                                                   447 
448         ret = -EEXIST;                            448         ret = -EEXIST;
449         /* Refuse to overwrite any PTE, even a    449         /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */
450         if (!pte_none(ptep_get(dst_pte)))         450         if (!pte_none(ptep_get(dst_pte)))
451                 goto out_unlock;                  451                 goto out_unlock;
452                                                   452 
453         set_pte_at(dst_mm, dst_addr, dst_pte,     453         set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
454                                                   454 
455         /* No need to invalidate - it was non-    455         /* No need to invalidate - it was non-present before */
456         update_mmu_cache(dst_vma, dst_addr, ds    456         update_mmu_cache(dst_vma, dst_addr, dst_pte);
457         ret = 0;                                  457         ret = 0;
458 out_unlock:                                       458 out_unlock:
459         pte_unmap_unlock(dst_pte, ptl);           459         pte_unmap_unlock(dst_pte, ptl);
460 out:                                              460 out:
461         return ret;                               461         return ret;
462 }                                                 462 }
463                                                   463 
464 static pmd_t *mm_alloc_pmd(struct mm_struct *m    464 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
465 {                                                 465 {
466         pgd_t *pgd;                               466         pgd_t *pgd;
467         p4d_t *p4d;                               467         p4d_t *p4d;
468         pud_t *pud;                               468         pud_t *pud;
469                                                   469 
470         pgd = pgd_offset(mm, address);            470         pgd = pgd_offset(mm, address);
471         p4d = p4d_alloc(mm, pgd, address);        471         p4d = p4d_alloc(mm, pgd, address);
472         if (!p4d)                                 472         if (!p4d)
473                 return NULL;                      473                 return NULL;
474         pud = pud_alloc(mm, p4d, address);        474         pud = pud_alloc(mm, p4d, address);
475         if (!pud)                                 475         if (!pud)
476                 return NULL;                      476                 return NULL;
477         /*                                        477         /*
478          * Note that we didn't run this becaus    478          * Note that we didn't run this because the pmd was
479          * missing, the *pmd may be already es    479          * missing, the *pmd may be already established and in
480          * turn it may also be a trans_huge_pm    480          * turn it may also be a trans_huge_pmd.
481          */                                       481          */
482         return pmd_alloc(mm, pud, address);       482         return pmd_alloc(mm, pud, address);
483 }                                                 483 }
484                                                   484 
485 #ifdef CONFIG_HUGETLB_PAGE                        485 #ifdef CONFIG_HUGETLB_PAGE
486 /*                                                486 /*
487  * mfill_atomic processing for HUGETLB vmas.      487  * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
488  * called with either vma-lock or mmap_lock he    488  * called with either vma-lock or mmap_lock held, it will release the lock
489  * before returning.                              489  * before returning.
490  */                                               490  */
491 static __always_inline ssize_t mfill_atomic_hu    491 static __always_inline ssize_t mfill_atomic_hugetlb(
492                                                   492                                               struct userfaultfd_ctx *ctx,
493                                                   493                                               struct vm_area_struct *dst_vma,
494                                                   494                                               unsigned long dst_start,
495                                                   495                                               unsigned long src_start,
496                                                   496                                               unsigned long len,
497                                                   497                                               uffd_flags_t flags)
498 {                                                 498 {
499         struct mm_struct *dst_mm = dst_vma->vm    499         struct mm_struct *dst_mm = dst_vma->vm_mm;
500         ssize_t err;                              500         ssize_t err;
501         pte_t *dst_pte;                           501         pte_t *dst_pte;
502         unsigned long src_addr, dst_addr;         502         unsigned long src_addr, dst_addr;
503         long copied;                              503         long copied;
504         struct folio *folio;                      504         struct folio *folio;
505         unsigned long vma_hpagesize;              505         unsigned long vma_hpagesize;
506         pgoff_t idx;                              506         pgoff_t idx;
507         u32 hash;                                 507         u32 hash;
508         struct address_space *mapping;            508         struct address_space *mapping;
509                                                   509 
510         /*                                        510         /*
511          * There is no default zero huge page     511          * There is no default zero huge page for all huge page sizes as
512          * supported by hugetlb.  A PMD_SIZE h    512          * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
513          * by THP.  Since we can not reliably     513          * by THP.  Since we can not reliably insert a zero page, this
514          * feature is not supported.              514          * feature is not supported.
515          */                                       515          */
516         if (uffd_flags_mode_is(flags, MFILL_AT    516         if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
517                 up_read(&ctx->map_changing_loc    517                 up_read(&ctx->map_changing_lock);
518                 uffd_mfill_unlock(dst_vma);       518                 uffd_mfill_unlock(dst_vma);
519                 return -EINVAL;                   519                 return -EINVAL;
520         }                                         520         }
521                                                   521 
522         src_addr = src_start;                     522         src_addr = src_start;
523         dst_addr = dst_start;                     523         dst_addr = dst_start;
524         copied = 0;                               524         copied = 0;
525         folio = NULL;                             525         folio = NULL;
526         vma_hpagesize = vma_kernel_pagesize(ds    526         vma_hpagesize = vma_kernel_pagesize(dst_vma);
527                                                   527 
528         /*                                        528         /*
529          * Validate alignment based on huge pa    529          * Validate alignment based on huge page size
530          */                                       530          */
531         err = -EINVAL;                            531         err = -EINVAL;
532         if (dst_start & (vma_hpagesize - 1) ||    532         if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
533                 goto out_unlock;                  533                 goto out_unlock;
534                                                   534 
535 retry:                                            535 retry:
536         /*                                        536         /*
537          * On routine entry dst_vma is set.  I    537          * On routine entry dst_vma is set.  If we had to drop mmap_lock and
538          * retry, dst_vma will be set to NULL     538          * retry, dst_vma will be set to NULL and we must lookup again.
539          */                                       539          */
540         if (!dst_vma) {                           540         if (!dst_vma) {
541                 dst_vma = uffd_mfill_lock(dst_    541                 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
542                 if (IS_ERR(dst_vma)) {            542                 if (IS_ERR(dst_vma)) {
543                         err = PTR_ERR(dst_vma)    543                         err = PTR_ERR(dst_vma);
544                         goto out;                 544                         goto out;
545                 }                                 545                 }
546                                                   546 
547                 err = -ENOENT;                    547                 err = -ENOENT;
548                 if (!is_vm_hugetlb_page(dst_vm    548                 if (!is_vm_hugetlb_page(dst_vma))
549                         goto out_unlock_vma;      549                         goto out_unlock_vma;
550                                                   550 
551                 err = -EINVAL;                    551                 err = -EINVAL;
552                 if (vma_hpagesize != vma_kerne    552                 if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
553                         goto out_unlock_vma;      553                         goto out_unlock_vma;
554                                                   554 
555                 /*                                555                 /*
556                  * If memory mappings are chan    556                  * If memory mappings are changing because of non-cooperative
557                  * operation (e.g. mremap) run    557                  * operation (e.g. mremap) running in parallel, bail out and
558                  * request the user to retry l    558                  * request the user to retry later
559                  */                               559                  */
560                 down_read(&ctx->map_changing_l    560                 down_read(&ctx->map_changing_lock);
561                 err = -EAGAIN;                    561                 err = -EAGAIN;
562                 if (atomic_read(&ctx->mmap_cha    562                 if (atomic_read(&ctx->mmap_changing))
563                         goto out_unlock;          563                         goto out_unlock;
564         }                                         564         }
565                                                   565 
566         while (src_addr < src_start + len) {      566         while (src_addr < src_start + len) {
567                 BUG_ON(dst_addr >= dst_start +    567                 BUG_ON(dst_addr >= dst_start + len);
568                                                   568 
569                 /*                                569                 /*
570                  * Serialize via vma_lock and     570                  * Serialize via vma_lock and hugetlb_fault_mutex.
571                  * vma_lock ensures the dst_pt    571                  * vma_lock ensures the dst_pte remains valid even
572                  * in the case of shared pmds.    572                  * in the case of shared pmds.  fault mutex prevents
573                  * races with other faulting t    573                  * races with other faulting threads.
574                  */                               574                  */
575                 idx = linear_page_index(dst_vm    575                 idx = linear_page_index(dst_vma, dst_addr);
576                 mapping = dst_vma->vm_file->f_    576                 mapping = dst_vma->vm_file->f_mapping;
577                 hash = hugetlb_fault_mutex_has    577                 hash = hugetlb_fault_mutex_hash(mapping, idx);
578                 mutex_lock(&hugetlb_fault_mute    578                 mutex_lock(&hugetlb_fault_mutex_table[hash]);
579                 hugetlb_vma_lock_read(dst_vma)    579                 hugetlb_vma_lock_read(dst_vma);
580                                                   580 
581                 err = -ENOMEM;                    581                 err = -ENOMEM;
582                 dst_pte = huge_pte_alloc(dst_m    582                 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
583                 if (!dst_pte) {                   583                 if (!dst_pte) {
584                         hugetlb_vma_unlock_rea    584                         hugetlb_vma_unlock_read(dst_vma);
585                         mutex_unlock(&hugetlb_    585                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
586                         goto out_unlock;          586                         goto out_unlock;
587                 }                                 587                 }
588                                                   588 
589                 if (!uffd_flags_mode_is(flags,    589                 if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
590                     !huge_pte_none_mostly(huge    590                     !huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) {
591                         err = -EEXIST;            591                         err = -EEXIST;
592                         hugetlb_vma_unlock_rea    592                         hugetlb_vma_unlock_read(dst_vma);
593                         mutex_unlock(&hugetlb_    593                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
594                         goto out_unlock;          594                         goto out_unlock;
595                 }                                 595                 }
596                                                   596 
597                 err = hugetlb_mfill_atomic_pte    597                 err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
598                                                   598                                                src_addr, flags, &folio);
599                                                   599 
600                 hugetlb_vma_unlock_read(dst_vm    600                 hugetlb_vma_unlock_read(dst_vma);
601                 mutex_unlock(&hugetlb_fault_mu    601                 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
602                                                   602 
603                 cond_resched();                   603                 cond_resched();
604                                                   604 
605                 if (unlikely(err == -ENOENT))     605                 if (unlikely(err == -ENOENT)) {
606                         up_read(&ctx->map_chan    606                         up_read(&ctx->map_changing_lock);
607                         uffd_mfill_unlock(dst_    607                         uffd_mfill_unlock(dst_vma);
608                         BUG_ON(!folio);           608                         BUG_ON(!folio);
609                                                   609 
610                         err = copy_folio_from_    610                         err = copy_folio_from_user(folio,
611                                                   611                                                    (const void __user *)src_addr, true);
612                         if (unlikely(err)) {      612                         if (unlikely(err)) {
613                                 err = -EFAULT;    613                                 err = -EFAULT;
614                                 goto out;         614                                 goto out;
615                         }                         615                         }
616                                                   616 
617                         dst_vma = NULL;           617                         dst_vma = NULL;
618                         goto retry;               618                         goto retry;
619                 } else                            619                 } else
620                         BUG_ON(folio);            620                         BUG_ON(folio);
621                                                   621 
622                 if (!err) {                       622                 if (!err) {
623                         dst_addr += vma_hpages    623                         dst_addr += vma_hpagesize;
624                         src_addr += vma_hpages    624                         src_addr += vma_hpagesize;
625                         copied += vma_hpagesiz    625                         copied += vma_hpagesize;
626                                                   626 
627                         if (fatal_signal_pendi    627                         if (fatal_signal_pending(current))
628                                 err = -EINTR;     628                                 err = -EINTR;
629                 }                                 629                 }
630                 if (err)                          630                 if (err)
631                         break;                    631                         break;
632         }                                         632         }
633                                                   633 
634 out_unlock:                                       634 out_unlock:
635         up_read(&ctx->map_changing_lock);         635         up_read(&ctx->map_changing_lock);
636 out_unlock_vma:                                   636 out_unlock_vma:
637         uffd_mfill_unlock(dst_vma);               637         uffd_mfill_unlock(dst_vma);
638 out:                                              638 out:
639         if (folio)                                639         if (folio)
640                 folio_put(folio);                 640                 folio_put(folio);
641         BUG_ON(copied < 0);                       641         BUG_ON(copied < 0);
642         BUG_ON(err > 0);                          642         BUG_ON(err > 0);
643         BUG_ON(!copied && !err);                  643         BUG_ON(!copied && !err);
644         return copied ? copied : err;             644         return copied ? copied : err;
645 }                                                 645 }
646 #else /* !CONFIG_HUGETLB_PAGE */                  646 #else /* !CONFIG_HUGETLB_PAGE */
647 /* fail at build time if gcc attempts to use t    647 /* fail at build time if gcc attempts to use this */
648 extern ssize_t mfill_atomic_hugetlb(struct use    648 extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
649                                     struct vm_    649                                     struct vm_area_struct *dst_vma,
650                                     unsigned l    650                                     unsigned long dst_start,
651                                     unsigned l    651                                     unsigned long src_start,
652                                     unsigned l    652                                     unsigned long len,
653                                     uffd_flags    653                                     uffd_flags_t flags);
654 #endif /* CONFIG_HUGETLB_PAGE */                  654 #endif /* CONFIG_HUGETLB_PAGE */
655                                                   655 
656 static __always_inline ssize_t mfill_atomic_pt    656 static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
657                                                   657                                                 struct vm_area_struct *dst_vma,
658                                                   658                                                 unsigned long dst_addr,
659                                                   659                                                 unsigned long src_addr,
660                                                   660                                                 uffd_flags_t flags,
661                                                   661                                                 struct folio **foliop)
662 {                                                 662 {
663         ssize_t err;                              663         ssize_t err;
664                                                   664 
665         if (uffd_flags_mode_is(flags, MFILL_AT    665         if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
666                 return mfill_atomic_pte_contin    666                 return mfill_atomic_pte_continue(dst_pmd, dst_vma,
667                                                   667                                                  dst_addr, flags);
668         } else if (uffd_flags_mode_is(flags, M    668         } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
669                 return mfill_atomic_pte_poison    669                 return mfill_atomic_pte_poison(dst_pmd, dst_vma,
670                                                   670                                                dst_addr, flags);
671         }                                         671         }
672                                                   672 
673         /*                                        673         /*
674          * The normal page fault path for a sh    674          * The normal page fault path for a shmem will invoke the
675          * fault, fill the hole in the file an    675          * fault, fill the hole in the file and COW it right away. The
676          * result generates plain anonymous me    676          * result generates plain anonymous memory. So when we are
677          * asked to fill an hole in a MAP_PRIV    677          * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
678          * generate anonymous memory directly     678          * generate anonymous memory directly without actually filling
679          * the hole. For the MAP_PRIVATE case     679          * the hole. For the MAP_PRIVATE case the robustness check
680          * only happens in the pagetable (to v    680          * only happens in the pagetable (to verify it's still none)
681          * and not in the radix tree.             681          * and not in the radix tree.
682          */                                       682          */
683         if (!(dst_vma->vm_flags & VM_SHARED))     683         if (!(dst_vma->vm_flags & VM_SHARED)) {
684                 if (uffd_flags_mode_is(flags,     684                 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
685                         err = mfill_atomic_pte    685                         err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
686                                                   686                                                     dst_addr, src_addr,
687                                                   687                                                     flags, foliop);
688                 else                              688                 else
689                         err = mfill_atomic_pte    689                         err = mfill_atomic_pte_zeropage(dst_pmd,
690                                                   690                                                  dst_vma, dst_addr);
691         } else {                                  691         } else {
692                 err = shmem_mfill_atomic_pte(d    692                 err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
693                                              d    693                                              dst_addr, src_addr,
694                                              f    694                                              flags, foliop);
695         }                                         695         }
696                                                   696 
697         return err;                               697         return err;
698 }                                                 698 }
699                                                   699 
700 static __always_inline ssize_t mfill_atomic(st    700 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
701                                             un    701                                             unsigned long dst_start,
702                                             un    702                                             unsigned long src_start,
703                                             un    703                                             unsigned long len,
704                                             uf    704                                             uffd_flags_t flags)
705 {                                                 705 {
706         struct mm_struct *dst_mm = ctx->mm;       706         struct mm_struct *dst_mm = ctx->mm;
707         struct vm_area_struct *dst_vma;           707         struct vm_area_struct *dst_vma;
708         ssize_t err;                              708         ssize_t err;
709         pmd_t *dst_pmd;                           709         pmd_t *dst_pmd;
710         unsigned long src_addr, dst_addr;         710         unsigned long src_addr, dst_addr;
711         long copied;                              711         long copied;
712         struct folio *folio;                      712         struct folio *folio;
713                                                   713 
714         /*                                        714         /*
715          * Sanitize the command parameters:       715          * Sanitize the command parameters:
716          */                                       716          */
717         BUG_ON(dst_start & ~PAGE_MASK);           717         BUG_ON(dst_start & ~PAGE_MASK);
718         BUG_ON(len & ~PAGE_MASK);                 718         BUG_ON(len & ~PAGE_MASK);
719                                                   719 
720         /* Does the address range wrap, or is     720         /* Does the address range wrap, or is the span zero-sized? */
721         BUG_ON(src_start + len <= src_start);     721         BUG_ON(src_start + len <= src_start);
722         BUG_ON(dst_start + len <= dst_start);     722         BUG_ON(dst_start + len <= dst_start);
723                                                   723 
724         src_addr = src_start;                     724         src_addr = src_start;
725         dst_addr = dst_start;                     725         dst_addr = dst_start;
726         copied = 0;                               726         copied = 0;
727         folio = NULL;                             727         folio = NULL;
728 retry:                                            728 retry:
729         /*                                        729         /*
730          * Make sure the vma is not shared, th    730          * Make sure the vma is not shared, that the dst range is
731          * both valid and fully within a singl    731          * both valid and fully within a single existing vma.
732          */                                       732          */
733         dst_vma = uffd_mfill_lock(dst_mm, dst_    733         dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
734         if (IS_ERR(dst_vma)) {                    734         if (IS_ERR(dst_vma)) {
735                 err = PTR_ERR(dst_vma);           735                 err = PTR_ERR(dst_vma);
736                 goto out;                         736                 goto out;
737         }                                         737         }
738                                                   738 
739         /*                                        739         /*
740          * If memory mappings are changing bec    740          * If memory mappings are changing because of non-cooperative
741          * operation (e.g. mremap) running in     741          * operation (e.g. mremap) running in parallel, bail out and
742          * request the user to retry later        742          * request the user to retry later
743          */                                       743          */
744         down_read(&ctx->map_changing_lock);       744         down_read(&ctx->map_changing_lock);
745         err = -EAGAIN;                            745         err = -EAGAIN;
746         if (atomic_read(&ctx->mmap_changing))     746         if (atomic_read(&ctx->mmap_changing))
747                 goto out_unlock;                  747                 goto out_unlock;
748                                                   748 
749         err = -EINVAL;                            749         err = -EINVAL;
750         /*                                        750         /*
751          * shmem_zero_setup is invoked in mmap    751          * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
752          * it will overwrite vm_ops, so vma_is    752          * it will overwrite vm_ops, so vma_is_anonymous must return false.
753          */                                       753          */
754         if (WARN_ON_ONCE(vma_is_anonymous(dst_    754         if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
755             dst_vma->vm_flags & VM_SHARED))       755             dst_vma->vm_flags & VM_SHARED))
756                 goto out_unlock;                  756                 goto out_unlock;
757                                                   757 
758         /*                                        758         /*
759          * validate 'mode' now that we know th    759          * validate 'mode' now that we know the dst_vma: don't allow
760          * a wrprotect copy if the userfaultfd    760          * a wrprotect copy if the userfaultfd didn't register as WP.
761          */                                       761          */
762         if ((flags & MFILL_ATOMIC_WP) && !(dst    762         if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
763                 goto out_unlock;                  763                 goto out_unlock;
764                                                   764 
765         /*                                        765         /*
766          * If this is a HUGETLB vma, pass off     766          * If this is a HUGETLB vma, pass off to appropriate routine
767          */                                       767          */
768         if (is_vm_hugetlb_page(dst_vma))          768         if (is_vm_hugetlb_page(dst_vma))
769                 return  mfill_atomic_hugetlb(c    769                 return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
770                                              s    770                                              src_start, len, flags);
771                                                   771 
772         if (!vma_is_anonymous(dst_vma) && !vma    772         if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
773                 goto out_unlock;                  773                 goto out_unlock;
774         if (!vma_is_shmem(dst_vma) &&             774         if (!vma_is_shmem(dst_vma) &&
775             uffd_flags_mode_is(flags, MFILL_AT    775             uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
776                 goto out_unlock;                  776                 goto out_unlock;
777                                                   777 
778         while (src_addr < src_start + len) {      778         while (src_addr < src_start + len) {
779                 pmd_t dst_pmdval;                 779                 pmd_t dst_pmdval;
780                                                   780 
781                 BUG_ON(dst_addr >= dst_start +    781                 BUG_ON(dst_addr >= dst_start + len);
782                                                   782 
783                 dst_pmd = mm_alloc_pmd(dst_mm,    783                 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
784                 if (unlikely(!dst_pmd)) {         784                 if (unlikely(!dst_pmd)) {
785                         err = -ENOMEM;            785                         err = -ENOMEM;
786                         break;                    786                         break;
787                 }                                 787                 }
788                                                   788 
789                 dst_pmdval = pmdp_get_lockless    789                 dst_pmdval = pmdp_get_lockless(dst_pmd);
790                 if (unlikely(pmd_none(dst_pmdv    790                 if (unlikely(pmd_none(dst_pmdval)) &&
791                     unlikely(__pte_alloc(dst_m    791                     unlikely(__pte_alloc(dst_mm, dst_pmd))) {
792                         err = -ENOMEM;            792                         err = -ENOMEM;
793                         break;                    793                         break;
794                 }                                 794                 }
795                 dst_pmdval = pmdp_get_lockless    795                 dst_pmdval = pmdp_get_lockless(dst_pmd);
796                 /*                                796                 /*
797                  * If the dst_pmd is THP don't    797                  * If the dst_pmd is THP don't override it and just be strict.
798                  * (This includes the case whe    798                  * (This includes the case where the PMD used to be THP and
799                  * changed back to none after     799                  * changed back to none after __pte_alloc().)
800                  */                               800                  */
801                 if (unlikely(!pmd_present(dst_    801                 if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) ||
802                              pmd_devmap(dst_pm    802                              pmd_devmap(dst_pmdval))) {
803                         err = -EEXIST;            803                         err = -EEXIST;
804                         break;                    804                         break;
805                 }                                 805                 }
806                 if (unlikely(pmd_bad(dst_pmdva    806                 if (unlikely(pmd_bad(dst_pmdval))) {
807                         err = -EFAULT;            807                         err = -EFAULT;
808                         break;                    808                         break;
809                 }                                 809                 }
810                 /*                                810                 /*
811                  * For shmem mappings, khugepa    811                  * For shmem mappings, khugepaged is allowed to remove page
812                  * tables under us; pte_offset    812                  * tables under us; pte_offset_map_lock() will deal with that.
813                  */                               813                  */
814                                                   814 
815                 err = mfill_atomic_pte(dst_pmd    815                 err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
816                                        src_add    816                                        src_addr, flags, &folio);
817                 cond_resched();                   817                 cond_resched();
818                                                   818 
819                 if (unlikely(err == -ENOENT))     819                 if (unlikely(err == -ENOENT)) {
820                         void *kaddr;              820                         void *kaddr;
821                                                   821 
822                         up_read(&ctx->map_chan    822                         up_read(&ctx->map_changing_lock);
823                         uffd_mfill_unlock(dst_    823                         uffd_mfill_unlock(dst_vma);
824                         BUG_ON(!folio);           824                         BUG_ON(!folio);
825                                                   825 
826                         kaddr = kmap_local_fol    826                         kaddr = kmap_local_folio(folio, 0);
827                         err = copy_from_user(k    827                         err = copy_from_user(kaddr,
828                                              (    828                                              (const void __user *) src_addr,
829                                              P    829                                              PAGE_SIZE);
830                         kunmap_local(kaddr);      830                         kunmap_local(kaddr);
831                         if (unlikely(err)) {      831                         if (unlikely(err)) {
832                                 err = -EFAULT;    832                                 err = -EFAULT;
833                                 goto out;         833                                 goto out;
834                         }                         834                         }
835                         flush_dcache_folio(fol    835                         flush_dcache_folio(folio);
836                         goto retry;               836                         goto retry;
837                 } else                            837                 } else
838                         BUG_ON(folio);            838                         BUG_ON(folio);
839                                                   839 
840                 if (!err) {                       840                 if (!err) {
841                         dst_addr += PAGE_SIZE;    841                         dst_addr += PAGE_SIZE;
842                         src_addr += PAGE_SIZE;    842                         src_addr += PAGE_SIZE;
843                         copied += PAGE_SIZE;      843                         copied += PAGE_SIZE;
844                                                   844 
845                         if (fatal_signal_pendi    845                         if (fatal_signal_pending(current))
846                                 err = -EINTR;     846                                 err = -EINTR;
847                 }                                 847                 }
848                 if (err)                          848                 if (err)
849                         break;                    849                         break;
850         }                                         850         }
851                                                   851 
852 out_unlock:                                       852 out_unlock:
853         up_read(&ctx->map_changing_lock);         853         up_read(&ctx->map_changing_lock);
854         uffd_mfill_unlock(dst_vma);               854         uffd_mfill_unlock(dst_vma);
855 out:                                              855 out:
856         if (folio)                                856         if (folio)
857                 folio_put(folio);                 857                 folio_put(folio);
858         BUG_ON(copied < 0);                       858         BUG_ON(copied < 0);
859         BUG_ON(err > 0);                          859         BUG_ON(err > 0);
860         BUG_ON(!copied && !err);                  860         BUG_ON(!copied && !err);
861         return copied ? copied : err;             861         return copied ? copied : err;
862 }                                                 862 }
863                                                   863 
864 ssize_t mfill_atomic_copy(struct userfaultfd_c    864 ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
865                           unsigned long src_st    865                           unsigned long src_start, unsigned long len,
866                           uffd_flags_t flags)     866                           uffd_flags_t flags)
867 {                                                 867 {
868         return mfill_atomic(ctx, dst_start, sr    868         return mfill_atomic(ctx, dst_start, src_start, len,
869                             uffd_flags_set_mod    869                             uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
870 }                                                 870 }
871                                                   871 
872 ssize_t mfill_atomic_zeropage(struct userfault    872 ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
873                               unsigned long st    873                               unsigned long start,
874                               unsigned long le    874                               unsigned long len)
875 {                                                 875 {
876         return mfill_atomic(ctx, start, 0, len    876         return mfill_atomic(ctx, start, 0, len,
877                             uffd_flags_set_mod    877                             uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
878 }                                                 878 }
879                                                   879 
880 ssize_t mfill_atomic_continue(struct userfault    880 ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
881                               unsigned long le    881                               unsigned long len, uffd_flags_t flags)
882 {                                                 882 {
883                                                   883 
884         /*                                        884         /*
885          * A caller might reasonably assume th    885          * A caller might reasonably assume that UFFDIO_CONTINUE contains an
886          * smp_wmb() to ensure that any writes    886          * smp_wmb() to ensure that any writes to the about-to-be-mapped page by
887          * the thread doing the UFFDIO_CONTINU    887          * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to
888          * subsequent loads from the page thro    888          * subsequent loads from the page through the newly mapped address range.
889          */                                       889          */
890         smp_wmb();                                890         smp_wmb();
891                                                   891 
892         return mfill_atomic(ctx, start, 0, len    892         return mfill_atomic(ctx, start, 0, len,
893                             uffd_flags_set_mod    893                             uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
894 }                                                 894 }
895                                                   895 
896 ssize_t mfill_atomic_poison(struct userfaultfd    896 ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
897                             unsigned long len,    897                             unsigned long len, uffd_flags_t flags)
898 {                                                 898 {
899         return mfill_atomic(ctx, start, 0, len    899         return mfill_atomic(ctx, start, 0, len,
900                             uffd_flags_set_mod    900                             uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
901 }                                                 901 }
902                                                   902 
903 long uffd_wp_range(struct vm_area_struct *dst_    903 long uffd_wp_range(struct vm_area_struct *dst_vma,
904                    unsigned long start, unsign    904                    unsigned long start, unsigned long len, bool enable_wp)
905 {                                                 905 {
906         unsigned int mm_cp_flags;                 906         unsigned int mm_cp_flags;
907         struct mmu_gather tlb;                    907         struct mmu_gather tlb;
908         long ret;                                 908         long ret;
909                                                   909 
910         VM_WARN_ONCE(start < dst_vma->vm_start    910         VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end,
911                         "The address range exc    911                         "The address range exceeds VMA boundary.\n");
912         if (enable_wp)                            912         if (enable_wp)
913                 mm_cp_flags = MM_CP_UFFD_WP;      913                 mm_cp_flags = MM_CP_UFFD_WP;
914         else                                      914         else
915                 mm_cp_flags = MM_CP_UFFD_WP_RE    915                 mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
916                                                   916 
917         /*                                        917         /*
918          * vma->vm_page_prot already reflects     918          * vma->vm_page_prot already reflects that uffd-wp is enabled for this
919          * VMA (see userfaultfd_set_vm_flags()    919          * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed
920          * to be write-protected as default wh    920          * to be write-protected as default whenever protection changes.
921          * Try upgrading write permissions man    921          * Try upgrading write permissions manually.
922          */                                       922          */
923         if (!enable_wp && vma_wants_manual_pte    923         if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
924                 mm_cp_flags |= MM_CP_TRY_CHANG    924                 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
925         tlb_gather_mmu(&tlb, dst_vma->vm_mm);     925         tlb_gather_mmu(&tlb, dst_vma->vm_mm);
926         ret = change_protection(&tlb, dst_vma,    926         ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
927         tlb_finish_mmu(&tlb);                     927         tlb_finish_mmu(&tlb);
928                                                   928 
929         return ret;                               929         return ret;
930 }                                                 930 }
931                                                   931 
932 int mwriteprotect_range(struct userfaultfd_ctx    932 int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
933                         unsigned long len, boo    933                         unsigned long len, bool enable_wp)
934 {                                                 934 {
935         struct mm_struct *dst_mm = ctx->mm;       935         struct mm_struct *dst_mm = ctx->mm;
936         unsigned long end = start + len;          936         unsigned long end = start + len;
937         unsigned long _start, _end;               937         unsigned long _start, _end;
938         struct vm_area_struct *dst_vma;           938         struct vm_area_struct *dst_vma;
939         unsigned long page_mask;                  939         unsigned long page_mask;
940         long err;                                 940         long err;
941         VMA_ITERATOR(vmi, dst_mm, start);         941         VMA_ITERATOR(vmi, dst_mm, start);
942                                                   942 
943         /*                                        943         /*
944          * Sanitize the command parameters:       944          * Sanitize the command parameters:
945          */                                       945          */
946         BUG_ON(start & ~PAGE_MASK);               946         BUG_ON(start & ~PAGE_MASK);
947         BUG_ON(len & ~PAGE_MASK);                 947         BUG_ON(len & ~PAGE_MASK);
948                                                   948 
949         /* Does the address range wrap, or is     949         /* Does the address range wrap, or is the span zero-sized? */
950         BUG_ON(start + len <= start);             950         BUG_ON(start + len <= start);
951                                                   951 
952         mmap_read_lock(dst_mm);                   952         mmap_read_lock(dst_mm);
953                                                   953 
954         /*                                        954         /*
955          * If memory mappings are changing bec    955          * If memory mappings are changing because of non-cooperative
956          * operation (e.g. mremap) running in     956          * operation (e.g. mremap) running in parallel, bail out and
957          * request the user to retry later        957          * request the user to retry later
958          */                                       958          */
959         down_read(&ctx->map_changing_lock);       959         down_read(&ctx->map_changing_lock);
960         err = -EAGAIN;                            960         err = -EAGAIN;
961         if (atomic_read(&ctx->mmap_changing))     961         if (atomic_read(&ctx->mmap_changing))
962                 goto out_unlock;                  962                 goto out_unlock;
963                                                   963 
964         err = -ENOENT;                            964         err = -ENOENT;
965         for_each_vma_range(vmi, dst_vma, end)     965         for_each_vma_range(vmi, dst_vma, end) {
966                                                   966 
967                 if (!userfaultfd_wp(dst_vma))     967                 if (!userfaultfd_wp(dst_vma)) {
968                         err = -ENOENT;            968                         err = -ENOENT;
969                         break;                    969                         break;
970                 }                                 970                 }
971                                                   971 
972                 if (is_vm_hugetlb_page(dst_vma    972                 if (is_vm_hugetlb_page(dst_vma)) {
973                         err = -EINVAL;            973                         err = -EINVAL;
974                         page_mask = vma_kernel    974                         page_mask = vma_kernel_pagesize(dst_vma) - 1;
975                         if ((start & page_mask    975                         if ((start & page_mask) || (len & page_mask))
976                                 break;            976                                 break;
977                 }                                 977                 }
978                                                   978 
979                 _start = max(dst_vma->vm_start    979                 _start = max(dst_vma->vm_start, start);
980                 _end = min(dst_vma->vm_end, en    980                 _end = min(dst_vma->vm_end, end);
981                                                   981 
982                 err = uffd_wp_range(dst_vma, _    982                 err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp);
983                                                   983 
984                 /* Return 0 on success, <0 on     984                 /* Return 0 on success, <0 on failures */
985                 if (err < 0)                      985                 if (err < 0)
986                         break;                    986                         break;
987                 err = 0;                          987                 err = 0;
988         }                                         988         }
989 out_unlock:                                       989 out_unlock:
990         up_read(&ctx->map_changing_lock);         990         up_read(&ctx->map_changing_lock);
991         mmap_read_unlock(dst_mm);                 991         mmap_read_unlock(dst_mm);
992         return err;                               992         return err;
993 }                                                 993 }
994                                                   994 
995                                                   995 
996 void double_pt_lock(spinlock_t *ptl1,             996 void double_pt_lock(spinlock_t *ptl1,
997                     spinlock_t *ptl2)             997                     spinlock_t *ptl2)
998         __acquires(ptl1)                          998         __acquires(ptl1)
999         __acquires(ptl2)                          999         __acquires(ptl2)
1000 {                                                1000 {
1001         if (ptl1 > ptl2)                         1001         if (ptl1 > ptl2)
1002                 swap(ptl1, ptl2);                1002                 swap(ptl1, ptl2);
1003         /* lock in virtual address order to a    1003         /* lock in virtual address order to avoid lock inversion */
1004         spin_lock(ptl1);                         1004         spin_lock(ptl1);
1005         if (ptl1 != ptl2)                        1005         if (ptl1 != ptl2)
1006                 spin_lock_nested(ptl2, SINGLE    1006                 spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING);
1007         else                                     1007         else
1008                 __acquire(ptl2);                 1008                 __acquire(ptl2);
1009 }                                                1009 }
1010                                                  1010 
1011 void double_pt_unlock(spinlock_t *ptl1,          1011 void double_pt_unlock(spinlock_t *ptl1,
1012                       spinlock_t *ptl2)          1012                       spinlock_t *ptl2)
1013         __releases(ptl1)                         1013         __releases(ptl1)
1014         __releases(ptl2)                         1014         __releases(ptl2)
1015 {                                                1015 {
1016         spin_unlock(ptl1);                       1016         spin_unlock(ptl1);
1017         if (ptl1 != ptl2)                        1017         if (ptl1 != ptl2)
1018                 spin_unlock(ptl2);               1018                 spin_unlock(ptl2);
1019         else                                     1019         else
1020                 __release(ptl2);                 1020                 __release(ptl2);
1021 }                                                1021 }
1022                                                  1022 
1023                                                  1023 
1024 static int move_present_pte(struct mm_struct     1024 static int move_present_pte(struct mm_struct *mm,
1025                             struct vm_area_st    1025                             struct vm_area_struct *dst_vma,
1026                             struct vm_area_st    1026                             struct vm_area_struct *src_vma,
1027                             unsigned long dst    1027                             unsigned long dst_addr, unsigned long src_addr,
1028                             pte_t *dst_pte, p    1028                             pte_t *dst_pte, pte_t *src_pte,
1029                             pte_t orig_dst_pt    1029                             pte_t orig_dst_pte, pte_t orig_src_pte,
1030                             spinlock_t *dst_p    1030                             spinlock_t *dst_ptl, spinlock_t *src_ptl,
1031                             struct folio *src    1031                             struct folio *src_folio)
1032 {                                                1032 {
1033         int err = 0;                             1033         int err = 0;
1034                                                  1034 
1035         double_pt_lock(dst_ptl, src_ptl);        1035         double_pt_lock(dst_ptl, src_ptl);
1036                                                  1036 
1037         if (!pte_same(ptep_get(src_pte), orig    1037         if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
1038             !pte_same(ptep_get(dst_pte), orig    1038             !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
1039                 err = -EAGAIN;                   1039                 err = -EAGAIN;
1040                 goto out;                        1040                 goto out;
1041         }                                        1041         }
1042         if (folio_test_large(src_folio) ||       1042         if (folio_test_large(src_folio) ||
1043             folio_maybe_dma_pinned(src_folio)    1043             folio_maybe_dma_pinned(src_folio) ||
1044             !PageAnonExclusive(&src_folio->pa    1044             !PageAnonExclusive(&src_folio->page)) {
1045                 err = -EBUSY;                    1045                 err = -EBUSY;
1046                 goto out;                        1046                 goto out;
1047         }                                        1047         }
1048                                                  1048 
1049         orig_src_pte = ptep_clear_flush(src_v    1049         orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
1050         /* Folio got pinned from under us. Pu    1050         /* Folio got pinned from under us. Put it back and fail the move. */
1051         if (folio_maybe_dma_pinned(src_folio)    1051         if (folio_maybe_dma_pinned(src_folio)) {
1052                 set_pte_at(mm, src_addr, src_    1052                 set_pte_at(mm, src_addr, src_pte, orig_src_pte);
1053                 err = -EBUSY;                    1053                 err = -EBUSY;
1054                 goto out;                        1054                 goto out;
1055         }                                        1055         }
1056                                                  1056 
1057         folio_move_anon_rmap(src_folio, dst_v    1057         folio_move_anon_rmap(src_folio, dst_vma);
1058         src_folio->index = linear_page_index(    1058         src_folio->index = linear_page_index(dst_vma, dst_addr);
1059                                                  1059 
1060         orig_dst_pte = mk_pte(&src_folio->pag    1060         orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
1061         /* Follow mremap() behavior and treat    1061         /* Follow mremap() behavior and treat the entry dirty after the move */
1062         orig_dst_pte = pte_mkwrite(pte_mkdirt    1062         orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma);
1063                                                  1063 
1064         set_pte_at(mm, dst_addr, dst_pte, ori    1064         set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
1065 out:                                             1065 out:
1066         double_pt_unlock(dst_ptl, src_ptl);      1066         double_pt_unlock(dst_ptl, src_ptl);
1067         return err;                              1067         return err;
1068 }                                                1068 }
1069                                                  1069 
1070 static int move_swap_pte(struct mm_struct *mm    1070 static int move_swap_pte(struct mm_struct *mm,
1071                          unsigned long dst_ad    1071                          unsigned long dst_addr, unsigned long src_addr,
1072                          pte_t *dst_pte, pte_    1072                          pte_t *dst_pte, pte_t *src_pte,
1073                          pte_t orig_dst_pte,     1073                          pte_t orig_dst_pte, pte_t orig_src_pte,
1074                          spinlock_t *dst_ptl,    1074                          spinlock_t *dst_ptl, spinlock_t *src_ptl)
1075 {                                                1075 {
1076         if (!pte_swp_exclusive(orig_src_pte))    1076         if (!pte_swp_exclusive(orig_src_pte))
1077                 return -EBUSY;                   1077                 return -EBUSY;
1078                                                  1078 
1079         double_pt_lock(dst_ptl, src_ptl);        1079         double_pt_lock(dst_ptl, src_ptl);
1080                                                  1080 
1081         if (!pte_same(ptep_get(src_pte), orig    1081         if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
1082             !pte_same(ptep_get(dst_pte), orig    1082             !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
1083                 double_pt_unlock(dst_ptl, src    1083                 double_pt_unlock(dst_ptl, src_ptl);
1084                 return -EAGAIN;                  1084                 return -EAGAIN;
1085         }                                        1085         }
1086                                                  1086 
1087         orig_src_pte = ptep_get_and_clear(mm,    1087         orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
1088         set_pte_at(mm, dst_addr, dst_pte, ori    1088         set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
1089         double_pt_unlock(dst_ptl, src_ptl);      1089         double_pt_unlock(dst_ptl, src_ptl);
1090                                                  1090 
1091         return 0;                                1091         return 0;
1092 }                                                1092 }
1093                                                  1093 
1094 static int move_zeropage_pte(struct mm_struct    1094 static int move_zeropage_pte(struct mm_struct *mm,
1095                              struct vm_area_s    1095                              struct vm_area_struct *dst_vma,
1096                              struct vm_area_s    1096                              struct vm_area_struct *src_vma,
1097                              unsigned long ds    1097                              unsigned long dst_addr, unsigned long src_addr,
1098                              pte_t *dst_pte,     1098                              pte_t *dst_pte, pte_t *src_pte,
1099                              pte_t orig_dst_p    1099                              pte_t orig_dst_pte, pte_t orig_src_pte,
1100                              spinlock_t *dst_    1100                              spinlock_t *dst_ptl, spinlock_t *src_ptl)
1101 {                                                1101 {
1102         pte_t zero_pte;                          1102         pte_t zero_pte;
1103                                                  1103 
1104         double_pt_lock(dst_ptl, src_ptl);        1104         double_pt_lock(dst_ptl, src_ptl);
1105         if (!pte_same(ptep_get(src_pte), orig    1105         if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
1106             !pte_same(ptep_get(dst_pte), orig    1106             !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
1107                 double_pt_unlock(dst_ptl, src    1107                 double_pt_unlock(dst_ptl, src_ptl);
1108                 return -EAGAIN;                  1108                 return -EAGAIN;
1109         }                                        1109         }
1110                                                  1110 
1111         zero_pte = pte_mkspecial(pfn_pte(my_z    1111         zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
1112                                          dst_    1112                                          dst_vma->vm_page_prot));
1113         ptep_clear_flush(src_vma, src_addr, s    1113         ptep_clear_flush(src_vma, src_addr, src_pte);
1114         set_pte_at(mm, dst_addr, dst_pte, zer    1114         set_pte_at(mm, dst_addr, dst_pte, zero_pte);
1115         double_pt_unlock(dst_ptl, src_ptl);      1115         double_pt_unlock(dst_ptl, src_ptl);
1116                                                  1116 
1117         return 0;                                1117         return 0;
1118 }                                                1118 }
1119                                                  1119 
1120                                                  1120 
1121 /*                                               1121 /*
1122  * The mmap_lock for reading is held by the c    1122  * The mmap_lock for reading is held by the caller. Just move the page
1123  * from src_pmd to dst_pmd if possible, and r    1123  * from src_pmd to dst_pmd if possible, and return true if succeeded
1124  * in moving the page.                           1124  * in moving the page.
1125  */                                              1125  */
1126 static int move_pages_pte(struct mm_struct *m    1126 static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
1127                           struct vm_area_stru    1127                           struct vm_area_struct *dst_vma,
1128                           struct vm_area_stru    1128                           struct vm_area_struct *src_vma,
1129                           unsigned long dst_a    1129                           unsigned long dst_addr, unsigned long src_addr,
1130                           __u64 mode)            1130                           __u64 mode)
1131 {                                                1131 {
1132         swp_entry_t entry;                       1132         swp_entry_t entry;
1133         pte_t orig_src_pte, orig_dst_pte;        1133         pte_t orig_src_pte, orig_dst_pte;
1134         pte_t src_folio_pte;                     1134         pte_t src_folio_pte;
1135         spinlock_t *src_ptl, *dst_ptl;           1135         spinlock_t *src_ptl, *dst_ptl;
1136         pte_t *src_pte = NULL;                   1136         pte_t *src_pte = NULL;
1137         pte_t *dst_pte = NULL;                   1137         pte_t *dst_pte = NULL;
1138                                                  1138 
1139         struct folio *src_folio = NULL;          1139         struct folio *src_folio = NULL;
1140         struct anon_vma *src_anon_vma = NULL;    1140         struct anon_vma *src_anon_vma = NULL;
1141         struct mmu_notifier_range range;         1141         struct mmu_notifier_range range;
1142         int err = 0;                             1142         int err = 0;
1143                                                  1143 
1144         flush_cache_range(src_vma, src_addr,     1144         flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE);
1145         mmu_notifier_range_init(&range, MMU_N    1145         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
1146                                 src_addr, src    1146                                 src_addr, src_addr + PAGE_SIZE);
1147         mmu_notifier_invalidate_range_start(&    1147         mmu_notifier_invalidate_range_start(&range);
1148 retry:                                           1148 retry:
1149         dst_pte = pte_offset_map_nolock(mm, d    1149         dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl);
1150                                                  1150 
1151         /* Retry if a huge pmd materialized f    1151         /* Retry if a huge pmd materialized from under us */
1152         if (unlikely(!dst_pte)) {                1152         if (unlikely(!dst_pte)) {
1153                 err = -EAGAIN;                   1153                 err = -EAGAIN;
1154                 goto out;                        1154                 goto out;
1155         }                                        1155         }
1156                                                  1156 
1157         src_pte = pte_offset_map_nolock(mm, s    1157         src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl);
1158                                                  1158 
1159         /*                                       1159         /*
1160          * We held the mmap_lock for reading     1160          * We held the mmap_lock for reading so MADV_DONTNEED
1161          * can zap transparent huge pages und    1161          * can zap transparent huge pages under us, or the
1162          * transparent huge page fault can es    1162          * transparent huge page fault can establish new
1163          * transparent huge pages under us.      1163          * transparent huge pages under us.
1164          */                                      1164          */
1165         if (unlikely(!src_pte)) {                1165         if (unlikely(!src_pte)) {
1166                 err = -EAGAIN;                   1166                 err = -EAGAIN;
1167                 goto out;                        1167                 goto out;
1168         }                                        1168         }
1169                                                  1169 
1170         /* Sanity checks before the operation    1170         /* Sanity checks before the operation */
1171         if (WARN_ON_ONCE(pmd_none(*dst_pmd))     1171         if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) ||
1172             WARN_ON_ONCE(pmd_trans_huge(*dst_    1172             WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) {
1173                 err = -EINVAL;                   1173                 err = -EINVAL;
1174                 goto out;                        1174                 goto out;
1175         }                                        1175         }
1176                                                  1176 
1177         spin_lock(dst_ptl);                      1177         spin_lock(dst_ptl);
1178         orig_dst_pte = ptep_get(dst_pte);        1178         orig_dst_pte = ptep_get(dst_pte);
1179         spin_unlock(dst_ptl);                    1179         spin_unlock(dst_ptl);
1180         if (!pte_none(orig_dst_pte)) {           1180         if (!pte_none(orig_dst_pte)) {
1181                 err = -EEXIST;                   1181                 err = -EEXIST;
1182                 goto out;                        1182                 goto out;
1183         }                                        1183         }
1184                                                  1184 
1185         spin_lock(src_ptl);                      1185         spin_lock(src_ptl);
1186         orig_src_pte = ptep_get(src_pte);        1186         orig_src_pte = ptep_get(src_pte);
1187         spin_unlock(src_ptl);                    1187         spin_unlock(src_ptl);
1188         if (pte_none(orig_src_pte)) {            1188         if (pte_none(orig_src_pte)) {
1189                 if (!(mode & UFFDIO_MOVE_MODE    1189                 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
1190                         err = -ENOENT;           1190                         err = -ENOENT;
1191                 else /* nothing to do to move    1191                 else /* nothing to do to move a hole */
1192                         err = 0;                 1192                         err = 0;
1193                 goto out;                        1193                 goto out;
1194         }                                        1194         }
1195                                                  1195 
1196         /* If PTE changed after we locked the    1196         /* If PTE changed after we locked the folio them start over */
1197         if (src_folio && unlikely(!pte_same(s    1197         if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
1198                 err = -EAGAIN;                   1198                 err = -EAGAIN;
1199                 goto out;                        1199                 goto out;
1200         }                                        1200         }
1201                                                  1201 
1202         if (pte_present(orig_src_pte)) {         1202         if (pte_present(orig_src_pte)) {
1203                 if (is_zero_pfn(pte_pfn(orig_    1203                 if (is_zero_pfn(pte_pfn(orig_src_pte))) {
1204                         err = move_zeropage_p    1204                         err = move_zeropage_pte(mm, dst_vma, src_vma,
1205                                                  1205                                                dst_addr, src_addr, dst_pte, src_pte,
1206                                                  1206                                                orig_dst_pte, orig_src_pte,
1207                                                  1207                                                dst_ptl, src_ptl);
1208                         goto out;                1208                         goto out;
1209                 }                                1209                 }
1210                                                  1210 
1211                 /*                               1211                 /*
1212                  * Pin and lock both source f    1212                  * Pin and lock both source folio and anon_vma. Since we are in
1213                  * RCU read section, we can't    1213                  * RCU read section, we can't block, so on contention have to
1214                  * unmap the ptes, obtain the    1214                  * unmap the ptes, obtain the lock and retry.
1215                  */                              1215                  */
1216                 if (!src_folio) {                1216                 if (!src_folio) {
1217                         struct folio *folio;     1217                         struct folio *folio;
1218                                                  1218 
1219                         /*                       1219                         /*
1220                          * Pin the page while    1220                          * Pin the page while holding the lock to be sure the
1221                          * page isn't freed u    1221                          * page isn't freed under us
1222                          */                      1222                          */
1223                         spin_lock(src_ptl);      1223                         spin_lock(src_ptl);
1224                         if (!pte_same(orig_sr    1224                         if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
1225                                 spin_unlock(s    1225                                 spin_unlock(src_ptl);
1226                                 err = -EAGAIN    1226                                 err = -EAGAIN;
1227                                 goto out;        1227                                 goto out;
1228                         }                        1228                         }
1229                                                  1229 
1230                         folio = vm_normal_fol    1230                         folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
1231                         if (!folio || !PageAn    1231                         if (!folio || !PageAnonExclusive(&folio->page)) {
1232                                 spin_unlock(s    1232                                 spin_unlock(src_ptl);
1233                                 err = -EBUSY;    1233                                 err = -EBUSY;
1234                                 goto out;        1234                                 goto out;
1235                         }                        1235                         }
1236                                                  1236 
1237                         folio_get(folio);        1237                         folio_get(folio);
1238                         src_folio = folio;       1238                         src_folio = folio;
1239                         src_folio_pte = orig_    1239                         src_folio_pte = orig_src_pte;
1240                         spin_unlock(src_ptl);    1240                         spin_unlock(src_ptl);
1241                                                  1241 
1242                         if (!folio_trylock(sr    1242                         if (!folio_trylock(src_folio)) {
1243                                 pte_unmap(&or    1243                                 pte_unmap(&orig_src_pte);
1244                                 pte_unmap(&or    1244                                 pte_unmap(&orig_dst_pte);
1245                                 src_pte = dst    1245                                 src_pte = dst_pte = NULL;
1246                                 /* now we can    1246                                 /* now we can block and wait */
1247                                 folio_lock(sr    1247                                 folio_lock(src_folio);
1248                                 goto retry;      1248                                 goto retry;
1249                         }                        1249                         }
1250                                                  1250 
1251                         if (WARN_ON_ONCE(!fol    1251                         if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
1252                                 err = -EBUSY;    1252                                 err = -EBUSY;
1253                                 goto out;        1253                                 goto out;
1254                         }                        1254                         }
1255                 }                                1255                 }
1256                                                  1256 
1257                 /* at this point we have src_    1257                 /* at this point we have src_folio locked */
1258                 if (folio_test_large(src_foli    1258                 if (folio_test_large(src_folio)) {
1259                         /* split_folio() can     1259                         /* split_folio() can block */
1260                         pte_unmap(&orig_src_p    1260                         pte_unmap(&orig_src_pte);
1261                         pte_unmap(&orig_dst_p    1261                         pte_unmap(&orig_dst_pte);
1262                         src_pte = dst_pte = N    1262                         src_pte = dst_pte = NULL;
1263                         err = split_folio(src    1263                         err = split_folio(src_folio);
1264                         if (err)                 1264                         if (err)
1265                                 goto out;        1265                                 goto out;
1266                         /* have to reacquire     1266                         /* have to reacquire the folio after it got split */
1267                         folio_unlock(src_foli    1267                         folio_unlock(src_folio);
1268                         folio_put(src_folio);    1268                         folio_put(src_folio);
1269                         src_folio = NULL;        1269                         src_folio = NULL;
1270                         goto retry;              1270                         goto retry;
1271                 }                                1271                 }
1272                                                  1272 
1273                 if (!src_anon_vma) {             1273                 if (!src_anon_vma) {
1274                         /*                       1274                         /*
1275                          * folio_referenced w    1275                          * folio_referenced walks the anon_vma chain
1276                          * without the folio     1276                          * without the folio lock. Serialize against it with
1277                          * the anon_vma lock,    1277                          * the anon_vma lock, the folio lock is not enough.
1278                          */                      1278                          */
1279                         src_anon_vma = folio_    1279                         src_anon_vma = folio_get_anon_vma(src_folio);
1280                         if (!src_anon_vma) {     1280                         if (!src_anon_vma) {
1281                                 /* page was u    1281                                 /* page was unmapped from under us */
1282                                 err = -EAGAIN    1282                                 err = -EAGAIN;
1283                                 goto out;        1283                                 goto out;
1284                         }                        1284                         }
1285                         if (!anon_vma_trylock    1285                         if (!anon_vma_trylock_write(src_anon_vma)) {
1286                                 pte_unmap(&or    1286                                 pte_unmap(&orig_src_pte);
1287                                 pte_unmap(&or    1287                                 pte_unmap(&orig_dst_pte);
1288                                 src_pte = dst    1288                                 src_pte = dst_pte = NULL;
1289                                 /* now we can    1289                                 /* now we can block and wait */
1290                                 anon_vma_lock    1290                                 anon_vma_lock_write(src_anon_vma);
1291                                 goto retry;      1291                                 goto retry;
1292                         }                        1292                         }
1293                 }                                1293                 }
1294                                                  1294 
1295                 err = move_present_pte(mm,  d    1295                 err = move_present_pte(mm,  dst_vma, src_vma,
1296                                        dst_ad    1296                                        dst_addr, src_addr, dst_pte, src_pte,
1297                                        orig_d    1297                                        orig_dst_pte, orig_src_pte,
1298                                        dst_pt    1298                                        dst_ptl, src_ptl, src_folio);
1299         } else {                                 1299         } else {
1300                 entry = pte_to_swp_entry(orig    1300                 entry = pte_to_swp_entry(orig_src_pte);
1301                 if (non_swap_entry(entry)) {     1301                 if (non_swap_entry(entry)) {
1302                         if (is_migration_entr    1302                         if (is_migration_entry(entry)) {
1303                                 pte_unmap(&or    1303                                 pte_unmap(&orig_src_pte);
1304                                 pte_unmap(&or    1304                                 pte_unmap(&orig_dst_pte);
1305                                 src_pte = dst    1305                                 src_pte = dst_pte = NULL;
1306                                 migration_ent    1306                                 migration_entry_wait(mm, src_pmd, src_addr);
1307                                 err = -EAGAIN    1307                                 err = -EAGAIN;
1308                         } else                   1308                         } else
1309                                 err = -EFAULT    1309                                 err = -EFAULT;
1310                         goto out;                1310                         goto out;
1311                 }                                1311                 }
1312                                                  1312 
1313                 err = move_swap_pte(mm, dst_a    1313                 err = move_swap_pte(mm, dst_addr, src_addr,
1314                                     dst_pte,     1314                                     dst_pte, src_pte,
1315                                     orig_dst_    1315                                     orig_dst_pte, orig_src_pte,
1316                                     dst_ptl,     1316                                     dst_ptl, src_ptl);
1317         }                                        1317         }
1318                                                  1318 
1319 out:                                             1319 out:
1320         if (src_anon_vma) {                      1320         if (src_anon_vma) {
1321                 anon_vma_unlock_write(src_ano    1321                 anon_vma_unlock_write(src_anon_vma);
1322                 put_anon_vma(src_anon_vma);      1322                 put_anon_vma(src_anon_vma);
1323         }                                        1323         }
1324         if (src_folio) {                         1324         if (src_folio) {
1325                 folio_unlock(src_folio);         1325                 folio_unlock(src_folio);
1326                 folio_put(src_folio);            1326                 folio_put(src_folio);
1327         }                                        1327         }
1328         if (dst_pte)                             1328         if (dst_pte)
1329                 pte_unmap(dst_pte);              1329                 pte_unmap(dst_pte);
1330         if (src_pte)                             1330         if (src_pte)
1331                 pte_unmap(src_pte);              1331                 pte_unmap(src_pte);
1332         mmu_notifier_invalidate_range_end(&ra    1332         mmu_notifier_invalidate_range_end(&range);
1333                                                  1333 
1334         return err;                              1334         return err;
1335 }                                                1335 }
1336                                                  1336 
1337 #ifdef CONFIG_TRANSPARENT_HUGEPAGE               1337 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1338 static inline bool move_splits_huge_pmd(unsig    1338 static inline bool move_splits_huge_pmd(unsigned long dst_addr,
1339                                         unsig    1339                                         unsigned long src_addr,
1340                                         unsig    1340                                         unsigned long src_end)
1341 {                                                1341 {
1342         return (src_addr & ~HPAGE_PMD_MASK) |    1342         return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) ||
1343                 src_end - src_addr < HPAGE_PM    1343                 src_end - src_addr < HPAGE_PMD_SIZE;
1344 }                                                1344 }
1345 #else                                            1345 #else
1346 static inline bool move_splits_huge_pmd(unsig    1346 static inline bool move_splits_huge_pmd(unsigned long dst_addr,
1347                                         unsig    1347                                         unsigned long src_addr,
1348                                         unsig    1348                                         unsigned long src_end)
1349 {                                                1349 {
1350         /* This is unreachable anyway, just t    1350         /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */
1351         return false;                            1351         return false;
1352 }                                                1352 }
1353 #endif                                           1353 #endif
1354                                                  1354 
1355 static inline bool vma_move_compatible(struct    1355 static inline bool vma_move_compatible(struct vm_area_struct *vma)
1356 {                                                1356 {
1357         return !(vma->vm_flags & (VM_PFNMAP |    1357         return !(vma->vm_flags & (VM_PFNMAP | VM_IO |  VM_HUGETLB |
1358                                   VM_MIXEDMAP    1358                                   VM_MIXEDMAP | VM_SHADOW_STACK));
1359 }                                                1359 }
1360                                                  1360 
1361 static int validate_move_areas(struct userfau    1361 static int validate_move_areas(struct userfaultfd_ctx *ctx,
1362                                struct vm_area    1362                                struct vm_area_struct *src_vma,
1363                                struct vm_area    1363                                struct vm_area_struct *dst_vma)
1364 {                                                1364 {
1365         /* Only allow moving if both have the    1365         /* Only allow moving if both have the same access and protection */
1366         if ((src_vma->vm_flags & VM_ACCESS_FL    1366         if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) ||
1367             pgprot_val(src_vma->vm_page_prot)    1367             pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot))
1368                 return -EINVAL;                  1368                 return -EINVAL;
1369                                                  1369 
1370         /* Only allow moving if both are mloc    1370         /* Only allow moving if both are mlocked or both aren't */
1371         if ((src_vma->vm_flags & VM_LOCKED) !    1371         if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED))
1372                 return -EINVAL;                  1372                 return -EINVAL;
1373                                                  1373 
1374         /*                                       1374         /*
1375          * For now, we keep it simple and onl    1375          * For now, we keep it simple and only move between writable VMAs.
1376          * Access flags are equal, therefore     1376          * Access flags are equal, therefore cheching only the source is enough.
1377          */                                      1377          */
1378         if (!(src_vma->vm_flags & VM_WRITE))     1378         if (!(src_vma->vm_flags & VM_WRITE))
1379                 return -EINVAL;                  1379                 return -EINVAL;
1380                                                  1380 
1381         /* Check if vma flags indicate conten    1381         /* Check if vma flags indicate content which can be moved */
1382         if (!vma_move_compatible(src_vma) ||     1382         if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma))
1383                 return -EINVAL;                  1383                 return -EINVAL;
1384                                                  1384 
1385         /* Ensure dst_vma is registered in uf    1385         /* Ensure dst_vma is registered in uffd we are operating on */
1386         if (!dst_vma->vm_userfaultfd_ctx.ctx     1386         if (!dst_vma->vm_userfaultfd_ctx.ctx ||
1387             dst_vma->vm_userfaultfd_ctx.ctx !    1387             dst_vma->vm_userfaultfd_ctx.ctx != ctx)
1388                 return -EINVAL;                  1388                 return -EINVAL;
1389                                                  1389 
1390         /* Only allow moving across anonymous    1390         /* Only allow moving across anonymous vmas */
1391         if (!vma_is_anonymous(src_vma) || !vm    1391         if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
1392                 return -EINVAL;                  1392                 return -EINVAL;
1393                                                  1393 
1394         return 0;                                1394         return 0;
1395 }                                                1395 }
1396                                                  1396 
1397 static __always_inline                           1397 static __always_inline
1398 int find_vmas_mm_locked(struct mm_struct *mm,    1398 int find_vmas_mm_locked(struct mm_struct *mm,
1399                         unsigned long dst_sta    1399                         unsigned long dst_start,
1400                         unsigned long src_sta    1400                         unsigned long src_start,
1401                         struct vm_area_struct    1401                         struct vm_area_struct **dst_vmap,
1402                         struct vm_area_struct    1402                         struct vm_area_struct **src_vmap)
1403 {                                                1403 {
1404         struct vm_area_struct *vma;              1404         struct vm_area_struct *vma;
1405                                                  1405 
1406         mmap_assert_locked(mm);                  1406         mmap_assert_locked(mm);
1407         vma = find_vma_and_prepare_anon(mm, d    1407         vma = find_vma_and_prepare_anon(mm, dst_start);
1408         if (IS_ERR(vma))                         1408         if (IS_ERR(vma))
1409                 return PTR_ERR(vma);             1409                 return PTR_ERR(vma);
1410                                                  1410 
1411         *dst_vmap = vma;                         1411         *dst_vmap = vma;
1412         /* Skip finding src_vma if src_start     1412         /* Skip finding src_vma if src_start is in dst_vma */
1413         if (src_start >= vma->vm_start && src    1413         if (src_start >= vma->vm_start && src_start < vma->vm_end)
1414                 goto out_success;                1414                 goto out_success;
1415                                                  1415 
1416         vma = vma_lookup(mm, src_start);         1416         vma = vma_lookup(mm, src_start);
1417         if (!vma)                                1417         if (!vma)
1418                 return -ENOENT;                  1418                 return -ENOENT;
1419 out_success:                                     1419 out_success:
1420         *src_vmap = vma;                         1420         *src_vmap = vma;
1421         return 0;                                1421         return 0;
1422 }                                                1422 }
1423                                                  1423 
1424 #ifdef CONFIG_PER_VMA_LOCK                       1424 #ifdef CONFIG_PER_VMA_LOCK
1425 static int uffd_move_lock(struct mm_struct *m    1425 static int uffd_move_lock(struct mm_struct *mm,
1426                           unsigned long dst_s    1426                           unsigned long dst_start,
1427                           unsigned long src_s    1427                           unsigned long src_start,
1428                           struct vm_area_stru    1428                           struct vm_area_struct **dst_vmap,
1429                           struct vm_area_stru    1429                           struct vm_area_struct **src_vmap)
1430 {                                                1430 {
1431         struct vm_area_struct *vma;              1431         struct vm_area_struct *vma;
1432         int err;                                 1432         int err;
1433                                                  1433 
1434         vma = uffd_lock_vma(mm, dst_start);      1434         vma = uffd_lock_vma(mm, dst_start);
1435         if (IS_ERR(vma))                         1435         if (IS_ERR(vma))
1436                 return PTR_ERR(vma);             1436                 return PTR_ERR(vma);
1437                                                  1437 
1438         *dst_vmap = vma;                         1438         *dst_vmap = vma;
1439         /*                                       1439         /*
1440          * Skip finding src_vma if src_start     1440          * Skip finding src_vma if src_start is in dst_vma. This also ensures
1441          * that we don't lock the same vma tw    1441          * that we don't lock the same vma twice.
1442          */                                      1442          */
1443         if (src_start >= vma->vm_start && src    1443         if (src_start >= vma->vm_start && src_start < vma->vm_end) {
1444                 *src_vmap = vma;                 1444                 *src_vmap = vma;
1445                 return 0;                        1445                 return 0;
1446         }                                        1446         }
1447                                                  1447 
1448         /*                                       1448         /*
1449          * Using uffd_lock_vma() to get src_v    1449          * Using uffd_lock_vma() to get src_vma can lead to following deadlock:
1450          *                                       1450          *
1451          * Thread1                               1451          * Thread1                              Thread2
1452          * -------                               1452          * -------                              -------
1453          * vma_start_read(dst_vma)               1453          * vma_start_read(dst_vma)
1454          *                                       1454          *                                      mmap_write_lock(mm)
1455          *                                       1455          *                                      vma_start_write(src_vma)
1456          * vma_start_read(src_vma)               1456          * vma_start_read(src_vma)
1457          * mmap_read_lock(mm)                    1457          * mmap_read_lock(mm)
1458          *                                       1458          *                                      vma_start_write(dst_vma)
1459          */                                      1459          */
1460         *src_vmap = lock_vma_under_rcu(mm, sr    1460         *src_vmap = lock_vma_under_rcu(mm, src_start);
1461         if (likely(*src_vmap))                   1461         if (likely(*src_vmap))
1462                 return 0;                        1462                 return 0;
1463                                                  1463 
1464         /* Undo any locking and retry in mmap    1464         /* Undo any locking and retry in mmap_lock critical section */
1465         vma_end_read(*dst_vmap);                 1465         vma_end_read(*dst_vmap);
1466                                                  1466 
1467         mmap_read_lock(mm);                      1467         mmap_read_lock(mm);
1468         err = find_vmas_mm_locked(mm, dst_sta    1468         err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
1469         if (!err) {                              1469         if (!err) {
1470                 /*                               1470                 /*
1471                  * See comment in uffd_lock_v    1471                  * See comment in uffd_lock_vma() as to why not using
1472                  * vma_start_read() here.        1472                  * vma_start_read() here.
1473                  */                              1473                  */
1474                 down_read(&(*dst_vmap)->vm_lo    1474                 down_read(&(*dst_vmap)->vm_lock->lock);
1475                 if (*dst_vmap != *src_vmap)      1475                 if (*dst_vmap != *src_vmap)
1476                         down_read_nested(&(*s    1476                         down_read_nested(&(*src_vmap)->vm_lock->lock,
1477                                          SING    1477                                          SINGLE_DEPTH_NESTING);
1478         }                                        1478         }
1479         mmap_read_unlock(mm);                    1479         mmap_read_unlock(mm);
1480         return err;                              1480         return err;
1481 }                                                1481 }
1482                                                  1482 
1483 static void uffd_move_unlock(struct vm_area_s    1483 static void uffd_move_unlock(struct vm_area_struct *dst_vma,
1484                              struct vm_area_s    1484                              struct vm_area_struct *src_vma)
1485 {                                                1485 {
1486         vma_end_read(src_vma);                   1486         vma_end_read(src_vma);
1487         if (src_vma != dst_vma)                  1487         if (src_vma != dst_vma)
1488                 vma_end_read(dst_vma);           1488                 vma_end_read(dst_vma);
1489 }                                                1489 }
1490                                                  1490 
1491 #else                                            1491 #else
1492                                                  1492 
1493 static int uffd_move_lock(struct mm_struct *m    1493 static int uffd_move_lock(struct mm_struct *mm,
1494                           unsigned long dst_s    1494                           unsigned long dst_start,
1495                           unsigned long src_s    1495                           unsigned long src_start,
1496                           struct vm_area_stru    1496                           struct vm_area_struct **dst_vmap,
1497                           struct vm_area_stru    1497                           struct vm_area_struct **src_vmap)
1498 {                                                1498 {
1499         int err;                                 1499         int err;
1500                                                  1500 
1501         mmap_read_lock(mm);                      1501         mmap_read_lock(mm);
1502         err = find_vmas_mm_locked(mm, dst_sta    1502         err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
1503         if (err)                                 1503         if (err)
1504                 mmap_read_unlock(mm);            1504                 mmap_read_unlock(mm);
1505         return err;                              1505         return err;
1506 }                                                1506 }
1507                                                  1507 
1508 static void uffd_move_unlock(struct vm_area_s    1508 static void uffd_move_unlock(struct vm_area_struct *dst_vma,
1509                              struct vm_area_s    1509                              struct vm_area_struct *src_vma)
1510 {                                                1510 {
1511         mmap_assert_locked(src_vma->vm_mm);      1511         mmap_assert_locked(src_vma->vm_mm);
1512         mmap_read_unlock(dst_vma->vm_mm);        1512         mmap_read_unlock(dst_vma->vm_mm);
1513 }                                                1513 }
1514 #endif                                           1514 #endif
1515                                                  1515 
1516 /**                                              1516 /**
1517  * move_pages - move arbitrary anonymous page    1517  * move_pages - move arbitrary anonymous pages of an existing vma
1518  * @ctx: pointer to the userfaultfd context      1518  * @ctx: pointer to the userfaultfd context
1519  * @dst_start: start of the destination virtu    1519  * @dst_start: start of the destination virtual memory range
1520  * @src_start: start of the source virtual me    1520  * @src_start: start of the source virtual memory range
1521  * @len: length of the virtual memory range      1521  * @len: length of the virtual memory range
1522  * @mode: flags from uffdio_move.mode            1522  * @mode: flags from uffdio_move.mode
1523  *                                               1523  *
1524  * It will either use the mmap_lock in read m    1524  * It will either use the mmap_lock in read mode or per-vma locks
1525  *                                               1525  *
1526  * move_pages() remaps arbitrary anonymous pa    1526  * move_pages() remaps arbitrary anonymous pages atomically in zero
1527  * copy. It only works on non shared anonymou    1527  * copy. It only works on non shared anonymous pages because those can
1528  * be relocated without generating non linear    1528  * be relocated without generating non linear anon_vmas in the rmap
1529  * code.                                         1529  * code.
1530  *                                               1530  *
1531  * It provides a zero copy mechanism to handl    1531  * It provides a zero copy mechanism to handle userspace page faults.
1532  * The source vma pages should have mapcount     1532  * The source vma pages should have mapcount == 1, which can be
1533  * enforced by using madvise(MADV_DONTFORK) o    1533  * enforced by using madvise(MADV_DONTFORK) on src vma.
1534  *                                               1534  *
1535  * The thread receiving the page during the u    1535  * The thread receiving the page during the userland page fault
1536  * will receive the faulting page in the sour    1536  * will receive the faulting page in the source vma through the network,
1537  * storage or any other I/O device (MADV_DONT    1537  * storage or any other I/O device (MADV_DONTFORK in the source vma
1538  * avoids move_pages() to fail with -EBUSY if    1538  * avoids move_pages() to fail with -EBUSY if the process forks before
1539  * move_pages() is called), then it will call    1539  * move_pages() is called), then it will call move_pages() to map the
1540  * page in the faulting address in the destin    1540  * page in the faulting address in the destination vma.
1541  *                                               1541  *
1542  * This userfaultfd command works purely via     1542  * This userfaultfd command works purely via pagetables, so it's the
1543  * most efficient way to move physical non sh    1543  * most efficient way to move physical non shared anonymous pages
1544  * across different virtual addresses. Unlike    1544  * across different virtual addresses. Unlike mremap()/mmap()/munmap()
1545  * it does not create any new vmas. The mappi    1545  * it does not create any new vmas. The mapping in the destination
1546  * address is atomic.                            1546  * address is atomic.
1547  *                                               1547  *
1548  * It only works if the vma protection bits a    1548  * It only works if the vma protection bits are identical from the
1549  * source and destination vma.                   1549  * source and destination vma.
1550  *                                               1550  *
1551  * It can remap non shared anonymous pages wi    1551  * It can remap non shared anonymous pages within the same vma too.
1552  *                                               1552  *
1553  * If the source virtual memory range has any    1553  * If the source virtual memory range has any unmapped holes, or if
1554  * the destination virtual memory range is no    1554  * the destination virtual memory range is not a whole unmapped hole,
1555  * move_pages() will fail respectively with -    1555  * move_pages() will fail respectively with -ENOENT or -EEXIST. This
1556  * provides a very strict behavior to avoid a    1556  * provides a very strict behavior to avoid any chance of memory
1557  * corruption going unnoticed if there are us    1557  * corruption going unnoticed if there are userland race conditions.
1558  * Only one thread should resolve the userlan    1558  * Only one thread should resolve the userland page fault at any given
1559  * time for any given faulting address. This     1559  * time for any given faulting address. This means that if two threads
1560  * try to both call move_pages() on the same     1560  * try to both call move_pages() on the same destination address at the
1561  * same time, the second thread will get an e    1561  * same time, the second thread will get an explicit error from this
1562  * command.                                      1562  * command.
1563  *                                               1563  *
1564  * The command retval will return "len" is su    1564  * The command retval will return "len" is successful. The command
1565  * however can be interrupted by fatal signal    1565  * however can be interrupted by fatal signals or errors. If
1566  * interrupted it will return the number of b    1566  * interrupted it will return the number of bytes successfully
1567  * remapped before the interruption if any, o    1567  * remapped before the interruption if any, or the negative error if
1568  * none. It will never return zero. Either it    1568  * none. It will never return zero. Either it will return an error or
1569  * an amount of bytes successfully moved. If     1569  * an amount of bytes successfully moved. If the retval reports a
1570  * "short" remap, the move_pages() command sh    1570  * "short" remap, the move_pages() command should be repeated by
1571  * userland with src+retval, dst+reval, len-r    1571  * userland with src+retval, dst+reval, len-retval if it wants to know
1572  * about the error that interrupted it.          1572  * about the error that interrupted it.
1573  *                                               1573  *
1574  * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag     1574  * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to
1575  * prevent -ENOENT errors to materialize if t    1575  * prevent -ENOENT errors to materialize if there are holes in the
1576  * source virtual range that is being remappe    1576  * source virtual range that is being remapped. The holes will be
1577  * accounted as successfully remapped in the     1577  * accounted as successfully remapped in the retval of the
1578  * command. This is mostly useful to remap hu    1578  * command. This is mostly useful to remap hugepage naturally aligned
1579  * virtual regions without knowing if there a    1579  * virtual regions without knowing if there are transparent hugepage
1580  * in the regions or not, but preventing the     1580  * in the regions or not, but preventing the risk of having to split
1581  * the hugepmd during the remap.                 1581  * the hugepmd during the remap.
1582  *                                               1582  *
1583  * If there's any rmap walk that is taking th    1583  * If there's any rmap walk that is taking the anon_vma locks without
1584  * first obtaining the folio lock (the only c    1584  * first obtaining the folio lock (the only current instance is
1585  * folio_referenced), they will have to verif    1585  * folio_referenced), they will have to verify if the folio->mapping
1586  * has changed after taking the anon_vma lock    1586  * has changed after taking the anon_vma lock. If it changed they
1587  * should release the lock and retry obtainin    1587  * should release the lock and retry obtaining a new anon_vma, because
1588  * it means the anon_vma was changed by move_    1588  * it means the anon_vma was changed by move_pages() before the lock
1589  * could be obtained. This is the only additi    1589  * could be obtained. This is the only additional complexity added to
1590  * the rmap code to provide this anonymous pa    1590  * the rmap code to provide this anonymous page remapping functionality.
1591  */                                              1591  */
1592 ssize_t move_pages(struct userfaultfd_ctx *ct    1592 ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
1593                    unsigned long src_start, u    1593                    unsigned long src_start, unsigned long len, __u64 mode)
1594 {                                                1594 {
1595         struct mm_struct *mm = ctx->mm;          1595         struct mm_struct *mm = ctx->mm;
1596         struct vm_area_struct *src_vma, *dst_    1596         struct vm_area_struct *src_vma, *dst_vma;
1597         unsigned long src_addr, dst_addr;        1597         unsigned long src_addr, dst_addr;
1598         pmd_t *src_pmd, *dst_pmd;                1598         pmd_t *src_pmd, *dst_pmd;
1599         long err = -EINVAL;                      1599         long err = -EINVAL;
1600         ssize_t moved = 0;                       1600         ssize_t moved = 0;
1601                                                  1601 
1602         /* Sanitize the command parameters. *    1602         /* Sanitize the command parameters. */
1603         if (WARN_ON_ONCE(src_start & ~PAGE_MA    1603         if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
1604             WARN_ON_ONCE(dst_start & ~PAGE_MA    1604             WARN_ON_ONCE(dst_start & ~PAGE_MASK) ||
1605             WARN_ON_ONCE(len & ~PAGE_MASK))      1605             WARN_ON_ONCE(len & ~PAGE_MASK))
1606                 goto out;                        1606                 goto out;
1607                                                  1607 
1608         /* Does the address range wrap, or is    1608         /* Does the address range wrap, or is the span zero-sized? */
1609         if (WARN_ON_ONCE(src_start + len <= s    1609         if (WARN_ON_ONCE(src_start + len <= src_start) ||
1610             WARN_ON_ONCE(dst_start + len <= d    1610             WARN_ON_ONCE(dst_start + len <= dst_start))
1611                 goto out;                        1611                 goto out;
1612                                                  1612 
1613         err = uffd_move_lock(mm, dst_start, s    1613         err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma);
1614         if (err)                                 1614         if (err)
1615                 goto out;                        1615                 goto out;
1616                                                  1616 
1617         /* Re-check after taking map_changing    1617         /* Re-check after taking map_changing_lock */
1618         err = -EAGAIN;                           1618         err = -EAGAIN;
1619         down_read(&ctx->map_changing_lock);      1619         down_read(&ctx->map_changing_lock);
1620         if (likely(atomic_read(&ctx->mmap_cha    1620         if (likely(atomic_read(&ctx->mmap_changing)))
1621                 goto out_unlock;                 1621                 goto out_unlock;
1622         /*                                       1622         /*
1623          * Make sure the vma is not shared, t    1623          * Make sure the vma is not shared, that the src and dst remap
1624          * ranges are both valid and fully wi    1624          * ranges are both valid and fully within a single existing
1625          * vma.                                  1625          * vma.
1626          */                                      1626          */
1627         err = -EINVAL;                           1627         err = -EINVAL;
1628         if (src_vma->vm_flags & VM_SHARED)       1628         if (src_vma->vm_flags & VM_SHARED)
1629                 goto out_unlock;                 1629                 goto out_unlock;
1630         if (src_start + len > src_vma->vm_end    1630         if (src_start + len > src_vma->vm_end)
1631                 goto out_unlock;                 1631                 goto out_unlock;
1632                                                  1632 
1633         if (dst_vma->vm_flags & VM_SHARED)       1633         if (dst_vma->vm_flags & VM_SHARED)
1634                 goto out_unlock;                 1634                 goto out_unlock;
1635         if (dst_start + len > dst_vma->vm_end    1635         if (dst_start + len > dst_vma->vm_end)
1636                 goto out_unlock;                 1636                 goto out_unlock;
1637                                                  1637 
1638         err = validate_move_areas(ctx, src_vm    1638         err = validate_move_areas(ctx, src_vma, dst_vma);
1639         if (err)                                 1639         if (err)
1640                 goto out_unlock;                 1640                 goto out_unlock;
1641                                                  1641 
1642         for (src_addr = src_start, dst_addr =    1642         for (src_addr = src_start, dst_addr = dst_start;
1643              src_addr < src_start + len;) {      1643              src_addr < src_start + len;) {
1644                 spinlock_t *ptl;                 1644                 spinlock_t *ptl;
1645                 pmd_t dst_pmdval;                1645                 pmd_t dst_pmdval;
1646                 unsigned long step_size;         1646                 unsigned long step_size;
1647                                                  1647 
1648                 /*                               1648                 /*
1649                  * Below works because anonym    1649                  * Below works because anonymous area would not have a
1650                  * transparent huge PUD. If f    1650                  * transparent huge PUD. If file-backed support is added,
1651                  * that case would need to be    1651                  * that case would need to be handled here.
1652                  */                              1652                  */
1653                 src_pmd = mm_find_pmd(mm, src    1653                 src_pmd = mm_find_pmd(mm, src_addr);
1654                 if (unlikely(!src_pmd)) {        1654                 if (unlikely(!src_pmd)) {
1655                         if (!(mode & UFFDIO_M    1655                         if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
1656                                 err = -ENOENT    1656                                 err = -ENOENT;
1657                                 break;           1657                                 break;
1658                         }                        1658                         }
1659                         src_pmd = mm_alloc_pm    1659                         src_pmd = mm_alloc_pmd(mm, src_addr);
1660                         if (unlikely(!src_pmd    1660                         if (unlikely(!src_pmd)) {
1661                                 err = -ENOMEM    1661                                 err = -ENOMEM;
1662                                 break;           1662                                 break;
1663                         }                        1663                         }
1664                 }                                1664                 }
1665                 dst_pmd = mm_alloc_pmd(mm, ds    1665                 dst_pmd = mm_alloc_pmd(mm, dst_addr);
1666                 if (unlikely(!dst_pmd)) {        1666                 if (unlikely(!dst_pmd)) {
1667                         err = -ENOMEM;           1667                         err = -ENOMEM;
1668                         break;                   1668                         break;
1669                 }                                1669                 }
1670                                                  1670 
1671                 dst_pmdval = pmdp_get_lockles    1671                 dst_pmdval = pmdp_get_lockless(dst_pmd);
1672                 /*                               1672                 /*
1673                  * If the dst_pmd is mapped a    1673                  * If the dst_pmd is mapped as THP don't override it and just
1674                  * be strict. If dst_pmd chan    1674                  * be strict. If dst_pmd changes into TPH after this check, the
1675                  * move_pages_huge_pmd() will    1675                  * move_pages_huge_pmd() will detect the change and retry
1676                  * while move_pages_pte() wil    1676                  * while move_pages_pte() will detect the change and fail.
1677                  */                              1677                  */
1678                 if (unlikely(pmd_trans_huge(d    1678                 if (unlikely(pmd_trans_huge(dst_pmdval))) {
1679                         err = -EEXIST;           1679                         err = -EEXIST;
1680                         break;                   1680                         break;
1681                 }                                1681                 }
1682                                                  1682 
1683                 ptl = pmd_trans_huge_lock(src    1683                 ptl = pmd_trans_huge_lock(src_pmd, src_vma);
1684                 if (ptl) {                       1684                 if (ptl) {
1685                         if (pmd_devmap(*src_p    1685                         if (pmd_devmap(*src_pmd)) {
1686                                 spin_unlock(p    1686                                 spin_unlock(ptl);
1687                                 err = -ENOENT    1687                                 err = -ENOENT;
1688                                 break;           1688                                 break;
1689                         }                        1689                         }
1690                                                  1690 
1691                         /* Check if we can mo    1691                         /* Check if we can move the pmd without splitting it. */
1692                         if (move_splits_huge_    1692                         if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
1693                             !pmd_none(dst_pmd    1693                             !pmd_none(dst_pmdval)) {
1694                                 struct folio     1694                                 struct folio *folio = pmd_folio(*src_pmd);
1695                                                  1695 
1696                                 if (!folio ||    1696                                 if (!folio || (!is_huge_zero_folio(folio) &&
1697                                                  1697                                                !PageAnonExclusive(&folio->page))) {
1698                                         spin_    1698                                         spin_unlock(ptl);
1699                                         err =    1699                                         err = -EBUSY;
1700                                         break    1700                                         break;
1701                                 }                1701                                 }
1702                                                  1702 
1703                                 spin_unlock(p    1703                                 spin_unlock(ptl);
1704                                 split_huge_pm    1704                                 split_huge_pmd(src_vma, src_pmd, src_addr);
1705                                 /* The folio     1705                                 /* The folio will be split by move_pages_pte() */
1706                                 continue;        1706                                 continue;
1707                         }                        1707                         }
1708                                                  1708 
1709                         err = move_pages_huge    1709                         err = move_pages_huge_pmd(mm, dst_pmd, src_pmd,
1710                                                  1710                                                   dst_pmdval, dst_vma, src_vma,
1711                                                  1711                                                   dst_addr, src_addr);
1712                         step_size = HPAGE_PMD    1712                         step_size = HPAGE_PMD_SIZE;
1713                 } else {                         1713                 } else {
1714                         if (pmd_none(*src_pmd    1714                         if (pmd_none(*src_pmd)) {
1715                                 if (!(mode &     1715                                 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
1716                                         err =    1716                                         err = -ENOENT;
1717                                         break    1717                                         break;
1718                                 }                1718                                 }
1719                                 if (unlikely(    1719                                 if (unlikely(__pte_alloc(mm, src_pmd))) {
1720                                         err =    1720                                         err = -ENOMEM;
1721                                         break    1721                                         break;
1722                                 }                1722                                 }
1723                         }                        1723                         }
1724                                                  1724 
1725                         if (unlikely(pte_allo    1725                         if (unlikely(pte_alloc(mm, dst_pmd))) {
1726                                 err = -ENOMEM    1726                                 err = -ENOMEM;
1727                                 break;           1727                                 break;
1728                         }                        1728                         }
1729                                                  1729 
1730                         err = move_pages_pte(    1730                         err = move_pages_pte(mm, dst_pmd, src_pmd,
1731                                                  1731                                              dst_vma, src_vma,
1732                                                  1732                                              dst_addr, src_addr, mode);
1733                         step_size = PAGE_SIZE    1733                         step_size = PAGE_SIZE;
1734                 }                                1734                 }
1735                                                  1735 
1736                 cond_resched();                  1736                 cond_resched();
1737                                                  1737 
1738                 if (fatal_signal_pending(curr    1738                 if (fatal_signal_pending(current)) {
1739                         /* Do not override an    1739                         /* Do not override an error */
1740                         if (!err || err == -E    1740                         if (!err || err == -EAGAIN)
1741                                 err = -EINTR;    1741                                 err = -EINTR;
1742                         break;                   1742                         break;
1743                 }                                1743                 }
1744                                                  1744 
1745                 if (err) {                       1745                 if (err) {
1746                         if (err == -EAGAIN)      1746                         if (err == -EAGAIN)
1747                                 continue;        1747                                 continue;
1748                         break;                   1748                         break;
1749                 }                                1749                 }
1750                                                  1750 
1751                 /* Proceed to the next page *    1751                 /* Proceed to the next page */
1752                 dst_addr += step_size;           1752                 dst_addr += step_size;
1753                 src_addr += step_size;           1753                 src_addr += step_size;
1754                 moved += step_size;              1754                 moved += step_size;
1755         }                                        1755         }
1756                                                  1756 
1757 out_unlock:                                      1757 out_unlock:
1758         up_read(&ctx->map_changing_lock);        1758         up_read(&ctx->map_changing_lock);
1759         uffd_move_unlock(dst_vma, src_vma);      1759         uffd_move_unlock(dst_vma, src_vma);
1760 out:                                             1760 out:
1761         VM_WARN_ON(moved < 0);                   1761         VM_WARN_ON(moved < 0);
1762         VM_WARN_ON(err > 0);                     1762         VM_WARN_ON(err > 0);
1763         VM_WARN_ON(!moved && !err);              1763         VM_WARN_ON(!moved && !err);
1764         return moved ? moved : err;              1764         return moved ? moved : err;
1765 }                                                1765 }
1766                                                  1766 
1767 static void userfaultfd_set_vm_flags(struct v    1767 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
1768                                      vm_flags    1768                                      vm_flags_t flags)
1769 {                                                1769 {
1770         const bool uffd_wp_changed = (vma->vm    1770         const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
1771                                                  1771 
1772         vm_flags_reset(vma, flags);              1772         vm_flags_reset(vma, flags);
1773         /*                                       1773         /*
1774          * For shared mappings, we want to en    1774          * For shared mappings, we want to enable writenotify while
1775          * userfaultfd-wp is enabled (see vma    1775          * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
1776          * recalculate vma->vm_page_prot when    1776          * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
1777          */                                      1777          */
1778         if ((vma->vm_flags & VM_SHARED) && uf    1778         if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
1779                 vma_set_page_prot(vma);          1779                 vma_set_page_prot(vma);
1780 }                                                1780 }
1781                                                  1781 
1782 static void userfaultfd_set_ctx(struct vm_are    1782 static void userfaultfd_set_ctx(struct vm_area_struct *vma,
1783                                 struct userfa    1783                                 struct userfaultfd_ctx *ctx,
1784                                 unsigned long    1784                                 unsigned long flags)
1785 {                                                1785 {
1786         vma_start_write(vma);                    1786         vma_start_write(vma);
1787         vma->vm_userfaultfd_ctx = (struct vm_    1787         vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx};
1788         userfaultfd_set_vm_flags(vma,            1788         userfaultfd_set_vm_flags(vma,
1789                                  (vma->vm_fla    1789                                  (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags);
1790 }                                                1790 }
1791                                                  1791 
1792 void userfaultfd_reset_ctx(struct vm_area_str    1792 void userfaultfd_reset_ctx(struct vm_area_struct *vma)
1793 {                                                1793 {
1794         userfaultfd_set_ctx(vma, NULL, 0);       1794         userfaultfd_set_ctx(vma, NULL, 0);
1795 }                                                1795 }
1796                                                  1796 
1797 struct vm_area_struct *userfaultfd_clear_vma(    1797 struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
1798                                                  1798                                              struct vm_area_struct *prev,
1799                                                  1799                                              struct vm_area_struct *vma,
1800                                                  1800                                              unsigned long start,
1801                                                  1801                                              unsigned long end)
1802 {                                                1802 {
1803         struct vm_area_struct *ret;              1803         struct vm_area_struct *ret;
1804                                                  1804 
1805         /* Reset ptes for the whole vma range    1805         /* Reset ptes for the whole vma range if wr-protected */
1806         if (userfaultfd_wp(vma))                 1806         if (userfaultfd_wp(vma))
1807                 uffd_wp_range(vma, start, end    1807                 uffd_wp_range(vma, start, end - start, false);
1808                                                  1808 
1809         ret = vma_modify_flags_uffd(vmi, prev    1809         ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
1810                                     vma->vm_f    1810                                     vma->vm_flags & ~__VM_UFFD_FLAGS,
1811                                     NULL_VM_U    1811                                     NULL_VM_UFFD_CTX);
1812                                                  1812 
1813         /*                                       1813         /*
1814          * In the vma_merge() successful mpro    1814          * In the vma_merge() successful mprotect-like case 8:
1815          * the next vma was merged into the c    1815          * the next vma was merged into the current one and
1816          * the current one has not been updat    1816          * the current one has not been updated yet.
1817          */                                      1817          */
1818         if (!IS_ERR(ret))                        1818         if (!IS_ERR(ret))
1819                 userfaultfd_reset_ctx(ret);      1819                 userfaultfd_reset_ctx(ret);
1820                                                  1820 
1821         return ret;                              1821         return ret;
1822 }                                                1822 }
1823                                                  1823 
1824 /* Assumes mmap write lock taken, and mm_stru    1824 /* Assumes mmap write lock taken, and mm_struct pinned. */
1825 int userfaultfd_register_range(struct userfau    1825 int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
1826                                struct vm_area    1826                                struct vm_area_struct *vma,
1827                                unsigned long     1827                                unsigned long vm_flags,
1828                                unsigned long     1828                                unsigned long start, unsigned long end,
1829                                bool wp_async)    1829                                bool wp_async)
1830 {                                                1830 {
1831         VMA_ITERATOR(vmi, ctx->mm, start);       1831         VMA_ITERATOR(vmi, ctx->mm, start);
1832         struct vm_area_struct *prev = vma_pre    1832         struct vm_area_struct *prev = vma_prev(&vmi);
1833         unsigned long vma_end;                   1833         unsigned long vma_end;
1834         unsigned long new_flags;                 1834         unsigned long new_flags;
1835                                                  1835 
1836         if (vma->vm_start < start)               1836         if (vma->vm_start < start)
1837                 prev = vma;                      1837                 prev = vma;
1838                                                  1838 
1839         for_each_vma_range(vmi, vma, end) {      1839         for_each_vma_range(vmi, vma, end) {
1840                 cond_resched();                  1840                 cond_resched();
1841                                                  1841 
1842                 BUG_ON(!vma_can_userfault(vma    1842                 BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
1843                 BUG_ON(vma->vm_userfaultfd_ct    1843                 BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
1844                        vma->vm_userfaultfd_ct    1844                        vma->vm_userfaultfd_ctx.ctx != ctx);
1845                 WARN_ON(!(vma->vm_flags & VM_    1845                 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1846                                                  1846 
1847                 /*                               1847                 /*
1848                  * Nothing to do: this vma is    1848                  * Nothing to do: this vma is already registered into this
1849                  * userfaultfd and with the r    1849                  * userfaultfd and with the right tracking mode too.
1850                  */                              1850                  */
1851                 if (vma->vm_userfaultfd_ctx.c    1851                 if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1852                     (vma->vm_flags & vm_flags    1852                     (vma->vm_flags & vm_flags) == vm_flags)
1853                         goto skip;               1853                         goto skip;
1854                                                  1854 
1855                 if (vma->vm_start > start)       1855                 if (vma->vm_start > start)
1856                         start = vma->vm_start    1856                         start = vma->vm_start;
1857                 vma_end = min(end, vma->vm_en    1857                 vma_end = min(end, vma->vm_end);
1858                                                  1858 
1859                 new_flags = (vma->vm_flags &     1859                 new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
1860                 vma = vma_modify_flags_uffd(&    1860                 vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
1861                                             n    1861                                             new_flags,
1862                                             (    1862                                             (struct vm_userfaultfd_ctx){ctx});
1863                 if (IS_ERR(vma))                 1863                 if (IS_ERR(vma))
1864                         return PTR_ERR(vma);     1864                         return PTR_ERR(vma);
1865                                                  1865 
1866                 /*                               1866                 /*
1867                  * In the vma_merge() success    1867                  * In the vma_merge() successful mprotect-like case 8:
1868                  * the next vma was merged in    1868                  * the next vma was merged into the current one and
1869                  * the current one has not be    1869                  * the current one has not been updated yet.
1870                  */                              1870                  */
1871                 userfaultfd_set_ctx(vma, ctx,    1871                 userfaultfd_set_ctx(vma, ctx, vm_flags);
1872                                                  1872 
1873                 if (is_vm_hugetlb_page(vma) &    1873                 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
1874                         hugetlb_unshare_all_p    1874                         hugetlb_unshare_all_pmds(vma);
1875                                                  1875 
1876 skip:                                            1876 skip:
1877                 prev = vma;                      1877                 prev = vma;
1878                 start = vma->vm_end;             1878                 start = vma->vm_end;
1879         }                                        1879         }
1880                                                  1880 
1881         return 0;                                1881         return 0;
1882 }                                                1882 }
1883                                                  1883 
1884 void userfaultfd_release_new(struct userfault    1884 void userfaultfd_release_new(struct userfaultfd_ctx *ctx)
1885 {                                                1885 {
1886         struct mm_struct *mm = ctx->mm;          1886         struct mm_struct *mm = ctx->mm;
1887         struct vm_area_struct *vma;              1887         struct vm_area_struct *vma;
1888         VMA_ITERATOR(vmi, mm, 0);                1888         VMA_ITERATOR(vmi, mm, 0);
1889                                                  1889 
1890         /* the various vma->vm_userfaultfd_ct    1890         /* the various vma->vm_userfaultfd_ctx still points to it */
1891         mmap_write_lock(mm);                     1891         mmap_write_lock(mm);
1892         for_each_vma(vmi, vma) {                 1892         for_each_vma(vmi, vma) {
1893                 if (vma->vm_userfaultfd_ctx.c    1893                 if (vma->vm_userfaultfd_ctx.ctx == ctx)
1894                         userfaultfd_reset_ctx    1894                         userfaultfd_reset_ctx(vma);
1895         }                                        1895         }
1896         mmap_write_unlock(mm);                   1896         mmap_write_unlock(mm);
1897 }                                                1897 }
1898                                                  1898 
1899 void userfaultfd_release_all(struct mm_struct    1899 void userfaultfd_release_all(struct mm_struct *mm,
1900                              struct userfault    1900                              struct userfaultfd_ctx *ctx)
1901 {                                                1901 {
1902         struct vm_area_struct *vma, *prev;       1902         struct vm_area_struct *vma, *prev;
1903         VMA_ITERATOR(vmi, mm, 0);                1903         VMA_ITERATOR(vmi, mm, 0);
1904                                                  1904 
1905         if (!mmget_not_zero(mm))                 1905         if (!mmget_not_zero(mm))
1906                 return;                          1906                 return;
1907                                                  1907 
1908         /*                                       1908         /*
1909          * Flush page faults out of all CPUs.    1909          * Flush page faults out of all CPUs. NOTE: all page faults
1910          * must be retried without returning     1910          * must be retried without returning VM_FAULT_SIGBUS if
1911          * userfaultfd_ctx_get() succeeds but    1911          * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
1912          * changes while handle_userfault rel    1912          * changes while handle_userfault released the mmap_lock. So
1913          * it's critical that released is set    1913          * it's critical that released is set to true (above), before
1914          * taking the mmap_lock for writing.     1914          * taking the mmap_lock for writing.
1915          */                                      1915          */
1916         mmap_write_lock(mm);                     1916         mmap_write_lock(mm);
1917         prev = NULL;                             1917         prev = NULL;
1918         for_each_vma(vmi, vma) {                 1918         for_each_vma(vmi, vma) {
1919                 cond_resched();                  1919                 cond_resched();
1920                 BUG_ON(!!vma->vm_userfaultfd_    1920                 BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
1921                        !!(vma->vm_flags & __V    1921                        !!(vma->vm_flags & __VM_UFFD_FLAGS));
1922                 if (vma->vm_userfaultfd_ctx.c    1922                 if (vma->vm_userfaultfd_ctx.ctx != ctx) {
1923                         prev = vma;              1923                         prev = vma;
1924                         continue;                1924                         continue;
1925                 }                                1925                 }
1926                                                  1926 
1927                 vma = userfaultfd_clear_vma(&    1927                 vma = userfaultfd_clear_vma(&vmi, prev, vma,
1928                                             v    1928                                             vma->vm_start, vma->vm_end);
1929                 prev = vma;                      1929                 prev = vma;
1930         }                                        1930         }
1931         mmap_write_unlock(mm);                   1931         mmap_write_unlock(mm);
1932         mmput(mm);                               1932         mmput(mm);
1933 }                                                1933 }
1934                                                  1934 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php