~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/mm/userfaultfd.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /mm/userfaultfd.c (Version linux-6.12-rc7) and /mm/userfaultfd.c (Version linux-4.13.16)


  1 // SPDX-License-Identifier: GPL-2.0-only       << 
  2 /*                                                  1 /*
  3  *  mm/userfaultfd.c                                2  *  mm/userfaultfd.c
  4  *                                                  3  *
  5  *  Copyright (C) 2015  Red Hat, Inc.               4  *  Copyright (C) 2015  Red Hat, Inc.
                                                   >>   5  *
                                                   >>   6  *  This work is licensed under the terms of the GNU GPL, version 2. See
                                                   >>   7  *  the COPYING file in the top-level directory.
  6  */                                                 8  */
  7                                                     9 
  8 #include <linux/mm.h>                              10 #include <linux/mm.h>
  9 #include <linux/sched/signal.h>                    11 #include <linux/sched/signal.h>
 10 #include <linux/pagemap.h>                         12 #include <linux/pagemap.h>
 11 #include <linux/rmap.h>                            13 #include <linux/rmap.h>
 12 #include <linux/swap.h>                            14 #include <linux/swap.h>
 13 #include <linux/swapops.h>                         15 #include <linux/swapops.h>
 14 #include <linux/userfaultfd_k.h>                   16 #include <linux/userfaultfd_k.h>
 15 #include <linux/mmu_notifier.h>                    17 #include <linux/mmu_notifier.h>
 16 #include <linux/hugetlb.h>                         18 #include <linux/hugetlb.h>
                                                   >>  19 #include <linux/pagemap.h>
 17 #include <linux/shmem_fs.h>                        20 #include <linux/shmem_fs.h>
 18 #include <asm/tlbflush.h>                          21 #include <asm/tlbflush.h>
 19 #include <asm/tlb.h>                           << 
 20 #include "internal.h"                              22 #include "internal.h"
 21                                                    23 
 22 static __always_inline                         !!  24 static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 23 bool validate_dst_vma(struct vm_area_struct *d !!  25                             pmd_t *dst_pmd,
 24 {                                              !!  26                             struct vm_area_struct *dst_vma,
 25         /* Make sure that the dst range is ful !!  27                             unsigned long dst_addr,
 26         if (dst_end > dst_vma->vm_end)         !!  28                             unsigned long src_addr,
 27                 return false;                  !!  29                             struct page **pagep)
 28                                                << 
 29         /*                                     << 
 30          * Check the vma is registered in uffd << 
 31          * enforce the VM_MAYWRITE check done  << 
 32          * time.                               << 
 33          */                                    << 
 34         if (!dst_vma->vm_userfaultfd_ctx.ctx)  << 
 35                 return false;                  << 
 36                                                << 
 37         return true;                           << 
 38 }                                              << 
 39                                                << 
 40 static __always_inline                         << 
 41 struct vm_area_struct *find_vma_and_prepare_an << 
 42                                                << 
 43 {                                              << 
 44         struct vm_area_struct *vma;            << 
 45                                                << 
 46         mmap_assert_locked(mm);                << 
 47         vma = vma_lookup(mm, addr);            << 
 48         if (!vma)                              << 
 49                 vma = ERR_PTR(-ENOENT);        << 
 50         else if (!(vma->vm_flags & VM_SHARED)  << 
 51                  unlikely(anon_vma_prepare(vma << 
 52                 vma = ERR_PTR(-ENOMEM);        << 
 53                                                << 
 54         return vma;                            << 
 55 }                                              << 
 56                                                << 
 57 #ifdef CONFIG_PER_VMA_LOCK                     << 
 58 /*                                             << 
 59  * uffd_lock_vma() - Lookup and lock vma corre << 
 60  * @mm: mm to search vma in.                   << 
 61  * @address: address that the vma should conta << 
 62  *                                             << 
 63  * Should be called without holding mmap_lock. << 
 64  *                                             << 
 65  * Return: A locked vma containing @address, - << 
 66  * -ENOMEM if anon_vma couldn't be allocated.  << 
 67  */                                            << 
 68 static struct vm_area_struct *uffd_lock_vma(st << 
 69                                        unsigne << 
 70 {                                              << 
 71         struct vm_area_struct *vma;            << 
 72                                                << 
 73         vma = lock_vma_under_rcu(mm, address); << 
 74         if (vma) {                             << 
 75                 /*                             << 
 76                  * We know we're going to need << 
 77                  * that early.                 << 
 78                  */                            << 
 79                 if (!(vma->vm_flags & VM_SHARE << 
 80                         vma_end_read(vma);     << 
 81                 else                           << 
 82                         return vma;            << 
 83         }                                      << 
 84                                                << 
 85         mmap_read_lock(mm);                    << 
 86         vma = find_vma_and_prepare_anon(mm, ad << 
 87         if (!IS_ERR(vma)) {                    << 
 88                 /*                             << 
 89                  * We cannot use vma_start_rea << 
 90                  * false locked (see comment i << 
 91                  * can avoid that by directly  << 
 92                  * mmap_lock, which guarantees << 
 93                  * vma for write (vma_start_wr << 
 94                  */                            << 
 95                 down_read(&vma->vm_lock->lock) << 
 96         }                                      << 
 97                                                << 
 98         mmap_read_unlock(mm);                  << 
 99         return vma;                            << 
100 }                                              << 
101                                                << 
102 static struct vm_area_struct *uffd_mfill_lock( << 
103                                                << 
104                                                << 
105 {                                              << 
106         struct vm_area_struct *dst_vma;        << 
107                                                << 
108         dst_vma = uffd_lock_vma(dst_mm, dst_st << 
109         if (IS_ERR(dst_vma) || validate_dst_vm << 
110                 return dst_vma;                << 
111                                                << 
112         vma_end_read(dst_vma);                 << 
113         return ERR_PTR(-ENOENT);               << 
114 }                                              << 
115                                                << 
116 static void uffd_mfill_unlock(struct vm_area_s << 
117 {                                              << 
118         vma_end_read(vma);                     << 
119 }                                              << 
120                                                << 
121 #else                                          << 
122                                                << 
123 static struct vm_area_struct *uffd_mfill_lock( << 
124                                                << 
125                                                << 
126 {                                              << 
127         struct vm_area_struct *dst_vma;        << 
128                                                << 
129         mmap_read_lock(dst_mm);                << 
130         dst_vma = find_vma_and_prepare_anon(ds << 
131         if (IS_ERR(dst_vma))                   << 
132                 goto out_unlock;               << 
133                                                << 
134         if (validate_dst_vma(dst_vma, dst_star << 
135                 return dst_vma;                << 
136                                                << 
137         dst_vma = ERR_PTR(-ENOENT);            << 
138 out_unlock:                                    << 
139         mmap_read_unlock(dst_mm);              << 
140         return dst_vma;                        << 
141 }                                              << 
142                                                << 
143 static void uffd_mfill_unlock(struct vm_area_s << 
144 {                                              << 
145         mmap_read_unlock(vma->vm_mm);          << 
146 }                                              << 
147 #endif                                         << 
148                                                << 
149 /* Check if dst_addr is outside of file's size << 
150 static bool mfill_file_over_size(struct vm_are << 
151                                  unsigned long << 
152 {                                              << 
153         struct inode *inode;                   << 
154         pgoff_t offset, max_off;               << 
155                                                << 
156         if (!dst_vma->vm_file)                 << 
157                 return false;                  << 
158                                                << 
159         inode = dst_vma->vm_file->f_inode;     << 
160         offset = linear_page_index(dst_vma, ds << 
161         max_off = DIV_ROUND_UP(i_size_read(ino << 
162         return offset >= max_off;              << 
163 }                                              << 
164                                                << 
165 /*                                             << 
166  * Install PTEs, to map dst_addr (within dst_v << 
167  *                                             << 
168  * This function handles both MCOPY_ATOMIC_NOR << 
169  * and anon, and for both shared and private V << 
170  */                                            << 
171 int mfill_atomic_install_pte(pmd_t *dst_pmd,   << 
172                              struct vm_area_st << 
173                              unsigned long dst << 
174                              bool newly_alloca << 
175 {                                                  30 {
176         int ret;                               !!  31         struct mem_cgroup *memcg;
177         struct mm_struct *dst_mm = dst_vma->vm << 
178         pte_t _dst_pte, *dst_pte;                  32         pte_t _dst_pte, *dst_pte;
179         bool writable = dst_vma->vm_flags & VM << 
180         bool vm_shared = dst_vma->vm_flags & V << 
181         spinlock_t *ptl;                           33         spinlock_t *ptl;
182         struct folio *folio = page_folio(page) !!  34         void *page_kaddr;
183         bool page_in_cache = folio_mapping(fol << 
184                                                << 
185         _dst_pte = mk_pte(page, dst_vma->vm_pa << 
186         _dst_pte = pte_mkdirty(_dst_pte);      << 
187         if (page_in_cache && !vm_shared)       << 
188                 writable = false;              << 
189         if (writable)                          << 
190                 _dst_pte = pte_mkwrite(_dst_pt << 
191         if (flags & MFILL_ATOMIC_WP)           << 
192                 _dst_pte = pte_mkuffd_wp(_dst_ << 
193                                                << 
194         ret = -EAGAIN;                         << 
195         dst_pte = pte_offset_map_lock(dst_mm,  << 
196         if (!dst_pte)                          << 
197                 goto out;                      << 
198                                                << 
199         if (mfill_file_over_size(dst_vma, dst_ << 
200                 ret = -EFAULT;                 << 
201                 goto out_unlock;               << 
202         }                                      << 
203                                                << 
204         ret = -EEXIST;                         << 
205         /*                                     << 
206          * We allow to overwrite a pte marker: << 
207          * registered, we firstly wr-protect a << 
208          * page backing it, then access the pa << 
209          */                                    << 
210         if (!pte_none_mostly(ptep_get(dst_pte) << 
211                 goto out_unlock;               << 
212                                                << 
213         if (page_in_cache) {                   << 
214                 /* Usually, cache pages are al << 
215                 if (newly_allocated)           << 
216                         folio_add_lru(folio);  << 
217                 folio_add_file_rmap_pte(folio, << 
218         } else {                               << 
219                 folio_add_new_anon_rmap(folio, << 
220                 folio_add_lru_vma(folio, dst_v << 
221         }                                      << 
222                                                << 
223         /*                                     << 
224          * Must happen after rmap, as mm_count << 
225          * PageAnon()), which is set by __page << 
226          */                                    << 
227         inc_mm_counter(dst_mm, mm_counter(foli << 
228                                                << 
229         set_pte_at(dst_mm, dst_addr, dst_pte,  << 
230                                                << 
231         /* No need to invalidate - it was non- << 
232         update_mmu_cache(dst_vma, dst_addr, ds << 
233         ret = 0;                               << 
234 out_unlock:                                    << 
235         pte_unmap_unlock(dst_pte, ptl);        << 
236 out:                                           << 
237         return ret;                            << 
238 }                                              << 
239                                                << 
240 static int mfill_atomic_pte_copy(pmd_t *dst_pm << 
241                                  struct vm_are << 
242                                  unsigned long << 
243                                  unsigned long << 
244                                  uffd_flags_t  << 
245                                  struct folio  << 
246 {                                              << 
247         void *kaddr;                           << 
248         int ret;                                   35         int ret;
249         struct folio *folio;                   !!  36         struct page *page;
250                                                    37 
251         if (!*foliop) {                        !!  38         if (!*pagep) {
252                 ret = -ENOMEM;                     39                 ret = -ENOMEM;
253                 folio = vma_alloc_folio(GFP_HI !!  40                 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
254                                         dst_ad !!  41                 if (!page)
255                 if (!folio)                    << 
256                         goto out;                  42                         goto out;
257                                                    43 
258                 kaddr = kmap_local_folio(folio !!  44                 page_kaddr = kmap_atomic(page);
259                 /*                             !!  45                 ret = copy_from_user(page_kaddr,
260                  * The read mmap_lock is held  !!  46                                      (const void __user *) src_addr,
261                  * mmap_lock being read recurs << 
262                  * possible if a writer has ta << 
263                  *                             << 
264                  * process A thread 1 takes re << 
265                  * process A thread 2 calls mm << 
266                  * process B thread 1 takes pa << 
267                  * process B thread 2 calls mm << 
268                  * process A thread 1 blocks t << 
269                  * process B thread 1 blocks t << 
270                  *                             << 
271                  * Disable page faults to prev << 
272                  * and retry the copy outside  << 
273                  */                            << 
274                 pagefault_disable();           << 
275                 ret = copy_from_user(kaddr, (c << 
276                                      PAGE_SIZE     47                                      PAGE_SIZE);
277                 pagefault_enable();            !!  48                 kunmap_atomic(page_kaddr);
278                 kunmap_local(kaddr);           << 
279                                                    49 
280                 /* fallback to copy_from_user  !!  50                 /* fallback to copy_from_user outside mmap_sem */
281                 if (unlikely(ret)) {               51                 if (unlikely(ret)) {
282                         ret = -ENOENT;         !!  52                         ret = -EFAULT;
283                         *foliop = folio;       !!  53                         *pagep = page;
284                         /* don't free the page     54                         /* don't free the page */
285                         goto out;                  55                         goto out;
286                 }                                  56                 }
287                                                << 
288                 flush_dcache_folio(folio);     << 
289         } else {                                   57         } else {
290                 folio = *foliop;               !!  58                 page = *pagep;
291                 *foliop = NULL;                !!  59                 *pagep = NULL;
292         }                                          60         }
293                                                    61 
294         /*                                         62         /*
295          * The memory barrier inside __folio_m !!  63          * The memory barrier inside __SetPageUptodate makes sure that
296          * preceding stores to the page conten !!  64          * preceeding stores to the page contents become visible before
297          * the set_pte_at() write.                 65          * the set_pte_at() write.
298          */                                        66          */
299         __folio_mark_uptodate(folio);          !!  67         __SetPageUptodate(page);
300                                                    68 
301         ret = -ENOMEM;                             69         ret = -ENOMEM;
302         if (mem_cgroup_charge(folio, dst_vma-> !!  70         if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
303                 goto out_release;                  71                 goto out_release;
304                                                    72 
305         ret = mfill_atomic_install_pte(dst_pmd !!  73         _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
306                                        &folio- !!  74         if (dst_vma->vm_flags & VM_WRITE)
307         if (ret)                               !!  75                 _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
308                 goto out_release;              << 
309 out:                                           << 
310         return ret;                            << 
311 out_release:                                   << 
312         folio_put(folio);                      << 
313         goto out;                              << 
314 }                                              << 
315                                                << 
316 static int mfill_atomic_pte_zeroed_folio(pmd_t << 
317                                          struc << 
318                                          unsig << 
319 {                                              << 
320         struct folio *folio;                   << 
321         int ret = -ENOMEM;                     << 
322                                                << 
323         folio = vma_alloc_zeroed_movable_folio << 
324         if (!folio)                            << 
325                 return ret;                    << 
326                                                << 
327         if (mem_cgroup_charge(folio, dst_vma-> << 
328                 goto out_put;                  << 
329                                                << 
330         /*                                     << 
331          * The memory barrier inside __folio_m << 
332          * zeroing out the folio become visibl << 
333          * using set_pte_at(). See do_anonymou << 
334          */                                    << 
335         __folio_mark_uptodate(folio);          << 
336                                                    76 
337         ret = mfill_atomic_install_pte(dst_pmd !!  77         ret = -EEXIST;
338                                        &folio- !!  78         dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
339         if (ret)                               !!  79         if (!pte_none(*dst_pte))
340                 goto out_put;                  !!  80                 goto out_release_uncharge_unlock;
341                                                << 
342         return 0;                              << 
343 out_put:                                       << 
344         folio_put(folio);                      << 
345         return ret;                            << 
346 }                                              << 
347                                                    81 
348 static int mfill_atomic_pte_zeropage(pmd_t *ds !!  82         inc_mm_counter(dst_mm, MM_ANONPAGES);
349                                      struct vm !!  83         page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
350                                      unsigned  !!  84         mem_cgroup_commit_charge(page, memcg, false, false);
351 {                                              !!  85         lru_cache_add_active_or_unevictable(page, dst_vma);
352         pte_t _dst_pte, *dst_pte;              << 
353         spinlock_t *ptl;                       << 
354         int ret;                               << 
355                                                    86 
356         if (mm_forbids_zeropage(dst_vma->vm_mm !!  87         set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
357                 return mfill_atomic_pte_zeroed << 
358                                                    88 
359         _dst_pte = pte_mkspecial(pfn_pte(my_ze << 
360                                          dst_v << 
361         ret = -EAGAIN;                         << 
362         dst_pte = pte_offset_map_lock(dst_vma- << 
363         if (!dst_pte)                          << 
364                 goto out;                      << 
365         if (mfill_file_over_size(dst_vma, dst_ << 
366                 ret = -EFAULT;                 << 
367                 goto out_unlock;               << 
368         }                                      << 
369         ret = -EEXIST;                         << 
370         if (!pte_none(ptep_get(dst_pte)))      << 
371                 goto out_unlock;               << 
372         set_pte_at(dst_vma->vm_mm, dst_addr, d << 
373         /* No need to invalidate - it was non-     89         /* No need to invalidate - it was non-present before */
374         update_mmu_cache(dst_vma, dst_addr, ds     90         update_mmu_cache(dst_vma, dst_addr, dst_pte);
375         ret = 0;                               << 
376 out_unlock:                                    << 
377         pte_unmap_unlock(dst_pte, ptl);        << 
378 out:                                           << 
379         return ret;                            << 
380 }                                              << 
381                                                << 
382 /* Handles UFFDIO_CONTINUE for all shmem VMAs  << 
383 static int mfill_atomic_pte_continue(pmd_t *ds << 
384                                      struct vm << 
385                                      unsigned  << 
386                                      uffd_flag << 
387 {                                              << 
388         struct inode *inode = file_inode(dst_v << 
389         pgoff_t pgoff = linear_page_index(dst_ << 
390         struct folio *folio;                   << 
391         struct page *page;                     << 
392         int ret;                               << 
393                                                << 
394         ret = shmem_get_folio(inode, pgoff, 0, << 
395         /* Our caller expects us to return -EF << 
396         if (ret == -ENOENT)                    << 
397                 ret = -EFAULT;                 << 
398         if (ret)                               << 
399                 goto out;                      << 
400         if (!folio) {                          << 
401                 ret = -EFAULT;                 << 
402                 goto out;                      << 
403         }                                      << 
404                                                    91 
405         page = folio_file_page(folio, pgoff);  !!  92         pte_unmap_unlock(dst_pte, ptl);
406         if (PageHWPoison(page)) {              << 
407                 ret = -EIO;                    << 
408                 goto out_release;              << 
409         }                                      << 
410                                                << 
411         ret = mfill_atomic_install_pte(dst_pmd << 
412                                        page, f << 
413         if (ret)                               << 
414                 goto out_release;              << 
415                                                << 
416         folio_unlock(folio);                   << 
417         ret = 0;                                   93         ret = 0;
418 out:                                               94 out:
419         return ret;                                95         return ret;
                                                   >>  96 out_release_uncharge_unlock:
                                                   >>  97         pte_unmap_unlock(dst_pte, ptl);
                                                   >>  98         mem_cgroup_cancel_charge(page, memcg, false);
420 out_release:                                       99 out_release:
421         folio_unlock(folio);                   !! 100         put_page(page);
422         folio_put(folio);                      << 
423         goto out;                                 101         goto out;
424 }                                                 102 }
425                                                   103 
426 /* Handles UFFDIO_POISON for all non-hugetlb V !! 104 static int mfill_zeropage_pte(struct mm_struct *dst_mm,
427 static int mfill_atomic_pte_poison(pmd_t *dst_ !! 105                               pmd_t *dst_pmd,
428                                    struct vm_a !! 106                               struct vm_area_struct *dst_vma,
429                                    unsigned lo !! 107                               unsigned long dst_addr)
430                                    uffd_flags_ << 
431 {                                                 108 {
432         int ret;                               << 
433         struct mm_struct *dst_mm = dst_vma->vm << 
434         pte_t _dst_pte, *dst_pte;                 109         pte_t _dst_pte, *dst_pte;
435         spinlock_t *ptl;                          110         spinlock_t *ptl;
                                                   >> 111         int ret;
436                                                   112 
437         _dst_pte = make_pte_marker(PTE_MARKER_ !! 113         _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
438         ret = -EAGAIN;                         !! 114                                          dst_vma->vm_page_prot));
439         dst_pte = pte_offset_map_lock(dst_mm,  << 
440         if (!dst_pte)                          << 
441                 goto out;                      << 
442                                                << 
443         if (mfill_file_over_size(dst_vma, dst_ << 
444                 ret = -EFAULT;                 << 
445                 goto out_unlock;               << 
446         }                                      << 
447                                                << 
448         ret = -EEXIST;                            115         ret = -EEXIST;
449         /* Refuse to overwrite any PTE, even a !! 116         dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
450         if (!pte_none(ptep_get(dst_pte)))      !! 117         if (!pte_none(*dst_pte))
451                 goto out_unlock;                  118                 goto out_unlock;
452                                                << 
453         set_pte_at(dst_mm, dst_addr, dst_pte,     119         set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
454                                                << 
455         /* No need to invalidate - it was non-    120         /* No need to invalidate - it was non-present before */
456         update_mmu_cache(dst_vma, dst_addr, ds    121         update_mmu_cache(dst_vma, dst_addr, dst_pte);
457         ret = 0;                                  122         ret = 0;
458 out_unlock:                                       123 out_unlock:
459         pte_unmap_unlock(dst_pte, ptl);           124         pte_unmap_unlock(dst_pte, ptl);
460 out:                                           << 
461         return ret;                               125         return ret;
462 }                                                 126 }
463                                                   127 
464 static pmd_t *mm_alloc_pmd(struct mm_struct *m    128 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
465 {                                                 129 {
466         pgd_t *pgd;                               130         pgd_t *pgd;
467         p4d_t *p4d;                               131         p4d_t *p4d;
468         pud_t *pud;                               132         pud_t *pud;
469                                                   133 
470         pgd = pgd_offset(mm, address);            134         pgd = pgd_offset(mm, address);
471         p4d = p4d_alloc(mm, pgd, address);        135         p4d = p4d_alloc(mm, pgd, address);
472         if (!p4d)                                 136         if (!p4d)
473                 return NULL;                      137                 return NULL;
474         pud = pud_alloc(mm, p4d, address);        138         pud = pud_alloc(mm, p4d, address);
475         if (!pud)                                 139         if (!pud)
476                 return NULL;                      140                 return NULL;
477         /*                                        141         /*
478          * Note that we didn't run this becaus    142          * Note that we didn't run this because the pmd was
479          * missing, the *pmd may be already es    143          * missing, the *pmd may be already established and in
480          * turn it may also be a trans_huge_pm    144          * turn it may also be a trans_huge_pmd.
481          */                                       145          */
482         return pmd_alloc(mm, pud, address);       146         return pmd_alloc(mm, pud, address);
483 }                                                 147 }
484                                                   148 
485 #ifdef CONFIG_HUGETLB_PAGE                        149 #ifdef CONFIG_HUGETLB_PAGE
486 /*                                                150 /*
487  * mfill_atomic processing for HUGETLB vmas.   !! 151  * __mcopy_atomic processing for HUGETLB vmas.  Note that this routine is
488  * called with either vma-lock or mmap_lock he !! 152  * called with mmap_sem held, it will release mmap_sem before returning.
489  * before returning.                           << 
490  */                                               153  */
491 static __always_inline ssize_t mfill_atomic_hu !! 154 static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
492                                                << 
493                                                   155                                               struct vm_area_struct *dst_vma,
494                                                   156                                               unsigned long dst_start,
495                                                   157                                               unsigned long src_start,
496                                                   158                                               unsigned long len,
497                                                !! 159                                               bool zeropage)
498 {                                                 160 {
499         struct mm_struct *dst_mm = dst_vma->vm !! 161         int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
                                                   >> 162         int vm_shared = dst_vma->vm_flags & VM_SHARED;
500         ssize_t err;                              163         ssize_t err;
501         pte_t *dst_pte;                           164         pte_t *dst_pte;
502         unsigned long src_addr, dst_addr;         165         unsigned long src_addr, dst_addr;
503         long copied;                              166         long copied;
504         struct folio *folio;                   !! 167         struct page *page;
                                                   >> 168         struct hstate *h;
505         unsigned long vma_hpagesize;              169         unsigned long vma_hpagesize;
506         pgoff_t idx;                              170         pgoff_t idx;
507         u32 hash;                                 171         u32 hash;
508         struct address_space *mapping;            172         struct address_space *mapping;
509                                                   173 
510         /*                                        174         /*
511          * There is no default zero huge page     175          * There is no default zero huge page for all huge page sizes as
512          * supported by hugetlb.  A PMD_SIZE h    176          * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
513          * by THP.  Since we can not reliably     177          * by THP.  Since we can not reliably insert a zero page, this
514          * feature is not supported.              178          * feature is not supported.
515          */                                       179          */
516         if (uffd_flags_mode_is(flags, MFILL_AT !! 180         if (zeropage) {
517                 up_read(&ctx->map_changing_loc !! 181                 up_read(&dst_mm->mmap_sem);
518                 uffd_mfill_unlock(dst_vma);    << 
519                 return -EINVAL;                   182                 return -EINVAL;
520         }                                         183         }
521                                                   184 
522         src_addr = src_start;                     185         src_addr = src_start;
523         dst_addr = dst_start;                     186         dst_addr = dst_start;
524         copied = 0;                               187         copied = 0;
525         folio = NULL;                          !! 188         page = NULL;
526         vma_hpagesize = vma_kernel_pagesize(ds    189         vma_hpagesize = vma_kernel_pagesize(dst_vma);
527                                                   190 
528         /*                                        191         /*
529          * Validate alignment based on huge pa    192          * Validate alignment based on huge page size
530          */                                       193          */
531         err = -EINVAL;                            194         err = -EINVAL;
532         if (dst_start & (vma_hpagesize - 1) ||    195         if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
533                 goto out_unlock;                  196                 goto out_unlock;
534                                                   197 
535 retry:                                            198 retry:
536         /*                                        199         /*
537          * On routine entry dst_vma is set.  I !! 200          * On routine entry dst_vma is set.  If we had to drop mmap_sem and
538          * retry, dst_vma will be set to NULL     201          * retry, dst_vma will be set to NULL and we must lookup again.
539          */                                       202          */
540         if (!dst_vma) {                           203         if (!dst_vma) {
541                 dst_vma = uffd_mfill_lock(dst_ << 
542                 if (IS_ERR(dst_vma)) {         << 
543                         err = PTR_ERR(dst_vma) << 
544                         goto out;              << 
545                 }                              << 
546                                                << 
547                 err = -ENOENT;                    204                 err = -ENOENT;
548                 if (!is_vm_hugetlb_page(dst_vm !! 205                 dst_vma = find_vma(dst_mm, dst_start);
549                         goto out_unlock_vma;   !! 206                 if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
                                                   >> 207                         goto out_unlock;
                                                   >> 208                 /*
                                                   >> 209                  * Only allow __mcopy_atomic_hugetlb on userfaultfd
                                                   >> 210                  * registered ranges.
                                                   >> 211                  */
                                                   >> 212                 if (!dst_vma->vm_userfaultfd_ctx.ctx)
                                                   >> 213                         goto out_unlock;
                                                   >> 214 
                                                   >> 215                 if (dst_start < dst_vma->vm_start ||
                                                   >> 216                     dst_start + len > dst_vma->vm_end)
                                                   >> 217                         goto out_unlock;
550                                                   218 
551                 err = -EINVAL;                    219                 err = -EINVAL;
552                 if (vma_hpagesize != vma_kerne    220                 if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
553                         goto out_unlock_vma;   !! 221                         goto out_unlock;
554                                                   222 
555                 /*                             !! 223                 vm_shared = dst_vma->vm_flags & VM_SHARED;
556                  * If memory mappings are chan !! 224         }
557                  * operation (e.g. mremap) run !! 225 
558                  * request the user to retry l !! 226         if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
559                  */                            !! 227                     (len - copied) & (vma_hpagesize - 1)))
560                 down_read(&ctx->map_changing_l !! 228                 goto out_unlock;
561                 err = -EAGAIN;                 !! 229 
562                 if (atomic_read(&ctx->mmap_cha !! 230         /*
                                                   >> 231          * If not shared, ensure the dst_vma has a anon_vma.
                                                   >> 232          */
                                                   >> 233         err = -ENOMEM;
                                                   >> 234         if (!vm_shared) {
                                                   >> 235                 if (unlikely(anon_vma_prepare(dst_vma)))
563                         goto out_unlock;          236                         goto out_unlock;
564         }                                         237         }
565                                                   238 
                                                   >> 239         h = hstate_vma(dst_vma);
                                                   >> 240 
566         while (src_addr < src_start + len) {      241         while (src_addr < src_start + len) {
                                                   >> 242                 pte_t dst_pteval;
                                                   >> 243 
567                 BUG_ON(dst_addr >= dst_start +    244                 BUG_ON(dst_addr >= dst_start + len);
                                                   >> 245                 VM_BUG_ON(dst_addr & ~huge_page_mask(h));
568                                                   246 
569                 /*                                247                 /*
570                  * Serialize via vma_lock and  !! 248                  * Serialize via hugetlb_fault_mutex
571                  * vma_lock ensures the dst_pt << 
572                  * in the case of shared pmds. << 
573                  * races with other faulting t << 
574                  */                               249                  */
575                 idx = linear_page_index(dst_vm    250                 idx = linear_page_index(dst_vma, dst_addr);
576                 mapping = dst_vma->vm_file->f_    251                 mapping = dst_vma->vm_file->f_mapping;
577                 hash = hugetlb_fault_mutex_has !! 252                 hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
                                                   >> 253                                                                 idx, dst_addr);
578                 mutex_lock(&hugetlb_fault_mute    254                 mutex_lock(&hugetlb_fault_mutex_table[hash]);
579                 hugetlb_vma_lock_read(dst_vma) << 
580                                                   255 
581                 err = -ENOMEM;                    256                 err = -ENOMEM;
582                 dst_pte = huge_pte_alloc(dst_m !! 257                 dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
583                 if (!dst_pte) {                   258                 if (!dst_pte) {
584                         hugetlb_vma_unlock_rea << 
585                         mutex_unlock(&hugetlb_    259                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
586                         goto out_unlock;          260                         goto out_unlock;
587                 }                                 261                 }
588                                                   262 
589                 if (!uffd_flags_mode_is(flags, !! 263                 err = -EEXIST;
590                     !huge_pte_none_mostly(huge !! 264                 dst_pteval = huge_ptep_get(dst_pte);
591                         err = -EEXIST;         !! 265                 if (!huge_pte_none(dst_pteval)) {
592                         hugetlb_vma_unlock_rea << 
593                         mutex_unlock(&hugetlb_    266                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
594                         goto out_unlock;          267                         goto out_unlock;
595                 }                                 268                 }
596                                                   269 
597                 err = hugetlb_mfill_atomic_pte !! 270                 err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
598                                                !! 271                                                 dst_addr, src_addr, &page);
599                                                   272 
600                 hugetlb_vma_unlock_read(dst_vm << 
601                 mutex_unlock(&hugetlb_fault_mu    273                 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                                                   >> 274                 vm_alloc_shared = vm_shared;
602                                                   275 
603                 cond_resched();                   276                 cond_resched();
604                                                   277 
605                 if (unlikely(err == -ENOENT))  !! 278                 if (unlikely(err == -EFAULT)) {
606                         up_read(&ctx->map_chan !! 279                         up_read(&dst_mm->mmap_sem);
607                         uffd_mfill_unlock(dst_ !! 280                         BUG_ON(!page);
608                         BUG_ON(!folio);        !! 281 
609                                                !! 282                         err = copy_huge_page_from_user(page,
610                         err = copy_folio_from_ !! 283                                                 (const void __user *)src_addr,
611                                                !! 284                                                 pages_per_huge_page(h), true);
612                         if (unlikely(err)) {      285                         if (unlikely(err)) {
613                                 err = -EFAULT;    286                                 err = -EFAULT;
614                                 goto out;         287                                 goto out;
615                         }                         288                         }
                                                   >> 289                         down_read(&dst_mm->mmap_sem);
616                                                   290 
617                         dst_vma = NULL;           291                         dst_vma = NULL;
618                         goto retry;               292                         goto retry;
619                 } else                            293                 } else
620                         BUG_ON(folio);         !! 294                         BUG_ON(page);
621                                                   295 
622                 if (!err) {                       296                 if (!err) {
623                         dst_addr += vma_hpages    297                         dst_addr += vma_hpagesize;
624                         src_addr += vma_hpages    298                         src_addr += vma_hpagesize;
625                         copied += vma_hpagesiz    299                         copied += vma_hpagesize;
626                                                   300 
627                         if (fatal_signal_pendi    301                         if (fatal_signal_pending(current))
628                                 err = -EINTR;     302                                 err = -EINTR;
629                 }                                 303                 }
630                 if (err)                          304                 if (err)
631                         break;                    305                         break;
632         }                                         306         }
633                                                   307 
634 out_unlock:                                       308 out_unlock:
635         up_read(&ctx->map_changing_lock);      !! 309         up_read(&dst_mm->mmap_sem);
636 out_unlock_vma:                                << 
637         uffd_mfill_unlock(dst_vma);            << 
638 out:                                              310 out:
639         if (folio)                             !! 311         if (page) {
640                 folio_put(folio);              !! 312                 /*
                                                   >> 313                  * We encountered an error and are about to free a newly
                                                   >> 314                  * allocated huge page.
                                                   >> 315                  *
                                                   >> 316                  * Reservation handling is very subtle, and is different for
                                                   >> 317                  * private and shared mappings.  See the routine
                                                   >> 318                  * restore_reserve_on_error for details.  Unfortunately, we
                                                   >> 319                  * can not call restore_reserve_on_error now as it would
                                                   >> 320                  * require holding mmap_sem.
                                                   >> 321                  *
                                                   >> 322                  * If a reservation for the page existed in the reservation
                                                   >> 323                  * map of a private mapping, the map was modified to indicate
                                                   >> 324                  * the reservation was consumed when the page was allocated.
                                                   >> 325                  * We clear the PagePrivate flag now so that the global
                                                   >> 326                  * reserve count will not be incremented in free_huge_page.
                                                   >> 327                  * The reservation map will still indicate the reservation
                                                   >> 328                  * was consumed and possibly prevent later page allocation.
                                                   >> 329                  * This is better than leaking a global reservation.  If no
                                                   >> 330                  * reservation existed, it is still safe to clear PagePrivate
                                                   >> 331                  * as no adjustments to reservation counts were made during
                                                   >> 332                  * allocation.
                                                   >> 333                  *
                                                   >> 334                  * The reservation map for shared mappings indicates which
                                                   >> 335                  * pages have reservations.  When a huge page is allocated
                                                   >> 336                  * for an address with a reservation, no change is made to
                                                   >> 337                  * the reserve map.  In this case PagePrivate will be set
                                                   >> 338                  * to indicate that the global reservation count should be
                                                   >> 339                  * incremented when the page is freed.  This is the desired
                                                   >> 340                  * behavior.  However, when a huge page is allocated for an
                                                   >> 341                  * address without a reservation a reservation entry is added
                                                   >> 342                  * to the reservation map, and PagePrivate will not be set.
                                                   >> 343                  * When the page is freed, the global reserve count will NOT
                                                   >> 344                  * be incremented and it will appear as though we have leaked
                                                   >> 345                  * reserved page.  In this case, set PagePrivate so that the
                                                   >> 346                  * global reserve count will be incremented to match the
                                                   >> 347                  * reservation map entry which was created.
                                                   >> 348                  *
                                                   >> 349                  * Note that vm_alloc_shared is based on the flags of the vma
                                                   >> 350                  * for which the page was originally allocated.  dst_vma could
                                                   >> 351                  * be different or NULL on error.
                                                   >> 352                  */
                                                   >> 353                 if (vm_alloc_shared)
                                                   >> 354                         SetPagePrivate(page);
                                                   >> 355                 else
                                                   >> 356                         ClearPagePrivate(page);
                                                   >> 357                 put_page(page);
                                                   >> 358         }
641         BUG_ON(copied < 0);                       359         BUG_ON(copied < 0);
642         BUG_ON(err > 0);                          360         BUG_ON(err > 0);
643         BUG_ON(!copied && !err);                  361         BUG_ON(!copied && !err);
644         return copied ? copied : err;             362         return copied ? copied : err;
645 }                                                 363 }
646 #else /* !CONFIG_HUGETLB_PAGE */                  364 #else /* !CONFIG_HUGETLB_PAGE */
647 /* fail at build time if gcc attempts to use t    365 /* fail at build time if gcc attempts to use this */
648 extern ssize_t mfill_atomic_hugetlb(struct use !! 366 extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
649                                     struct vm_ !! 367                                       struct vm_area_struct *dst_vma,
650                                     unsigned l !! 368                                       unsigned long dst_start,
651                                     unsigned l !! 369                                       unsigned long src_start,
652                                     unsigned l !! 370                                       unsigned long len,
653                                     uffd_flags !! 371                                       bool zeropage);
654 #endif /* CONFIG_HUGETLB_PAGE */                  372 #endif /* CONFIG_HUGETLB_PAGE */
655                                                   373 
656 static __always_inline ssize_t mfill_atomic_pt !! 374 static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
657                                                !! 375                                               unsigned long dst_start,
658                                                !! 376                                               unsigned long src_start,
659                                                !! 377                                               unsigned long len,
660                                                !! 378                                               bool zeropage)
661                                                << 
662 {                                              << 
663         ssize_t err;                           << 
664                                                << 
665         if (uffd_flags_mode_is(flags, MFILL_AT << 
666                 return mfill_atomic_pte_contin << 
667                                                << 
668         } else if (uffd_flags_mode_is(flags, M << 
669                 return mfill_atomic_pte_poison << 
670                                                << 
671         }                                      << 
672                                                << 
673         /*                                     << 
674          * The normal page fault path for a sh << 
675          * fault, fill the hole in the file an << 
676          * result generates plain anonymous me << 
677          * asked to fill an hole in a MAP_PRIV << 
678          * generate anonymous memory directly  << 
679          * the hole. For the MAP_PRIVATE case  << 
680          * only happens in the pagetable (to v << 
681          * and not in the radix tree.          << 
682          */                                    << 
683         if (!(dst_vma->vm_flags & VM_SHARED))  << 
684                 if (uffd_flags_mode_is(flags,  << 
685                         err = mfill_atomic_pte << 
686                                                << 
687                                                << 
688                 else                           << 
689                         err = mfill_atomic_pte << 
690                                                << 
691         } else {                               << 
692                 err = shmem_mfill_atomic_pte(d << 
693                                              d << 
694                                              f << 
695         }                                      << 
696                                                << 
697         return err;                            << 
698 }                                              << 
699                                                << 
700 static __always_inline ssize_t mfill_atomic(st << 
701                                             un << 
702                                             un << 
703                                             un << 
704                                             uf << 
705 {                                                 379 {
706         struct mm_struct *dst_mm = ctx->mm;    << 
707         struct vm_area_struct *dst_vma;           380         struct vm_area_struct *dst_vma;
708         ssize_t err;                              381         ssize_t err;
709         pmd_t *dst_pmd;                           382         pmd_t *dst_pmd;
710         unsigned long src_addr, dst_addr;         383         unsigned long src_addr, dst_addr;
711         long copied;                              384         long copied;
712         struct folio *folio;                   !! 385         struct page *page;
713                                                   386 
714         /*                                        387         /*
715          * Sanitize the command parameters:       388          * Sanitize the command parameters:
716          */                                       389          */
717         BUG_ON(dst_start & ~PAGE_MASK);           390         BUG_ON(dst_start & ~PAGE_MASK);
718         BUG_ON(len & ~PAGE_MASK);                 391         BUG_ON(len & ~PAGE_MASK);
719                                                   392 
720         /* Does the address range wrap, or is     393         /* Does the address range wrap, or is the span zero-sized? */
721         BUG_ON(src_start + len <= src_start);     394         BUG_ON(src_start + len <= src_start);
722         BUG_ON(dst_start + len <= dst_start);     395         BUG_ON(dst_start + len <= dst_start);
723                                                   396 
724         src_addr = src_start;                     397         src_addr = src_start;
725         dst_addr = dst_start;                     398         dst_addr = dst_start;
726         copied = 0;                               399         copied = 0;
727         folio = NULL;                          !! 400         page = NULL;
728 retry:                                            401 retry:
                                                   >> 402         down_read(&dst_mm->mmap_sem);
                                                   >> 403 
729         /*                                        404         /*
730          * Make sure the vma is not shared, th    405          * Make sure the vma is not shared, that the dst range is
731          * both valid and fully within a singl    406          * both valid and fully within a single existing vma.
732          */                                       407          */
733         dst_vma = uffd_mfill_lock(dst_mm, dst_ !! 408         err = -ENOENT;
734         if (IS_ERR(dst_vma)) {                 !! 409         dst_vma = find_vma(dst_mm, dst_start);
735                 err = PTR_ERR(dst_vma);        !! 410         if (!dst_vma)
736                 goto out;                      !! 411                 goto out_unlock;
737         }                                      << 
738                                                << 
739         /*                                        412         /*
740          * If memory mappings are changing bec !! 413          * Be strict and only allow __mcopy_atomic on userfaultfd
741          * operation (e.g. mremap) running in  !! 414          * registered ranges to prevent userland errors going
742          * request the user to retry later     !! 415          * unnoticed. As far as the VM consistency is concerned, it
                                                   >> 416          * would be perfectly safe to remove this check, but there's
                                                   >> 417          * no useful usage for __mcopy_atomic ouside of userfaultfd
                                                   >> 418          * registered ranges. This is after all why these are ioctls
                                                   >> 419          * belonging to the userfaultfd and not syscalls.
743          */                                       420          */
744         down_read(&ctx->map_changing_lock);    !! 421         if (!dst_vma->vm_userfaultfd_ctx.ctx)
745         err = -EAGAIN;                         !! 422                 goto out_unlock;
746         if (atomic_read(&ctx->mmap_changing))  !! 423 
                                                   >> 424         if (dst_start < dst_vma->vm_start ||
                                                   >> 425             dst_start + len > dst_vma->vm_end)
747                 goto out_unlock;                  426                 goto out_unlock;
748                                                   427 
749         err = -EINVAL;                            428         err = -EINVAL;
750         /*                                        429         /*
751          * shmem_zero_setup is invoked in mmap    430          * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
752          * it will overwrite vm_ops, so vma_is    431          * it will overwrite vm_ops, so vma_is_anonymous must return false.
753          */                                       432          */
754         if (WARN_ON_ONCE(vma_is_anonymous(dst_    433         if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
755             dst_vma->vm_flags & VM_SHARED))       434             dst_vma->vm_flags & VM_SHARED))
756                 goto out_unlock;                  435                 goto out_unlock;
757                                                   436 
758         /*                                        437         /*
759          * validate 'mode' now that we know th << 
760          * a wrprotect copy if the userfaultfd << 
761          */                                    << 
762         if ((flags & MFILL_ATOMIC_WP) && !(dst << 
763                 goto out_unlock;               << 
764                                                << 
765         /*                                     << 
766          * If this is a HUGETLB vma, pass off     438          * If this is a HUGETLB vma, pass off to appropriate routine
767          */                                       439          */
768         if (is_vm_hugetlb_page(dst_vma))          440         if (is_vm_hugetlb_page(dst_vma))
769                 return  mfill_atomic_hugetlb(c !! 441                 return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
770                                              s !! 442                                                 src_start, len, zeropage);
771                                                   443 
772         if (!vma_is_anonymous(dst_vma) && !vma    444         if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
773                 goto out_unlock;                  445                 goto out_unlock;
774         if (!vma_is_shmem(dst_vma) &&          !! 446 
775             uffd_flags_mode_is(flags, MFILL_AT !! 447         /*
                                                   >> 448          * Ensure the dst_vma has a anon_vma or this page
                                                   >> 449          * would get a NULL anon_vma when moved in the
                                                   >> 450          * dst_vma.
                                                   >> 451          */
                                                   >> 452         err = -ENOMEM;
                                                   >> 453         if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma)))
776                 goto out_unlock;                  454                 goto out_unlock;
777                                                   455 
778         while (src_addr < src_start + len) {      456         while (src_addr < src_start + len) {
779                 pmd_t dst_pmdval;                 457                 pmd_t dst_pmdval;
780                                                   458 
781                 BUG_ON(dst_addr >= dst_start +    459                 BUG_ON(dst_addr >= dst_start + len);
782                                                   460 
783                 dst_pmd = mm_alloc_pmd(dst_mm,    461                 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
784                 if (unlikely(!dst_pmd)) {         462                 if (unlikely(!dst_pmd)) {
785                         err = -ENOMEM;            463                         err = -ENOMEM;
786                         break;                    464                         break;
787                 }                                 465                 }
788                                                   466 
789                 dst_pmdval = pmdp_get_lockless !! 467                 dst_pmdval = pmd_read_atomic(dst_pmd);
790                 if (unlikely(pmd_none(dst_pmdv << 
791                     unlikely(__pte_alloc(dst_m << 
792                         err = -ENOMEM;         << 
793                         break;                 << 
794                 }                              << 
795                 dst_pmdval = pmdp_get_lockless << 
796                 /*                                468                 /*
797                  * If the dst_pmd is THP don't !! 469                  * If the dst_pmd is mapped as THP don't
798                  * (This includes the case whe !! 470                  * override it and just be strict.
799                  * changed back to none after  << 
800                  */                               471                  */
801                 if (unlikely(!pmd_present(dst_ !! 472                 if (unlikely(pmd_trans_huge(dst_pmdval))) {
802                              pmd_devmap(dst_pm << 
803                         err = -EEXIST;            473                         err = -EEXIST;
804                         break;                    474                         break;
805                 }                                 475                 }
806                 if (unlikely(pmd_bad(dst_pmdva !! 476                 if (unlikely(pmd_none(dst_pmdval)) &&
                                                   >> 477                     unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
                                                   >> 478                         err = -ENOMEM;
                                                   >> 479                         break;
                                                   >> 480                 }
                                                   >> 481                 /* If an huge pmd materialized from under us fail */
                                                   >> 482                 if (unlikely(pmd_trans_huge(*dst_pmd))) {
807                         err = -EFAULT;            483                         err = -EFAULT;
808                         break;                    484                         break;
809                 }                                 485                 }
810                 /*                             << 
811                  * For shmem mappings, khugepa << 
812                  * tables under us; pte_offset << 
813                  */                            << 
814                                                   486 
815                 err = mfill_atomic_pte(dst_pmd !! 487                 BUG_ON(pmd_none(*dst_pmd));
816                                        src_add !! 488                 BUG_ON(pmd_trans_huge(*dst_pmd));
                                                   >> 489 
                                                   >> 490                 if (vma_is_anonymous(dst_vma)) {
                                                   >> 491                         if (!zeropage)
                                                   >> 492                                 err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
                                                   >> 493                                                        dst_addr, src_addr,
                                                   >> 494                                                        &page);
                                                   >> 495                         else
                                                   >> 496                                 err = mfill_zeropage_pte(dst_mm, dst_pmd,
                                                   >> 497                                                          dst_vma, dst_addr);
                                                   >> 498                 } else {
                                                   >> 499                         err = -EINVAL; /* if zeropage is true return -EINVAL */
                                                   >> 500                         if (likely(!zeropage))
                                                   >> 501                                 err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
                                                   >> 502                                                              dst_vma, dst_addr,
                                                   >> 503                                                              src_addr, &page);
                                                   >> 504                 }
                                                   >> 505 
817                 cond_resched();                   506                 cond_resched();
818                                                   507 
819                 if (unlikely(err == -ENOENT))  !! 508                 if (unlikely(err == -EFAULT)) {
820                         void *kaddr;           !! 509                         void *page_kaddr;
821                                                   510 
822                         up_read(&ctx->map_chan !! 511                         up_read(&dst_mm->mmap_sem);
823                         uffd_mfill_unlock(dst_ !! 512                         BUG_ON(!page);
824                         BUG_ON(!folio);        << 
825                                                   513 
826                         kaddr = kmap_local_fol !! 514                         page_kaddr = kmap(page);
827                         err = copy_from_user(k !! 515                         err = copy_from_user(page_kaddr,
828                                              (    516                                              (const void __user *) src_addr,
829                                              P    517                                              PAGE_SIZE);
830                         kunmap_local(kaddr);   !! 518                         kunmap(page);
831                         if (unlikely(err)) {      519                         if (unlikely(err)) {
832                                 err = -EFAULT;    520                                 err = -EFAULT;
833                                 goto out;         521                                 goto out;
834                         }                         522                         }
835                         flush_dcache_folio(fol << 
836                         goto retry;               523                         goto retry;
837                 } else                            524                 } else
838                         BUG_ON(folio);         !! 525                         BUG_ON(page);
839                                                   526 
840                 if (!err) {                       527                 if (!err) {
841                         dst_addr += PAGE_SIZE;    528                         dst_addr += PAGE_SIZE;
842                         src_addr += PAGE_SIZE;    529                         src_addr += PAGE_SIZE;
843                         copied += PAGE_SIZE;      530                         copied += PAGE_SIZE;
844                                                   531 
845                         if (fatal_signal_pendi    532                         if (fatal_signal_pending(current))
846                                 err = -EINTR;     533                                 err = -EINTR;
847                 }                                 534                 }
848                 if (err)                          535                 if (err)
849                         break;                    536                         break;
850         }                                         537         }
851                                                   538 
852 out_unlock:                                       539 out_unlock:
853         up_read(&ctx->map_changing_lock);      !! 540         up_read(&dst_mm->mmap_sem);
854         uffd_mfill_unlock(dst_vma);            << 
855 out:                                              541 out:
856         if (folio)                             !! 542         if (page)
857                 folio_put(folio);              !! 543                 put_page(page);
858         BUG_ON(copied < 0);                       544         BUG_ON(copied < 0);
859         BUG_ON(err > 0);                          545         BUG_ON(err > 0);
860         BUG_ON(!copied && !err);                  546         BUG_ON(!copied && !err);
861         return copied ? copied : err;             547         return copied ? copied : err;
862 }                                                 548 }
863                                                   549 
864 ssize_t mfill_atomic_copy(struct userfaultfd_c !! 550 ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
865                           unsigned long src_st !! 551                      unsigned long src_start, unsigned long len)
866                           uffd_flags_t flags)  << 
867 {                                                 552 {
868         return mfill_atomic(ctx, dst_start, sr !! 553         return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
869                             uffd_flags_set_mod << 
870 }                                                 554 }
871                                                   555 
872 ssize_t mfill_atomic_zeropage(struct userfault !! 556 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
873                               unsigned long st !! 557                        unsigned long len)
874                               unsigned long le << 
875 {                                                 558 {
876         return mfill_atomic(ctx, start, 0, len !! 559         return __mcopy_atomic(dst_mm, start, 0, len, true);
877                             uffd_flags_set_mod << 
878 }                                              << 
879                                                << 
880 ssize_t mfill_atomic_continue(struct userfault << 
881                               unsigned long le << 
882 {                                              << 
883                                                << 
884         /*                                     << 
885          * A caller might reasonably assume th << 
886          * smp_wmb() to ensure that any writes << 
887          * the thread doing the UFFDIO_CONTINU << 
888          * subsequent loads from the page thro << 
889          */                                    << 
890         smp_wmb();                             << 
891                                                << 
892         return mfill_atomic(ctx, start, 0, len << 
893                             uffd_flags_set_mod << 
894 }                                              << 
895                                                << 
896 ssize_t mfill_atomic_poison(struct userfaultfd << 
897                             unsigned long len, << 
898 {                                              << 
899         return mfill_atomic(ctx, start, 0, len << 
900                             uffd_flags_set_mod << 
901 }                                              << 
902                                                << 
903 long uffd_wp_range(struct vm_area_struct *dst_ << 
904                    unsigned long start, unsign << 
905 {                                              << 
906         unsigned int mm_cp_flags;              << 
907         struct mmu_gather tlb;                 << 
908         long ret;                              << 
909                                                << 
910         VM_WARN_ONCE(start < dst_vma->vm_start << 
911                         "The address range exc << 
912         if (enable_wp)                         << 
913                 mm_cp_flags = MM_CP_UFFD_WP;   << 
914         else                                   << 
915                 mm_cp_flags = MM_CP_UFFD_WP_RE << 
916                                                << 
917         /*                                     << 
918          * vma->vm_page_prot already reflects  << 
919          * VMA (see userfaultfd_set_vm_flags() << 
920          * to be write-protected as default wh << 
921          * Try upgrading write permissions man << 
922          */                                    << 
923         if (!enable_wp && vma_wants_manual_pte << 
924                 mm_cp_flags |= MM_CP_TRY_CHANG << 
925         tlb_gather_mmu(&tlb, dst_vma->vm_mm);  << 
926         ret = change_protection(&tlb, dst_vma, << 
927         tlb_finish_mmu(&tlb);                  << 
928                                                << 
929         return ret;                            << 
930 }                                              << 
931                                                << 
932 int mwriteprotect_range(struct userfaultfd_ctx << 
933                         unsigned long len, boo << 
934 {                                              << 
935         struct mm_struct *dst_mm = ctx->mm;    << 
936         unsigned long end = start + len;       << 
937         unsigned long _start, _end;            << 
938         struct vm_area_struct *dst_vma;        << 
939         unsigned long page_mask;               << 
940         long err;                              << 
941         VMA_ITERATOR(vmi, dst_mm, start);      << 
942                                                << 
943         /*                                     << 
944          * Sanitize the command parameters:    << 
945          */                                    << 
946         BUG_ON(start & ~PAGE_MASK);            << 
947         BUG_ON(len & ~PAGE_MASK);              << 
948                                                << 
949         /* Does the address range wrap, or is  << 
950         BUG_ON(start + len <= start);          << 
951                                                << 
952         mmap_read_lock(dst_mm);                << 
953                                                << 
954         /*                                     << 
955          * If memory mappings are changing bec << 
956          * operation (e.g. mremap) running in  << 
957          * request the user to retry later     << 
958          */                                    << 
959         down_read(&ctx->map_changing_lock);    << 
960         err = -EAGAIN;                         << 
961         if (atomic_read(&ctx->mmap_changing))  << 
962                 goto out_unlock;               << 
963                                                << 
964         err = -ENOENT;                         << 
965         for_each_vma_range(vmi, dst_vma, end)  << 
966                                                << 
967                 if (!userfaultfd_wp(dst_vma))  << 
968                         err = -ENOENT;         << 
969                         break;                 << 
970                 }                              << 
971                                                << 
972                 if (is_vm_hugetlb_page(dst_vma << 
973                         err = -EINVAL;         << 
974                         page_mask = vma_kernel << 
975                         if ((start & page_mask << 
976                                 break;         << 
977                 }                              << 
978                                                << 
979                 _start = max(dst_vma->vm_start << 
980                 _end = min(dst_vma->vm_end, en << 
981                                                << 
982                 err = uffd_wp_range(dst_vma, _ << 
983                                                << 
984                 /* Return 0 on success, <0 on  << 
985                 if (err < 0)                   << 
986                         break;                 << 
987                 err = 0;                       << 
988         }                                      << 
989 out_unlock:                                    << 
990         up_read(&ctx->map_changing_lock);      << 
991         mmap_read_unlock(dst_mm);              << 
992         return err;                            << 
993 }                                              << 
994                                                << 
995                                                << 
996 void double_pt_lock(spinlock_t *ptl1,          << 
997                     spinlock_t *ptl2)          << 
998         __acquires(ptl1)                       << 
999         __acquires(ptl2)                       << 
1000 {                                             << 
1001         if (ptl1 > ptl2)                      << 
1002                 swap(ptl1, ptl2);             << 
1003         /* lock in virtual address order to a << 
1004         spin_lock(ptl1);                      << 
1005         if (ptl1 != ptl2)                     << 
1006                 spin_lock_nested(ptl2, SINGLE << 
1007         else                                  << 
1008                 __acquire(ptl2);              << 
1009 }                                             << 
1010                                               << 
1011 void double_pt_unlock(spinlock_t *ptl1,       << 
1012                       spinlock_t *ptl2)       << 
1013         __releases(ptl1)                      << 
1014         __releases(ptl2)                      << 
1015 {                                             << 
1016         spin_unlock(ptl1);                    << 
1017         if (ptl1 != ptl2)                     << 
1018                 spin_unlock(ptl2);            << 
1019         else                                  << 
1020                 __release(ptl2);              << 
1021 }                                             << 
1022                                               << 
1023                                               << 
1024 static int move_present_pte(struct mm_struct  << 
1025                             struct vm_area_st << 
1026                             struct vm_area_st << 
1027                             unsigned long dst << 
1028                             pte_t *dst_pte, p << 
1029                             pte_t orig_dst_pt << 
1030                             spinlock_t *dst_p << 
1031                             struct folio *src << 
1032 {                                             << 
1033         int err = 0;                          << 
1034                                               << 
1035         double_pt_lock(dst_ptl, src_ptl);     << 
1036                                               << 
1037         if (!pte_same(ptep_get(src_pte), orig << 
1038             !pte_same(ptep_get(dst_pte), orig << 
1039                 err = -EAGAIN;                << 
1040                 goto out;                     << 
1041         }                                     << 
1042         if (folio_test_large(src_folio) ||    << 
1043             folio_maybe_dma_pinned(src_folio) << 
1044             !PageAnonExclusive(&src_folio->pa << 
1045                 err = -EBUSY;                 << 
1046                 goto out;                     << 
1047         }                                     << 
1048                                               << 
1049         orig_src_pte = ptep_clear_flush(src_v << 
1050         /* Folio got pinned from under us. Pu << 
1051         if (folio_maybe_dma_pinned(src_folio) << 
1052                 set_pte_at(mm, src_addr, src_ << 
1053                 err = -EBUSY;                 << 
1054                 goto out;                     << 
1055         }                                     << 
1056                                               << 
1057         folio_move_anon_rmap(src_folio, dst_v << 
1058         src_folio->index = linear_page_index( << 
1059                                               << 
1060         orig_dst_pte = mk_pte(&src_folio->pag << 
1061         /* Follow mremap() behavior and treat << 
1062         orig_dst_pte = pte_mkwrite(pte_mkdirt << 
1063                                               << 
1064         set_pte_at(mm, dst_addr, dst_pte, ori << 
1065 out:                                          << 
1066         double_pt_unlock(dst_ptl, src_ptl);   << 
1067         return err;                           << 
1068 }                                             << 
1069                                               << 
1070 static int move_swap_pte(struct mm_struct *mm << 
1071                          unsigned long dst_ad << 
1072                          pte_t *dst_pte, pte_ << 
1073                          pte_t orig_dst_pte,  << 
1074                          spinlock_t *dst_ptl, << 
1075 {                                             << 
1076         if (!pte_swp_exclusive(orig_src_pte)) << 
1077                 return -EBUSY;                << 
1078                                               << 
1079         double_pt_lock(dst_ptl, src_ptl);     << 
1080                                               << 
1081         if (!pte_same(ptep_get(src_pte), orig << 
1082             !pte_same(ptep_get(dst_pte), orig << 
1083                 double_pt_unlock(dst_ptl, src << 
1084                 return -EAGAIN;               << 
1085         }                                     << 
1086                                               << 
1087         orig_src_pte = ptep_get_and_clear(mm, << 
1088         set_pte_at(mm, dst_addr, dst_pte, ori << 
1089         double_pt_unlock(dst_ptl, src_ptl);   << 
1090                                               << 
1091         return 0;                             << 
1092 }                                             << 
1093                                               << 
1094 static int move_zeropage_pte(struct mm_struct << 
1095                              struct vm_area_s << 
1096                              struct vm_area_s << 
1097                              unsigned long ds << 
1098                              pte_t *dst_pte,  << 
1099                              pte_t orig_dst_p << 
1100                              spinlock_t *dst_ << 
1101 {                                             << 
1102         pte_t zero_pte;                       << 
1103                                               << 
1104         double_pt_lock(dst_ptl, src_ptl);     << 
1105         if (!pte_same(ptep_get(src_pte), orig << 
1106             !pte_same(ptep_get(dst_pte), orig << 
1107                 double_pt_unlock(dst_ptl, src << 
1108                 return -EAGAIN;               << 
1109         }                                     << 
1110                                               << 
1111         zero_pte = pte_mkspecial(pfn_pte(my_z << 
1112                                          dst_ << 
1113         ptep_clear_flush(src_vma, src_addr, s << 
1114         set_pte_at(mm, dst_addr, dst_pte, zer << 
1115         double_pt_unlock(dst_ptl, src_ptl);   << 
1116                                               << 
1117         return 0;                             << 
1118 }                                             << 
1119                                               << 
1120                                               << 
1121 /*                                            << 
1122  * The mmap_lock for reading is held by the c << 
1123  * from src_pmd to dst_pmd if possible, and r << 
1124  * in moving the page.                        << 
1125  */                                           << 
1126 static int move_pages_pte(struct mm_struct *m << 
1127                           struct vm_area_stru << 
1128                           struct vm_area_stru << 
1129                           unsigned long dst_a << 
1130                           __u64 mode)         << 
1131 {                                             << 
1132         swp_entry_t entry;                    << 
1133         pte_t orig_src_pte, orig_dst_pte;     << 
1134         pte_t src_folio_pte;                  << 
1135         spinlock_t *src_ptl, *dst_ptl;        << 
1136         pte_t *src_pte = NULL;                << 
1137         pte_t *dst_pte = NULL;                << 
1138                                               << 
1139         struct folio *src_folio = NULL;       << 
1140         struct anon_vma *src_anon_vma = NULL; << 
1141         struct mmu_notifier_range range;      << 
1142         int err = 0;                          << 
1143                                               << 
1144         flush_cache_range(src_vma, src_addr,  << 
1145         mmu_notifier_range_init(&range, MMU_N << 
1146                                 src_addr, src << 
1147         mmu_notifier_invalidate_range_start(& << 
1148 retry:                                        << 
1149         dst_pte = pte_offset_map_nolock(mm, d << 
1150                                               << 
1151         /* Retry if a huge pmd materialized f << 
1152         if (unlikely(!dst_pte)) {             << 
1153                 err = -EAGAIN;                << 
1154                 goto out;                     << 
1155         }                                     << 
1156                                               << 
1157         src_pte = pte_offset_map_nolock(mm, s << 
1158                                               << 
1159         /*                                    << 
1160          * We held the mmap_lock for reading  << 
1161          * can zap transparent huge pages und << 
1162          * transparent huge page fault can es << 
1163          * transparent huge pages under us.   << 
1164          */                                   << 
1165         if (unlikely(!src_pte)) {             << 
1166                 err = -EAGAIN;                << 
1167                 goto out;                     << 
1168         }                                     << 
1169                                               << 
1170         /* Sanity checks before the operation << 
1171         if (WARN_ON_ONCE(pmd_none(*dst_pmd))  << 
1172             WARN_ON_ONCE(pmd_trans_huge(*dst_ << 
1173                 err = -EINVAL;                << 
1174                 goto out;                     << 
1175         }                                     << 
1176                                               << 
1177         spin_lock(dst_ptl);                   << 
1178         orig_dst_pte = ptep_get(dst_pte);     << 
1179         spin_unlock(dst_ptl);                 << 
1180         if (!pte_none(orig_dst_pte)) {        << 
1181                 err = -EEXIST;                << 
1182                 goto out;                     << 
1183         }                                     << 
1184                                               << 
1185         spin_lock(src_ptl);                   << 
1186         orig_src_pte = ptep_get(src_pte);     << 
1187         spin_unlock(src_ptl);                 << 
1188         if (pte_none(orig_src_pte)) {         << 
1189                 if (!(mode & UFFDIO_MOVE_MODE << 
1190                         err = -ENOENT;        << 
1191                 else /* nothing to do to move << 
1192                         err = 0;              << 
1193                 goto out;                     << 
1194         }                                     << 
1195                                               << 
1196         /* If PTE changed after we locked the << 
1197         if (src_folio && unlikely(!pte_same(s << 
1198                 err = -EAGAIN;                << 
1199                 goto out;                     << 
1200         }                                     << 
1201                                               << 
1202         if (pte_present(orig_src_pte)) {      << 
1203                 if (is_zero_pfn(pte_pfn(orig_ << 
1204                         err = move_zeropage_p << 
1205                                               << 
1206                                               << 
1207                                               << 
1208                         goto out;             << 
1209                 }                             << 
1210                                               << 
1211                 /*                            << 
1212                  * Pin and lock both source f << 
1213                  * RCU read section, we can't << 
1214                  * unmap the ptes, obtain the << 
1215                  */                           << 
1216                 if (!src_folio) {             << 
1217                         struct folio *folio;  << 
1218                                               << 
1219                         /*                    << 
1220                          * Pin the page while << 
1221                          * page isn't freed u << 
1222                          */                   << 
1223                         spin_lock(src_ptl);   << 
1224                         if (!pte_same(orig_sr << 
1225                                 spin_unlock(s << 
1226                                 err = -EAGAIN << 
1227                                 goto out;     << 
1228                         }                     << 
1229                                               << 
1230                         folio = vm_normal_fol << 
1231                         if (!folio || !PageAn << 
1232                                 spin_unlock(s << 
1233                                 err = -EBUSY; << 
1234                                 goto out;     << 
1235                         }                     << 
1236                                               << 
1237                         folio_get(folio);     << 
1238                         src_folio = folio;    << 
1239                         src_folio_pte = orig_ << 
1240                         spin_unlock(src_ptl); << 
1241                                               << 
1242                         if (!folio_trylock(sr << 
1243                                 pte_unmap(&or << 
1244                                 pte_unmap(&or << 
1245                                 src_pte = dst << 
1246                                 /* now we can << 
1247                                 folio_lock(sr << 
1248                                 goto retry;   << 
1249                         }                     << 
1250                                               << 
1251                         if (WARN_ON_ONCE(!fol << 
1252                                 err = -EBUSY; << 
1253                                 goto out;     << 
1254                         }                     << 
1255                 }                             << 
1256                                               << 
1257                 /* at this point we have src_ << 
1258                 if (folio_test_large(src_foli << 
1259                         /* split_folio() can  << 
1260                         pte_unmap(&orig_src_p << 
1261                         pte_unmap(&orig_dst_p << 
1262                         src_pte = dst_pte = N << 
1263                         err = split_folio(src << 
1264                         if (err)              << 
1265                                 goto out;     << 
1266                         /* have to reacquire  << 
1267                         folio_unlock(src_foli << 
1268                         folio_put(src_folio); << 
1269                         src_folio = NULL;     << 
1270                         goto retry;           << 
1271                 }                             << 
1272                                               << 
1273                 if (!src_anon_vma) {          << 
1274                         /*                    << 
1275                          * folio_referenced w << 
1276                          * without the folio  << 
1277                          * the anon_vma lock, << 
1278                          */                   << 
1279                         src_anon_vma = folio_ << 
1280                         if (!src_anon_vma) {  << 
1281                                 /* page was u << 
1282                                 err = -EAGAIN << 
1283                                 goto out;     << 
1284                         }                     << 
1285                         if (!anon_vma_trylock << 
1286                                 pte_unmap(&or << 
1287                                 pte_unmap(&or << 
1288                                 src_pte = dst << 
1289                                 /* now we can << 
1290                                 anon_vma_lock << 
1291                                 goto retry;   << 
1292                         }                     << 
1293                 }                             << 
1294                                               << 
1295                 err = move_present_pte(mm,  d << 
1296                                        dst_ad << 
1297                                        orig_d << 
1298                                        dst_pt << 
1299         } else {                              << 
1300                 entry = pte_to_swp_entry(orig << 
1301                 if (non_swap_entry(entry)) {  << 
1302                         if (is_migration_entr << 
1303                                 pte_unmap(&or << 
1304                                 pte_unmap(&or << 
1305                                 src_pte = dst << 
1306                                 migration_ent << 
1307                                 err = -EAGAIN << 
1308                         } else                << 
1309                                 err = -EFAULT << 
1310                         goto out;             << 
1311                 }                             << 
1312                                               << 
1313                 err = move_swap_pte(mm, dst_a << 
1314                                     dst_pte,  << 
1315                                     orig_dst_ << 
1316                                     dst_ptl,  << 
1317         }                                     << 
1318                                               << 
1319 out:                                          << 
1320         if (src_anon_vma) {                   << 
1321                 anon_vma_unlock_write(src_ano << 
1322                 put_anon_vma(src_anon_vma);   << 
1323         }                                     << 
1324         if (src_folio) {                      << 
1325                 folio_unlock(src_folio);      << 
1326                 folio_put(src_folio);         << 
1327         }                                     << 
1328         if (dst_pte)                          << 
1329                 pte_unmap(dst_pte);           << 
1330         if (src_pte)                          << 
1331                 pte_unmap(src_pte);           << 
1332         mmu_notifier_invalidate_range_end(&ra << 
1333                                               << 
1334         return err;                           << 
1335 }                                             << 
1336                                               << 
1337 #ifdef CONFIG_TRANSPARENT_HUGEPAGE            << 
1338 static inline bool move_splits_huge_pmd(unsig << 
1339                                         unsig << 
1340                                         unsig << 
1341 {                                             << 
1342         return (src_addr & ~HPAGE_PMD_MASK) | << 
1343                 src_end - src_addr < HPAGE_PM << 
1344 }                                             << 
1345 #else                                         << 
1346 static inline bool move_splits_huge_pmd(unsig << 
1347                                         unsig << 
1348                                         unsig << 
1349 {                                             << 
1350         /* This is unreachable anyway, just t << 
1351         return false;                         << 
1352 }                                             << 
1353 #endif                                        << 
1354                                               << 
1355 static inline bool vma_move_compatible(struct << 
1356 {                                             << 
1357         return !(vma->vm_flags & (VM_PFNMAP | << 
1358                                   VM_MIXEDMAP << 
1359 }                                             << 
1360                                               << 
1361 static int validate_move_areas(struct userfau << 
1362                                struct vm_area << 
1363                                struct vm_area << 
1364 {                                             << 
1365         /* Only allow moving if both have the << 
1366         if ((src_vma->vm_flags & VM_ACCESS_FL << 
1367             pgprot_val(src_vma->vm_page_prot) << 
1368                 return -EINVAL;               << 
1369                                               << 
1370         /* Only allow moving if both are mloc << 
1371         if ((src_vma->vm_flags & VM_LOCKED) ! << 
1372                 return -EINVAL;               << 
1373                                               << 
1374         /*                                    << 
1375          * For now, we keep it simple and onl << 
1376          * Access flags are equal, therefore  << 
1377          */                                   << 
1378         if (!(src_vma->vm_flags & VM_WRITE))  << 
1379                 return -EINVAL;               << 
1380                                               << 
1381         /* Check if vma flags indicate conten << 
1382         if (!vma_move_compatible(src_vma) ||  << 
1383                 return -EINVAL;               << 
1384                                               << 
1385         /* Ensure dst_vma is registered in uf << 
1386         if (!dst_vma->vm_userfaultfd_ctx.ctx  << 
1387             dst_vma->vm_userfaultfd_ctx.ctx ! << 
1388                 return -EINVAL;               << 
1389                                               << 
1390         /* Only allow moving across anonymous << 
1391         if (!vma_is_anonymous(src_vma) || !vm << 
1392                 return -EINVAL;               << 
1393                                               << 
1394         return 0;                             << 
1395 }                                             << 
1396                                               << 
1397 static __always_inline                        << 
1398 int find_vmas_mm_locked(struct mm_struct *mm, << 
1399                         unsigned long dst_sta << 
1400                         unsigned long src_sta << 
1401                         struct vm_area_struct << 
1402                         struct vm_area_struct << 
1403 {                                             << 
1404         struct vm_area_struct *vma;           << 
1405                                               << 
1406         mmap_assert_locked(mm);               << 
1407         vma = find_vma_and_prepare_anon(mm, d << 
1408         if (IS_ERR(vma))                      << 
1409                 return PTR_ERR(vma);          << 
1410                                               << 
1411         *dst_vmap = vma;                      << 
1412         /* Skip finding src_vma if src_start  << 
1413         if (src_start >= vma->vm_start && src << 
1414                 goto out_success;             << 
1415                                               << 
1416         vma = vma_lookup(mm, src_start);      << 
1417         if (!vma)                             << 
1418                 return -ENOENT;               << 
1419 out_success:                                  << 
1420         *src_vmap = vma;                      << 
1421         return 0;                             << 
1422 }                                             << 
1423                                               << 
1424 #ifdef CONFIG_PER_VMA_LOCK                    << 
1425 static int uffd_move_lock(struct mm_struct *m << 
1426                           unsigned long dst_s << 
1427                           unsigned long src_s << 
1428                           struct vm_area_stru << 
1429                           struct vm_area_stru << 
1430 {                                             << 
1431         struct vm_area_struct *vma;           << 
1432         int err;                              << 
1433                                               << 
1434         vma = uffd_lock_vma(mm, dst_start);   << 
1435         if (IS_ERR(vma))                      << 
1436                 return PTR_ERR(vma);          << 
1437                                               << 
1438         *dst_vmap = vma;                      << 
1439         /*                                    << 
1440          * Skip finding src_vma if src_start  << 
1441          * that we don't lock the same vma tw << 
1442          */                                   << 
1443         if (src_start >= vma->vm_start && src << 
1444                 *src_vmap = vma;              << 
1445                 return 0;                     << 
1446         }                                     << 
1447                                               << 
1448         /*                                    << 
1449          * Using uffd_lock_vma() to get src_v << 
1450          *                                    << 
1451          * Thread1                            << 
1452          * -------                            << 
1453          * vma_start_read(dst_vma)            << 
1454          *                                    << 
1455          *                                    << 
1456          * vma_start_read(src_vma)            << 
1457          * mmap_read_lock(mm)                 << 
1458          *                                    << 
1459          */                                   << 
1460         *src_vmap = lock_vma_under_rcu(mm, sr << 
1461         if (likely(*src_vmap))                << 
1462                 return 0;                     << 
1463                                               << 
1464         /* Undo any locking and retry in mmap << 
1465         vma_end_read(*dst_vmap);              << 
1466                                               << 
1467         mmap_read_lock(mm);                   << 
1468         err = find_vmas_mm_locked(mm, dst_sta << 
1469         if (!err) {                           << 
1470                 /*                            << 
1471                  * See comment in uffd_lock_v << 
1472                  * vma_start_read() here.     << 
1473                  */                           << 
1474                 down_read(&(*dst_vmap)->vm_lo << 
1475                 if (*dst_vmap != *src_vmap)   << 
1476                         down_read_nested(&(*s << 
1477                                          SING << 
1478         }                                     << 
1479         mmap_read_unlock(mm);                 << 
1480         return err;                           << 
1481 }                                             << 
1482                                               << 
1483 static void uffd_move_unlock(struct vm_area_s << 
1484                              struct vm_area_s << 
1485 {                                             << 
1486         vma_end_read(src_vma);                << 
1487         if (src_vma != dst_vma)               << 
1488                 vma_end_read(dst_vma);        << 
1489 }                                             << 
1490                                               << 
1491 #else                                         << 
1492                                               << 
1493 static int uffd_move_lock(struct mm_struct *m << 
1494                           unsigned long dst_s << 
1495                           unsigned long src_s << 
1496                           struct vm_area_stru << 
1497                           struct vm_area_stru << 
1498 {                                             << 
1499         int err;                              << 
1500                                               << 
1501         mmap_read_lock(mm);                   << 
1502         err = find_vmas_mm_locked(mm, dst_sta << 
1503         if (err)                              << 
1504                 mmap_read_unlock(mm);         << 
1505         return err;                           << 
1506 }                                             << 
1507                                               << 
1508 static void uffd_move_unlock(struct vm_area_s << 
1509                              struct vm_area_s << 
1510 {                                             << 
1511         mmap_assert_locked(src_vma->vm_mm);   << 
1512         mmap_read_unlock(dst_vma->vm_mm);     << 
1513 }                                             << 
1514 #endif                                        << 
1515                                               << 
1516 /**                                           << 
1517  * move_pages - move arbitrary anonymous page << 
1518  * @ctx: pointer to the userfaultfd context   << 
1519  * @dst_start: start of the destination virtu << 
1520  * @src_start: start of the source virtual me << 
1521  * @len: length of the virtual memory range   << 
1522  * @mode: flags from uffdio_move.mode         << 
1523  *                                            << 
1524  * It will either use the mmap_lock in read m << 
1525  *                                            << 
1526  * move_pages() remaps arbitrary anonymous pa << 
1527  * copy. It only works on non shared anonymou << 
1528  * be relocated without generating non linear << 
1529  * code.                                      << 
1530  *                                            << 
1531  * It provides a zero copy mechanism to handl << 
1532  * The source vma pages should have mapcount  << 
1533  * enforced by using madvise(MADV_DONTFORK) o << 
1534  *                                            << 
1535  * The thread receiving the page during the u << 
1536  * will receive the faulting page in the sour << 
1537  * storage or any other I/O device (MADV_DONT << 
1538  * avoids move_pages() to fail with -EBUSY if << 
1539  * move_pages() is called), then it will call << 
1540  * page in the faulting address in the destin << 
1541  *                                            << 
1542  * This userfaultfd command works purely via  << 
1543  * most efficient way to move physical non sh << 
1544  * across different virtual addresses. Unlike << 
1545  * it does not create any new vmas. The mappi << 
1546  * address is atomic.                         << 
1547  *                                            << 
1548  * It only works if the vma protection bits a << 
1549  * source and destination vma.                << 
1550  *                                            << 
1551  * It can remap non shared anonymous pages wi << 
1552  *                                            << 
1553  * If the source virtual memory range has any << 
1554  * the destination virtual memory range is no << 
1555  * move_pages() will fail respectively with - << 
1556  * provides a very strict behavior to avoid a << 
1557  * corruption going unnoticed if there are us << 
1558  * Only one thread should resolve the userlan << 
1559  * time for any given faulting address. This  << 
1560  * try to both call move_pages() on the same  << 
1561  * same time, the second thread will get an e << 
1562  * command.                                   << 
1563  *                                            << 
1564  * The command retval will return "len" is su << 
1565  * however can be interrupted by fatal signal << 
1566  * interrupted it will return the number of b << 
1567  * remapped before the interruption if any, o << 
1568  * none. It will never return zero. Either it << 
1569  * an amount of bytes successfully moved. If  << 
1570  * "short" remap, the move_pages() command sh << 
1571  * userland with src+retval, dst+reval, len-r << 
1572  * about the error that interrupted it.       << 
1573  *                                            << 
1574  * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag  << 
1575  * prevent -ENOENT errors to materialize if t << 
1576  * source virtual range that is being remappe << 
1577  * accounted as successfully remapped in the  << 
1578  * command. This is mostly useful to remap hu << 
1579  * virtual regions without knowing if there a << 
1580  * in the regions or not, but preventing the  << 
1581  * the hugepmd during the remap.              << 
1582  *                                            << 
1583  * If there's any rmap walk that is taking th << 
1584  * first obtaining the folio lock (the only c << 
1585  * folio_referenced), they will have to verif << 
1586  * has changed after taking the anon_vma lock << 
1587  * should release the lock and retry obtainin << 
1588  * it means the anon_vma was changed by move_ << 
1589  * could be obtained. This is the only additi << 
1590  * the rmap code to provide this anonymous pa << 
1591  */                                           << 
1592 ssize_t move_pages(struct userfaultfd_ctx *ct << 
1593                    unsigned long src_start, u << 
1594 {                                             << 
1595         struct mm_struct *mm = ctx->mm;       << 
1596         struct vm_area_struct *src_vma, *dst_ << 
1597         unsigned long src_addr, dst_addr;     << 
1598         pmd_t *src_pmd, *dst_pmd;             << 
1599         long err = -EINVAL;                   << 
1600         ssize_t moved = 0;                    << 
1601                                               << 
1602         /* Sanitize the command parameters. * << 
1603         if (WARN_ON_ONCE(src_start & ~PAGE_MA << 
1604             WARN_ON_ONCE(dst_start & ~PAGE_MA << 
1605             WARN_ON_ONCE(len & ~PAGE_MASK))   << 
1606                 goto out;                     << 
1607                                               << 
1608         /* Does the address range wrap, or is << 
1609         if (WARN_ON_ONCE(src_start + len <= s << 
1610             WARN_ON_ONCE(dst_start + len <= d << 
1611                 goto out;                     << 
1612                                               << 
1613         err = uffd_move_lock(mm, dst_start, s << 
1614         if (err)                              << 
1615                 goto out;                     << 
1616                                               << 
1617         /* Re-check after taking map_changing << 
1618         err = -EAGAIN;                        << 
1619         down_read(&ctx->map_changing_lock);   << 
1620         if (likely(atomic_read(&ctx->mmap_cha << 
1621                 goto out_unlock;              << 
1622         /*                                    << 
1623          * Make sure the vma is not shared, t << 
1624          * ranges are both valid and fully wi << 
1625          * vma.                               << 
1626          */                                   << 
1627         err = -EINVAL;                        << 
1628         if (src_vma->vm_flags & VM_SHARED)    << 
1629                 goto out_unlock;              << 
1630         if (src_start + len > src_vma->vm_end << 
1631                 goto out_unlock;              << 
1632                                               << 
1633         if (dst_vma->vm_flags & VM_SHARED)    << 
1634                 goto out_unlock;              << 
1635         if (dst_start + len > dst_vma->vm_end << 
1636                 goto out_unlock;              << 
1637                                               << 
1638         err = validate_move_areas(ctx, src_vm << 
1639         if (err)                              << 
1640                 goto out_unlock;              << 
1641                                               << 
1642         for (src_addr = src_start, dst_addr = << 
1643              src_addr < src_start + len;) {   << 
1644                 spinlock_t *ptl;              << 
1645                 pmd_t dst_pmdval;             << 
1646                 unsigned long step_size;      << 
1647                                               << 
1648                 /*                            << 
1649                  * Below works because anonym << 
1650                  * transparent huge PUD. If f << 
1651                  * that case would need to be << 
1652                  */                           << 
1653                 src_pmd = mm_find_pmd(mm, src << 
1654                 if (unlikely(!src_pmd)) {     << 
1655                         if (!(mode & UFFDIO_M << 
1656                                 err = -ENOENT << 
1657                                 break;        << 
1658                         }                     << 
1659                         src_pmd = mm_alloc_pm << 
1660                         if (unlikely(!src_pmd << 
1661                                 err = -ENOMEM << 
1662                                 break;        << 
1663                         }                     << 
1664                 }                             << 
1665                 dst_pmd = mm_alloc_pmd(mm, ds << 
1666                 if (unlikely(!dst_pmd)) {     << 
1667                         err = -ENOMEM;        << 
1668                         break;                << 
1669                 }                             << 
1670                                               << 
1671                 dst_pmdval = pmdp_get_lockles << 
1672                 /*                            << 
1673                  * If the dst_pmd is mapped a << 
1674                  * be strict. If dst_pmd chan << 
1675                  * move_pages_huge_pmd() will << 
1676                  * while move_pages_pte() wil << 
1677                  */                           << 
1678                 if (unlikely(pmd_trans_huge(d << 
1679                         err = -EEXIST;        << 
1680                         break;                << 
1681                 }                             << 
1682                                               << 
1683                 ptl = pmd_trans_huge_lock(src << 
1684                 if (ptl) {                    << 
1685                         if (pmd_devmap(*src_p << 
1686                                 spin_unlock(p << 
1687                                 err = -ENOENT << 
1688                                 break;        << 
1689                         }                     << 
1690                                               << 
1691                         /* Check if we can mo << 
1692                         if (move_splits_huge_ << 
1693                             !pmd_none(dst_pmd << 
1694                                 struct folio  << 
1695                                               << 
1696                                 if (!folio || << 
1697                                               << 
1698                                         spin_ << 
1699                                         err = << 
1700                                         break << 
1701                                 }             << 
1702                                               << 
1703                                 spin_unlock(p << 
1704                                 split_huge_pm << 
1705                                 /* The folio  << 
1706                                 continue;     << 
1707                         }                     << 
1708                                               << 
1709                         err = move_pages_huge << 
1710                                               << 
1711                                               << 
1712                         step_size = HPAGE_PMD << 
1713                 } else {                      << 
1714                         if (pmd_none(*src_pmd << 
1715                                 if (!(mode &  << 
1716                                         err = << 
1717                                         break << 
1718                                 }             << 
1719                                 if (unlikely( << 
1720                                         err = << 
1721                                         break << 
1722                                 }             << 
1723                         }                     << 
1724                                               << 
1725                         if (unlikely(pte_allo << 
1726                                 err = -ENOMEM << 
1727                                 break;        << 
1728                         }                     << 
1729                                               << 
1730                         err = move_pages_pte( << 
1731                                               << 
1732                                               << 
1733                         step_size = PAGE_SIZE << 
1734                 }                             << 
1735                                               << 
1736                 cond_resched();               << 
1737                                               << 
1738                 if (fatal_signal_pending(curr << 
1739                         /* Do not override an << 
1740                         if (!err || err == -E << 
1741                                 err = -EINTR; << 
1742                         break;                << 
1743                 }                             << 
1744                                               << 
1745                 if (err) {                    << 
1746                         if (err == -EAGAIN)   << 
1747                                 continue;     << 
1748                         break;                << 
1749                 }                             << 
1750                                               << 
1751                 /* Proceed to the next page * << 
1752                 dst_addr += step_size;        << 
1753                 src_addr += step_size;        << 
1754                 moved += step_size;           << 
1755         }                                     << 
1756                                               << 
1757 out_unlock:                                   << 
1758         up_read(&ctx->map_changing_lock);     << 
1759         uffd_move_unlock(dst_vma, src_vma);   << 
1760 out:                                          << 
1761         VM_WARN_ON(moved < 0);                << 
1762         VM_WARN_ON(err > 0);                  << 
1763         VM_WARN_ON(!moved && !err);           << 
1764         return moved ? moved : err;           << 
1765 }                                             << 
1766                                               << 
1767 static void userfaultfd_set_vm_flags(struct v << 
1768                                      vm_flags << 
1769 {                                             << 
1770         const bool uffd_wp_changed = (vma->vm << 
1771                                               << 
1772         vm_flags_reset(vma, flags);           << 
1773         /*                                    << 
1774          * For shared mappings, we want to en << 
1775          * userfaultfd-wp is enabled (see vma << 
1776          * recalculate vma->vm_page_prot when << 
1777          */                                   << 
1778         if ((vma->vm_flags & VM_SHARED) && uf << 
1779                 vma_set_page_prot(vma);       << 
1780 }                                             << 
1781                                               << 
1782 static void userfaultfd_set_ctx(struct vm_are << 
1783                                 struct userfa << 
1784                                 unsigned long << 
1785 {                                             << 
1786         vma_start_write(vma);                 << 
1787         vma->vm_userfaultfd_ctx = (struct vm_ << 
1788         userfaultfd_set_vm_flags(vma,         << 
1789                                  (vma->vm_fla << 
1790 }                                             << 
1791                                               << 
1792 void userfaultfd_reset_ctx(struct vm_area_str << 
1793 {                                             << 
1794         userfaultfd_set_ctx(vma, NULL, 0);    << 
1795 }                                             << 
1796                                               << 
1797 struct vm_area_struct *userfaultfd_clear_vma( << 
1798                                               << 
1799                                               << 
1800                                               << 
1801                                               << 
1802 {                                             << 
1803         struct vm_area_struct *ret;           << 
1804                                               << 
1805         /* Reset ptes for the whole vma range << 
1806         if (userfaultfd_wp(vma))              << 
1807                 uffd_wp_range(vma, start, end << 
1808                                               << 
1809         ret = vma_modify_flags_uffd(vmi, prev << 
1810                                     vma->vm_f << 
1811                                     NULL_VM_U << 
1812                                               << 
1813         /*                                    << 
1814          * In the vma_merge() successful mpro << 
1815          * the next vma was merged into the c << 
1816          * the current one has not been updat << 
1817          */                                   << 
1818         if (!IS_ERR(ret))                     << 
1819                 userfaultfd_reset_ctx(ret);   << 
1820                                               << 
1821         return ret;                           << 
1822 }                                             << 
1823                                               << 
1824 /* Assumes mmap write lock taken, and mm_stru << 
1825 int userfaultfd_register_range(struct userfau << 
1826                                struct vm_area << 
1827                                unsigned long  << 
1828                                unsigned long  << 
1829                                bool wp_async) << 
1830 {                                             << 
1831         VMA_ITERATOR(vmi, ctx->mm, start);    << 
1832         struct vm_area_struct *prev = vma_pre << 
1833         unsigned long vma_end;                << 
1834         unsigned long new_flags;              << 
1835                                               << 
1836         if (vma->vm_start < start)            << 
1837                 prev = vma;                   << 
1838                                               << 
1839         for_each_vma_range(vmi, vma, end) {   << 
1840                 cond_resched();               << 
1841                                               << 
1842                 BUG_ON(!vma_can_userfault(vma << 
1843                 BUG_ON(vma->vm_userfaultfd_ct << 
1844                        vma->vm_userfaultfd_ct << 
1845                 WARN_ON(!(vma->vm_flags & VM_ << 
1846                                               << 
1847                 /*                            << 
1848                  * Nothing to do: this vma is << 
1849                  * userfaultfd and with the r << 
1850                  */                           << 
1851                 if (vma->vm_userfaultfd_ctx.c << 
1852                     (vma->vm_flags & vm_flags << 
1853                         goto skip;            << 
1854                                               << 
1855                 if (vma->vm_start > start)    << 
1856                         start = vma->vm_start << 
1857                 vma_end = min(end, vma->vm_en << 
1858                                               << 
1859                 new_flags = (vma->vm_flags &  << 
1860                 vma = vma_modify_flags_uffd(& << 
1861                                             n << 
1862                                             ( << 
1863                 if (IS_ERR(vma))              << 
1864                         return PTR_ERR(vma);  << 
1865                                               << 
1866                 /*                            << 
1867                  * In the vma_merge() success << 
1868                  * the next vma was merged in << 
1869                  * the current one has not be << 
1870                  */                           << 
1871                 userfaultfd_set_ctx(vma, ctx, << 
1872                                               << 
1873                 if (is_vm_hugetlb_page(vma) & << 
1874                         hugetlb_unshare_all_p << 
1875                                               << 
1876 skip:                                         << 
1877                 prev = vma;                   << 
1878                 start = vma->vm_end;          << 
1879         }                                     << 
1880                                               << 
1881         return 0;                             << 
1882 }                                             << 
1883                                               << 
1884 void userfaultfd_release_new(struct userfault << 
1885 {                                             << 
1886         struct mm_struct *mm = ctx->mm;       << 
1887         struct vm_area_struct *vma;           << 
1888         VMA_ITERATOR(vmi, mm, 0);             << 
1889                                               << 
1890         /* the various vma->vm_userfaultfd_ct << 
1891         mmap_write_lock(mm);                  << 
1892         for_each_vma(vmi, vma) {              << 
1893                 if (vma->vm_userfaultfd_ctx.c << 
1894                         userfaultfd_reset_ctx << 
1895         }                                     << 
1896         mmap_write_unlock(mm);                << 
1897 }                                             << 
1898                                               << 
1899 void userfaultfd_release_all(struct mm_struct << 
1900                              struct userfault << 
1901 {                                             << 
1902         struct vm_area_struct *vma, *prev;    << 
1903         VMA_ITERATOR(vmi, mm, 0);             << 
1904                                               << 
1905         if (!mmget_not_zero(mm))              << 
1906                 return;                       << 
1907                                               << 
1908         /*                                    << 
1909          * Flush page faults out of all CPUs. << 
1910          * must be retried without returning  << 
1911          * userfaultfd_ctx_get() succeeds but << 
1912          * changes while handle_userfault rel << 
1913          * it's critical that released is set << 
1914          * taking the mmap_lock for writing.  << 
1915          */                                   << 
1916         mmap_write_lock(mm);                  << 
1917         prev = NULL;                          << 
1918         for_each_vma(vmi, vma) {              << 
1919                 cond_resched();               << 
1920                 BUG_ON(!!vma->vm_userfaultfd_ << 
1921                        !!(vma->vm_flags & __V << 
1922                 if (vma->vm_userfaultfd_ctx.c << 
1923                         prev = vma;           << 
1924                         continue;             << 
1925                 }                             << 
1926                                               << 
1927                 vma = userfaultfd_clear_vma(& << 
1928                                             v << 
1929                 prev = vma;                   << 
1930         }                                     << 
1931         mmap_write_unlock(mm);                << 
1932         mmput(mm);                            << 
1933 }                                                560 }
1934                                                  561 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php