~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/mm/memory.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /mm/memory.c (Version linux-6.12-rc7) and /mm/memory.c (Version linux-6.1.114)


** Warning: Cannot open xref database.

  1                                                     1 
  2 // SPDX-License-Identifier: GPL-2.0-only          
  3 /*                                                
  4  *  linux/mm/memory.c                             
  5  *                                                
  6  *  Copyright (C) 1991, 1992, 1993, 1994  Linu    
  7  */                                               
  8                                                   
  9 /*                                                
 10  * demand-loading started 01.12.91 - seems it     
 11  * things wanted, and it should be easy to imp    
 12  */                                               
 13                                                   
 14 /*                                                
 15  * Ok, demand-loading was easy, shared pages a    
 16  * pages started 02.12.91, seems to work. - Li    
 17  *                                                
 18  * Tested sharing by executing about 30 /bin/s    
 19  * would have taken more than the 6M I have fr    
 20  * far as I could see.                            
 21  *                                                
 22  * Also corrected some "invalidate()"s - I was    
 23  */                                               
 24                                                   
 25 /*                                                
 26  * Real VM (paging to/from disk) started 18.12    
 27  * thought has to go into this. Oh, well..        
 28  * 19.12.91  -  works, somewhat. Sometimes I g    
 29  *              Found it. Everything seems to     
 30  * 20.12.91  -  Ok, making the swap-device cha    
 31  */                                               
 32                                                   
 33 /*                                                
 34  * 05.04.94  -  Multi-page memory management a    
 35  *              Idea by Alex Bligh (alex@cconc    
 36  *                                                
 37  * 16.07.99  -  Support of BIGMEM added by Ger    
 38  *              (Gerhard.Wichert@pdb.siemens.d    
 39  *                                                
 40  * Aug/Sep 2004 Changed to four level page tab    
 41  */                                               
 42                                                   
 43 #include <linux/kernel_stat.h>                    
 44 #include <linux/mm.h>                             
 45 #include <linux/mm_inline.h>                      
 46 #include <linux/sched/mm.h>                       
 47 #include <linux/sched/coredump.h>                 
 48 #include <linux/sched/numa_balancing.h>           
 49 #include <linux/sched/task.h>                     
 50 #include <linux/hugetlb.h>                        
 51 #include <linux/mman.h>                           
 52 #include <linux/swap.h>                           
 53 #include <linux/highmem.h>                        
 54 #include <linux/pagemap.h>                        
 55 #include <linux/memremap.h>                       
 56 #include <linux/kmsan.h>                          
 57 #include <linux/ksm.h>                            
 58 #include <linux/rmap.h>                           
 59 #include <linux/export.h>                         
 60 #include <linux/delayacct.h>                      
 61 #include <linux/init.h>                           
 62 #include <linux/pfn_t.h>                          
 63 #include <linux/writeback.h>                      
 64 #include <linux/memcontrol.h>                     
 65 #include <linux/mmu_notifier.h>                   
 66 #include <linux/swapops.h>                        
 67 #include <linux/elf.h>                            
 68 #include <linux/gfp.h>                            
 69 #include <linux/migrate.h>                        
 70 #include <linux/string.h>                         
 71 #include <linux/memory-tiers.h>                   
 72 #include <linux/debugfs.h>                        
 73 #include <linux/userfaultfd_k.h>                  
 74 #include <linux/dax.h>                            
 75 #include <linux/oom.h>                            
 76 #include <linux/numa.h>                           
 77 #include <linux/perf_event.h>                     
 78 #include <linux/ptrace.h>                         
 79 #include <linux/vmalloc.h>                        
 80 #include <linux/sched/sysctl.h>                   
 81                                                   
 82 #include <trace/events/kmem.h>                    
 83                                                   
 84 #include <asm/io.h>                               
 85 #include <asm/mmu_context.h>                      
 86 #include <asm/pgalloc.h>                          
 87 #include <linux/uaccess.h>                        
 88 #include <asm/tlb.h>                              
 89 #include <asm/tlbflush.h>                         
 90                                                   
 91 #include "pgalloc-track.h"                        
 92 #include "internal.h"                             
 93 #include "swap.h"                                 
 94                                                   
 95 #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) &&     
 96 #warning Unfortunate NUMA and NUMA Balancing c    
 97 #endif                                            
 98                                                   
 99 #ifndef CONFIG_NUMA                               
100 unsigned long max_mapnr;                          
101 EXPORT_SYMBOL(max_mapnr);                         
102                                                   
103 struct page *mem_map;                             
104 EXPORT_SYMBOL(mem_map);                           
105 #endif                                            
106                                                   
107 static vm_fault_t do_fault(struct vm_fault *vm    
108 static vm_fault_t do_anonymous_page(struct vm_    
109 static bool vmf_pte_changed(struct vm_fault *v    
110                                                   
111 /*                                                
112  * Return true if the original pte was a uffd-    
113  * wr-protected).                                 
114  */                                               
115 static __always_inline bool vmf_orig_pte_uffd_    
116 {                                                 
117         if (!userfaultfd_wp(vmf->vma))            
118                 return false;                     
119         if (!(vmf->flags & FAULT_FLAG_ORIG_PTE    
120                 return false;                     
121                                                   
122         return pte_marker_uffd_wp(vmf->orig_pt    
123 }                                                 
124                                                   
125 /*                                                
126  * A number of key systems in x86 including io    
127  * that high_memory defines the upper bound on    
128  * of ZONE_NORMAL.                                
129  */                                               
130 void *high_memory;                                
131 EXPORT_SYMBOL(high_memory);                       
132                                                   
133 /*                                                
134  * Randomize the address space (stacks, mmaps,    
135  *                                                
136  * ( When CONFIG_COMPAT_BRK=y we exclude brk f    
137  *   as ancient (libc5 based) binaries can seg    
138  */                                               
139 int randomize_va_space __read_mostly =            
140 #ifdef CONFIG_COMPAT_BRK                          
141                                         1;        
142 #else                                             
143                                         2;        
144 #endif                                            
145                                                   
146 #ifndef arch_wants_old_prefaulted_pte             
147 static inline bool arch_wants_old_prefaulted_p    
148 {                                                 
149         /*                                        
150          * Transitioning a PTE from 'old' to '    
151          * some architectures, even if it's pe    
152          * default, "false" means prefaulted e    
153          */                                       
154         return false;                             
155 }                                                 
156 #endif                                            
157                                                   
158 static int __init disable_randmaps(char *s)       
159 {                                                 
160         randomize_va_space = 0;                   
161         return 1;                                 
162 }                                                 
163 __setup("norandmaps", disable_randmaps);          
164                                                   
165 unsigned long zero_pfn __read_mostly;             
166 EXPORT_SYMBOL(zero_pfn);                          
167                                                   
168 unsigned long highest_memmap_pfn __read_mostly    
169                                                   
170 /*                                                
171  * CONFIG_MMU architectures set up ZERO_PAGE i    
172  */                                               
173 static int __init init_zero_pfn(void)             
174 {                                                 
175         zero_pfn = page_to_pfn(ZERO_PAGE(0));     
176         return 0;                                 
177 }                                                 
178 early_initcall(init_zero_pfn);                    
179                                                   
180 void mm_trace_rss_stat(struct mm_struct *mm, i    
181 {                                                 
182         trace_rss_stat(mm, member);               
183 }                                                 
184                                                   
185 /*                                                
186  * Note: this doesn't free the actual pages th    
187  * has been handled earlier when unmapping all    
188  */                                               
189 static void free_pte_range(struct mmu_gather *    
190                            unsigned long addr)    
191 {                                                 
192         pgtable_t token = pmd_pgtable(*pmd);      
193         pmd_clear(pmd);                           
194         pte_free_tlb(tlb, token, addr);           
195         mm_dec_nr_ptes(tlb->mm);                  
196 }                                                 
197                                                   
198 static inline void free_pmd_range(struct mmu_g    
199                                 unsigned long     
200                                 unsigned long     
201 {                                                 
202         pmd_t *pmd;                               
203         unsigned long next;                       
204         unsigned long start;                      
205                                                   
206         start = addr;                             
207         pmd = pmd_offset(pud, addr);              
208         do {                                      
209                 next = pmd_addr_end(addr, end)    
210                 if (pmd_none_or_clear_bad(pmd)    
211                         continue;                 
212                 free_pte_range(tlb, pmd, addr)    
213         } while (pmd++, addr = next, addr != e    
214                                                   
215         start &= PUD_MASK;                        
216         if (start < floor)                        
217                 return;                           
218         if (ceiling) {                            
219                 ceiling &= PUD_MASK;              
220                 if (!ceiling)                     
221                         return;                   
222         }                                         
223         if (end - 1 > ceiling - 1)                
224                 return;                           
225                                                   
226         pmd = pmd_offset(pud, start);             
227         pud_clear(pud);                           
228         pmd_free_tlb(tlb, pmd, start);            
229         mm_dec_nr_pmds(tlb->mm);                  
230 }                                                 
231                                                   
232 static inline void free_pud_range(struct mmu_g    
233                                 unsigned long     
234                                 unsigned long     
235 {                                                 
236         pud_t *pud;                               
237         unsigned long next;                       
238         unsigned long start;                      
239                                                   
240         start = addr;                             
241         pud = pud_offset(p4d, addr);              
242         do {                                      
243                 next = pud_addr_end(addr, end)    
244                 if (pud_none_or_clear_bad(pud)    
245                         continue;                 
246                 free_pmd_range(tlb, pud, addr,    
247         } while (pud++, addr = next, addr != e    
248                                                   
249         start &= P4D_MASK;                        
250         if (start < floor)                        
251                 return;                           
252         if (ceiling) {                            
253                 ceiling &= P4D_MASK;              
254                 if (!ceiling)                     
255                         return;                   
256         }                                         
257         if (end - 1 > ceiling - 1)                
258                 return;                           
259                                                   
260         pud = pud_offset(p4d, start);             
261         p4d_clear(p4d);                           
262         pud_free_tlb(tlb, pud, start);            
263         mm_dec_nr_puds(tlb->mm);                  
264 }                                                 
265                                                   
266 static inline void free_p4d_range(struct mmu_g    
267                                 unsigned long     
268                                 unsigned long     
269 {                                                 
270         p4d_t *p4d;                               
271         unsigned long next;                       
272         unsigned long start;                      
273                                                   
274         start = addr;                             
275         p4d = p4d_offset(pgd, addr);              
276         do {                                      
277                 next = p4d_addr_end(addr, end)    
278                 if (p4d_none_or_clear_bad(p4d)    
279                         continue;                 
280                 free_pud_range(tlb, p4d, addr,    
281         } while (p4d++, addr = next, addr != e    
282                                                   
283         start &= PGDIR_MASK;                      
284         if (start < floor)                        
285                 return;                           
286         if (ceiling) {                            
287                 ceiling &= PGDIR_MASK;            
288                 if (!ceiling)                     
289                         return;                   
290         }                                         
291         if (end - 1 > ceiling - 1)                
292                 return;                           
293                                                   
294         p4d = p4d_offset(pgd, start);             
295         pgd_clear(pgd);                           
296         p4d_free_tlb(tlb, p4d, start);            
297 }                                                 
298                                                   
299 /*                                                
300  * This function frees user-level page tables     
301  */                                               
302 void free_pgd_range(struct mmu_gather *tlb,       
303                         unsigned long addr, un    
304                         unsigned long floor, u    
305 {                                                 
306         pgd_t *pgd;                               
307         unsigned long next;                       
308                                                   
309         /*                                        
310          * The next few lines have given us lo    
311          *                                        
312          * Why are we testing PMD* at this top    
313          * there will be no work to do at all,    
314          * go all the way down to the bottom j    
315          *                                        
316          * Why all these "- 1"s?  Because 0 re    
317          * of the address space and the top of    
318          * top wouldn't help much: the masks w    
319          * The rule is that addr 0 and floor 0    
320          * the address space, but end 0 and ce    
321          * Comparisons need to use "end - 1" a    
322          * that end 0 case should be mythical)    
323          *                                        
324          * Wherever addr is brought up or ceil    
325          * be careful to reject "the opposite     
326          * subsequent tests.  But what about w    
327          * by PMD_SIZE below? no, end can't go    
328          *                                        
329          * Whereas we round start (addr) and c    
330          * masks at different levels, in order    
331          * now has no other vmas using it, so     
332          * bother to round floor or end up - t    
333          */                                       
334                                                   
335         addr &= PMD_MASK;                         
336         if (addr < floor) {                       
337                 addr += PMD_SIZE;                 
338                 if (!addr)                        
339                         return;                   
340         }                                         
341         if (ceiling) {                            
342                 ceiling &= PMD_MASK;              
343                 if (!ceiling)                     
344                         return;                   
345         }                                         
346         if (end - 1 > ceiling - 1)                
347                 end -= PMD_SIZE;                  
348         if (addr > end - 1)                       
349                 return;                           
350         /*                                        
351          * We add page table cache pages with     
352          * (see pte_free_tlb()), flush the tlb    
353          */                                       
354         tlb_change_page_size(tlb, PAGE_SIZE);     
355         pgd = pgd_offset(tlb->mm, addr);          
356         do {                                      
357                 next = pgd_addr_end(addr, end)    
358                 if (pgd_none_or_clear_bad(pgd)    
359                         continue;                 
360                 free_p4d_range(tlb, pgd, addr,    
361         } while (pgd++, addr = next, addr != e    
362 }                                                 
363                                                   
364 void free_pgtables(struct mmu_gather *tlb, str    
365                    struct vm_area_struct *vma,    
366                    unsigned long ceiling, bool    
367 {                                                 
368         struct unlink_vma_file_batch vb;          
369                                                   
370         do {                                      
371                 unsigned long addr = vma->vm_s    
372                 struct vm_area_struct *next;      
373                                                   
374                 /*                                
375                  * Note: USER_PGTABLES_CEILING    
376                  * be 0.  This will underflow     
377                  */                               
378                 next = mas_find(mas, ceiling -    
379                 if (unlikely(xa_is_zero(next))    
380                         next = NULL;              
381                                                   
382                 /*                                
383                  * Hide vma from rmap and trun    
384                  * pgtables                       
385                  */                               
386                 if (mm_wr_locked)                 
387                         vma_start_write(vma);     
388                 unlink_anon_vmas(vma);            
389                                                   
390                 if (is_vm_hugetlb_page(vma)) {    
391                         unlink_file_vma(vma);     
392                         hugetlb_free_pgd_range    
393                                 floor, next ?     
394                 } else {                          
395                         unlink_file_vma_batch_    
396                         unlink_file_vma_batch_    
397                                                   
398                         /*                        
399                          * Optimization: gathe    
400                          */                       
401                         while (next && next->v    
402                                && !is_vm_huget    
403                                 vma = next;       
404                                 next = mas_fin    
405                                 if (unlikely(x    
406                                         next =    
407                                 if (mm_wr_lock    
408                                         vma_st    
409                                 unlink_anon_vm    
410                                 unlink_file_vm    
411                         }                         
412                         unlink_file_vma_batch_    
413                         free_pgd_range(tlb, ad    
414                                 floor, next ?     
415                 }                                 
416                 vma = next;                       
417         } while (vma);                            
418 }                                                 
419                                                   
420 void pmd_install(struct mm_struct *mm, pmd_t *    
421 {                                                 
422         spinlock_t *ptl = pmd_lock(mm, pmd);      
423                                                   
424         if (likely(pmd_none(*pmd))) {   /* Has    
425                 mm_inc_nr_ptes(mm);               
426                 /*                                
427                  * Ensure all pte setup (eg. p    
428                  * visible before the pte is m    
429                  * put into page tables.          
430                  *                                
431                  * The other side of the story    
432                  * table walking code (when wa    
433                  * ie. most of the time). Fort    
434                  * of a chain of data-dependen    
435                  * being the notable exception    
436                  * seen in-order. See the alph    
437                  * smp_rmb() barriers in page     
438                  */                               
439                 smp_wmb(); /* Could be smp_wmb    
440                 pmd_populate(mm, pmd, *pte);      
441                 *pte = NULL;                      
442         }                                         
443         spin_unlock(ptl);                         
444 }                                                 
445                                                   
446 int __pte_alloc(struct mm_struct *mm, pmd_t *p    
447 {                                                 
448         pgtable_t new = pte_alloc_one(mm);        
449         if (!new)                                 
450                 return -ENOMEM;                   
451                                                   
452         pmd_install(mm, pmd, &new);               
453         if (new)                                  
454                 pte_free(mm, new);                
455         return 0;                                 
456 }                                                 
457                                                   
458 int __pte_alloc_kernel(pmd_t *pmd)                
459 {                                                 
460         pte_t *new = pte_alloc_one_kernel(&ini    
461         if (!new)                                 
462                 return -ENOMEM;                   
463                                                   
464         spin_lock(&init_mm.page_table_lock);      
465         if (likely(pmd_none(*pmd))) {   /* Has    
466                 smp_wmb(); /* See comment in p    
467                 pmd_populate_kernel(&init_mm,     
468                 new = NULL;                       
469         }                                         
470         spin_unlock(&init_mm.page_table_lock);    
471         if (new)                                  
472                 pte_free_kernel(&init_mm, new)    
473         return 0;                                 
474 }                                                 
475                                                   
476 static inline void init_rss_vec(int *rss)         
477 {                                                 
478         memset(rss, 0, sizeof(int) * NR_MM_COU    
479 }                                                 
480                                                   
481 static inline void add_mm_rss_vec(struct mm_st    
482 {                                                 
483         int i;                                    
484                                                   
485         for (i = 0; i < NR_MM_COUNTERS; i++)      
486                 if (rss[i])                       
487                         add_mm_counter(mm, i,     
488 }                                                 
489                                                   
490 /*                                                
491  * This function is called to print an error w    
492  * is found. For example, we might have a PFN-    
493  * a region that doesn't allow it.                
494  *                                                
495  * The calling function must still handle the     
496  */                                               
497 static void print_bad_pte(struct vm_area_struc    
498                           pte_t pte, struct pa    
499 {                                                 
500         pgd_t *pgd = pgd_offset(vma->vm_mm, ad    
501         p4d_t *p4d = p4d_offset(pgd, addr);       
502         pud_t *pud = pud_offset(p4d, addr);       
503         pmd_t *pmd = pmd_offset(pud, addr);       
504         struct address_space *mapping;            
505         pgoff_t index;                            
506         static unsigned long resume;              
507         static unsigned long nr_shown;            
508         static unsigned long nr_unshown;          
509                                                   
510         /*                                        
511          * Allow a burst of 60 reports, then k    
512          * or allow a steady drip of one repor    
513          */                                       
514         if (nr_shown == 60) {                     
515                 if (time_before(jiffies, resum    
516                         nr_unshown++;             
517                         return;                   
518                 }                                 
519                 if (nr_unshown) {                 
520                         pr_alert("BUG: Bad pag    
521                                  nr_unshown);     
522                         nr_unshown = 0;           
523                 }                                 
524                 nr_shown = 0;                     
525         }                                         
526         if (nr_shown++ == 0)                      
527                 resume = jiffies + 60 * HZ;       
528                                                   
529         mapping = vma->vm_file ? vma->vm_file-    
530         index = linear_page_index(vma, addr);     
531                                                   
532         pr_alert("BUG: Bad page map in process    
533                  current->comm,                   
534                  (long long)pte_val(pte), (lon    
535         if (page)                                 
536                 dump_page(page, "bad pte");       
537         pr_alert("addr:%px vm_flags:%08lx anon    
538                  (void *)addr, vma->vm_flags,     
539         pr_alert("file:%pD fault:%ps mmap:%ps     
540                  vma->vm_file,                    
541                  vma->vm_ops ? vma->vm_ops->fa    
542                  vma->vm_file ? vma->vm_file->    
543                  mapping ? mapping->a_ops->rea    
544         dump_stack();                             
545         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_    
546 }                                                 
547                                                   
548 /*                                                
549  * vm_normal_page -- This function gets the "s    
550  *                                                
551  * "Special" mappings do not wish to be associ    
552  * it doesn't exist, or it exists but they don    
553  * case, NULL is returned here. "Normal" mappi    
554  *                                                
555  * There are 2 broad cases. Firstly, an archit    
556  * pte bit, in which case this function is tri    
557  * may not have a spare pte bit, which require    
558  * described below.                               
559  *                                                
560  * A raw VM_PFNMAP mapping (ie. one that is no    
561  * special mapping (even if there are underlyi    
562  * COWed pages of a VM_PFNMAP are always norma    
563  *                                                
564  * The way we recognize COWed pages within VM_    
565  * rules set up by "remap_pfn_range()": the vm    
566  * set, and the vm_pgoff will point to the fir    
567  * mapping will always honor the rule             
568  *                                                
569  *      pfn_of_page == vma->vm_pgoff + ((addr     
570  *                                                
571  * And for normal mappings this is false.         
572  *                                                
573  * This restricts such mappings to be a linear    
574  * to pfn. To get around this restriction, we     
575  * as the vma is not a COW mapping; in that ca    
576  * special (because none can have been COWed).    
577  *                                                
578  *                                                
579  * In order to support COW of arbitrary specia    
580  *                                                
581  * VM_MIXEDMAP mappings can likewise contain m    
582  * page" backing, however the difference is th    
583  * page (that is, those where pfn_valid is tru    
584  * normal pages by the VM. The only exception     
585  * *never* refcounted.                            
586  *                                                
587  * The disadvantage is that pages are refcount    
588  * simply not an option for some PFNMAP users)    
589  * don't have to follow the strict linearity r    
590  * order to support COWable mappings.             
591  *                                                
592  */                                               
593 struct page *vm_normal_page(struct vm_area_str    
594                             pte_t pte)            
595 {                                                 
596         unsigned long pfn = pte_pfn(pte);         
597                                                   
598         if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPE    
599                 if (likely(!pte_special(pte)))    
600                         goto check_pfn;           
601                 if (vma->vm_ops && vma->vm_ops    
602                         return vma->vm_ops->fi    
603                 if (vma->vm_flags & (VM_PFNMAP    
604                         return NULL;              
605                 if (is_zero_pfn(pfn))             
606                         return NULL;              
607                 if (pte_devmap(pte))              
608                 /*                                
609                  * NOTE: New users of ZONE_DEV    
610                  * and will have refcounts inc    
611                  * when they are inserted into    
612                  * return here. Legacy ZONE_DE    
613                  * do not have refcounts. Exam    
614                  * MEMORY_DEVICE_FS_DAX type i    
615                  */                               
616                         return NULL;              
617                                                   
618                 print_bad_pte(vma, addr, pte,     
619                 return NULL;                      
620         }                                         
621                                                   
622         /* !CONFIG_ARCH_HAS_PTE_SPECIAL case f    
623                                                   
624         if (unlikely(vma->vm_flags & (VM_PFNMA    
625                 if (vma->vm_flags & VM_MIXEDMA    
626                         if (!pfn_valid(pfn))      
627                                 return NULL;      
628                         if (is_zero_pfn(pfn))     
629                                 return NULL;      
630                         goto out;                 
631                 } else {                          
632                         unsigned long off;        
633                         off = (addr - vma->vm_    
634                         if (pfn == vma->vm_pgo    
635                                 return NULL;      
636                         if (!is_cow_mapping(vm    
637                                 return NULL;      
638                 }                                 
639         }                                         
640                                                   
641         if (is_zero_pfn(pfn))                     
642                 return NULL;                      
643                                                   
644 check_pfn:                                        
645         if (unlikely(pfn > highest_memmap_pfn)    
646                 print_bad_pte(vma, addr, pte,     
647                 return NULL;                      
648         }                                         
649                                                   
650         /*                                        
651          * NOTE! We still have PageReserved()     
652          * eg. VDSO mappings can cause them to    
653          */                                       
654 out:                                              
655         VM_WARN_ON_ONCE(is_zero_pfn(pfn));        
656         return pfn_to_page(pfn);                  
657 }                                                 
658                                                   
659 struct folio *vm_normal_folio(struct vm_area_s    
660                             pte_t pte)            
661 {                                                 
662         struct page *page = vm_normal_page(vma    
663                                                   
664         if (page)                                 
665                 return page_folio(page);          
666         return NULL;                              
667 }                                                 
668                                                   
669 #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES             
670 struct page *vm_normal_page_pmd(struct vm_area    
671                                 pmd_t pmd)        
672 {                                                 
673         unsigned long pfn = pmd_pfn(pmd);         
674                                                   
675         /* Currently it's only used for huge p    
676         if (unlikely(pmd_special(pmd)))           
677                 return NULL;                      
678                                                   
679         if (unlikely(vma->vm_flags & (VM_PFNMA    
680                 if (vma->vm_flags & VM_MIXEDMA    
681                         if (!pfn_valid(pfn))      
682                                 return NULL;      
683                         goto out;                 
684                 } else {                          
685                         unsigned long off;        
686                         off = (addr - vma->vm_    
687                         if (pfn == vma->vm_pgo    
688                                 return NULL;      
689                         if (!is_cow_mapping(vm    
690                                 return NULL;      
691                 }                                 
692         }                                         
693                                                   
694         if (pmd_devmap(pmd))                      
695                 return NULL;                      
696         if (is_huge_zero_pmd(pmd))                
697                 return NULL;                      
698         if (unlikely(pfn > highest_memmap_pfn)    
699                 return NULL;                      
700                                                   
701         /*                                        
702          * NOTE! We still have PageReserved()     
703          * eg. VDSO mappings can cause them to    
704          */                                       
705 out:                                              
706         return pfn_to_page(pfn);                  
707 }                                                 
708                                                   
709 struct folio *vm_normal_folio_pmd(struct vm_ar    
710                                   unsigned lon    
711 {                                                 
712         struct page *page = vm_normal_page_pmd    
713                                                   
714         if (page)                                 
715                 return page_folio(page);          
716         return NULL;                              
717 }                                                 
718 #endif                                            
719                                                   
720 static void restore_exclusive_pte(struct vm_ar    
721                                   struct page     
722                                   pte_t *ptep)    
723 {                                                 
724         struct folio *folio = page_folio(page)    
725         pte_t orig_pte;                           
726         pte_t pte;                                
727         swp_entry_t entry;                        
728                                                   
729         orig_pte = ptep_get(ptep);                
730         pte = pte_mkold(mk_pte(page, READ_ONCE    
731         if (pte_swp_soft_dirty(orig_pte))         
732                 pte = pte_mksoft_dirty(pte);      
733                                                   
734         entry = pte_to_swp_entry(orig_pte);       
735         if (pte_swp_uffd_wp(orig_pte))            
736                 pte = pte_mkuffd_wp(pte);         
737         else if (is_writable_device_exclusive_    
738                 pte = maybe_mkwrite(pte_mkdirt    
739                                                   
740         VM_BUG_ON_FOLIO(pte_write(pte) && (!fo    
741                                            Pag    
742                                                   
743         /*                                        
744          * No need to take a page reference as    
745          * created when the swap entry was mad    
746          */                                       
747         if (folio_test_anon(folio))               
748                 folio_add_anon_rmap_pte(folio,    
749         else                                      
750                 /*                                
751                  * Currently device exclusive     
752                  * memory so the entry shouldn    
753                  */                               
754                 WARN_ON_ONCE(1);                  
755                                                   
756         set_pte_at(vma->vm_mm, address, ptep,     
757                                                   
758         /*                                        
759          * No need to invalidate - it was non-    
760          * secondary CPUs may have mappings th    
761          */                                       
762         update_mmu_cache(vma, address, ptep);     
763 }                                                 
764                                                   
765 /*                                                
766  * Tries to restore an exclusive pte if the pa    
767  * sleeping.                                      
768  */                                               
769 static int                                        
770 try_restore_exclusive_pte(pte_t *src_pte, stru    
771                         unsigned long addr)       
772 {                                                 
773         swp_entry_t entry = pte_to_swp_entry(p    
774         struct page *page = pfn_swap_entry_to_    
775                                                   
776         if (trylock_page(page)) {                 
777                 restore_exclusive_pte(vma, pag    
778                 unlock_page(page);                
779                 return 0;                         
780         }                                         
781                                                   
782         return -EBUSY;                            
783 }                                                 
784                                                   
785 /*                                                
786  * copy one vm_area from one task to the other    
787  * already present in the new task to be clear    
788  * covered by this vma.                           
789  */                                               
790                                                   
791 static unsigned long                              
792 copy_nonpresent_pte(struct mm_struct *dst_mm,     
793                 pte_t *dst_pte, pte_t *src_pte    
794                 struct vm_area_struct *src_vma    
795 {                                                 
796         unsigned long vm_flags = dst_vma->vm_f    
797         pte_t orig_pte = ptep_get(src_pte);       
798         pte_t pte = orig_pte;                     
799         struct folio *folio;                      
800         struct page *page;                        
801         swp_entry_t entry = pte_to_swp_entry(o    
802                                                   
803         if (likely(!non_swap_entry(entry))) {     
804                 if (swap_duplicate(entry) < 0)    
805                         return -EIO;              
806                                                   
807                 /* make sure dst_mm is on swap    
808                 if (unlikely(list_empty(&dst_m    
809                         spin_lock(&mmlist_lock    
810                         if (list_empty(&dst_mm    
811                                 list_add(&dst_    
812                                                   
813                         spin_unlock(&mmlist_lo    
814                 }                                 
815                 /* Mark the swap entry as shar    
816                 if (pte_swp_exclusive(orig_pte    
817                         pte = pte_swp_clear_ex    
818                         set_pte_at(src_mm, add    
819                 }                                 
820                 rss[MM_SWAPENTS]++;               
821         } else if (is_migration_entry(entry))     
822                 folio = pfn_swap_entry_folio(e    
823                                                   
824                 rss[mm_counter(folio)]++;         
825                                                   
826                 if (!is_readable_migration_ent    
827                                 is_cow_mapping    
828                         /*                        
829                          * COW mappings requir    
830                          * to be set to read.     
831                          * now shared.            
832                          */                       
833                         entry = make_readable_    
834                                                   
835                         pte = swp_entry_to_pte    
836                         if (pte_swp_soft_dirty    
837                                 pte = pte_swp_    
838                         if (pte_swp_uffd_wp(or    
839                                 pte = pte_swp_    
840                         set_pte_at(src_mm, add    
841                 }                                 
842         } else if (is_device_private_entry(ent    
843                 page = pfn_swap_entry_to_page(    
844                 folio = page_folio(page);         
845                                                   
846                 /*                                
847                  * Update rss count even for u    
848                  * they should treated just li    
849                  * respect.                       
850                  *                                
851                  * We will likely want to have    
852                  * for unaddressable pages, at    
853                  * keep things as they are.       
854                  */                               
855                 folio_get(folio);                 
856                 rss[mm_counter(folio)]++;         
857                 /* Cannot fail as these pages     
858                 folio_try_dup_anon_rmap_pte(fo    
859                                                   
860                 /*                                
861                  * We do not preserve soft-dir    
862                  * far, checkpoint/restore is     
863                  * requires that. And checkpoi    
864                  * when a device driver is inv    
865                  * save and restore device dri    
866                  */                               
867                 if (is_writable_device_private    
868                     is_cow_mapping(vm_flags))     
869                         entry = make_readable_    
870                                                   
871                         pte = swp_entry_to_pte    
872                         if (pte_swp_uffd_wp(or    
873                                 pte = pte_swp_    
874                         set_pte_at(src_mm, add    
875                 }                                 
876         } else if (is_device_exclusive_entry(e    
877                 /*                                
878                  * Make device exclusive entri    
879                  * original entry then copying    
880                  * exclusive entries currently    
881                  * (ie. COW) mappings.            
882                  */                               
883                 VM_BUG_ON(!is_cow_mapping(src_    
884                 if (try_restore_exclusive_pte(    
885                         return -EBUSY;            
886                 return -ENOENT;                   
887         } else if (is_pte_marker_entry(entry))    
888                 pte_marker marker = copy_pte_m    
889                                                   
890                 if (marker)                       
891                         set_pte_at(dst_mm, add    
892                                    make_pte_ma    
893                 return 0;                         
894         }                                         
895         if (!userfaultfd_wp(dst_vma))             
896                 pte = pte_swp_clear_uffd_wp(pt    
897         set_pte_at(dst_mm, addr, dst_pte, pte)    
898         return 0;                                 
899 }                                                 
900                                                   
901 /*                                                
902  * Copy a present and normal page.                
903  *                                                
904  * NOTE! The usual case is that this isn't req    
905  * instead, the caller can just increase the p    
906  * and re-use the pte the traditional way.        
907  *                                                
908  * And if we need a pre-allocated page but don    
909  * one, return a negative error to let the pre    
910  * code know so that it can do so outside the     
911  * lock.                                          
912  */                                               
913 static inline int                                 
914 copy_present_page(struct vm_area_struct *dst_v    
915                   pte_t *dst_pte, pte_t *src_p    
916                   struct folio **prealloc, str    
917 {                                                 
918         struct folio *new_folio;                  
919         pte_t pte;                                
920                                                   
921         new_folio = *prealloc;                    
922         if (!new_folio)                           
923                 return -EAGAIN;                   
924                                                   
925         /*                                        
926          * We have a prealloc page, all good!     
927          * over and copy the page & arm it.       
928          */                                       
929                                                   
930         if (copy_mc_user_highpage(&new_folio->    
931                 return -EHWPOISON;                
932                                                   
933         *prealloc = NULL;                         
934         __folio_mark_uptodate(new_folio);         
935         folio_add_new_anon_rmap(new_folio, dst    
936         folio_add_lru_vma(new_folio, dst_vma);    
937         rss[MM_ANONPAGES]++;                      
938                                                   
939         /* All done, just insert the new page     
940         pte = mk_pte(&new_folio->page, dst_vma    
941         pte = maybe_mkwrite(pte_mkdirty(pte),     
942         if (userfaultfd_pte_wp(dst_vma, ptep_g    
943                 /* Uffd-wp needs to be deliver    
944                 pte = pte_mkuffd_wp(pte);         
945         set_pte_at(dst_vma->vm_mm, addr, dst_p    
946         return 0;                                 
947 }                                                 
948                                                   
949 static __always_inline void __copy_present_pte    
950                 struct vm_area_struct *src_vma    
951                 pte_t pte, unsigned long addr,    
952 {                                                 
953         struct mm_struct *src_mm = src_vma->vm    
954                                                   
955         /* If it's a COW mapping, write protec    
956         if (is_cow_mapping(src_vma->vm_flags)     
957                 wrprotect_ptes(src_mm, addr, s    
958                 pte = pte_wrprotect(pte);         
959         }                                         
960                                                   
961         /* If it's a shared mapping, mark it c    
962         if (src_vma->vm_flags & VM_SHARED)        
963                 pte = pte_mkclean(pte);           
964         pte = pte_mkold(pte);                     
965                                                   
966         if (!userfaultfd_wp(dst_vma))             
967                 pte = pte_clear_uffd_wp(pte);     
968                                                   
969         set_ptes(dst_vma->vm_mm, addr, dst_pte    
970 }                                                 
971                                                   
972 /*                                                
973  * Copy one present PTE, trying to batch-proce    
974  * consecutive pages of the same folio by copy    
975  *                                                
976  * Returns -EAGAIN if one preallocated page is    
977  * Otherwise, returns the number of copied PTE    
978  */                                               
979 static inline int                                 
980 copy_present_ptes(struct vm_area_struct *dst_v    
981                  pte_t *dst_pte, pte_t *src_pt    
982                  int max_nr, int *rss, struct     
983 {                                                 
984         struct page *page;                        
985         struct folio *folio;                      
986         bool any_writable;                        
987         fpb_t flags = 0;                          
988         int err, nr;                              
989                                                   
990         page = vm_normal_page(src_vma, addr, p    
991         if (unlikely(!page))                      
992                 goto copy_pte;                    
993                                                   
994         folio = page_folio(page);                 
995                                                   
996         /*                                        
997          * If we likely have to copy, just don    
998          * sure that the common "small folio"     
999          * by keeping the batching logic separ    
1000          */                                      
1001         if (unlikely(!*prealloc && folio_test    
1002                 if (src_vma->vm_flags & VM_SH    
1003                         flags |= FPB_IGNORE_D    
1004                 if (!vma_soft_dirty_enabled(s    
1005                         flags |= FPB_IGNORE_S    
1006                                                  
1007                 nr = folio_pte_batch(folio, a    
1008                                      &any_wri    
1009                 folio_ref_add(folio, nr);        
1010                 if (folio_test_anon(folio)) {    
1011                         if (unlikely(folio_tr    
1012                                                  
1013                                 folio_ref_sub    
1014                                 return -EAGAI    
1015                         }                        
1016                         rss[MM_ANONPAGES] +=     
1017                         VM_WARN_ON_FOLIO(Page    
1018                 } else {                         
1019                         folio_dup_file_rmap_p    
1020                         rss[mm_counter_file(f    
1021                 }                                
1022                 if (any_writable)                
1023                         pte = pte_mkwrite(pte    
1024                 __copy_present_ptes(dst_vma,     
1025                                     addr, nr)    
1026                 return nr;                       
1027         }                                        
1028                                                  
1029         folio_get(folio);                        
1030         if (folio_test_anon(folio)) {            
1031                 /*                               
1032                  * If this page may have been    
1033                  * copy the page immediately     
1034                  * guarantee the pinned page     
1035                  * future.                       
1036                  */                              
1037                 if (unlikely(folio_try_dup_an    
1038                         /* Page may be pinned    
1039                         folio_put(folio);        
1040                         err = copy_present_pa    
1041                                                  
1042                         return err ? err : 1;    
1043                 }                                
1044                 rss[MM_ANONPAGES]++;             
1045                 VM_WARN_ON_FOLIO(PageAnonExcl    
1046         } else {                                 
1047                 folio_dup_file_rmap_pte(folio    
1048                 rss[mm_counter_file(folio)]++    
1049         }                                        
1050                                                  
1051 copy_pte:                                        
1052         __copy_present_ptes(dst_vma, src_vma,    
1053         return 1;                                
1054 }                                                
1055                                                  
1056 static inline struct folio *folio_prealloc(st    
1057                 struct vm_area_struct *vma, u    
1058 {                                                
1059         struct folio *new_folio;                 
1060                                                  
1061         if (need_zero)                           
1062                 new_folio = vma_alloc_zeroed_    
1063         else                                     
1064                 new_folio = vma_alloc_folio(G    
1065                                             a    
1066                                                  
1067         if (!new_folio)                          
1068                 return NULL;                     
1069                                                  
1070         if (mem_cgroup_charge(new_folio, src_    
1071                 folio_put(new_folio);            
1072                 return NULL;                     
1073         }                                        
1074         folio_throttle_swaprate(new_folio, GF    
1075                                                  
1076         return new_folio;                        
1077 }                                                
1078                                                  
1079 static int                                       
1080 copy_pte_range(struct vm_area_struct *dst_vma    
1081                pmd_t *dst_pmd, pmd_t *src_pmd    
1082                unsigned long end)                
1083 {                                                
1084         struct mm_struct *dst_mm = dst_vma->v    
1085         struct mm_struct *src_mm = src_vma->v    
1086         pte_t *orig_src_pte, *orig_dst_pte;      
1087         pte_t *src_pte, *dst_pte;                
1088         pte_t ptent;                             
1089         spinlock_t *src_ptl, *dst_ptl;           
1090         int progress, max_nr, ret = 0;           
1091         int rss[NR_MM_COUNTERS];                 
1092         swp_entry_t entry = (swp_entry_t){0};    
1093         struct folio *prealloc = NULL;           
1094         int nr;                                  
1095                                                  
1096 again:                                           
1097         progress = 0;                            
1098         init_rss_vec(rss);                       
1099                                                  
1100         /*                                       
1101          * copy_pmd_range()'s prior pmd_none_    
1102          * error handling here, assume that e    
1103          * protects anon from unexpected THP     
1104          * protected by mmap_lock-less collap    
1105          * (whereas vma_needs_copy() skips ar    
1106          * can remove such assumptions later,    
1107          */                                      
1108         dst_pte = pte_alloc_map_lock(dst_mm,     
1109         if (!dst_pte) {                          
1110                 ret = -ENOMEM;                   
1111                 goto out;                        
1112         }                                        
1113         src_pte = pte_offset_map_nolock(src_m    
1114         if (!src_pte) {                          
1115                 pte_unmap_unlock(dst_pte, dst    
1116                 /* ret == 0 */                   
1117                 goto out;                        
1118         }                                        
1119         spin_lock_nested(src_ptl, SINGLE_DEPT    
1120         orig_src_pte = src_pte;                  
1121         orig_dst_pte = dst_pte;                  
1122         arch_enter_lazy_mmu_mode();              
1123                                                  
1124         do {                                     
1125                 nr = 1;                          
1126                                                  
1127                 /*                               
1128                  * We are holding two locks a    
1129                  * could generate latencies i    
1130                  */                              
1131                 if (progress >= 32) {            
1132                         progress = 0;            
1133                         if (need_resched() ||    
1134                             spin_needbreak(sr    
1135                                 break;           
1136                 }                                
1137                 ptent = ptep_get(src_pte);       
1138                 if (pte_none(ptent)) {           
1139                         progress++;              
1140                         continue;                
1141                 }                                
1142                 if (unlikely(!pte_present(pte    
1143                         ret = copy_nonpresent    
1144                                                  
1145                                                  
1146                                                  
1147                         if (ret == -EIO) {       
1148                                 entry = pte_t    
1149                                 break;           
1150                         } else if (ret == -EB    
1151                                 break;           
1152                         } else if (!ret) {       
1153                                 progress += 8    
1154                                 continue;        
1155                         }                        
1156                         ptent = ptep_get(src_    
1157                         VM_WARN_ON_ONCE(!pte_    
1158                                                  
1159                         /*                       
1160                          * Device exclusive e    
1161                          * the now present pt    
1162                          */                      
1163                         WARN_ON_ONCE(ret != -    
1164                 }                                
1165                 /* copy_present_ptes() will c    
1166                 max_nr = (end - addr) / PAGE_    
1167                 ret = copy_present_ptes(dst_v    
1168                                         ptent    
1169                 /*                               
1170                  * If we need a pre-allocated    
1171                  * locks, allocate, and try a    
1172                  * If copy failed due to hwpo    
1173                  */                              
1174                 if (unlikely(ret == -EAGAIN |    
1175                         break;                   
1176                 if (unlikely(prealloc)) {        
1177                         /*                       
1178                          * pre-alloc page can    
1179                          * to strictly follow    
1180                          * will allocate page    
1181                          * could only happen     
1182                          */                      
1183                         folio_put(prealloc);     
1184                         prealloc = NULL;         
1185                 }                                
1186                 nr = ret;                        
1187                 progress += 8 * nr;              
1188         } while (dst_pte += nr, src_pte += nr    
1189                  addr != end);                   
1190                                                  
1191         arch_leave_lazy_mmu_mode();              
1192         pte_unmap_unlock(orig_src_pte, src_pt    
1193         add_mm_rss_vec(dst_mm, rss);             
1194         pte_unmap_unlock(orig_dst_pte, dst_pt    
1195         cond_resched();                          
1196                                                  
1197         if (ret == -EIO) {                       
1198                 VM_WARN_ON_ONCE(!entry.val);     
1199                 if (add_swap_count_continuati    
1200                         ret = -ENOMEM;           
1201                         goto out;                
1202                 }                                
1203                 entry.val = 0;                   
1204         } else if (ret == -EBUSY || unlikely(    
1205                 goto out;                        
1206         } else if (ret ==  -EAGAIN) {            
1207                 prealloc = folio_prealloc(src    
1208                 if (!prealloc)                   
1209                         return -ENOMEM;          
1210         } else if (ret < 0) {                    
1211                 VM_WARN_ON_ONCE(1);              
1212         }                                        
1213                                                  
1214         /* We've captured and resolved the er    
1215         ret = 0;                                 
1216                                                  
1217         if (addr != end)                         
1218                 goto again;                      
1219 out:                                             
1220         if (unlikely(prealloc))                  
1221                 folio_put(prealloc);             
1222         return ret;                              
1223 }                                                
1224                                                  
1225 static inline int                                
1226 copy_pmd_range(struct vm_area_struct *dst_vma    
1227                pud_t *dst_pud, pud_t *src_pud    
1228                unsigned long end)                
1229 {                                                
1230         struct mm_struct *dst_mm = dst_vma->v    
1231         struct mm_struct *src_mm = src_vma->v    
1232         pmd_t *src_pmd, *dst_pmd;                
1233         unsigned long next;                      
1234                                                  
1235         dst_pmd = pmd_alloc(dst_mm, dst_pud,     
1236         if (!dst_pmd)                            
1237                 return -ENOMEM;                  
1238         src_pmd = pmd_offset(src_pud, addr);     
1239         do {                                     
1240                 next = pmd_addr_end(addr, end    
1241                 if (is_swap_pmd(*src_pmd) ||     
1242                         || pmd_devmap(*src_pm    
1243                         int err;                 
1244                         VM_BUG_ON_VMA(next-ad    
1245                         err = copy_huge_pmd(d    
1246                                             a    
1247                         if (err == -ENOMEM)      
1248                                 return -ENOME    
1249                         if (!err)                
1250                                 continue;        
1251                         /* fall through */       
1252                 }                                
1253                 if (pmd_none_or_clear_bad(src    
1254                         continue;                
1255                 if (copy_pte_range(dst_vma, s    
1256                                    addr, next    
1257                         return -ENOMEM;          
1258         } while (dst_pmd++, src_pmd++, addr =    
1259         return 0;                                
1260 }                                                
1261                                                  
1262 static inline int                                
1263 copy_pud_range(struct vm_area_struct *dst_vma    
1264                p4d_t *dst_p4d, p4d_t *src_p4d    
1265                unsigned long end)                
1266 {                                                
1267         struct mm_struct *dst_mm = dst_vma->v    
1268         struct mm_struct *src_mm = src_vma->v    
1269         pud_t *src_pud, *dst_pud;                
1270         unsigned long next;                      
1271                                                  
1272         dst_pud = pud_alloc(dst_mm, dst_p4d,     
1273         if (!dst_pud)                            
1274                 return -ENOMEM;                  
1275         src_pud = pud_offset(src_p4d, addr);     
1276         do {                                     
1277                 next = pud_addr_end(addr, end    
1278                 if (pud_trans_huge(*src_pud)     
1279                         int err;                 
1280                                                  
1281                         VM_BUG_ON_VMA(next-ad    
1282                         err = copy_huge_pud(d    
1283                                             d    
1284                         if (err == -ENOMEM)      
1285                                 return -ENOME    
1286                         if (!err)                
1287                                 continue;        
1288                         /* fall through */       
1289                 }                                
1290                 if (pud_none_or_clear_bad(src    
1291                         continue;                
1292                 if (copy_pmd_range(dst_vma, s    
1293                                    addr, next    
1294                         return -ENOMEM;          
1295         } while (dst_pud++, src_pud++, addr =    
1296         return 0;                                
1297 }                                                
1298                                                  
1299 static inline int                                
1300 copy_p4d_range(struct vm_area_struct *dst_vma    
1301                pgd_t *dst_pgd, pgd_t *src_pgd    
1302                unsigned long end)                
1303 {                                                
1304         struct mm_struct *dst_mm = dst_vma->v    
1305         p4d_t *src_p4d, *dst_p4d;                
1306         unsigned long next;                      
1307                                                  
1308         dst_p4d = p4d_alloc(dst_mm, dst_pgd,     
1309         if (!dst_p4d)                            
1310                 return -ENOMEM;                  
1311         src_p4d = p4d_offset(src_pgd, addr);     
1312         do {                                     
1313                 next = p4d_addr_end(addr, end    
1314                 if (p4d_none_or_clear_bad(src    
1315                         continue;                
1316                 if (copy_pud_range(dst_vma, s    
1317                                    addr, next    
1318                         return -ENOMEM;          
1319         } while (dst_p4d++, src_p4d++, addr =    
1320         return 0;                                
1321 }                                                
1322                                                  
1323 /*                                               
1324  * Return true if the vma needs to copy the p    
1325  * false when we can speed up fork() by allow    
1326  * when the child accesses the memory range.     
1327  */                                              
1328 static bool                                      
1329 vma_needs_copy(struct vm_area_struct *dst_vma    
1330 {                                                
1331         /*                                       
1332          * Always copy pgtables when dst_vma     
1333          * file-backed (e.g. shmem). Because     
1334          * contains uffd-wp protection inform    
1335          * retrieve from page cache, and skip    
1336          */                                      
1337         if (userfaultfd_wp(dst_vma))             
1338                 return true;                     
1339                                                  
1340         if (src_vma->vm_flags & (VM_PFNMAP |     
1341                 return true;                     
1342                                                  
1343         if (src_vma->anon_vma)                   
1344                 return true;                     
1345                                                  
1346         /*                                       
1347          * Don't copy ptes where a page fault    
1348          * becomes much lighter when there ar    
1349          * mappings. The tradeoff is that cop    
1350          * than faulting.                        
1351          */                                      
1352         return false;                            
1353 }                                                
1354                                                  
1355 int                                              
1356 copy_page_range(struct vm_area_struct *dst_vm    
1357 {                                                
1358         pgd_t *src_pgd, *dst_pgd;                
1359         unsigned long next;                      
1360         unsigned long addr = src_vma->vm_star    
1361         unsigned long end = src_vma->vm_end;     
1362         struct mm_struct *dst_mm = dst_vma->v    
1363         struct mm_struct *src_mm = src_vma->v    
1364         struct mmu_notifier_range range;         
1365         bool is_cow;                             
1366         int ret;                                 
1367                                                  
1368         if (!vma_needs_copy(dst_vma, src_vma)    
1369                 return 0;                        
1370                                                  
1371         if (is_vm_hugetlb_page(src_vma))         
1372                 return copy_hugetlb_page_rang    
1373                                                  
1374         if (unlikely(src_vma->vm_flags & VM_P    
1375                 /*                               
1376                  * We do not free on error ca    
1377                  * gets called on error from     
1378                  */                              
1379                 ret = track_pfn_copy(src_vma)    
1380                 if (ret)                         
1381                         return ret;              
1382         }                                        
1383                                                  
1384         /*                                       
1385          * We need to invalidate the secondar    
1386          * there could be a permission downgr    
1387          * parent mm. And a permission downgr    
1388          * is_cow_mapping() returns true.        
1389          */                                      
1390         is_cow = is_cow_mapping(src_vma->vm_f    
1391                                                  
1392         if (is_cow) {                            
1393                 mmu_notifier_range_init(&rang    
1394                                         0, sr    
1395                 mmu_notifier_invalidate_range    
1396                 /*                               
1397                  * Disabling preemption is no    
1398                  * the read side doesn't spin    
1399                  *                               
1400                  * Use the raw variant of the    
1401                  * lockdep complaining about     
1402                  */                              
1403                 vma_assert_write_locked(src_v    
1404                 raw_write_seqcount_begin(&src    
1405         }                                        
1406                                                  
1407         ret = 0;                                 
1408         dst_pgd = pgd_offset(dst_mm, addr);      
1409         src_pgd = pgd_offset(src_mm, addr);      
1410         do {                                     
1411                 next = pgd_addr_end(addr, end    
1412                 if (pgd_none_or_clear_bad(src    
1413                         continue;                
1414                 if (unlikely(copy_p4d_range(d    
1415                                             a    
1416                         untrack_pfn_clear(dst    
1417                         ret = -ENOMEM;           
1418                         break;                   
1419                 }                                
1420         } while (dst_pgd++, src_pgd++, addr =    
1421                                                  
1422         if (is_cow) {                            
1423                 raw_write_seqcount_end(&src_m    
1424                 mmu_notifier_invalidate_range    
1425         }                                        
1426         return ret;                              
1427 }                                                
1428                                                  
1429 /* Whether we should zap all COWed (private)     
1430 static inline bool should_zap_cows(struct zap    
1431 {                                                
1432         /* By default, zap all pages */          
1433         if (!details)                            
1434                 return true;                     
1435                                                  
1436         /* Or, we zap COWed pages only if the    
1437         return details->even_cows;               
1438 }                                                
1439                                                  
1440 /* Decides whether we should zap this folio w    
1441 static inline bool should_zap_folio(struct za    
1442                                     struct fo    
1443 {                                                
1444         /* If we can make a decision without     
1445         if (should_zap_cows(details))            
1446                 return true;                     
1447                                                  
1448         /* Otherwise we should only zap non-a    
1449         return !folio_test_anon(folio);          
1450 }                                                
1451                                                  
1452 static inline bool zap_drop_file_uffd_wp(stru    
1453 {                                                
1454         if (!details)                            
1455                 return false;                    
1456                                                  
1457         return details->zap_flags & ZAP_FLAG_    
1458 }                                                
1459                                                  
1460 /*                                               
1461  * This function makes sure that we'll replac    
1462  * swap special pte marker when necessary. Mu    
1463  */                                              
1464 static inline void                               
1465 zap_install_uffd_wp_if_needed(struct vm_area_    
1466                               unsigned long a    
1467                               struct zap_deta    
1468 {                                                
1469         /* Zap on anonymous always means drop    
1470         if (vma_is_anonymous(vma))               
1471                 return;                          
1472                                                  
1473         if (zap_drop_file_uffd_wp(details))      
1474                 return;                          
1475                                                  
1476         for (;;) {                               
1477                 /* the PFN in the PTE is irre    
1478                 pte_install_uffd_wp_if_needed    
1479                 if (--nr == 0)                   
1480                         break;                   
1481                 pte++;                           
1482                 addr += PAGE_SIZE;               
1483         }                                        
1484 }                                                
1485                                                  
1486 static __always_inline void zap_present_folio    
1487                 struct vm_area_struct *vma, s    
1488                 struct page *page, pte_t *pte    
1489                 unsigned long addr, struct za    
1490                 bool *force_flush, bool *forc    
1491 {                                                
1492         struct mm_struct *mm = tlb->mm;          
1493         bool delay_rmap = false;                 
1494                                                  
1495         if (!folio_test_anon(folio)) {           
1496                 ptent = get_and_clear_full_pt    
1497                 if (pte_dirty(ptent)) {          
1498                         folio_mark_dirty(foli    
1499                         if (tlb_delay_rmap(tl    
1500                                 delay_rmap =     
1501                                 *force_flush     
1502                         }                        
1503                 }                                
1504                 if (pte_young(ptent) && likel    
1505                         folio_mark_accessed(f    
1506                 rss[mm_counter(folio)] -= nr;    
1507         } else {                                 
1508                 /* We don't need up-to-date a    
1509                 clear_full_ptes(mm, addr, pte    
1510                 rss[MM_ANONPAGES] -= nr;         
1511         }                                        
1512         /* Checking a single PTE in a batch i    
1513         arch_check_zapped_pte(vma, ptent);       
1514         tlb_remove_tlb_entries(tlb, pte, nr,     
1515         if (unlikely(userfaultfd_pte_wp(vma,     
1516                 zap_install_uffd_wp_if_needed    
1517                                                  
1518                                                  
1519         if (!delay_rmap) {                       
1520                 folio_remove_rmap_ptes(folio,    
1521                                                  
1522                 if (unlikely(folio_mapcount(f    
1523                         print_bad_pte(vma, ad    
1524         }                                        
1525         if (unlikely(__tlb_remove_folio_pages    
1526                 *force_flush = true;             
1527                 *force_break = true;             
1528         }                                        
1529 }                                                
1530                                                  
1531 /*                                               
1532  * Zap or skip at least one present PTE, tryi    
1533  * PTEs that map consecutive pages of the sam    
1534  *                                               
1535  * Returns the number of processed (skipped o    
1536  */                                              
1537 static inline int zap_present_ptes(struct mmu    
1538                 struct vm_area_struct *vma, p    
1539                 unsigned int max_nr, unsigned    
1540                 struct zap_details *details,     
1541                 bool *force_break)               
1542 {                                                
1543         const fpb_t fpb_flags = FPB_IGNORE_DI    
1544         struct mm_struct *mm = tlb->mm;          
1545         struct folio *folio;                     
1546         struct page *page;                       
1547         int nr;                                  
1548                                                  
1549         page = vm_normal_page(vma, addr, pten    
1550         if (!page) {                             
1551                 /* We don't need up-to-date a    
1552                 ptep_get_and_clear_full(mm, a    
1553                 arch_check_zapped_pte(vma, pt    
1554                 tlb_remove_tlb_entry(tlb, pte    
1555                 if (userfaultfd_pte_wp(vma, p    
1556                         zap_install_uffd_wp_i    
1557                                                  
1558                 ksm_might_unmap_zero_page(mm,    
1559                 return 1;                        
1560         }                                        
1561                                                  
1562         folio = page_folio(page);                
1563         if (unlikely(!should_zap_folio(detail    
1564                 return 1;                        
1565                                                  
1566         /*                                       
1567          * Make sure that the common "small f    
1568          * by keeping the batching logic sepa    
1569          */                                      
1570         if (unlikely(folio_test_large(folio)     
1571                 nr = folio_pte_batch(folio, a    
1572                                      NULL, NU    
1573                                                  
1574                 zap_present_folio_ptes(tlb, v    
1575                                        addr,     
1576                                        force_    
1577                 return nr;                       
1578         }                                        
1579         zap_present_folio_ptes(tlb, vma, foli    
1580                                details, rss,     
1581         return 1;                                
1582 }                                                
1583                                                  
1584 static unsigned long zap_pte_range(struct mmu    
1585                                 struct vm_are    
1586                                 unsigned long    
1587                                 struct zap_de    
1588 {                                                
1589         bool force_flush = false, force_break    
1590         struct mm_struct *mm = tlb->mm;          
1591         int rss[NR_MM_COUNTERS];                 
1592         spinlock_t *ptl;                         
1593         pte_t *start_pte;                        
1594         pte_t *pte;                              
1595         swp_entry_t entry;                       
1596         int nr;                                  
1597                                                  
1598         tlb_change_page_size(tlb, PAGE_SIZE);    
1599         init_rss_vec(rss);                       
1600         start_pte = pte = pte_offset_map_lock    
1601         if (!pte)                                
1602                 return addr;                     
1603                                                  
1604         flush_tlb_batched_pending(mm);           
1605         arch_enter_lazy_mmu_mode();              
1606         do {                                     
1607                 pte_t ptent = ptep_get(pte);     
1608                 struct folio *folio;             
1609                 struct page *page;               
1610                 int max_nr;                      
1611                                                  
1612                 nr = 1;                          
1613                 if (pte_none(ptent))             
1614                         continue;                
1615                                                  
1616                 if (need_resched())              
1617                         break;                   
1618                                                  
1619                 if (pte_present(ptent)) {        
1620                         max_nr = (end - addr)    
1621                         nr = zap_present_ptes    
1622                                                  
1623                                                  
1624                         if (unlikely(force_br    
1625                                 addr += nr *     
1626                                 break;           
1627                         }                        
1628                         continue;                
1629                 }                                
1630                                                  
1631                 entry = pte_to_swp_entry(pten    
1632                 if (is_device_private_entry(e    
1633                     is_device_exclusive_entry    
1634                         page = pfn_swap_entry    
1635                         folio = page_folio(pa    
1636                         if (unlikely(!should_    
1637                                 continue;        
1638                         /*                       
1639                          * Both device privat    
1640                          * work with anonymou    
1641                          * consider uffd-wp b    
1642                          * see zap_install_uf    
1643                          */                      
1644                         WARN_ON_ONCE(!vma_is_    
1645                         rss[mm_counter(folio)    
1646                         if (is_device_private    
1647                                 folio_remove_    
1648                         folio_put(folio);        
1649                 } else if (!non_swap_entry(en    
1650                         max_nr = (end - addr)    
1651                         nr = swap_pte_batch(p    
1652                         /* Genuine swap entri    
1653                         if (!should_zap_cows(    
1654                                 continue;        
1655                         rss[MM_SWAPENTS] -= n    
1656                         free_swap_and_cache_n    
1657                 } else if (is_migration_entry    
1658                         folio = pfn_swap_entr    
1659                         if (!should_zap_folio    
1660                                 continue;        
1661                         rss[mm_counter(folio)    
1662                 } else if (pte_marker_entry_u    
1663                         /*                       
1664                          * For anon: always d    
1665                          * drop the marker if    
1666                          */                      
1667                         if (!vma_is_anonymous    
1668                             !zap_drop_file_uf    
1669                                 continue;        
1670                 } else if (is_hwpoison_entry(    
1671                            is_poisoned_swp_en    
1672                         if (!should_zap_cows(    
1673                                 continue;        
1674                 } else {                         
1675                         /* We should have cov    
1676                         pr_alert("unrecognize    
1677                         WARN_ON_ONCE(1);         
1678                 }                                
1679                 clear_not_present_full_ptes(m    
1680                 zap_install_uffd_wp_if_needed    
1681         } while (pte += nr, addr += PAGE_SIZE    
1682                                                  
1683         add_mm_rss_vec(mm, rss);                 
1684         arch_leave_lazy_mmu_mode();              
1685                                                  
1686         /* Do the actual TLB flush before dro    
1687         if (force_flush) {                       
1688                 tlb_flush_mmu_tlbonly(tlb);      
1689                 tlb_flush_rmaps(tlb, vma);       
1690         }                                        
1691         pte_unmap_unlock(start_pte, ptl);        
1692                                                  
1693         /*                                       
1694          * If we forced a TLB flush (either d    
1695          * batch buffers or because we needed    
1696          * entries before releasing the ptl),    
1697          * memory too. Come back again if we     
1698          */                                      
1699         if (force_flush)                         
1700                 tlb_flush_mmu(tlb);              
1701                                                  
1702         return addr;                             
1703 }                                                
1704                                                  
1705 static inline unsigned long zap_pmd_range(str    
1706                                 struct vm_are    
1707                                 unsigned long    
1708                                 struct zap_de    
1709 {                                                
1710         pmd_t *pmd;                              
1711         unsigned long next;                      
1712                                                  
1713         pmd = pmd_offset(pud, addr);             
1714         do {                                     
1715                 next = pmd_addr_end(addr, end    
1716                 if (is_swap_pmd(*pmd) || pmd_    
1717                         if (next - addr != HP    
1718                                 __split_huge_    
1719                         else if (zap_huge_pmd    
1720                                 addr = next;     
1721                                 continue;        
1722                         }                        
1723                         /* fall through */       
1724                 } else if (details && details    
1725                            folio_test_pmd_map    
1726                            next - addr == HPA    
1727                         spinlock_t *ptl = pmd    
1728                         /*                       
1729                          * Take and drop THP     
1730                          * prematurely, while    
1731                          * but not yet decrem    
1732                          */                      
1733                         spin_unlock(ptl);        
1734                 }                                
1735                 if (pmd_none(*pmd)) {            
1736                         addr = next;             
1737                         continue;                
1738                 }                                
1739                 addr = zap_pte_range(tlb, vma    
1740                 if (addr != next)                
1741                         pmd--;                   
1742         } while (pmd++, cond_resched(), addr     
1743                                                  
1744         return addr;                             
1745 }                                                
1746                                                  
1747 static inline unsigned long zap_pud_range(str    
1748                                 struct vm_are    
1749                                 unsigned long    
1750                                 struct zap_de    
1751 {                                                
1752         pud_t *pud;                              
1753         unsigned long next;                      
1754                                                  
1755         pud = pud_offset(p4d, addr);             
1756         do {                                     
1757                 next = pud_addr_end(addr, end    
1758                 if (pud_trans_huge(*pud) || p    
1759                         if (next - addr != HP    
1760                                 mmap_assert_l    
1761                                 split_huge_pu    
1762                         } else if (zap_huge_p    
1763                                 goto next;       
1764                         /* fall through */       
1765                 }                                
1766                 if (pud_none_or_clear_bad(pud    
1767                         continue;                
1768                 next = zap_pmd_range(tlb, vma    
1769 next:                                            
1770                 cond_resched();                  
1771         } while (pud++, addr = next, addr !=     
1772                                                  
1773         return addr;                             
1774 }                                                
1775                                                  
1776 static inline unsigned long zap_p4d_range(str    
1777                                 struct vm_are    
1778                                 unsigned long    
1779                                 struct zap_de    
1780 {                                                
1781         p4d_t *p4d;                              
1782         unsigned long next;                      
1783                                                  
1784         p4d = p4d_offset(pgd, addr);             
1785         do {                                     
1786                 next = p4d_addr_end(addr, end    
1787                 if (p4d_none_or_clear_bad(p4d    
1788                         continue;                
1789                 next = zap_pud_range(tlb, vma    
1790         } while (p4d++, addr = next, addr !=     
1791                                                  
1792         return addr;                             
1793 }                                                
1794                                                  
1795 void unmap_page_range(struct mmu_gather *tlb,    
1796                              struct vm_area_s    
1797                              unsigned long ad    
1798                              struct zap_detai    
1799 {                                                
1800         pgd_t *pgd;                              
1801         unsigned long next;                      
1802                                                  
1803         BUG_ON(addr >= end);                     
1804         tlb_start_vma(tlb, vma);                 
1805         pgd = pgd_offset(vma->vm_mm, addr);      
1806         do {                                     
1807                 next = pgd_addr_end(addr, end    
1808                 if (pgd_none_or_clear_bad(pgd    
1809                         continue;                
1810                 next = zap_p4d_range(tlb, vma    
1811         } while (pgd++, addr = next, addr !=     
1812         tlb_end_vma(tlb, vma);                   
1813 }                                                
1814                                                  
1815                                                  
1816 static void unmap_single_vma(struct mmu_gathe    
1817                 struct vm_area_struct *vma, u    
1818                 unsigned long end_addr,          
1819                 struct zap_details *details,     
1820 {                                                
1821         unsigned long start = max(vma->vm_sta    
1822         unsigned long end;                       
1823                                                  
1824         if (start >= vma->vm_end)                
1825                 return;                          
1826         end = min(vma->vm_end, end_addr);        
1827         if (end <= vma->vm_start)                
1828                 return;                          
1829                                                  
1830         if (vma->vm_file)                        
1831                 uprobe_munmap(vma, start, end    
1832                                                  
1833         if (unlikely(vma->vm_flags & VM_PFNMA    
1834                 untrack_pfn(vma, 0, 0, mm_wr_    
1835                                                  
1836         if (start != end) {                      
1837                 if (unlikely(is_vm_hugetlb_pa    
1838                         /*                       
1839                          * It is undesirable     
1840                          * should be non-null    
1841                          * However, vm_file w    
1842                          * cleanup path of mm    
1843                          * hugetlbfs ->mmap m    
1844                          * mmap_region() null    
1845                          * before calling thi    
1846                          * Since no pte has a    
1847                          * safe to do nothing    
1848                          */                      
1849                         if (vma->vm_file) {      
1850                                 zap_flags_t z    
1851                                     details->    
1852                                 __unmap_hugep    
1853                                                  
1854                         }                        
1855                 } else                           
1856                         unmap_page_range(tlb,    
1857         }                                        
1858 }                                                
1859                                                  
1860 /**                                              
1861  * unmap_vmas - unmap a range of memory cover    
1862  * @tlb: address of the caller's struct mmu_g    
1863  * @mas: the maple state                         
1864  * @vma: the starting vma                        
1865  * @start_addr: virtual address at which to s    
1866  * @end_addr: virtual address at which to end    
1867  * @tree_end: The maximum index to check         
1868  * @mm_wr_locked: lock flag                      
1869  *                                               
1870  * Unmap all pages in the vma list.              
1871  *                                               
1872  * Only addresses between `start' and `end' w    
1873  *                                               
1874  * The VMA list must be sorted in ascending v    
1875  *                                               
1876  * unmap_vmas() assumes that the caller will     
1877  * range after unmap_vmas() returns.  So the     
1878  * ensure that any thus-far unmapped pages ar    
1879  * drops the lock and schedules.                 
1880  */                                              
1881 void unmap_vmas(struct mmu_gather *tlb, struc    
1882                 struct vm_area_struct *vma, u    
1883                 unsigned long end_addr, unsig    
1884                 bool mm_wr_locked)               
1885 {                                                
1886         struct mmu_notifier_range range;         
1887         struct zap_details details = {           
1888                 .zap_flags = ZAP_FLAG_DROP_MA    
1889                 /* Careful - we need to zap p    
1890                 .even_cows = true,               
1891         };                                       
1892                                                  
1893         mmu_notifier_range_init(&range, MMU_N    
1894                                 start_addr, e    
1895         mmu_notifier_invalidate_range_start(&    
1896         do {                                     
1897                 unsigned long start = start_a    
1898                 unsigned long end = end_addr;    
1899                 hugetlb_zap_begin(vma, &start    
1900                 unmap_single_vma(tlb, vma, st    
1901                                  mm_wr_locked    
1902                 hugetlb_zap_end(vma, &details    
1903                 vma = mas_find(mas, tree_end     
1904         } while (vma && likely(!xa_is_zero(vm    
1905         mmu_notifier_invalidate_range_end(&ra    
1906 }                                                
1907                                                  
1908 /**                                              
1909  * zap_page_range_single - remove user pages     
1910  * @vma: vm_area_struct holding the applicabl    
1911  * @address: starting address of pages to zap    
1912  * @size: number of bytes to zap                 
1913  * @details: details of shared cache invalida    
1914  *                                               
1915  * The range must fit into one VMA.              
1916  */                                              
1917 void zap_page_range_single(struct vm_area_str    
1918                 unsigned long size, struct za    
1919 {                                                
1920         const unsigned long end = address + s    
1921         struct mmu_notifier_range range;         
1922         struct mmu_gather tlb;                   
1923                                                  
1924         lru_add_drain();                         
1925         mmu_notifier_range_init(&range, MMU_N    
1926                                 address, end)    
1927         hugetlb_zap_begin(vma, &range.start,     
1928         tlb_gather_mmu(&tlb, vma->vm_mm);        
1929         update_hiwater_rss(vma->vm_mm);          
1930         mmu_notifier_invalidate_range_start(&    
1931         /*                                       
1932          * unmap 'address-end' not 'range.sta    
1933          * could have been expanded for huget    
1934          */                                      
1935         unmap_single_vma(&tlb, vma, address,     
1936         mmu_notifier_invalidate_range_end(&ra    
1937         tlb_finish_mmu(&tlb);                    
1938         hugetlb_zap_end(vma, details);           
1939 }                                                
1940                                                  
1941 /**                                              
1942  * zap_vma_ptes - remove ptes mapping the vma    
1943  * @vma: vm_area_struct holding ptes to be za    
1944  * @address: starting address of pages to zap    
1945  * @size: number of bytes to zap                 
1946  *                                               
1947  * This function only unmaps ptes assigned to    
1948  *                                               
1949  * The entire address range must be fully con    
1950  *                                               
1951  */                                              
1952 void zap_vma_ptes(struct vm_area_struct *vma,    
1953                 unsigned long size)              
1954 {                                                
1955         if (!range_in_vma(vma, address, addre    
1956                         !(vma->vm_flags & VM_    
1957                 return;                          
1958                                                  
1959         zap_page_range_single(vma, address, s    
1960 }                                                
1961 EXPORT_SYMBOL_GPL(zap_vma_ptes);                 
1962                                                  
1963 static pmd_t *walk_to_pmd(struct mm_struct *m    
1964 {                                                
1965         pgd_t *pgd;                              
1966         p4d_t *p4d;                              
1967         pud_t *pud;                              
1968         pmd_t *pmd;                              
1969                                                  
1970         pgd = pgd_offset(mm, addr);              
1971         p4d = p4d_alloc(mm, pgd, addr);          
1972         if (!p4d)                                
1973                 return NULL;                     
1974         pud = pud_alloc(mm, p4d, addr);          
1975         if (!pud)                                
1976                 return NULL;                     
1977         pmd = pmd_alloc(mm, pud, addr);          
1978         if (!pmd)                                
1979                 return NULL;                     
1980                                                  
1981         VM_BUG_ON(pmd_trans_huge(*pmd));         
1982         return pmd;                              
1983 }                                                
1984                                                  
1985 pte_t *__get_locked_pte(struct mm_struct *mm,    
1986                         spinlock_t **ptl)        
1987 {                                                
1988         pmd_t *pmd = walk_to_pmd(mm, addr);      
1989                                                  
1990         if (!pmd)                                
1991                 return NULL;                     
1992         return pte_alloc_map_lock(mm, pmd, ad    
1993 }                                                
1994                                                  
1995 static bool vm_mixed_zeropage_allowed(struct     
1996 {                                                
1997         VM_WARN_ON_ONCE(vma->vm_flags & VM_PF    
1998         /*                                       
1999          * Whoever wants to forbid the zeropa    
2000          * might already have been mapped has    
2001          * bail out on any zeropages. Zeropag    
2002          * be unshared using FAULT_FLAG_UNSHA    
2003          */                                      
2004         if (mm_forbids_zeropage(vma->vm_mm))     
2005                 return false;                    
2006         /* zeropages in COW mappings are comm    
2007         if (is_cow_mapping(vma->vm_flags))       
2008                 return true;                     
2009         /* Mappings that do not allow for wri    
2010         if (!(vma->vm_flags & (VM_WRITE | VM_    
2011                 return true;                     
2012         /*                                       
2013          * Why not allow any VMA that has vm_    
2014          * find the shared zeropage and longt    
2015          * be problematic as soon as the zero    
2016          * page due to vma->vm_ops->pfn_mkwri    
2017          * now differ to what GUP looked up.     
2018          * FOLL_LONGTERM and VM_IO is incompa    
2019          * check_vma_flags).                     
2020          */                                      
2021         return vma->vm_ops && vma->vm_ops->pf    
2022                (vma_is_fsdax(vma) || vma->vm_    
2023 }                                                
2024                                                  
2025 static int validate_page_before_insert(struct    
2026                                        struct    
2027 {                                                
2028         struct folio *folio = page_folio(page    
2029                                                  
2030         if (!folio_ref_count(folio))             
2031                 return -EINVAL;                  
2032         if (unlikely(is_zero_folio(folio))) {    
2033                 if (!vm_mixed_zeropage_allowe    
2034                         return -EINVAL;          
2035                 return 0;                        
2036         }                                        
2037         if (folio_test_anon(folio) || folio_t    
2038             page_has_type(page))                 
2039                 return -EINVAL;                  
2040         flush_dcache_folio(folio);               
2041         return 0;                                
2042 }                                                
2043                                                  
2044 static int insert_page_into_pte_locked(struct    
2045                         unsigned long addr, s    
2046 {                                                
2047         struct folio *folio = page_folio(page    
2048         pte_t pteval;                            
2049                                                  
2050         if (!pte_none(ptep_get(pte)))            
2051                 return -EBUSY;                   
2052         /* Ok, finally just insert the thing.    
2053         pteval = mk_pte(page, prot);             
2054         if (unlikely(is_zero_folio(folio))) {    
2055                 pteval = pte_mkspecial(pteval    
2056         } else {                                 
2057                 folio_get(folio);                
2058                 inc_mm_counter(vma->vm_mm, mm    
2059                 folio_add_file_rmap_pte(folio    
2060         }                                        
2061         set_pte_at(vma->vm_mm, addr, pte, pte    
2062         return 0;                                
2063 }                                                
2064                                                  
2065 static int insert_page(struct vm_area_struct     
2066                         struct page *page, pg    
2067 {                                                
2068         int retval;                              
2069         pte_t *pte;                              
2070         spinlock_t *ptl;                         
2071                                                  
2072         retval = validate_page_before_insert(    
2073         if (retval)                              
2074                 goto out;                        
2075         retval = -ENOMEM;                        
2076         pte = get_locked_pte(vma->vm_mm, addr    
2077         if (!pte)                                
2078                 goto out;                        
2079         retval = insert_page_into_pte_locked(    
2080         pte_unmap_unlock(pte, ptl);              
2081 out:                                             
2082         return retval;                           
2083 }                                                
2084                                                  
2085 static int insert_page_in_batch_locked(struct    
2086                         unsigned long addr, s    
2087 {                                                
2088         int err;                                 
2089                                                  
2090         err = validate_page_before_insert(vma    
2091         if (err)                                 
2092                 return err;                      
2093         return insert_page_into_pte_locked(vm    
2094 }                                                
2095                                                  
2096 /* insert_pages() amortizes the cost of spinl    
2097  * when inserting pages in a loop.               
2098  */                                              
2099 static int insert_pages(struct vm_area_struct    
2100                         struct page **pages,     
2101 {                                                
2102         pmd_t *pmd = NULL;                       
2103         pte_t *start_pte, *pte;                  
2104         spinlock_t *pte_lock;                    
2105         struct mm_struct *const mm = vma->vm_    
2106         unsigned long curr_page_idx = 0;         
2107         unsigned long remaining_pages_total =    
2108         unsigned long pages_to_write_in_pmd;     
2109         int ret;                                 
2110 more:                                            
2111         ret = -EFAULT;                           
2112         pmd = walk_to_pmd(mm, addr);             
2113         if (!pmd)                                
2114                 goto out;                        
2115                                                  
2116         pages_to_write_in_pmd = min_t(unsigne    
2117                 remaining_pages_total, PTRS_P    
2118                                                  
2119         /* Allocate the PTE if necessary; tak    
2120         ret = -ENOMEM;                           
2121         if (pte_alloc(mm, pmd))                  
2122                 goto out;                        
2123                                                  
2124         while (pages_to_write_in_pmd) {          
2125                 int pte_idx = 0;                 
2126                 const int batch_size = min_t(    
2127                                                  
2128                 start_pte = pte_offset_map_lo    
2129                 if (!start_pte) {                
2130                         ret = -EFAULT;           
2131                         goto out;                
2132                 }                                
2133                 for (pte = start_pte; pte_idx    
2134                         int err = insert_page    
2135                                 addr, pages[c    
2136                         if (unlikely(err)) {     
2137                                 pte_unmap_unl    
2138                                 ret = err;       
2139                                 remaining_pag    
2140                                 goto out;        
2141                         }                        
2142                         addr += PAGE_SIZE;       
2143                         ++curr_page_idx;         
2144                 }                                
2145                 pte_unmap_unlock(start_pte, p    
2146                 pages_to_write_in_pmd -= batc    
2147                 remaining_pages_total -= batc    
2148         }                                        
2149         if (remaining_pages_total)               
2150                 goto more;                       
2151         ret = 0;                                 
2152 out:                                             
2153         *num = remaining_pages_total;            
2154         return ret;                              
2155 }                                                
2156                                                  
2157 /**                                              
2158  * vm_insert_pages - insert multiple pages in    
2159  * @vma: user vma to map to                      
2160  * @addr: target start user address of these     
2161  * @pages: source kernel pages                   
2162  * @num: in: number of pages to map. out: num    
2163  * mapped. (0 means all pages were successful    
2164  *                                               
2165  * Preferred over vm_insert_page() when inser    
2166  *                                               
2167  * In case of error, we may have mapped a sub    
2168  * pages. It is the caller's responsibility t    
2169  *                                               
2170  * The same restrictions apply as in vm_inser    
2171  */                                              
2172 int vm_insert_pages(struct vm_area_struct *vm    
2173                         struct page **pages,     
2174 {                                                
2175         const unsigned long end_addr = addr +    
2176                                                  
2177         if (addr < vma->vm_start || end_addr     
2178                 return -EFAULT;                  
2179         if (!(vma->vm_flags & VM_MIXEDMAP)) {    
2180                 BUG_ON(mmap_read_trylock(vma-    
2181                 BUG_ON(vma->vm_flags & VM_PFN    
2182                 vm_flags_set(vma, VM_MIXEDMAP    
2183         }                                        
2184         /* Defer page refcount checking till     
2185         return insert_pages(vma, addr, pages,    
2186 }                                                
2187 EXPORT_SYMBOL(vm_insert_pages);                  
2188                                                  
2189 /**                                              
2190  * vm_insert_page - insert single page into u    
2191  * @vma: user vma to map to                      
2192  * @addr: target user address of this page       
2193  * @page: source kernel page                     
2194  *                                               
2195  * This allows drivers to insert individual p    
2196  * into a user vma. The zeropage is supported    
2197  * see vm_mixed_zeropage_allowed().              
2198  *                                               
2199  * The page has to be a nice clean _individua    
2200  * If you allocate a compound page, you need     
2201  * such (__GFP_COMP), or manually just split     
2202  * (see split_page()).                           
2203  *                                               
2204  * NOTE! Traditionally this was done with "re    
2205  * took an arbitrary page protection paramete    
2206  * that. Your vma protection will have to be     
2207  * means that if you want a shared writable m    
2208  * ask for a shared writable mapping!            
2209  *                                               
2210  * The page does not need to be reserved.        
2211  *                                               
2212  * Usually this function is called from f_op-    
2213  * under mm->mmap_lock write-lock, so it can     
2214  * Caller must set VM_MIXEDMAP on vma if it w    
2215  * function from other places, for example fr    
2216  *                                               
2217  * Return: %0 on success, negative error code    
2218  */                                              
2219 int vm_insert_page(struct vm_area_struct *vma    
2220                         struct page *page)       
2221 {                                                
2222         if (addr < vma->vm_start || addr >= v    
2223                 return -EFAULT;                  
2224         if (!(vma->vm_flags & VM_MIXEDMAP)) {    
2225                 BUG_ON(mmap_read_trylock(vma-    
2226                 BUG_ON(vma->vm_flags & VM_PFN    
2227                 vm_flags_set(vma, VM_MIXEDMAP    
2228         }                                        
2229         return insert_page(vma, addr, page, v    
2230 }                                                
2231 EXPORT_SYMBOL(vm_insert_page);                   
2232                                                  
2233 /*                                               
2234  * __vm_map_pages - maps range of kernel page    
2235  * @vma: user vma to map to                      
2236  * @pages: pointer to array of source kernel     
2237  * @num: number of pages in page array           
2238  * @offset: user's requested vm_pgoff            
2239  *                                               
2240  * This allows drivers to map range of kernel    
2241  * The zeropage is supported in some VMAs, se    
2242  * vm_mixed_zeropage_allowed().                  
2243  *                                               
2244  * Return: 0 on success and error code otherw    
2245  */                                              
2246 static int __vm_map_pages(struct vm_area_stru    
2247                                 unsigned long    
2248 {                                                
2249         unsigned long count = vma_pages(vma);    
2250         unsigned long uaddr = vma->vm_start;     
2251         int ret, i;                              
2252                                                  
2253         /* Fail if the user requested offset     
2254         if (offset >= num)                       
2255                 return -ENXIO;                   
2256                                                  
2257         /* Fail if the user requested size ex    
2258         if (count > num - offset)                
2259                 return -ENXIO;                   
2260                                                  
2261         for (i = 0; i < count; i++) {            
2262                 ret = vm_insert_page(vma, uad    
2263                 if (ret < 0)                     
2264                         return ret;              
2265                 uaddr += PAGE_SIZE;              
2266         }                                        
2267                                                  
2268         return 0;                                
2269 }                                                
2270                                                  
2271 /**                                              
2272  * vm_map_pages - maps range of kernel pages     
2273  * @vma: user vma to map to                      
2274  * @pages: pointer to array of source kernel     
2275  * @num: number of pages in page array           
2276  *                                               
2277  * Maps an object consisting of @num pages, c    
2278  * requested vm_pgoff                            
2279  *                                               
2280  * If we fail to insert any page into the vma    
2281  * immediately leaving any previously inserte    
2282  * from the mmap handler may immediately retu    
2283  * will destroy the vma, removing any success    
2284  * callers should make their own arrangements    
2285  *                                               
2286  * Context: Process context. Called by mmap h    
2287  * Return: 0 on success and error code otherw    
2288  */                                              
2289 int vm_map_pages(struct vm_area_struct *vma,     
2290                                 unsigned long    
2291 {                                                
2292         return __vm_map_pages(vma, pages, num    
2293 }                                                
2294 EXPORT_SYMBOL(vm_map_pages);                     
2295                                                  
2296 /**                                              
2297  * vm_map_pages_zero - map range of kernel pa    
2298  * @vma: user vma to map to                      
2299  * @pages: pointer to array of source kernel     
2300  * @num: number of pages in page array           
2301  *                                               
2302  * Similar to vm_map_pages(), except that it     
2303  * to 0. This function is intended for the dr    
2304  * vm_pgoff.                                     
2305  *                                               
2306  * Context: Process context. Called by mmap h    
2307  * Return: 0 on success and error code otherw    
2308  */                                              
2309 int vm_map_pages_zero(struct vm_area_struct *    
2310                                 unsigned long    
2311 {                                                
2312         return __vm_map_pages(vma, pages, num    
2313 }                                                
2314 EXPORT_SYMBOL(vm_map_pages_zero);                
2315                                                  
2316 static vm_fault_t insert_pfn(struct vm_area_s    
2317                         pfn_t pfn, pgprot_t p    
2318 {                                                
2319         struct mm_struct *mm = vma->vm_mm;       
2320         pte_t *pte, entry;                       
2321         spinlock_t *ptl;                         
2322                                                  
2323         pte = get_locked_pte(mm, addr, &ptl);    
2324         if (!pte)                                
2325                 return VM_FAULT_OOM;             
2326         entry = ptep_get(pte);                   
2327         if (!pte_none(entry)) {                  
2328                 if (mkwrite) {                   
2329                         /*                       
2330                          * For read faults on    
2331                          * in may not match t    
2332                          * mapped PFN is a wr    
2333                          * case we are creati    
2334                          * mapping and we exp    
2335                          * don't match, we ar    
2336                          * allocation and map    
2337                          * update.               
2338                          */                      
2339                         if (pte_pfn(entry) !=    
2340                                 WARN_ON_ONCE(    
2341                                 goto out_unlo    
2342                         }                        
2343                         entry = pte_mkyoung(e    
2344                         entry = maybe_mkwrite    
2345                         if (ptep_set_access_f    
2346                                 update_mmu_ca    
2347                 }                                
2348                 goto out_unlock;                 
2349         }                                        
2350                                                  
2351         /* Ok, finally just insert the thing.    
2352         if (pfn_t_devmap(pfn))                   
2353                 entry = pte_mkdevmap(pfn_t_pt    
2354         else                                     
2355                 entry = pte_mkspecial(pfn_t_p    
2356                                                  
2357         if (mkwrite) {                           
2358                 entry = pte_mkyoung(entry);      
2359                 entry = maybe_mkwrite(pte_mkd    
2360         }                                        
2361                                                  
2362         set_pte_at(mm, addr, pte, entry);        
2363         update_mmu_cache(vma, addr, pte); /*     
2364                                                  
2365 out_unlock:                                      
2366         pte_unmap_unlock(pte, ptl);              
2367         return VM_FAULT_NOPAGE;                  
2368 }                                                
2369                                                  
2370 /**                                              
2371  * vmf_insert_pfn_prot - insert single pfn in    
2372  * @vma: user vma to map to                      
2373  * @addr: target user address of this page       
2374  * @pfn: source kernel pfn                       
2375  * @pgprot: pgprot flags for the inserted pag    
2376  *                                               
2377  * This is exactly like vmf_insert_pfn(), exc    
2378  * to override pgprot on a per-page basis.       
2379  *                                               
2380  * This only makes sense for IO mappings, and    
2381  * COW mappings.  In general, using multiple     
2382  * vmf_insert_pfn_prot should only be used if    
2383  * impractical.                                  
2384  *                                               
2385  * pgprot typically only differs from @vma->v    
2386  * caching- and encryption bits different tha    
2387  * because the caching- or encryption mode ma    
2388  *                                               
2389  * This is ok as long as @vma->vm_page_prot i    
2390  * to set caching and encryption bits for tho    
2391  * This is ensured by core vm only modifying     
2392  * functions that don't touch caching- or enc    
2393  * if needed. (See for example mprotect()).      
2394  *                                               
2395  * Also when new page-table entries are creat    
2396  * fault() callback, and never using the valu    
2397  * except for page-table entries that point t    
2398  * of COW.                                       
2399  *                                               
2400  * Context: Process context.  May allocate us    
2401  * Return: vm_fault_t value.                     
2402  */                                              
2403 vm_fault_t vmf_insert_pfn_prot(struct vm_area    
2404                         unsigned long pfn, pg    
2405 {                                                
2406         /*                                       
2407          * Technically, architectures with pt    
2408          * restrictions (same for remap_pfn_r    
2409          * consistency in testing and feature    
2410          * try to keep these invariants in pl    
2411          */                                      
2412         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|V    
2413         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM    
2414                                                  
2415         BUG_ON((vma->vm_flags & VM_PFNMAP) &&    
2416         BUG_ON((vma->vm_flags & VM_MIXEDMAP)     
2417                                                  
2418         if (addr < vma->vm_start || addr >= v    
2419                 return VM_FAULT_SIGBUS;          
2420                                                  
2421         if (!pfn_modify_allowed(pfn, pgprot))    
2422                 return VM_FAULT_SIGBUS;          
2423                                                  
2424         track_pfn_insert(vma, &pgprot, __pfn_    
2425                                                  
2426         return insert_pfn(vma, addr, __pfn_to    
2427                         false);                  
2428 }                                                
2429 EXPORT_SYMBOL(vmf_insert_pfn_prot);              
2430                                                  
2431 /**                                              
2432  * vmf_insert_pfn - insert single pfn into us    
2433  * @vma: user vma to map to                      
2434  * @addr: target user address of this page       
2435  * @pfn: source kernel pfn                       
2436  *                                               
2437  * Similar to vm_insert_page, this allows dri    
2438  * they've allocated into a user vma. Same co    
2439  *                                               
2440  * This function should only be called from a    
2441  * in that case the handler should return the    
2442  *                                               
2443  * vma cannot be a COW mapping.                  
2444  *                                               
2445  * As this is called only for pages that do n    
2446  * do not need to flush old virtual caches or    
2447  *                                               
2448  * Context: Process context.  May allocate us    
2449  * Return: vm_fault_t value.                     
2450  */                                              
2451 vm_fault_t vmf_insert_pfn(struct vm_area_stru    
2452                         unsigned long pfn)       
2453 {                                                
2454         return vmf_insert_pfn_prot(vma, addr,    
2455 }                                                
2456 EXPORT_SYMBOL(vmf_insert_pfn);                   
2457                                                  
2458 static bool vm_mixed_ok(struct vm_area_struct    
2459 {                                                
2460         if (unlikely(is_zero_pfn(pfn_t_to_pfn    
2461             (mkwrite || !vm_mixed_zeropage_al    
2462                 return false;                    
2463         /* these checks mirror the abort cond    
2464         if (vma->vm_flags & VM_MIXEDMAP)         
2465                 return true;                     
2466         if (pfn_t_devmap(pfn))                   
2467                 return true;                     
2468         if (pfn_t_special(pfn))                  
2469                 return true;                     
2470         if (is_zero_pfn(pfn_t_to_pfn(pfn)))      
2471                 return true;                     
2472         return false;                            
2473 }                                                
2474                                                  
2475 static vm_fault_t __vm_insert_mixed(struct vm    
2476                 unsigned long addr, pfn_t pfn    
2477 {                                                
2478         pgprot_t pgprot = vma->vm_page_prot;     
2479         int err;                                 
2480                                                  
2481         if (!vm_mixed_ok(vma, pfn, mkwrite))     
2482                 return VM_FAULT_SIGBUS;          
2483                                                  
2484         if (addr < vma->vm_start || addr >= v    
2485                 return VM_FAULT_SIGBUS;          
2486                                                  
2487         track_pfn_insert(vma, &pgprot, pfn);     
2488                                                  
2489         if (!pfn_modify_allowed(pfn_t_to_pfn(    
2490                 return VM_FAULT_SIGBUS;          
2491                                                  
2492         /*                                       
2493          * If we don't have pte special, then    
2494          * based VM_MIXEDMAP scheme (see vm_n    
2495          * refcount the page if pfn_valid is     
2496          * than insert_pfn).  If a zero_pfn w    
2497          * without pte special, it would ther    
2498          */                                      
2499         if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_S    
2500             !pfn_t_devmap(pfn) && pfn_t_valid    
2501                 struct page *page;               
2502                                                  
2503                 /*                               
2504                  * At this point we are commi    
2505                  * regardless of whether the     
2506                  * result in pfn_t_has_page()    
2507                  */                              
2508                 page = pfn_to_page(pfn_t_to_p    
2509                 err = insert_page(vma, addr,     
2510         } else {                                 
2511                 return insert_pfn(vma, addr,     
2512         }                                        
2513                                                  
2514         if (err == -ENOMEM)                      
2515                 return VM_FAULT_OOM;             
2516         if (err < 0 && err != -EBUSY)            
2517                 return VM_FAULT_SIGBUS;          
2518                                                  
2519         return VM_FAULT_NOPAGE;                  
2520 }                                                
2521                                                  
2522 vm_fault_t vmf_insert_mixed(struct vm_area_st    
2523                 pfn_t pfn)                       
2524 {                                                
2525         return __vm_insert_mixed(vma, addr, p    
2526 }                                                
2527 EXPORT_SYMBOL(vmf_insert_mixed);                 
2528                                                  
2529 /*                                               
2530  *  If the insertion of PTE failed because so    
2531  *  different entry in the mean time, we trea    
2532  *  the same entry was actually inserted.        
2533  */                                              
2534 vm_fault_t vmf_insert_mixed_mkwrite(struct vm    
2535                 unsigned long addr, pfn_t pfn    
2536 {                                                
2537         return __vm_insert_mixed(vma, addr, p    
2538 }                                                
2539                                                  
2540 /*                                               
2541  * maps a range of physical memory into the r    
2542  * mappings are removed. any references to no    
2543  * in null mappings (currently treated as "co    
2544  */                                              
2545 static int remap_pte_range(struct mm_struct *    
2546                         unsigned long addr, u    
2547                         unsigned long pfn, pg    
2548 {                                                
2549         pte_t *pte, *mapped_pte;                 
2550         spinlock_t *ptl;                         
2551         int err = 0;                             
2552                                                  
2553         mapped_pte = pte = pte_alloc_map_lock    
2554         if (!pte)                                
2555                 return -ENOMEM;                  
2556         arch_enter_lazy_mmu_mode();              
2557         do {                                     
2558                 BUG_ON(!pte_none(ptep_get(pte    
2559                 if (!pfn_modify_allowed(pfn,     
2560                         err = -EACCES;           
2561                         break;                   
2562                 }                                
2563                 set_pte_at(mm, addr, pte, pte    
2564                 pfn++;                           
2565         } while (pte++, addr += PAGE_SIZE, ad    
2566         arch_leave_lazy_mmu_mode();              
2567         pte_unmap_unlock(mapped_pte, ptl);       
2568         return err;                              
2569 }                                                
2570                                                  
2571 static inline int remap_pmd_range(struct mm_s    
2572                         unsigned long addr, u    
2573                         unsigned long pfn, pg    
2574 {                                                
2575         pmd_t *pmd;                              
2576         unsigned long next;                      
2577         int err;                                 
2578                                                  
2579         pfn -= addr >> PAGE_SHIFT;               
2580         pmd = pmd_alloc(mm, pud, addr);          
2581         if (!pmd)                                
2582                 return -ENOMEM;                  
2583         VM_BUG_ON(pmd_trans_huge(*pmd));         
2584         do {                                     
2585                 next = pmd_addr_end(addr, end    
2586                 err = remap_pte_range(mm, pmd    
2587                                 pfn + (addr >    
2588                 if (err)                         
2589                         return err;              
2590         } while (pmd++, addr = next, addr !=     
2591         return 0;                                
2592 }                                                
2593                                                  
2594 static inline int remap_pud_range(struct mm_s    
2595                         unsigned long addr, u    
2596                         unsigned long pfn, pg    
2597 {                                                
2598         pud_t *pud;                              
2599         unsigned long next;                      
2600         int err;                                 
2601                                                  
2602         pfn -= addr >> PAGE_SHIFT;               
2603         pud = pud_alloc(mm, p4d, addr);          
2604         if (!pud)                                
2605                 return -ENOMEM;                  
2606         do {                                     
2607                 next = pud_addr_end(addr, end    
2608                 err = remap_pmd_range(mm, pud    
2609                                 pfn + (addr >    
2610                 if (err)                         
2611                         return err;              
2612         } while (pud++, addr = next, addr !=     
2613         return 0;                                
2614 }                                                
2615                                                  
2616 static inline int remap_p4d_range(struct mm_s    
2617                         unsigned long addr, u    
2618                         unsigned long pfn, pg    
2619 {                                                
2620         p4d_t *p4d;                              
2621         unsigned long next;                      
2622         int err;                                 
2623                                                  
2624         pfn -= addr >> PAGE_SHIFT;               
2625         p4d = p4d_alloc(mm, pgd, addr);          
2626         if (!p4d)                                
2627                 return -ENOMEM;                  
2628         do {                                     
2629                 next = p4d_addr_end(addr, end    
2630                 err = remap_pud_range(mm, p4d    
2631                                 pfn + (addr >    
2632                 if (err)                         
2633                         return err;              
2634         } while (p4d++, addr = next, addr !=     
2635         return 0;                                
2636 }                                                
2637                                                  
2638 static int remap_pfn_range_internal(struct vm    
2639                 unsigned long pfn, unsigned l    
2640 {                                                
2641         pgd_t *pgd;                              
2642         unsigned long next;                      
2643         unsigned long end = addr + PAGE_ALIGN    
2644         struct mm_struct *mm = vma->vm_mm;       
2645         int err;                                 
2646                                                  
2647         if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))    
2648                 return -EINVAL;                  
2649                                                  
2650         /*                                       
2651          * Physically remapped pages are spec    
2652          * rest of the world about it:           
2653          *   VM_IO tells people not to look a    
2654          *      (accesses can have side effec    
2655          *   VM_PFNMAP tells the core MM that    
2656          *      raw PFN mappings, and do not     
2657          *      with them.                       
2658          *   VM_DONTEXPAND                       
2659          *      Disable vma merging and expan    
2660          *   VM_DONTDUMP                         
2661          *      Omit vma from core dump, even    
2662          *                                       
2663          * There's a horrible special case to    
2664          * behaviour that some programs depen    
2665          * un-COW'ed pages by matching them u    
2666          * See vm_normal_page() for details.     
2667          */                                      
2668         if (is_cow_mapping(vma->vm_flags)) {     
2669                 if (addr != vma->vm_start ||     
2670                         return -EINVAL;          
2671                 vma->vm_pgoff = pfn;             
2672         }                                        
2673                                                  
2674         vm_flags_set(vma, VM_IO | VM_PFNMAP |    
2675                                                  
2676         BUG_ON(addr >= end);                     
2677         pfn -= addr >> PAGE_SHIFT;               
2678         pgd = pgd_offset(mm, addr);              
2679         flush_cache_range(vma, addr, end);       
2680         do {                                     
2681                 next = pgd_addr_end(addr, end    
2682                 err = remap_p4d_range(mm, pgd    
2683                                 pfn + (addr >    
2684                 if (err)                         
2685                         return err;              
2686         } while (pgd++, addr = next, addr !=     
2687                                                  
2688         return 0;                                
2689 }                                                
2690                                                  
2691 /*                                               
2692  * Variant of remap_pfn_range that does not c    
2693  * must have pre-validated the caching bits o    
2694  */                                              
2695 int remap_pfn_range_notrack(struct vm_area_st    
2696                 unsigned long pfn, unsigned l    
2697 {                                                
2698         int error = remap_pfn_range_internal(    
2699                                                  
2700         if (!error)                              
2701                 return 0;                        
2702                                                  
2703         /*                                       
2704          * A partial pfn range mapping is dan    
2705          * maintain page reference counts, an    
2706          * pages due to the error. So zap it     
2707          */                                      
2708         zap_page_range_single(vma, addr, size    
2709         return error;                            
2710 }                                                
2711                                                  
2712 /**                                              
2713  * remap_pfn_range - remap kernel memory to u    
2714  * @vma: user vma to map to                      
2715  * @addr: target page aligned user address to    
2716  * @pfn: page frame number of kernel physical    
2717  * @size: size of mapping area                   
2718  * @prot: page protection flags for this mapp    
2719  *                                               
2720  * Note: this is only safe if the mm semaphor    
2721  *                                               
2722  * Return: %0 on success, negative error code    
2723  */                                              
2724 int remap_pfn_range(struct vm_area_struct *vm    
2725                     unsigned long pfn, unsign    
2726 {                                                
2727         int err;                                 
2728                                                  
2729         err = track_pfn_remap(vma, &prot, pfn    
2730         if (err)                                 
2731                 return -EINVAL;                  
2732                                                  
2733         err = remap_pfn_range_notrack(vma, ad    
2734         if (err)                                 
2735                 untrack_pfn(vma, pfn, PAGE_AL    
2736         return err;                              
2737 }                                                
2738 EXPORT_SYMBOL(remap_pfn_range);                  
2739                                                  
2740 /**                                              
2741  * vm_iomap_memory - remap memory to userspac    
2742  * @vma: user vma to map to                      
2743  * @start: start of the physical memory to be    
2744  * @len: size of area                            
2745  *                                               
2746  * This is a simplified io_remap_pfn_range()     
2747  * driver just needs to give us the physical     
2748  * we'll figure out the rest from the vma inf    
2749  *                                               
2750  * NOTE! Some drivers might want to tweak vma    
2751  * whatever write-combining details or simila    
2752  *                                               
2753  * Return: %0 on success, negative error code    
2754  */                                              
2755 int vm_iomap_memory(struct vm_area_struct *vm    
2756 {                                                
2757         unsigned long vm_len, pfn, pages;        
2758                                                  
2759         /* Check that the physical memory are    
2760         if (start + len < start)                 
2761                 return -EINVAL;                  
2762         /*                                       
2763          * You *really* shouldn't map things     
2764          * but we've historically allowed it     
2765          * just have smaller alignment.          
2766          */                                      
2767         len += start & ~PAGE_MASK;               
2768         pfn = start >> PAGE_SHIFT;               
2769         pages = (len + ~PAGE_MASK) >> PAGE_SH    
2770         if (pfn + pages < pfn)                   
2771                 return -EINVAL;                  
2772                                                  
2773         /* We start the mapping 'vm_pgoff' pa    
2774         if (vma->vm_pgoff > pages)               
2775                 return -EINVAL;                  
2776         pfn += vma->vm_pgoff;                    
2777         pages -= vma->vm_pgoff;                  
2778                                                  
2779         /* Can we fit all of the mapping? */     
2780         vm_len = vma->vm_end - vma->vm_start;    
2781         if (vm_len >> PAGE_SHIFT > pages)        
2782                 return -EINVAL;                  
2783                                                  
2784         /* Ok, let it rip */                     
2785         return io_remap_pfn_range(vma, vma->v    
2786 }                                                
2787 EXPORT_SYMBOL(vm_iomap_memory);                  
2788                                                  
2789 static int apply_to_pte_range(struct mm_struc    
2790                                      unsigned    
2791                                      pte_fn_t    
2792                                      pgtbl_mo    
2793 {                                                
2794         pte_t *pte, *mapped_pte;                 
2795         int err = 0;                             
2796         spinlock_t *ptl;                         
2797                                                  
2798         if (create) {                            
2799                 mapped_pte = pte = (mm == &in    
2800                         pte_alloc_kernel_trac    
2801                         pte_alloc_map_lock(mm    
2802                 if (!pte)                        
2803                         return -ENOMEM;          
2804         } else {                                 
2805                 mapped_pte = pte = (mm == &in    
2806                         pte_offset_kernel(pmd    
2807                         pte_offset_map_lock(m    
2808                 if (!pte)                        
2809                         return -EINVAL;          
2810         }                                        
2811                                                  
2812         arch_enter_lazy_mmu_mode();              
2813                                                  
2814         if (fn) {                                
2815                 do {                             
2816                         if (create || !pte_no    
2817                                 err = fn(pte+    
2818                                 if (err)         
2819                                         break    
2820                         }                        
2821                 } while (addr += PAGE_SIZE, a    
2822         }                                        
2823         *mask |= PGTBL_PTE_MODIFIED;             
2824                                                  
2825         arch_leave_lazy_mmu_mode();              
2826                                                  
2827         if (mm != &init_mm)                      
2828                 pte_unmap_unlock(mapped_pte,     
2829         return err;                              
2830 }                                                
2831                                                  
2832 static int apply_to_pmd_range(struct mm_struc    
2833                                      unsigned    
2834                                      pte_fn_t    
2835                                      pgtbl_mo    
2836 {                                                
2837         pmd_t *pmd;                              
2838         unsigned long next;                      
2839         int err = 0;                             
2840                                                  
2841         BUG_ON(pud_leaf(*pud));                  
2842                                                  
2843         if (create) {                            
2844                 pmd = pmd_alloc_track(mm, pud    
2845                 if (!pmd)                        
2846                         return -ENOMEM;          
2847         } else {                                 
2848                 pmd = pmd_offset(pud, addr);     
2849         }                                        
2850         do {                                     
2851                 next = pmd_addr_end(addr, end    
2852                 if (pmd_none(*pmd) && !create    
2853                         continue;                
2854                 if (WARN_ON_ONCE(pmd_leaf(*pm    
2855                         return -EINVAL;          
2856                 if (!pmd_none(*pmd) && WARN_O    
2857                         if (!create)             
2858                                 continue;        
2859                         pmd_clear_bad(pmd);      
2860                 }                                
2861                 err = apply_to_pte_range(mm,     
2862                                          fn,     
2863                 if (err)                         
2864                         break;                   
2865         } while (pmd++, addr = next, addr !=     
2866                                                  
2867         return err;                              
2868 }                                                
2869                                                  
2870 static int apply_to_pud_range(struct mm_struc    
2871                                      unsigned    
2872                                      pte_fn_t    
2873                                      pgtbl_mo    
2874 {                                                
2875         pud_t *pud;                              
2876         unsigned long next;                      
2877         int err = 0;                             
2878                                                  
2879         if (create) {                            
2880                 pud = pud_alloc_track(mm, p4d    
2881                 if (!pud)                        
2882                         return -ENOMEM;          
2883         } else {                                 
2884                 pud = pud_offset(p4d, addr);     
2885         }                                        
2886         do {                                     
2887                 next = pud_addr_end(addr, end    
2888                 if (pud_none(*pud) && !create    
2889                         continue;                
2890                 if (WARN_ON_ONCE(pud_leaf(*pu    
2891                         return -EINVAL;          
2892                 if (!pud_none(*pud) && WARN_O    
2893                         if (!create)             
2894                                 continue;        
2895                         pud_clear_bad(pud);      
2896                 }                                
2897                 err = apply_to_pmd_range(mm,     
2898                                          fn,     
2899                 if (err)                         
2900                         break;                   
2901         } while (pud++, addr = next, addr !=     
2902                                                  
2903         return err;                              
2904 }                                                
2905                                                  
2906 static int apply_to_p4d_range(struct mm_struc    
2907                                      unsigned    
2908                                      pte_fn_t    
2909                                      pgtbl_mo    
2910 {                                                
2911         p4d_t *p4d;                              
2912         unsigned long next;                      
2913         int err = 0;                             
2914                                                  
2915         if (create) {                            
2916                 p4d = p4d_alloc_track(mm, pgd    
2917                 if (!p4d)                        
2918                         return -ENOMEM;          
2919         } else {                                 
2920                 p4d = p4d_offset(pgd, addr);     
2921         }                                        
2922         do {                                     
2923                 next = p4d_addr_end(addr, end    
2924                 if (p4d_none(*p4d) && !create    
2925                         continue;                
2926                 if (WARN_ON_ONCE(p4d_leaf(*p4    
2927                         return -EINVAL;          
2928                 if (!p4d_none(*p4d) && WARN_O    
2929                         if (!create)             
2930                                 continue;        
2931                         p4d_clear_bad(p4d);      
2932                 }                                
2933                 err = apply_to_pud_range(mm,     
2934                                          fn,     
2935                 if (err)                         
2936                         break;                   
2937         } while (p4d++, addr = next, addr !=     
2938                                                  
2939         return err;                              
2940 }                                                
2941                                                  
2942 static int __apply_to_page_range(struct mm_st    
2943                                  unsigned lon    
2944                                  void *data,     
2945 {                                                
2946         pgd_t *pgd;                              
2947         unsigned long start = addr, next;        
2948         unsigned long end = addr + size;         
2949         pgtbl_mod_mask mask = 0;                 
2950         int err = 0;                             
2951                                                  
2952         if (WARN_ON(addr >= end))                
2953                 return -EINVAL;                  
2954                                                  
2955         pgd = pgd_offset(mm, addr);              
2956         do {                                     
2957                 next = pgd_addr_end(addr, end    
2958                 if (pgd_none(*pgd) && !create    
2959                         continue;                
2960                 if (WARN_ON_ONCE(pgd_leaf(*pg    
2961                         return -EINVAL;          
2962                 if (!pgd_none(*pgd) && WARN_O    
2963                         if (!create)             
2964                                 continue;        
2965                         pgd_clear_bad(pgd);      
2966                 }                                
2967                 err = apply_to_p4d_range(mm,     
2968                                          fn,     
2969                 if (err)                         
2970                         break;                   
2971         } while (pgd++, addr = next, addr !=     
2972                                                  
2973         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)    
2974                 arch_sync_kernel_mappings(sta    
2975                                                  
2976         return err;                              
2977 }                                                
2978                                                  
2979 /*                                               
2980  * Scan a region of virtual memory, filling i    
2981  * and calling a provided function on each le    
2982  */                                              
2983 int apply_to_page_range(struct mm_struct *mm,    
2984                         unsigned long size, p    
2985 {                                                
2986         return __apply_to_page_range(mm, addr    
2987 }                                                
2988 EXPORT_SYMBOL_GPL(apply_to_page_range);          
2989                                                  
2990 /*                                               
2991  * Scan a region of virtual memory, calling a    
2992  * each leaf page table where it exists.         
2993  *                                               
2994  * Unlike apply_to_page_range, this does _not    
2995  * where they are absent.                        
2996  */                                              
2997 int apply_to_existing_page_range(struct mm_st    
2998                                  unsigned lon    
2999 {                                                
3000         return __apply_to_page_range(mm, addr    
3001 }                                                
3002 EXPORT_SYMBOL_GPL(apply_to_existing_page_rang    
3003                                                  
3004 /*                                               
3005  * handle_pte_fault chooses page fault handle    
3006  * read non-atomically.  Before making any co    
3007  * or configurations (e.g. i386 with PAE) whi    
3008  * parts, do_swap_page must check under lock     
3009  * proceeding (but do_wp_page is only called     
3010  * and do_anonymous_page can safely check lat    
3011  */                                              
3012 static inline int pte_unmap_same(struct vm_fa    
3013 {                                                
3014         int same = 1;                            
3015 #if defined(CONFIG_SMP) || defined(CONFIG_PRE    
3016         if (sizeof(pte_t) > sizeof(unsigned l    
3017                 spin_lock(vmf->ptl);             
3018                 same = pte_same(ptep_get(vmf-    
3019                 spin_unlock(vmf->ptl);           
3020         }                                        
3021 #endif                                           
3022         pte_unmap(vmf->pte);                     
3023         vmf->pte = NULL;                         
3024         return same;                             
3025 }                                                
3026                                                  
3027 /*                                               
3028  * Return:                                       
3029  *      0:              copied succeeded         
3030  *      -EHWPOISON:     copy failed due to hw    
3031  *      -EAGAIN:        copied failed (some o    
3032  */                                              
3033 static inline int __wp_page_copy_user(struct     
3034                                       struct     
3035 {                                                
3036         int ret;                                 
3037         void *kaddr;                             
3038         void __user *uaddr;                      
3039         struct vm_area_struct *vma = vmf->vma    
3040         struct mm_struct *mm = vma->vm_mm;       
3041         unsigned long addr = vmf->address;       
3042                                                  
3043         if (likely(src)) {                       
3044                 if (copy_mc_user_highpage(dst    
3045                         return -EHWPOISON;       
3046                 return 0;                        
3047         }                                        
3048                                                  
3049         /*                                       
3050          * If the source page was a PFN mappi    
3051          * a "struct page" for it. We do a be    
3052          * just copying from the original use    
3053          * fails, we just zero-fill it. Live     
3054          */                                      
3055         kaddr = kmap_local_page(dst);            
3056         pagefault_disable();                     
3057         uaddr = (void __user *)(addr & PAGE_M    
3058                                                  
3059         /*                                       
3060          * On architectures with software "ac    
3061          * take a double page fault, so mark     
3062          */                                      
3063         vmf->pte = NULL;                         
3064         if (!arch_has_hw_pte_young() && !pte_    
3065                 pte_t entry;                     
3066                                                  
3067                 vmf->pte = pte_offset_map_loc    
3068                 if (unlikely(!vmf->pte || !pt    
3069                         /*                       
3070                          * Other thread has a    
3071                          * and update local t    
3072                          */                      
3073                         if (vmf->pte)            
3074                                 update_mmu_tl    
3075                         ret = -EAGAIN;           
3076                         goto pte_unlock;         
3077                 }                                
3078                                                  
3079                 entry = pte_mkyoung(vmf->orig    
3080                 if (ptep_set_access_flags(vma    
3081                         update_mmu_cache_rang    
3082         }                                        
3083                                                  
3084         /*                                       
3085          * This really shouldn't fail, becaus    
3086          * in the page tables. But it might j    
3087          * in which case we just give up and     
3088          * zeroes.                               
3089          */                                      
3090         if (__copy_from_user_inatomic(kaddr,     
3091                 if (vmf->pte)                    
3092                         goto warn;               
3093                                                  
3094                 /* Re-validate under PTL if t    
3095                 vmf->pte = pte_offset_map_loc    
3096                 if (unlikely(!vmf->pte || !pt    
3097                         /* The PTE changed un    
3098                         if (vmf->pte)            
3099                                 update_mmu_tl    
3100                         ret = -EAGAIN;           
3101                         goto pte_unlock;         
3102                 }                                
3103                                                  
3104                 /*                               
3105                  * The same page can be mappe    
3106                  * Try to copy again under PT    
3107                  */                              
3108                 if (__copy_from_user_inatomic    
3109                         /*                       
3110                          * Give a warn in cas    
3111                          * use-case              
3112                          */                      
3113 warn:                                            
3114                         WARN_ON_ONCE(1);         
3115                         clear_page(kaddr);       
3116                 }                                
3117         }                                        
3118                                                  
3119         ret = 0;                                 
3120                                                  
3121 pte_unlock:                                      
3122         if (vmf->pte)                            
3123                 pte_unmap_unlock(vmf->pte, vm    
3124         pagefault_enable();                      
3125         kunmap_local(kaddr);                     
3126         flush_dcache_page(dst);                  
3127                                                  
3128         return ret;                              
3129 }                                                
3130                                                  
3131 static gfp_t __get_fault_gfp_mask(struct vm_a    
3132 {                                                
3133         struct file *vm_file = vma->vm_file;     
3134                                                  
3135         if (vm_file)                             
3136                 return mapping_gfp_mask(vm_fi    
3137                                                  
3138         /*                                       
3139          * Special mappings (e.g. VDSO) do no    
3140          * a default GFP_KERNEL for them.        
3141          */                                      
3142         return GFP_KERNEL;                       
3143 }                                                
3144                                                  
3145 /*                                               
3146  * Notify the address space that the page is     
3147  * it can prohibit this or wait for the page     
3148  *                                               
3149  * We do this without the lock held, so that     
3150  */                                              
3151 static vm_fault_t do_page_mkwrite(struct vm_f    
3152 {                                                
3153         vm_fault_t ret;                          
3154         unsigned int old_flags = vmf->flags;     
3155                                                  
3156         vmf->flags = FAULT_FLAG_WRITE|FAULT_F    
3157                                                  
3158         if (vmf->vma->vm_file &&                 
3159             IS_SWAPFILE(vmf->vma->vm_file->f_    
3160                 return VM_FAULT_SIGBUS;          
3161                                                  
3162         ret = vmf->vma->vm_ops->page_mkwrite(    
3163         /* Restore original flags so that cal    
3164         vmf->flags = old_flags;                  
3165         if (unlikely(ret & (VM_FAULT_ERROR |     
3166                 return ret;                      
3167         if (unlikely(!(ret & VM_FAULT_LOCKED)    
3168                 folio_lock(folio);               
3169                 if (!folio->mapping) {           
3170                         folio_unlock(folio);     
3171                         return 0; /* retry */    
3172                 }                                
3173                 ret |= VM_FAULT_LOCKED;          
3174         } else                                   
3175                 VM_BUG_ON_FOLIO(!folio_test_l    
3176         return ret;                              
3177 }                                                
3178                                                  
3179 /*                                               
3180  * Handle dirtying of a page in shared file m    
3181  *                                               
3182  * The function expects the page to be locked    
3183  */                                              
3184 static vm_fault_t fault_dirty_shared_page(str    
3185 {                                                
3186         struct vm_area_struct *vma = vmf->vma    
3187         struct address_space *mapping;           
3188         struct folio *folio = page_folio(vmf-    
3189         bool dirtied;                            
3190         bool page_mkwrite = vma->vm_ops && vm    
3191                                                  
3192         dirtied = folio_mark_dirty(folio);       
3193         VM_BUG_ON_FOLIO(folio_test_anon(folio    
3194         /*                                       
3195          * Take a local copy of the address_s    
3196          * by truncate after folio_unlock().     
3197          * pinned by vma->vm_file's reference    
3198          * release semantics to prevent the c    
3199          */                                      
3200         mapping = folio_raw_mapping(folio);      
3201         folio_unlock(folio);                     
3202                                                  
3203         if (!page_mkwrite)                       
3204                 file_update_time(vma->vm_file    
3205                                                  
3206         /*                                       
3207          * Throttle page dirtying rate down t    
3208          *                                       
3209          * mapping may be NULL here because s    
3210          * set page.mapping but still dirty t    
3211          *                                       
3212          * Drop the mmap_lock before waiting     
3213          * is pinning the mapping, as per abo    
3214          */                                      
3215         if ((dirtied || page_mkwrite) && mapp    
3216                 struct file *fpin;               
3217                                                  
3218                 fpin = maybe_unlock_mmap_for_    
3219                 balance_dirty_pages_ratelimit    
3220                 if (fpin) {                      
3221                         fput(fpin);              
3222                         return VM_FAULT_COMPL    
3223                 }                                
3224         }                                        
3225                                                  
3226         return 0;                                
3227 }                                                
3228                                                  
3229 /*                                               
3230  * Handle write page faults for pages that ca    
3231  *                                               
3232  * This can happen either due to the mapping     
3233  * or due to us being the last reference stan    
3234  * case, all we need to do here is to mark th    
3235  * any related book-keeping.                     
3236  */                                              
3237 static inline void wp_page_reuse(struct vm_fa    
3238         __releases(vmf->ptl)                     
3239 {                                                
3240         struct vm_area_struct *vma = vmf->vma    
3241         pte_t entry;                             
3242                                                  
3243         VM_BUG_ON(!(vmf->flags & FAULT_FLAG_W    
3244         VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->o    
3245                                                  
3246         if (folio) {                             
3247                 VM_BUG_ON(folio_test_anon(fol    
3248                           !PageAnonExclusive(    
3249                 /*                               
3250                  * Clear the folio's cpupid i    
3251                  * information potentially be    
3252                  * unrelated process.            
3253                  */                              
3254                 folio_xchg_last_cpupid(folio,    
3255         }                                        
3256                                                  
3257         flush_cache_page(vma, vmf->address, p    
3258         entry = pte_mkyoung(vmf->orig_pte);      
3259         entry = maybe_mkwrite(pte_mkdirty(ent    
3260         if (ptep_set_access_flags(vma, vmf->a    
3261                 update_mmu_cache_range(vmf, v    
3262         pte_unmap_unlock(vmf->pte, vmf->ptl);    
3263         count_vm_event(PGREUSE);                 
3264 }                                                
3265                                                  
3266 /*                                               
3267  * We could add a bitflag somewhere, but for     
3268  * vm_ops that have a ->map_pages have been a    
3269  * the mmap_lock to be held.                     
3270  */                                              
3271 static inline vm_fault_t vmf_can_call_fault(c    
3272 {                                                
3273         struct vm_area_struct *vma = vmf->vma    
3274                                                  
3275         if (vma->vm_ops->map_pages || !(vmf->    
3276                 return 0;                        
3277         vma_end_read(vma);                       
3278         return VM_FAULT_RETRY;                   
3279 }                                                
3280                                                  
3281 /**                                              
3282  * __vmf_anon_prepare - Prepare to handle an     
3283  * @vmf: The vm_fault descriptor passed from     
3284  *                                               
3285  * When preparing to insert an anonymous page    
3286  * fault handler, call this function rather t    
3287  * If this vma does not already have an assoc    
3288  * only protected by the per-VMA lock, the ca    
3289  * mmap_lock held.  __anon_vma_prepare() will    
3290  * determine if this VMA can share its anon_v    
3291  * do with only the per-VMA lock held for thi    
3292  *                                               
3293  * Return: 0 if fault handling can proceed.      
3294  * returned to the caller.                       
3295  */                                              
3296 vm_fault_t __vmf_anon_prepare(struct vm_fault    
3297 {                                                
3298         struct vm_area_struct *vma = vmf->vma    
3299         vm_fault_t ret = 0;                      
3300                                                  
3301         if (likely(vma->anon_vma))               
3302                 return 0;                        
3303         if (vmf->flags & FAULT_FLAG_VMA_LOCK)    
3304                 if (!mmap_read_trylock(vma->v    
3305                         return VM_FAULT_RETRY    
3306         }                                        
3307         if (__anon_vma_prepare(vma))             
3308                 ret = VM_FAULT_OOM;              
3309         if (vmf->flags & FAULT_FLAG_VMA_LOCK)    
3310                 mmap_read_unlock(vma->vm_mm);    
3311         return ret;                              
3312 }                                                
3313                                                  
3314 /*                                               
3315  * Handle the case of a page which we actuall    
3316  * either due to COW or unsharing.               
3317  *                                               
3318  * Called with mmap_lock locked and the old p    
3319  * without the ptl held.                         
3320  *                                               
3321  * High level logic flow:                        
3322  *                                               
3323  * - Allocate a page, copy the content of the    
3324  * - Handle book keeping and accounting - cgr    
3325  * - Take the PTL. If the pte changed, bail o    
3326  * - If the pte is still the way we remember     
3327  *   relevant references. This includes dropp    
3328  *   held to the old page, as well as updatin    
3329  * - In any case, unlock the PTL and drop the    
3330  */                                              
3331 static vm_fault_t wp_page_copy(struct vm_faul    
3332 {                                                
3333         const bool unshare = vmf->flags & FAU    
3334         struct vm_area_struct *vma = vmf->vma    
3335         struct mm_struct *mm = vma->vm_mm;       
3336         struct folio *old_folio = NULL;          
3337         struct folio *new_folio = NULL;          
3338         pte_t entry;                             
3339         int page_copied = 0;                     
3340         struct mmu_notifier_range range;         
3341         vm_fault_t ret;                          
3342         bool pfn_is_zero;                        
3343                                                  
3344         delayacct_wpcopy_start();                
3345                                                  
3346         if (vmf->page)                           
3347                 old_folio = page_folio(vmf->p    
3348         ret = vmf_anon_prepare(vmf);             
3349         if (unlikely(ret))                       
3350                 goto out;                        
3351                                                  
3352         pfn_is_zero = is_zero_pfn(pte_pfn(vmf    
3353         new_folio = folio_prealloc(mm, vma, v    
3354         if (!new_folio)                          
3355                 goto oom;                        
3356                                                  
3357         if (!pfn_is_zero) {                      
3358                 int err;                         
3359                                                  
3360                 err = __wp_page_copy_user(&ne    
3361                 if (err) {                       
3362                         /*                       
3363                          * COW failed, if the    
3364                          * it's fine. If not,    
3365                          * the same address a    
3366                          * from the second at    
3367                          * The -EHWPOISON cas    
3368                          */                      
3369                         folio_put(new_folio);    
3370                         if (old_folio)           
3371                                 folio_put(old    
3372                                                  
3373                         delayacct_wpcopy_end(    
3374                         return err == -EHWPOI    
3375                 }                                
3376                 kmsan_copy_page_meta(&new_fol    
3377         }                                        
3378                                                  
3379         __folio_mark_uptodate(new_folio);        
3380                                                  
3381         mmu_notifier_range_init(&range, MMU_N    
3382                                 vmf->address     
3383                                 (vmf->address    
3384         mmu_notifier_invalidate_range_start(&    
3385                                                  
3386         /*                                       
3387          * Re-check the pte - we dropped the     
3388          */                                      
3389         vmf->pte = pte_offset_map_lock(mm, vm    
3390         if (likely(vmf->pte && pte_same(ptep_    
3391                 if (old_folio) {                 
3392                         if (!folio_test_anon(    
3393                                 dec_mm_counte    
3394                                 inc_mm_counte    
3395                         }                        
3396                 } else {                         
3397                         ksm_might_unmap_zero_    
3398                         inc_mm_counter(mm, MM    
3399                 }                                
3400                 flush_cache_page(vma, vmf->ad    
3401                 entry = mk_pte(&new_folio->pa    
3402                 entry = pte_sw_mkyoung(entry)    
3403                 if (unlikely(unshare)) {         
3404                         if (pte_soft_dirty(vm    
3405                                 entry = pte_m    
3406                         if (pte_uffd_wp(vmf->    
3407                                 entry = pte_m    
3408                 } else {                         
3409                         entry = maybe_mkwrite    
3410                 }                                
3411                                                  
3412                 /*                               
3413                  * Clear the pte entry and fl    
3414                  * pte with the new entry, to    
3415                  * sync. This code used to se    
3416                  * that left a window where t    
3417                  * some TLBs while the old PT    
3418                  */                              
3419                 ptep_clear_flush(vma, vmf->ad    
3420                 folio_add_new_anon_rmap(new_f    
3421                 folio_add_lru_vma(new_folio,     
3422                 BUG_ON(unshare && pte_write(e    
3423                 set_pte_at(mm, vmf->address,     
3424                 update_mmu_cache_range(vmf, v    
3425                 if (old_folio) {                 
3426                         /*                       
3427                          * Only after switchi    
3428                          * we remove the mapc    
3429                          * process may come a    
3430                          * before the pte is     
3431                          * "reuse" the old pa    
3432                          * here still points     
3433                          * threads.              
3434                          *                       
3435                          * The critical issue    
3436                          * folio_remove_rmap_    
3437                          * above. Those store    
3438                          * the barrier presen    
3439                          * in folio_remove_rm    
3440                          *                       
3441                          * Then the TLB flush    
3442                          * no process can acc    
3443                          * decremented mapcou    
3444                          * cannot be reused u    
3445                          * mapcount is visibl    
3446                          * old page will be f    
3447                          */                      
3448                         folio_remove_rmap_pte    
3449                 }                                
3450                                                  
3451                 /* Free the old page.. */        
3452                 new_folio = old_folio;           
3453                 page_copied = 1;                 
3454                 pte_unmap_unlock(vmf->pte, vm    
3455         } else if (vmf->pte) {                   
3456                 update_mmu_tlb(vma, vmf->addr    
3457                 pte_unmap_unlock(vmf->pte, vm    
3458         }                                        
3459                                                  
3460         mmu_notifier_invalidate_range_end(&ra    
3461                                                  
3462         if (new_folio)                           
3463                 folio_put(new_folio);            
3464         if (old_folio) {                         
3465                 if (page_copied)                 
3466                         free_swap_cache(old_f    
3467                 folio_put(old_folio);            
3468         }                                        
3469                                                  
3470         delayacct_wpcopy_end();                  
3471         return 0;                                
3472 oom:                                             
3473         ret = VM_FAULT_OOM;                      
3474 out:                                             
3475         if (old_folio)                           
3476                 folio_put(old_folio);            
3477                                                  
3478         delayacct_wpcopy_end();                  
3479         return ret;                              
3480 }                                                
3481                                                  
3482 /**                                              
3483  * finish_mkwrite_fault - finish page fault f    
3484  *                        writeable once the     
3485  *                                               
3486  * @vmf: structure describing the fault          
3487  * @folio: the folio of vmf->page                
3488  *                                               
3489  * This function handles all that is needed t    
3490  * shared mapping due to PTE being read-only     
3491  * It handles locking of PTE and modifying it    
3492  *                                               
3493  * The function expects the page to be locked    
3494  * concurrent faults / writeback (such as DAX    
3495  *                                               
3496  * Return: %0 on success, %VM_FAULT_NOPAGE wh    
3497  * we acquired PTE lock.                         
3498  */                                              
3499 static vm_fault_t finish_mkwrite_fault(struct    
3500 {                                                
3501         WARN_ON_ONCE(!(vmf->vma->vm_flags & V    
3502         vmf->pte = pte_offset_map_lock(vmf->v    
3503                                        &vmf->    
3504         if (!vmf->pte)                           
3505                 return VM_FAULT_NOPAGE;          
3506         /*                                       
3507          * We might have raced with another p    
3508          * pte_offset_map_lock.                  
3509          */                                      
3510         if (!pte_same(ptep_get(vmf->pte), vmf    
3511                 update_mmu_tlb(vmf->vma, vmf-    
3512                 pte_unmap_unlock(vmf->pte, vm    
3513                 return VM_FAULT_NOPAGE;          
3514         }                                        
3515         wp_page_reuse(vmf, folio);               
3516         return 0;                                
3517 }                                                
3518                                                  
3519 /*                                               
3520  * Handle write page faults for VM_MIXEDMAP o    
3521  * mapping                                       
3522  */                                              
3523 static vm_fault_t wp_pfn_shared(struct vm_fau    
3524 {                                                
3525         struct vm_area_struct *vma = vmf->vma    
3526                                                  
3527         if (vma->vm_ops && vma->vm_ops->pfn_m    
3528                 vm_fault_t ret;                  
3529                                                  
3530                 pte_unmap_unlock(vmf->pte, vm    
3531                 ret = vmf_can_call_fault(vmf)    
3532                 if (ret)                         
3533                         return ret;              
3534                                                  
3535                 vmf->flags |= FAULT_FLAG_MKWR    
3536                 ret = vma->vm_ops->pfn_mkwrit    
3537                 if (ret & (VM_FAULT_ERROR | V    
3538                         return ret;              
3539                 return finish_mkwrite_fault(v    
3540         }                                        
3541         wp_page_reuse(vmf, NULL);                
3542         return 0;                                
3543 }                                                
3544                                                  
3545 static vm_fault_t wp_page_shared(struct vm_fa    
3546         __releases(vmf->ptl)                     
3547 {                                                
3548         struct vm_area_struct *vma = vmf->vma    
3549         vm_fault_t ret = 0;                      
3550                                                  
3551         folio_get(folio);                        
3552                                                  
3553         if (vma->vm_ops && vma->vm_ops->page_    
3554                 vm_fault_t tmp;                  
3555                                                  
3556                 pte_unmap_unlock(vmf->pte, vm    
3557                 tmp = vmf_can_call_fault(vmf)    
3558                 if (tmp) {                       
3559                         folio_put(folio);        
3560                         return tmp;              
3561                 }                                
3562                                                  
3563                 tmp = do_page_mkwrite(vmf, fo    
3564                 if (unlikely(!tmp || (tmp &      
3565                                       (VM_FAU    
3566                         folio_put(folio);        
3567                         return tmp;              
3568                 }                                
3569                 tmp = finish_mkwrite_fault(vm    
3570                 if (unlikely(tmp & (VM_FAULT_    
3571                         folio_unlock(folio);     
3572                         folio_put(folio);        
3573                         return tmp;              
3574                 }                                
3575         } else {                                 
3576                 wp_page_reuse(vmf, folio);       
3577                 folio_lock(folio);               
3578         }                                        
3579         ret |= fault_dirty_shared_page(vmf);     
3580         folio_put(folio);                        
3581                                                  
3582         return ret;                              
3583 }                                                
3584                                                  
3585 static bool wp_can_reuse_anon_folio(struct fo    
3586                                     struct vm    
3587 {                                                
3588         /*                                       
3589          * We could currently only reuse a su    
3590          * other subpages of the large folios    
3591          * let's just consistently not reuse     
3592          * reuse in that scenario, and give b    
3593          * sooner.                               
3594          */                                      
3595         if (folio_test_large(folio))             
3596                 return false;                    
3597                                                  
3598         /*                                       
3599          * We have to verify under folio lock    
3600          * just an optimization to avoid lock    
3601          * the swapcache if there is little h    
3602          *                                       
3603          * KSM doesn't necessarily raise the     
3604          */                                      
3605         if (folio_test_ksm(folio) || folio_re    
3606                 return false;                    
3607         if (!folio_test_lru(folio))              
3608                 /*                               
3609                  * We cannot easily detect+ha    
3610                  * remote LRU caches or refer    
3611                  */                              
3612                 lru_add_drain();                 
3613         if (folio_ref_count(folio) > 1 + foli    
3614                 return false;                    
3615         if (!folio_trylock(folio))               
3616                 return false;                    
3617         if (folio_test_swapcache(folio))         
3618                 folio_free_swap(folio);          
3619         if (folio_test_ksm(folio) || folio_re    
3620                 folio_unlock(folio);             
3621                 return false;                    
3622         }                                        
3623         /*                                       
3624          * Ok, we've got the only folio refer    
3625          * and the folio is locked, it's dark    
3626          * sunglasses. Hit it.                   
3627          */                                      
3628         folio_move_anon_rmap(folio, vma);        
3629         folio_unlock(folio);                     
3630         return true;                             
3631 }                                                
3632                                                  
3633 /*                                               
3634  * This routine handles present pages, when      
3635  * * users try to write to a shared page (FAU    
3636  * * GUP wants to take a R/O pin on a possibl    
3637  *   (FAULT_FLAG_UNSHARE)                        
3638  *                                               
3639  * It is done by copying the page to a new ad    
3640  * shared-page counter for the old page.         
3641  *                                               
3642  * Note that this routine assumes that the pr    
3643  * done by the caller (the low-level page fau    
3644  * Thus, with FAULT_FLAG_WRITE, we can safely    
3645  * done any necessary COW.                       
3646  *                                               
3647  * In case of FAULT_FLAG_WRITE, we also mark     
3648  * though the page will change only once the     
3649  * avoids a few races, and potentially makes     
3650  *                                               
3651  * We enter with non-exclusive mmap_lock (to     
3652  * but allow concurrent faults), with pte bot    
3653  * We return with mmap_lock still held, but p    
3654  */                                              
3655 static vm_fault_t do_wp_page(struct vm_fault     
3656         __releases(vmf->ptl)                     
3657 {                                                
3658         const bool unshare = vmf->flags & FAU    
3659         struct vm_area_struct *vma = vmf->vma    
3660         struct folio *folio = NULL;              
3661         pte_t pte;                               
3662                                                  
3663         if (likely(!unshare)) {                  
3664                 if (userfaultfd_pte_wp(vma, p    
3665                         if (!userfaultfd_wp_a    
3666                                 pte_unmap_unl    
3667                                 return handle    
3668                         }                        
3669                                                  
3670                         /*                       
3671                          * Nothing needed (ca    
3672                          * etc.) because we'r    
3673                          * which is completel    
3674                          */                      
3675                         pte = pte_clear_uffd_    
3676                                                  
3677                         set_pte_at(vma->vm_mm    
3678                         /*                       
3679                          * Update this to be     
3680                          * handling              
3681                          */                      
3682                         vmf->orig_pte = pte;     
3683                 }                                
3684                                                  
3685                 /*                               
3686                  * Userfaultfd write-protect     
3687                  * is flushed in this case be    
3688                  */                              
3689                 if (unlikely(userfaultfd_wp(v    
3690                              mm_tlb_flush_pen    
3691                         flush_tlb_page(vmf->v    
3692         }                                        
3693                                                  
3694         vmf->page = vm_normal_page(vma, vmf->    
3695                                                  
3696         if (vmf->page)                           
3697                 folio = page_folio(vmf->page)    
3698                                                  
3699         /*                                       
3700          * Shared mapping: we are guaranteed     
3701          * FAULT_FLAG_WRITE set at this point    
3702          */                                      
3703         if (vma->vm_flags & (VM_SHARED | VM_M    
3704                 /*                               
3705                  * VM_MIXEDMAP !pfn_valid() c    
3706                  * VM_PFNMAP VMA.                
3707                  *                               
3708                  * We should not cow pages in    
3709                  * Just mark the pages writab    
3710                  */                              
3711                 if (!vmf->page)                  
3712                         return wp_pfn_shared(    
3713                 return wp_page_shared(vmf, fo    
3714         }                                        
3715                                                  
3716         /*                                       
3717          * Private mapping: create an exclusi    
3718          * is impossible. We might miss VM_WR    
3719          *                                       
3720          * If we encounter a page that is mar    
3721          * the page without further checks.      
3722          */                                      
3723         if (folio && folio_test_anon(folio) &    
3724             (PageAnonExclusive(vmf->page) ||     
3725                 if (!PageAnonExclusive(vmf->p    
3726                         SetPageAnonExclusive(    
3727                 if (unlikely(unshare)) {         
3728                         pte_unmap_unlock(vmf-    
3729                         return 0;                
3730                 }                                
3731                 wp_page_reuse(vmf, folio);       
3732                 return 0;                        
3733         }                                        
3734         /*                                       
3735          * Ok, we need to copy. Oh, well..       
3736          */                                      
3737         if (folio)                               
3738                 folio_get(folio);                
3739                                                  
3740         pte_unmap_unlock(vmf->pte, vmf->ptl);    
3741 #ifdef CONFIG_KSM                                
3742         if (folio && folio_test_ksm(folio))      
3743                 count_vm_event(COW_KSM);         
3744 #endif                                           
3745         return wp_page_copy(vmf);                
3746 }                                                
3747                                                  
3748 static void unmap_mapping_range_vma(struct vm    
3749                 unsigned long start_addr, uns    
3750                 struct zap_details *details)     
3751 {                                                
3752         zap_page_range_single(vma, start_addr    
3753 }                                                
3754                                                  
3755 static inline void unmap_mapping_range_tree(s    
3756                                             p    
3757                                             p    
3758                                             s    
3759 {                                                
3760         struct vm_area_struct *vma;              
3761         pgoff_t vba, vea, zba, zea;              
3762                                                  
3763         vma_interval_tree_foreach(vma, root,     
3764                 vba = vma->vm_pgoff;             
3765                 vea = vba + vma_pages(vma) -     
3766                 zba = max(first_index, vba);     
3767                 zea = min(last_index, vea);      
3768                                                  
3769                 unmap_mapping_range_vma(vma,     
3770                         ((zba - vba) << PAGE_    
3771                         ((zea - vba + 1) << P    
3772                                 details);        
3773         }                                        
3774 }                                                
3775                                                  
3776 /**                                              
3777  * unmap_mapping_folio() - Unmap single folio    
3778  * @folio: The locked folio to be unmapped.      
3779  *                                               
3780  * Unmap this folio from any userspace proces    
3781  * Typically, for efficiency, the range of ne    
3782  * unmapped by unmap_mapping_pages() or unmap    
3783  * truncation or invalidation holds the lock     
3784  * the page has been remapped again: and then    
3785  * to unmap it finally.                          
3786  */                                              
3787 void unmap_mapping_folio(struct folio *folio)    
3788 {                                                
3789         struct address_space *mapping = folio    
3790         struct zap_details details = { };        
3791         pgoff_t first_index;                     
3792         pgoff_t last_index;                      
3793                                                  
3794         VM_BUG_ON(!folio_test_locked(folio));    
3795                                                  
3796         first_index = folio->index;              
3797         last_index = folio_next_index(folio)     
3798                                                  
3799         details.even_cows = false;               
3800         details.single_folio = folio;            
3801         details.zap_flags = ZAP_FLAG_DROP_MAR    
3802                                                  
3803         i_mmap_lock_read(mapping);               
3804         if (unlikely(!RB_EMPTY_ROOT(&mapping-    
3805                 unmap_mapping_range_tree(&map    
3806                                          last    
3807         i_mmap_unlock_read(mapping);             
3808 }                                                
3809                                                  
3810 /**                                              
3811  * unmap_mapping_pages() - Unmap pages from p    
3812  * @mapping: The address space containing pag    
3813  * @start: Index of first page to be unmapped    
3814  * @nr: Number of pages to be unmapped.  0 to    
3815  * @even_cows: Whether to unmap even private     
3816  *                                               
3817  * Unmap the pages in this address space from    
3818  * has them mmaped.  Generally, you want to r    
3819  * a file is being truncated, but not when in    
3820  * cache.                                        
3821  */                                              
3822 void unmap_mapping_pages(struct address_space    
3823                 pgoff_t nr, bool even_cows)      
3824 {                                                
3825         struct zap_details details = { };        
3826         pgoff_t first_index = start;             
3827         pgoff_t last_index = start + nr - 1;     
3828                                                  
3829         details.even_cows = even_cows;           
3830         if (last_index < first_index)            
3831                 last_index = ULONG_MAX;          
3832                                                  
3833         i_mmap_lock_read(mapping);               
3834         if (unlikely(!RB_EMPTY_ROOT(&mapping-    
3835                 unmap_mapping_range_tree(&map    
3836                                          last    
3837         i_mmap_unlock_read(mapping);             
3838 }                                                
3839 EXPORT_SYMBOL_GPL(unmap_mapping_pages);          
3840                                                  
3841 /**                                              
3842  * unmap_mapping_range - unmap the portion of    
3843  * address_space corresponding to the specifi    
3844  * file.                                         
3845  *                                               
3846  * @mapping: the address space containing mma    
3847  * @holebegin: byte in first page to unmap, r    
3848  * the underlying file.  This will be rounded    
3849  * boundary.  Note that this is different fro    
3850  * must keep the partial page.  In contrast,     
3851  * partial pages.                                
3852  * @holelen: size of prospective hole in byte    
3853  * up to a PAGE_SIZE boundary.  A holelen of     
3854  * end of the file.                              
3855  * @even_cows: 1 when truncating a file, unma    
3856  * but 0 when invalidating pagecache, don't t    
3857  */                                              
3858 void unmap_mapping_range(struct address_space    
3859                 loff_t const holebegin, loff_    
3860 {                                                
3861         pgoff_t hba = (pgoff_t)(holebegin) >>    
3862         pgoff_t hlen = ((pgoff_t)(holelen) +     
3863                                                  
3864         /* Check for overflow. */                
3865         if (sizeof(holelen) > sizeof(hlen)) {    
3866                 long long holeend =              
3867                         (holebegin + holelen     
3868                 if (holeend & ~(long long)ULO    
3869                         hlen = ULONG_MAX - hb    
3870         }                                        
3871                                                  
3872         unmap_mapping_pages(mapping, hba, hle    
3873 }                                                
3874 EXPORT_SYMBOL(unmap_mapping_range);              
3875                                                  
3876 /*                                               
3877  * Restore a potential device exclusive pte t    
3878  */                                              
3879 static vm_fault_t remove_device_exclusive_ent    
3880 {                                                
3881         struct folio *folio = page_folio(vmf-    
3882         struct vm_area_struct *vma = vmf->vma    
3883         struct mmu_notifier_range range;         
3884         vm_fault_t ret;                          
3885                                                  
3886         /*                                       
3887          * We need a reference to lock the fo    
3888          * the PTL so a racing thread can rem    
3889          * entry and unmap it. If the folio i    
3890          * have been removed already. If it h    
3891          * been re-allocated after being free    
3892          * unlock it.                            
3893          */                                      
3894         if (!folio_try_get(folio))               
3895                 return 0;                        
3896                                                  
3897         ret = folio_lock_or_retry(folio, vmf)    
3898         if (ret) {                               
3899                 folio_put(folio);                
3900                 return ret;                      
3901         }                                        
3902         mmu_notifier_range_init_owner(&range,    
3903                                 vma->vm_mm, v    
3904                                 (vmf->address    
3905         mmu_notifier_invalidate_range_start(&    
3906                                                  
3907         vmf->pte = pte_offset_map_lock(vma->v    
3908                                 &vmf->ptl);      
3909         if (likely(vmf->pte && pte_same(ptep_    
3910                 restore_exclusive_pte(vma, vm    
3911                                                  
3912         if (vmf->pte)                            
3913                 pte_unmap_unlock(vmf->pte, vm    
3914         folio_unlock(folio);                     
3915         folio_put(folio);                        
3916                                                  
3917         mmu_notifier_invalidate_range_end(&ra    
3918         return 0;                                
3919 }                                                
3920                                                  
3921 static inline bool should_try_to_free_swap(st    
3922                                            st    
3923                                            un    
3924 {                                                
3925         if (!folio_test_swapcache(folio))        
3926                 return false;                    
3927         if (mem_cgroup_swap_full(folio) || (v    
3928             folio_test_mlocked(folio))           
3929                 return true;                     
3930         /*                                       
3931          * If we want to map a page that's in    
3932          * have to detect via the refcount if    
3933          * user. Try freeing the swapcache to    
3934          * reference only in case it's likely    
3935          */                                      
3936         return (fault_flags & FAULT_FLAG_WRIT    
3937                 folio_ref_count(folio) == (1     
3938 }                                                
3939                                                  
3940 static vm_fault_t pte_marker_clear(struct vm_    
3941 {                                                
3942         vmf->pte = pte_offset_map_lock(vmf->v    
3943                                        vmf->a    
3944         if (!vmf->pte)                           
3945                 return 0;                        
3946         /*                                       
3947          * Be careful so that we will only re    
3948          * none pte.  Otherwise it means the     
3949          *                                       
3950          * This should also cover the case wh    
3951          * quickly from a PTE_MARKER_UFFD_WP     
3952          * So is_pte_marker() check is not en    
3953          */                                      
3954         if (pte_same(vmf->orig_pte, ptep_get(    
3955                 pte_clear(vmf->vma->vm_mm, vm    
3956         pte_unmap_unlock(vmf->pte, vmf->ptl);    
3957         return 0;                                
3958 }                                                
3959                                                  
3960 static vm_fault_t do_pte_missing(struct vm_fa    
3961 {                                                
3962         if (vma_is_anonymous(vmf->vma))          
3963                 return do_anonymous_page(vmf)    
3964         else                                     
3965                 return do_fault(vmf);            
3966 }                                                
3967                                                  
3968 /*                                               
3969  * This is actually a page-missing access, bu    
3970  * installed.  It means this pte was wr-prote    
3971  */                                              
3972 static vm_fault_t pte_marker_handle_uffd_wp(s    
3973 {                                                
3974         /*                                       
3975          * Just in case there're leftover spe    
3976          * got unregistered - we can simply c    
3977          */                                      
3978         if (unlikely(!userfaultfd_wp(vmf->vma    
3979                 return pte_marker_clear(vmf);    
3980                                                  
3981         return do_pte_missing(vmf);              
3982 }                                                
3983                                                  
3984 static vm_fault_t handle_pte_marker(struct vm    
3985 {                                                
3986         swp_entry_t entry = pte_to_swp_entry(    
3987         unsigned long marker = pte_marker_get    
3988                                                  
3989         /*                                       
3990          * PTE markers should never be empty.    
3991          * the best thing to do is to kill th    
3992          */                                      
3993         if (WARN_ON_ONCE(!marker))               
3994                 return VM_FAULT_SIGBUS;          
3995                                                  
3996         /* Higher priority than uffd-wp when     
3997         if (marker & PTE_MARKER_POISONED)        
3998                 return VM_FAULT_HWPOISON;        
3999                                                  
4000         if (pte_marker_entry_uffd_wp(entry))     
4001                 return pte_marker_handle_uffd    
4002                                                  
4003         /* This is an unknown pte marker */      
4004         return VM_FAULT_SIGBUS;                  
4005 }                                                
4006                                                  
4007 static struct folio *__alloc_swap_folio(struc    
4008 {                                                
4009         struct vm_area_struct *vma = vmf->vma    
4010         struct folio *folio;                     
4011         swp_entry_t entry;                       
4012                                                  
4013         folio = vma_alloc_folio(GFP_HIGHUSER_    
4014                                 vmf->address,    
4015         if (!folio)                              
4016                 return NULL;                     
4017                                                  
4018         entry = pte_to_swp_entry(vmf->orig_pt    
4019         if (mem_cgroup_swapin_charge_folio(fo    
4020                                            GF    
4021                 folio_put(folio);                
4022                 return NULL;                     
4023         }                                        
4024                                                  
4025         return folio;                            
4026 }                                                
4027                                                  
4028 #ifdef CONFIG_TRANSPARENT_HUGEPAGE               
4029 static inline int non_swapcache_batch(swp_ent    
4030 {                                                
4031         struct swap_info_struct *si = swp_swa    
4032         pgoff_t offset = swp_offset(entry);      
4033         int i;                                   
4034                                                  
4035         /*                                       
4036          * While allocating a large folio and    
4037          * the case the being faulted pte doe    
4038          * ensure all PTEs have no cache as w    
4039          * swap devices while the content is     
4040          */                                      
4041         for (i = 0; i < max_nr; i++) {           
4042                 if ((si->swap_map[offset + i]    
4043                         return i;                
4044         }                                        
4045                                                  
4046         return i;                                
4047 }                                                
4048                                                  
4049 /*                                               
4050  * Check if the PTEs within a range are conti    
4051  * and have consistent swapcache, zeromap.       
4052  */                                              
4053 static bool can_swapin_thp(struct vm_fault *v    
4054 {                                                
4055         unsigned long addr;                      
4056         swp_entry_t entry;                       
4057         int idx;                                 
4058         pte_t pte;                               
4059                                                  
4060         addr = ALIGN_DOWN(vmf->address, nr_pa    
4061         idx = (vmf->address - addr) / PAGE_SI    
4062         pte = ptep_get(ptep);                    
4063                                                  
4064         if (!pte_same(pte, pte_move_swp_offse    
4065                 return false;                    
4066         entry = pte_to_swp_entry(pte);           
4067         if (swap_pte_batch(ptep, nr_pages, pt    
4068                 return false;                    
4069                                                  
4070         /*                                       
4071          * swap_read_folio() can't handle the    
4072          * from different backends. And they     
4073          * things might be added once zswap s    
4074          */                                      
4075         if (unlikely(swap_zeromap_batch(entry    
4076                 return false;                    
4077         if (unlikely(non_swapcache_batch(entr    
4078                 return false;                    
4079                                                  
4080         return true;                             
4081 }                                                
4082                                                  
4083 static inline unsigned long thp_swap_suitable    
4084                                                  
4085                                                  
4086 {                                                
4087         int order, nr;                           
4088                                                  
4089         order = highest_order(orders);           
4090                                                  
4091         /*                                       
4092          * To swap in a THP with nr pages, we    
4093          * is aligned with that number, as it    
4094          * This helps filter out most invalid    
4095          */                                      
4096         while (orders) {                         
4097                 nr = 1 << order;                 
4098                 if ((addr >> PAGE_SHIFT) % nr    
4099                         break;                   
4100                 order = next_order(&orders, o    
4101         }                                        
4102                                                  
4103         return orders;                           
4104 }                                                
4105                                                  
4106 static struct folio *alloc_swap_folio(struct     
4107 {                                                
4108         struct vm_area_struct *vma = vmf->vma    
4109         unsigned long orders;                    
4110         struct folio *folio;                     
4111         unsigned long addr;                      
4112         swp_entry_t entry;                       
4113         spinlock_t *ptl;                         
4114         pte_t *pte;                              
4115         gfp_t gfp;                               
4116         int order;                               
4117                                                  
4118         /*                                       
4119          * If uffd is active for the vma we n    
4120          * maintain the uffd semantics.          
4121          */                                      
4122         if (unlikely(userfaultfd_armed(vma)))    
4123                 goto fallback;                   
4124                                                  
4125         /*                                       
4126          * A large swapped out folio could be    
4127          * lack handling for such cases, so f    
4128          * folio.                                
4129          */                                      
4130         if (!zswap_never_enabled())              
4131                 goto fallback;                   
4132                                                  
4133         entry = pte_to_swp_entry(vmf->orig_pt    
4134         /*                                       
4135          * Get a list of all the (large) orde    
4136          * and suitable for swapping THP.        
4137          */                                      
4138         orders = thp_vma_allowable_orders(vma    
4139                         TVA_IN_PF | TVA_ENFOR    
4140         orders = thp_vma_suitable_orders(vma,    
4141         orders = thp_swap_suitable_orders(swp    
4142                                           vmf    
4143                                                  
4144         if (!orders)                             
4145                 goto fallback;                   
4146                                                  
4147         pte = pte_offset_map_lock(vmf->vma->v    
4148                                   vmf->addres    
4149         if (unlikely(!pte))                      
4150                 goto fallback;                   
4151                                                  
4152         /*                                       
4153          * For do_swap_page, find the highest    
4154          * completely swap entries with conti    
4155          */                                      
4156         order = highest_order(orders);           
4157         while (orders) {                         
4158                 addr = ALIGN_DOWN(vmf->addres    
4159                 if (can_swapin_thp(vmf, pte +    
4160                         break;                   
4161                 order = next_order(&orders, o    
4162         }                                        
4163                                                  
4164         pte_unmap_unlock(pte, ptl);              
4165                                                  
4166         /* Try allocating the highest of the     
4167         gfp = vma_thp_gfp_mask(vma);             
4168         while (orders) {                         
4169                 addr = ALIGN_DOWN(vmf->addres    
4170                 folio = vma_alloc_folio(gfp,     
4171                 if (folio) {                     
4172                         if (!mem_cgroup_swapi    
4173                                                  
4174                                 return folio;    
4175                         folio_put(folio);        
4176                 }                                
4177                 order = next_order(&orders, o    
4178         }                                        
4179                                                  
4180 fallback:                                        
4181         return __alloc_swap_folio(vmf);          
4182 }                                                
4183 #else /* !CONFIG_TRANSPARENT_HUGEPAGE */         
4184 static struct folio *alloc_swap_folio(struct     
4185 {                                                
4186         return __alloc_swap_folio(vmf);          
4187 }                                                
4188 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */         
4189                                                  
4190 static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);    
4191                                                  
4192 /*                                               
4193  * We enter with non-exclusive mmap_lock (to     
4194  * but allow concurrent faults), and pte mapp    
4195  * We return with pte unmapped and unlocked.     
4196  *                                               
4197  * We return with the mmap_lock locked or unl    
4198  * as does filemap_fault().                      
4199  */                                              
4200 vm_fault_t do_swap_page(struct vm_fault *vmf)    
4201 {                                                
4202         struct vm_area_struct *vma = vmf->vma    
4203         struct folio *swapcache, *folio = NUL    
4204         DECLARE_WAITQUEUE(wait, current);        
4205         struct page *page;                       
4206         struct swap_info_struct *si = NULL;      
4207         rmap_t rmap_flags = RMAP_NONE;           
4208         bool need_clear_cache = false;           
4209         bool exclusive = false;                  
4210         swp_entry_t entry;                       
4211         pte_t pte;                               
4212         vm_fault_t ret = 0;                      
4213         void *shadow = NULL;                     
4214         int nr_pages;                            
4215         unsigned long page_idx;                  
4216         unsigned long address;                   
4217         pte_t *ptep;                             
4218                                                  
4219         if (!pte_unmap_same(vmf))                
4220                 goto out;                        
4221                                                  
4222         entry = pte_to_swp_entry(vmf->orig_pt    
4223         if (unlikely(non_swap_entry(entry)))     
4224                 if (is_migration_entry(entry)    
4225                         migration_entry_wait(    
4226                                                  
4227                 } else if (is_device_exclusiv    
4228                         vmf->page = pfn_swap_    
4229                         ret = remove_device_e    
4230                 } else if (is_device_private_    
4231                         if (vmf->flags & FAUL    
4232                                 /*               
4233                                  * migrate_to    
4234                                  * under VMA     
4235                                  */              
4236                                 vma_end_read(    
4237                                 ret = VM_FAUL    
4238                                 goto out;        
4239                         }                        
4240                                                  
4241                         vmf->page = pfn_swap_    
4242                         vmf->pte = pte_offset    
4243                                         vmf->    
4244                         if (unlikely(!vmf->pt    
4245                                      !pte_sam    
4246                                                  
4247                                 goto unlock;     
4248                                                  
4249                         /*                       
4250                          * Get a page referen    
4251                          * freed.                
4252                          */                      
4253                         get_page(vmf->page);     
4254                         pte_unmap_unlock(vmf-    
4255                         ret = vmf->page->pgma    
4256                         put_page(vmf->page);     
4257                 } else if (is_hwpoison_entry(    
4258                         ret = VM_FAULT_HWPOIS    
4259                 } else if (is_pte_marker_entr    
4260                         ret = handle_pte_mark    
4261                 } else {                         
4262                         print_bad_pte(vma, vm    
4263                         ret = VM_FAULT_SIGBUS    
4264                 }                                
4265                 goto out;                        
4266         }                                        
4267                                                  
4268         /* Prevent swapoff from happening to     
4269         si = get_swap_device(entry);             
4270         if (unlikely(!si))                       
4271                 goto out;                        
4272                                                  
4273         folio = swap_cache_get_folio(entry, v    
4274         if (folio)                               
4275                 page = folio_file_page(folio,    
4276         swapcache = folio;                       
4277                                                  
4278         if (!folio) {                            
4279                 if (data_race(si->flags & SWP    
4280                     __swap_count(entry) == 1)    
4281                         /* skip swapcache */     
4282                         folio = alloc_swap_fo    
4283                         if (folio) {             
4284                                 __folio_set_l    
4285                                 __folio_set_s    
4286                                                  
4287                                 nr_pages = fo    
4288                                 if (folio_tes    
4289                                         entry    
4290                                 /*               
4291                                  * Prevent pa    
4292                                  * the cache     
4293                                  * may finish    
4294                                  * swapout re    
4295                                  * undetectab    
4296                                  * to entry r    
4297                                  */              
4298                                 if (swapcache    
4299                                         /*       
4300                                          * Re    
4301                                          * re    
4302                                          */      
4303                                         add_w    
4304                                         sched    
4305                                         remov    
4306                                         goto     
4307                                 }                
4308                                 need_clear_ca    
4309                                                  
4310                                 mem_cgroup_sw    
4311                                                  
4312                                 shadow = get_    
4313                                 if (shadow)      
4314                                         worki    
4315                                                  
4316                                 folio_add_lru    
4317                                                  
4318                                 /* To provide    
4319                                 folio->swap =    
4320                                 swap_read_fol    
4321                                 folio->privat    
4322                         }                        
4323                 } else {                         
4324                         folio = swapin_readah    
4325                                                  
4326                         swapcache = folio;       
4327                 }                                
4328                                                  
4329                 if (!folio) {                    
4330                         /*                       
4331                          * Back out if somebo    
4332                          * while we released     
4333                          */                      
4334                         vmf->pte = pte_offset    
4335                                         vmf->    
4336                         if (likely(vmf->pte &    
4337                                    pte_same(p    
4338                                 ret = VM_FAUL    
4339                         goto unlock;             
4340                 }                                
4341                                                  
4342                 /* Had to read the page from     
4343                 ret = VM_FAULT_MAJOR;            
4344                 count_vm_event(PGMAJFAULT);      
4345                 count_memcg_event_mm(vma->vm_    
4346                 page = folio_file_page(folio,    
4347         } else if (PageHWPoison(page)) {         
4348                 /*                               
4349                  * hwpoisoned dirty swapcache    
4350                  * owner processes (which may    
4351                  */                              
4352                 ret = VM_FAULT_HWPOISON;         
4353                 goto out_release;                
4354         }                                        
4355                                                  
4356         ret |= folio_lock_or_retry(folio, vmf    
4357         if (ret & VM_FAULT_RETRY)                
4358                 goto out_release;                
4359                                                  
4360         if (swapcache) {                         
4361                 /*                               
4362                  * Make sure folio_free_swap(    
4363                  * swapcache from under us.      
4364                  * below, are not enough to e    
4365                  * swapcache, we need to chec    
4366                  * changed.                      
4367                  */                              
4368                 if (unlikely(!folio_test_swap    
4369                              page_swap_entry(    
4370                         goto out_page;           
4371                                                  
4372                 /*                               
4373                  * KSM sometimes has to copy     
4374                  * page->index of !PageKSM()     
4375                  * anon VMA -- PageKSM() is l    
4376                  */                              
4377                 folio = ksm_might_need_to_cop    
4378                 if (unlikely(!folio)) {          
4379                         ret = VM_FAULT_OOM;      
4380                         folio = swapcache;       
4381                         goto out_page;           
4382                 } else if (unlikely(folio ==     
4383                         ret = VM_FAULT_HWPOIS    
4384                         folio = swapcache;       
4385                         goto out_page;           
4386                 }                                
4387                 if (folio != swapcache)          
4388                         page = folio_page(fol    
4389                                                  
4390                 /*                               
4391                  * If we want to map a page t    
4392                  * have to detect via the ref    
4393                  * owner. Try removing the ex    
4394                  * caches if required.           
4395                  */                              
4396                 if ((vmf->flags & FAULT_FLAG_    
4397                     !folio_test_ksm(folio) &&    
4398                         lru_add_drain();         
4399         }                                        
4400                                                  
4401         folio_throttle_swaprate(folio, GFP_KE    
4402                                                  
4403         /*                                       
4404          * Back out if somebody else already     
4405          */                                      
4406         vmf->pte = pte_offset_map_lock(vma->v    
4407                         &vmf->ptl);              
4408         if (unlikely(!vmf->pte || !pte_same(p    
4409                 goto out_nomap;                  
4410                                                  
4411         if (unlikely(!folio_test_uptodate(fol    
4412                 ret = VM_FAULT_SIGBUS;           
4413                 goto out_nomap;                  
4414         }                                        
4415                                                  
4416         /* allocated large folios for SWP_SYN    
4417         if (folio_test_large(folio) && !folio    
4418                 unsigned long nr = folio_nr_p    
4419                 unsigned long folio_start = A    
4420                 unsigned long idx = (vmf->add    
4421                 pte_t *folio_ptep = vmf->pte     
4422                 pte_t folio_pte = ptep_get(fo    
4423                                                  
4424                 if (!pte_same(folio_pte, pte_    
4425                     swap_pte_batch(folio_ptep    
4426                         goto out_nomap;          
4427                                                  
4428                 page_idx = idx;                  
4429                 address = folio_start;           
4430                 ptep = folio_ptep;               
4431                 goto check_folio;                
4432         }                                        
4433                                                  
4434         nr_pages = 1;                            
4435         page_idx = 0;                            
4436         address = vmf->address;                  
4437         ptep = vmf->pte;                         
4438         if (folio_test_large(folio) && folio_    
4439                 int nr = folio_nr_pages(folio    
4440                 unsigned long idx = folio_pag    
4441                 unsigned long folio_start = a    
4442                 unsigned long folio_end = fol    
4443                 pte_t *folio_ptep;               
4444                 pte_t folio_pte;                 
4445                                                  
4446                 if (unlikely(folio_start < ma    
4447                         goto check_folio;        
4448                 if (unlikely(folio_end > pmd_    
4449                         goto check_folio;        
4450                                                  
4451                 folio_ptep = vmf->pte - idx;     
4452                 folio_pte = ptep_get(folio_pt    
4453                 if (!pte_same(folio_pte, pte_    
4454                     swap_pte_batch(folio_ptep    
4455                         goto check_folio;        
4456                                                  
4457                 page_idx = idx;                  
4458                 address = folio_start;           
4459                 ptep = folio_ptep;               
4460                 nr_pages = nr;                   
4461                 entry = folio->swap;             
4462                 page = &folio->page;             
4463         }                                        
4464                                                  
4465 check_folio:                                     
4466         /*                                       
4467          * PG_anon_exclusive reuses PG_mapped    
4468          * must never point at an anonymous p    
4469          * PG_anon_exclusive. Sanity check th    
4470          * no filesystem set PG_mappedtodisk     
4471          * check after taking the PT lock and    
4472          * concurrently faulted in this page     
4473          */                                      
4474         BUG_ON(!folio_test_anon(folio) && fol    
4475         BUG_ON(folio_test_anon(folio) && Page    
4476                                                  
4477         /*                                       
4478          * Check under PT lock (to protect ag    
4479          * the swap entry concurrently) for c    
4480          */                                      
4481         if (!folio_test_ksm(folio)) {            
4482                 exclusive = pte_swp_exclusive    
4483                 if (folio != swapcache) {        
4484                         /*                       
4485                          * We have a fresh pa    
4486                          * swapcache -> certa    
4487                          */                      
4488                         exclusive = true;        
4489                 } else if (exclusive && folio    
4490                           data_race(si->flags    
4491                         /*                       
4492                          * This is tricky: no    
4493                          * concurrent page mo    
4494                          *                       
4495                          * So if we stumble o    
4496                          * we must not set th    
4497                          * map it writable wi    
4498                          * while still under     
4499                          *                       
4500                          * For these problema    
4501                          * exclusive marker:     
4502                          * writeback only if     
4503                          * there are no unexp    
4504                          * unmapping succeede    
4505                          * further GUP refere    
4506                          * appear, so droppin    
4507                          * it only R/O is fin    
4508                          */                      
4509                         exclusive = false;       
4510                 }                                
4511         }                                        
4512                                                  
4513         /*                                       
4514          * Some architectures may have to res    
4515          * when reading from swap. This metad    
4516          * so this must be called before swap    
4517          */                                      
4518         arch_swap_restore(folio_swap(entry, f    
4519                                                  
4520         /*                                       
4521          * Remove the swap entry and conditio    
4522          * We're already holding a reference     
4523          * yet.                                  
4524          */                                      
4525         swap_free_nr(entry, nr_pages);           
4526         if (should_try_to_free_swap(folio, vm    
4527                 folio_free_swap(folio);          
4528                                                  
4529         add_mm_counter(vma->vm_mm, MM_ANONPAG    
4530         add_mm_counter(vma->vm_mm, MM_SWAPENT    
4531         pte = mk_pte(page, vma->vm_page_prot)    
4532         if (pte_swp_soft_dirty(vmf->orig_pte)    
4533                 pte = pte_mksoft_dirty(pte);     
4534         if (pte_swp_uffd_wp(vmf->orig_pte))      
4535                 pte = pte_mkuffd_wp(pte);        
4536                                                  
4537         /*                                       
4538          * Same logic as in do_wp_page(); how    
4539          * certainly not shared either becaus    
4540          * exposing them to the swapcache or     
4541          * exclusivity.                          
4542          */                                      
4543         if (!folio_test_ksm(folio) &&            
4544             (exclusive || folio_ref_count(fol    
4545                 if ((vma->vm_flags & VM_WRITE    
4546                     !pte_needs_soft_dirty_wp(    
4547                         pte = pte_mkwrite(pte    
4548                         if (vmf->flags & FAUL    
4549                                 pte = pte_mkd    
4550                                 vmf->flags &=    
4551                         }                        
4552                 }                                
4553                 rmap_flags |= RMAP_EXCLUSIVE;    
4554         }                                        
4555         folio_ref_add(folio, nr_pages - 1);      
4556         flush_icache_pages(vma, page, nr_page    
4557         vmf->orig_pte = pte_advance_pfn(pte,     
4558                                                  
4559         /* ksm created a completely new copy     
4560         if (unlikely(folio != swapcache && sw    
4561                 folio_add_new_anon_rmap(folio    
4562                 folio_add_lru_vma(folio, vma)    
4563         } else if (!folio_test_anon(folio)) {    
4564                 /*                               
4565                  * We currently only expect s    
4566                  * fully exclusive or fully s    
4567                  * folios which are fully exc    
4568                  * folios within swapcache he    
4569                  */                              
4570                 VM_WARN_ON_ONCE(folio_test_la    
4571                 VM_WARN_ON_FOLIO(!folio_test_    
4572                 folio_add_new_anon_rmap(folio    
4573         } else {                                 
4574                 folio_add_anon_rmap_ptes(foli    
4575                                         rmap_    
4576         }                                        
4577                                                  
4578         VM_BUG_ON(!folio_test_anon(folio) ||     
4579                         (pte_write(pte) && !P    
4580         set_ptes(vma->vm_mm, address, ptep, p    
4581         arch_do_swap_page_nr(vma->vm_mm, vma,    
4582                         pte, pte, nr_pages);     
4583                                                  
4584         folio_unlock(folio);                     
4585         if (folio != swapcache && swapcache)     
4586                 /*                               
4587                  * Hold the lock to avoid the    
4588                  * until we take the PT lock     
4589                  * (to avoid false positives     
4590                  * further safety release the    
4591                  * so that the swap count won    
4592                  * parallel locked swapcache.    
4593                  */                              
4594                 folio_unlock(swapcache);         
4595                 folio_put(swapcache);            
4596         }                                        
4597                                                  
4598         if (vmf->flags & FAULT_FLAG_WRITE) {     
4599                 ret |= do_wp_page(vmf);          
4600                 if (ret & VM_FAULT_ERROR)        
4601                         ret &= VM_FAULT_ERROR    
4602                 goto out;                        
4603         }                                        
4604                                                  
4605         /* No need to invalidate - it was non    
4606         update_mmu_cache_range(vmf, vma, addr    
4607 unlock:                                          
4608         if (vmf->pte)                            
4609                 pte_unmap_unlock(vmf->pte, vm    
4610 out:                                             
4611         /* Clear the swap cache pin for direc    
4612         if (need_clear_cache) {                  
4613                 swapcache_clear(si, entry, nr    
4614                 if (waitqueue_active(&swapcac    
4615                         wake_up(&swapcache_wq    
4616         }                                        
4617         if (si)                                  
4618                 put_swap_device(si);             
4619         return ret;                              
4620 out_nomap:                                       
4621         if (vmf->pte)                            
4622                 pte_unmap_unlock(vmf->pte, vm    
4623 out_page:                                        
4624         folio_unlock(folio);                     
4625 out_release:                                     
4626         folio_put(folio);                        
4627         if (folio != swapcache && swapcache)     
4628                 folio_unlock(swapcache);         
4629                 folio_put(swapcache);            
4630         }                                        
4631         if (need_clear_cache) {                  
4632                 swapcache_clear(si, entry, nr    
4633                 if (waitqueue_active(&swapcac    
4634                         wake_up(&swapcache_wq    
4635         }                                        
4636         if (si)                                  
4637                 put_swap_device(si);             
4638         return ret;                              
4639 }                                                
4640                                                  
4641 static bool pte_range_none(pte_t *pte, int nr    
4642 {                                                
4643         int i;                                   
4644                                                  
4645         for (i = 0; i < nr_pages; i++) {         
4646                 if (!pte_none(ptep_get_lockle    
4647                         return false;            
4648         }                                        
4649                                                  
4650         return true;                             
4651 }                                                
4652                                                  
4653 static struct folio *alloc_anon_folio(struct     
4654 {                                                
4655         struct vm_area_struct *vma = vmf->vma    
4656 #ifdef CONFIG_TRANSPARENT_HUGEPAGE               
4657         unsigned long orders;                    
4658         struct folio *folio;                     
4659         unsigned long addr;                      
4660         pte_t *pte;                              
4661         gfp_t gfp;                               
4662         int order;                               
4663                                                  
4664         /*                                       
4665          * If uffd is active for the vma we n    
4666          * maintain the uffd semantics.          
4667          */                                      
4668         if (unlikely(userfaultfd_armed(vma)))    
4669                 goto fallback;                   
4670                                                  
4671         /*                                       
4672          * Get a list of all the (large) orde    
4673          * for this vma. Then filter out the     
4674          * the faulting address and still be     
4675          */                                      
4676         orders = thp_vma_allowable_orders(vma    
4677                         TVA_IN_PF | TVA_ENFOR    
4678         orders = thp_vma_suitable_orders(vma,    
4679                                                  
4680         if (!orders)                             
4681                 goto fallback;                   
4682                                                  
4683         pte = pte_offset_map(vmf->pmd, vmf->a    
4684         if (!pte)                                
4685                 return ERR_PTR(-EAGAIN);         
4686                                                  
4687         /*                                       
4688          * Find the highest order where the a    
4689          * pte_none(). Note that all remainin    
4690          * pte_none().                           
4691          */                                      
4692         order = highest_order(orders);           
4693         while (orders) {                         
4694                 addr = ALIGN_DOWN(vmf->addres    
4695                 if (pte_range_none(pte + pte_    
4696                         break;                   
4697                 order = next_order(&orders, o    
4698         }                                        
4699                                                  
4700         pte_unmap(pte);                          
4701                                                  
4702         if (!orders)                             
4703                 goto fallback;                   
4704                                                  
4705         /* Try allocating the highest of the     
4706         gfp = vma_thp_gfp_mask(vma);             
4707         while (orders) {                         
4708                 addr = ALIGN_DOWN(vmf->addres    
4709                 folio = vma_alloc_folio(gfp,     
4710                 if (folio) {                     
4711                         if (mem_cgroup_charge    
4712                                 count_mthp_st    
4713                                 folio_put(fol    
4714                                 goto next;       
4715                         }                        
4716                         folio_throttle_swapra    
4717                         folio_zero_user(folio    
4718                         return folio;            
4719                 }                                
4720 next:                                            
4721                 count_mthp_stat(order, MTHP_S    
4722                 order = next_order(&orders, o    
4723         }                                        
4724                                                  
4725 fallback:                                        
4726 #endif                                           
4727         return folio_prealloc(vma->vm_mm, vma    
4728 }                                                
4729                                                  
4730 /*                                               
4731  * We enter with non-exclusive mmap_lock (to     
4732  * but allow concurrent faults), and pte mapp    
4733  * We return with mmap_lock still held, but p    
4734  */                                              
4735 static vm_fault_t do_anonymous_page(struct vm    
4736 {                                                
4737         struct vm_area_struct *vma = vmf->vma    
4738         unsigned long addr = vmf->address;       
4739         struct folio *folio;                     
4740         vm_fault_t ret = 0;                      
4741         int nr_pages = 1;                        
4742         pte_t entry;                             
4743                                                  
4744         /* File mapping without ->vm_ops ? */    
4745         if (vma->vm_flags & VM_SHARED)           
4746                 return VM_FAULT_SIGBUS;          
4747                                                  
4748         /*                                       
4749          * Use pte_alloc() instead of pte_all    
4750          * be distinguished from a transient     
4751          */                                      
4752         if (pte_alloc(vma->vm_mm, vmf->pmd))     
4753                 return VM_FAULT_OOM;             
4754                                                  
4755         /* Use the zero-page for reads */        
4756         if (!(vmf->flags & FAULT_FLAG_WRITE)     
4757                         !mm_forbids_zeropage(    
4758                 entry = pte_mkspecial(pfn_pte    
4759                                                  
4760                 vmf->pte = pte_offset_map_loc    
4761                                 vmf->address,    
4762                 if (!vmf->pte)                   
4763                         goto unlock;             
4764                 if (vmf_pte_changed(vmf)) {      
4765                         update_mmu_tlb(vma, v    
4766                         goto unlock;             
4767                 }                                
4768                 ret = check_stable_address_sp    
4769                 if (ret)                         
4770                         goto unlock;             
4771                 /* Deliver the page fault to     
4772                 if (userfaultfd_missing(vma))    
4773                         pte_unmap_unlock(vmf-    
4774                         return handle_userfau    
4775                 }                                
4776                 goto setpte;                     
4777         }                                        
4778                                                  
4779         /* Allocate our own private page. */     
4780         ret = vmf_anon_prepare(vmf);             
4781         if (ret)                                 
4782                 return ret;                      
4783         /* Returns NULL on OOM or ERR_PTR(-EA    
4784         folio = alloc_anon_folio(vmf);           
4785         if (IS_ERR(folio))                       
4786                 return 0;                        
4787         if (!folio)                              
4788                 goto oom;                        
4789                                                  
4790         nr_pages = folio_nr_pages(folio);        
4791         addr = ALIGN_DOWN(vmf->address, nr_pa    
4792                                                  
4793         /*                                       
4794          * The memory barrier inside __folio_    
4795          * preceding stores to the page conte    
4796          * the set_pte_at() write.               
4797          */                                      
4798         __folio_mark_uptodate(folio);            
4799                                                  
4800         entry = mk_pte(&folio->page, vma->vm_    
4801         entry = pte_sw_mkyoung(entry);           
4802         if (vma->vm_flags & VM_WRITE)            
4803                 entry = pte_mkwrite(pte_mkdir    
4804                                                  
4805         vmf->pte = pte_offset_map_lock(vma->v    
4806         if (!vmf->pte)                           
4807                 goto release;                    
4808         if (nr_pages == 1 && vmf_pte_changed(    
4809                 update_mmu_tlb(vma, addr, vmf    
4810                 goto release;                    
4811         } else if (nr_pages > 1 && !pte_range    
4812                 update_mmu_tlb_range(vma, add    
4813                 goto release;                    
4814         }                                        
4815                                                  
4816         ret = check_stable_address_space(vma-    
4817         if (ret)                                 
4818                 goto release;                    
4819                                                  
4820         /* Deliver the page fault to userland    
4821         if (userfaultfd_missing(vma)) {          
4822                 pte_unmap_unlock(vmf->pte, vm    
4823                 folio_put(folio);                
4824                 return handle_userfault(vmf,     
4825         }                                        
4826                                                  
4827         folio_ref_add(folio, nr_pages - 1);      
4828         add_mm_counter(vma->vm_mm, MM_ANONPAG    
4829         count_mthp_stat(folio_order(folio), M    
4830         folio_add_new_anon_rmap(folio, vma, a    
4831         folio_add_lru_vma(folio, vma);           
4832 setpte:                                          
4833         if (vmf_orig_pte_uffd_wp(vmf))           
4834                 entry = pte_mkuffd_wp(entry);    
4835         set_ptes(vma->vm_mm, addr, vmf->pte,     
4836                                                  
4837         /* No need to invalidate - it was non    
4838         update_mmu_cache_range(vmf, vma, addr    
4839 unlock:                                          
4840         if (vmf->pte)                            
4841                 pte_unmap_unlock(vmf->pte, vm    
4842         return ret;                              
4843 release:                                         
4844         folio_put(folio);                        
4845         goto unlock;                             
4846 oom:                                             
4847         return VM_FAULT_OOM;                     
4848 }                                                
4849                                                  
4850 /*                                               
4851  * The mmap_lock must have been held on entry    
4852  * released depending on flags and vma->vm_op    
4853  * See filemap_fault() and __lock_page_retry(    
4854  */                                              
4855 static vm_fault_t __do_fault(struct vm_fault     
4856 {                                                
4857         struct vm_area_struct *vma = vmf->vma    
4858         struct folio *folio;                     
4859         vm_fault_t ret;                          
4860                                                  
4861         /*                                       
4862          * Preallocate pte before we take pag    
4863          * deadlocks for memcg reclaim which     
4864          *                              lock_    
4865          *                              SetPa    
4866          *                              unloc    
4867          * lock_page(B)                          
4868          *                              lock_    
4869          * pte_alloc_one                         
4870          *   shrink_folio_list                   
4871          *     wait_on_page_writeback(A)         
4872          *                              SetPa    
4873          *                              unloc    
4874          *                              # flu    
4875          */                                      
4876         if (pmd_none(*vmf->pmd) && !vmf->prea    
4877                 vmf->prealloc_pte = pte_alloc    
4878                 if (!vmf->prealloc_pte)          
4879                         return VM_FAULT_OOM;     
4880         }                                        
4881                                                  
4882         ret = vma->vm_ops->fault(vmf);           
4883         if (unlikely(ret & (VM_FAULT_ERROR |     
4884                             VM_FAULT_DONE_COW    
4885                 return ret;                      
4886                                                  
4887         folio = page_folio(vmf->page);           
4888         if (unlikely(PageHWPoison(vmf->page))    
4889                 vm_fault_t poisonret = VM_FAU    
4890                 if (ret & VM_FAULT_LOCKED) {     
4891                         if (page_mapped(vmf->    
4892                                 unmap_mapping    
4893                         /* Retry if a clean f    
4894                         if (mapping_evict_fol    
4895                                 poisonret = V    
4896                         folio_unlock(folio);     
4897                 }                                
4898                 folio_put(folio);                
4899                 vmf->page = NULL;                
4900                 return poisonret;                
4901         }                                        
4902                                                  
4903         if (unlikely(!(ret & VM_FAULT_LOCKED)    
4904                 folio_lock(folio);               
4905         else                                     
4906                 VM_BUG_ON_PAGE(!folio_test_lo    
4907                                                  
4908         return ret;                              
4909 }                                                
4910                                                  
4911 #ifdef CONFIG_TRANSPARENT_HUGEPAGE               
4912 static void deposit_prealloc_pte(struct vm_fa    
4913 {                                                
4914         struct vm_area_struct *vma = vmf->vma    
4915                                                  
4916         pgtable_trans_huge_deposit(vma->vm_mm    
4917         /*                                       
4918          * We are going to consume the preall    
4919          * count that as nr_ptes.                
4920          */                                      
4921         mm_inc_nr_ptes(vma->vm_mm);              
4922         vmf->prealloc_pte = NULL;                
4923 }                                                
4924                                                  
4925 vm_fault_t do_set_pmd(struct vm_fault *vmf, s    
4926 {                                                
4927         struct folio *folio = page_folio(page    
4928         struct vm_area_struct *vma = vmf->vma    
4929         bool write = vmf->flags & FAULT_FLAG_    
4930         unsigned long haddr = vmf->address &     
4931         pmd_t entry;                             
4932         vm_fault_t ret = VM_FAULT_FALLBACK;      
4933                                                  
4934         /*                                       
4935          * It is too late to allocate a small    
4936          * folio in the pagecache: especially    
4937          * PMD mappings, but PTE-mapped THP a    
4938          * PMD mappings if THPs are disabled.    
4939          */                                      
4940         if (thp_disabled_by_hw() || vma_thp_d    
4941                 return ret;                      
4942                                                  
4943         if (!thp_vma_suitable_order(vma, hadd    
4944                 return ret;                      
4945                                                  
4946         if (folio_order(folio) != HPAGE_PMD_O    
4947                 return ret;                      
4948         page = &folio->page;                     
4949                                                  
4950         /*                                       
4951          * Just backoff if any subpage of a T    
4952          * the corrupted page may mapped by P    
4953          * check.  This kind of THP just can     
4954          * the corrupted subpage should trigg    
4955          */                                      
4956         if (unlikely(folio_test_has_hwpoisone    
4957                 return ret;                      
4958                                                  
4959         /*                                       
4960          * Archs like ppc64 need additional s    
4961          * related to pte entry. Use the prea    
4962          */                                      
4963         if (arch_needs_pgtable_deposit() && !    
4964                 vmf->prealloc_pte = pte_alloc    
4965                 if (!vmf->prealloc_pte)          
4966                         return VM_FAULT_OOM;     
4967         }                                        
4968                                                  
4969         vmf->ptl = pmd_lock(vma->vm_mm, vmf->    
4970         if (unlikely(!pmd_none(*vmf->pmd)))      
4971                 goto out;                        
4972                                                  
4973         flush_icache_pages(vma, page, HPAGE_P    
4974                                                  
4975         entry = mk_huge_pmd(page, vma->vm_pag    
4976         if (write)                               
4977                 entry = maybe_pmd_mkwrite(pmd    
4978                                                  
4979         add_mm_counter(vma->vm_mm, mm_counter    
4980         folio_add_file_rmap_pmd(folio, page,     
4981                                                  
4982         /*                                       
4983          * deposit and withdraw with pmd lock    
4984          */                                      
4985         if (arch_needs_pgtable_deposit())        
4986                 deposit_prealloc_pte(vmf);       
4987                                                  
4988         set_pmd_at(vma->vm_mm, haddr, vmf->pm    
4989                                                  
4990         update_mmu_cache_pmd(vma, haddr, vmf-    
4991                                                  
4992         /* fault is handled */                   
4993         ret = 0;                                 
4994         count_vm_event(THP_FILE_MAPPED);         
4995 out:                                             
4996         spin_unlock(vmf->ptl);                   
4997         return ret;                              
4998 }                                                
4999 #else                                            
5000 vm_fault_t do_set_pmd(struct vm_fault *vmf, s    
5001 {                                                
5002         return VM_FAULT_FALLBACK;                
5003 }                                                
5004 #endif                                           
5005                                                  
5006 /**                                              
5007  * set_pte_range - Set a range of PTEs to poi    
5008  * @vmf: Fault decription.                       
5009  * @folio: The folio that contains @page.        
5010  * @page: The first page to create a PTE for.    
5011  * @nr: The number of PTEs to create.            
5012  * @addr: The first address to create a PTE f    
5013  */                                              
5014 void set_pte_range(struct vm_fault *vmf, stru    
5015                 struct page *page, unsigned i    
5016 {                                                
5017         struct vm_area_struct *vma = vmf->vma    
5018         bool write = vmf->flags & FAULT_FLAG_    
5019         bool prefault = !in_range(vmf->addres    
5020         pte_t entry;                             
5021                                                  
5022         flush_icache_pages(vma, page, nr);       
5023         entry = mk_pte(page, vma->vm_page_pro    
5024                                                  
5025         if (prefault && arch_wants_old_prefau    
5026                 entry = pte_mkold(entry);        
5027         else                                     
5028                 entry = pte_sw_mkyoung(entry)    
5029                                                  
5030         if (write)                               
5031                 entry = maybe_mkwrite(pte_mkd    
5032         if (unlikely(vmf_orig_pte_uffd_wp(vmf    
5033                 entry = pte_mkuffd_wp(entry);    
5034         /* copy-on-write page */                 
5035         if (write && !(vma->vm_flags & VM_SHA    
5036                 VM_BUG_ON_FOLIO(nr != 1, foli    
5037                 folio_add_new_anon_rmap(folio    
5038                 folio_add_lru_vma(folio, vma)    
5039         } else {                                 
5040                 folio_add_file_rmap_ptes(foli    
5041         }                                        
5042         set_ptes(vma->vm_mm, addr, vmf->pte,     
5043                                                  
5044         /* no need to invalidate: a not-prese    
5045         update_mmu_cache_range(vmf, vma, addr    
5046 }                                                
5047                                                  
5048 static bool vmf_pte_changed(struct vm_fault *    
5049 {                                                
5050         if (vmf->flags & FAULT_FLAG_ORIG_PTE_    
5051                 return !pte_same(ptep_get(vmf    
5052                                                  
5053         return !pte_none(ptep_get(vmf->pte));    
5054 }                                                
5055                                                  
5056 /**                                              
5057  * finish_fault - finish page fault once we h    
5058  *                                               
5059  * @vmf: structure describing the fault          
5060  *                                               
5061  * This function handles all that is needed t    
5062  * page to fault in is prepared. It handles l    
5063  * given page, adds reverse page mapping, han    
5064  * addition.                                     
5065  *                                               
5066  * The function expects the page to be locked    
5067  * reference of a page being mapped (for the     
5068  *                                               
5069  * Return: %0 on success, %VM_FAULT_ code in     
5070  */                                              
5071 vm_fault_t finish_fault(struct vm_fault *vmf)    
5072 {                                                
5073         struct vm_area_struct *vma = vmf->vma    
5074         struct page *page;                       
5075         struct folio *folio;                     
5076         vm_fault_t ret;                          
5077         bool is_cow = (vmf->flags & FAULT_FLA    
5078                       !(vma->vm_flags & VM_SH    
5079         int type, nr_pages;                      
5080         unsigned long addr = vmf->address;       
5081                                                  
5082         /* Did we COW the page? */               
5083         if (is_cow)                              
5084                 page = vmf->cow_page;            
5085         else                                     
5086                 page = vmf->page;                
5087                                                  
5088         /*                                       
5089          * check even for read faults because    
5090          * page                                  
5091          */                                      
5092         if (!(vma->vm_flags & VM_SHARED)) {      
5093                 ret = check_stable_address_sp    
5094                 if (ret)                         
5095                         return ret;              
5096         }                                        
5097                                                  
5098         if (pmd_none(*vmf->pmd)) {               
5099                 if (PageTransCompound(page))     
5100                         ret = do_set_pmd(vmf,    
5101                         if (ret != VM_FAULT_F    
5102                                 return ret;      
5103                 }                                
5104                                                  
5105                 if (vmf->prealloc_pte)           
5106                         pmd_install(vma->vm_m    
5107                 else if (unlikely(pte_alloc(v    
5108                         return VM_FAULT_OOM;     
5109         }                                        
5110                                                  
5111         folio = page_folio(page);                
5112         nr_pages = folio_nr_pages(folio);        
5113                                                  
5114         /*                                       
5115          * Using per-page fault to maintain t    
5116          * approach also applies to non-anony    
5117          * inflating the RSS of the process.     
5118          */                                      
5119         if (!vma_is_anon_shmem(vma) || unlike    
5120                 nr_pages = 1;                    
5121         } else if (nr_pages > 1) {               
5122                 pgoff_t idx = folio_page_idx(    
5123                 /* The page offset of vmf->ad    
5124                 pgoff_t vma_off = vmf->pgoff     
5125                 /* The index of the entry in     
5126                 pgoff_t pte_off = pte_index(v    
5127                                                  
5128                 /*                               
5129                  * Fallback to per-page fault    
5130                  * cache beyond the VMA limit    
5131                  */                              
5132                 if (unlikely(vma_off < idx ||    
5133                             vma_off + (nr_pag    
5134                             pte_off < idx ||     
5135                             pte_off + (nr_pag    
5136                         nr_pages = 1;            
5137                 } else {                         
5138                         /* Now we can set map    
5139                         addr = vmf->address -    
5140                         page = &folio->page;     
5141                 }                                
5142         }                                        
5143                                                  
5144         vmf->pte = pte_offset_map_lock(vma->v    
5145                                        addr,     
5146         if (!vmf->pte)                           
5147                 return VM_FAULT_NOPAGE;          
5148                                                  
5149         /* Re-check under ptl */                 
5150         if (nr_pages == 1 && unlikely(vmf_pte    
5151                 update_mmu_tlb(vma, addr, vmf    
5152                 ret = VM_FAULT_NOPAGE;           
5153                 goto unlock;                     
5154         } else if (nr_pages > 1 && !pte_range    
5155                 update_mmu_tlb_range(vma, add    
5156                 ret = VM_FAULT_NOPAGE;           
5157                 goto unlock;                     
5158         }                                        
5159                                                  
5160         folio_ref_add(folio, nr_pages - 1);      
5161         set_pte_range(vmf, folio, page, nr_pa    
5162         type = is_cow ? MM_ANONPAGES : mm_cou    
5163         add_mm_counter(vma->vm_mm, type, nr_p    
5164         ret = 0;                                 
5165                                                  
5166 unlock:                                          
5167         pte_unmap_unlock(vmf->pte, vmf->ptl);    
5168         return ret;                              
5169 }                                                
5170                                                  
5171 static unsigned long fault_around_pages __rea    
5172         65536 >> PAGE_SHIFT;                     
5173                                                  
5174 #ifdef CONFIG_DEBUG_FS                           
5175 static int fault_around_bytes_get(void *data,    
5176 {                                                
5177         *val = fault_around_pages << PAGE_SHI    
5178         return 0;                                
5179 }                                                
5180                                                  
5181 /*                                               
5182  * fault_around_bytes must be rounded down to    
5183  * what do_fault_around() expects to see.        
5184  */                                              
5185 static int fault_around_bytes_set(void *data,    
5186 {                                                
5187         if (val / PAGE_SIZE > PTRS_PER_PTE)      
5188                 return -EINVAL;                  
5189                                                  
5190         /*                                       
5191          * The minimum value is 1 page, howev    
5192          * at all. See should_fault_around().    
5193          */                                      
5194         val = max(val, PAGE_SIZE);               
5195         fault_around_pages = rounddown_pow_of    
5196                                                  
5197         return 0;                                
5198 }                                                
5199 DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_f    
5200                 fault_around_bytes_get, fault    
5201                                                  
5202 static int __init fault_around_debugfs(void)     
5203 {                                                
5204         debugfs_create_file_unsafe("fault_aro    
5205                                    &fault_aro    
5206         return 0;                                
5207 }                                                
5208 late_initcall(fault_around_debugfs);             
5209 #endif                                           
5210                                                  
5211 /*                                               
5212  * do_fault_around() tries to map few pages a    
5213  * is that the pages will be needed soon and     
5214  * faults to handle.                             
5215  *                                               
5216  * It uses vm_ops->map_pages() to map the pag    
5217  * not ready to be mapped: not up-to-date, lo    
5218  *                                               
5219  * This function doesn't cross VMA or page ta    
5220  * map_pages() and acquire a PTE lock only on    
5221  *                                               
5222  * fault_around_pages defines how many pages     
5223  * do_fault_around() expects it to be set to     
5224  * to PTRS_PER_PTE.                              
5225  *                                               
5226  * The virtual address of the area that we ma    
5227  * fault_around_pages * PAGE_SIZE rounded dow    
5228  * (and therefore to page order).  This way i    
5229  * that we don't cross page table boundaries.    
5230  */                                              
5231 static vm_fault_t do_fault_around(struct vm_f    
5232 {                                                
5233         pgoff_t nr_pages = READ_ONCE(fault_ar    
5234         pgoff_t pte_off = pte_index(vmf->addr    
5235         /* The page offset of vmf->address wi    
5236         pgoff_t vma_off = vmf->pgoff - vmf->v    
5237         pgoff_t from_pte, to_pte;                
5238         vm_fault_t ret;                          
5239                                                  
5240         /* The PTE offset of the start addres    
5241         from_pte = max(ALIGN_DOWN(pte_off, nr    
5242                        pte_off - min(pte_off,    
5243                                                  
5244         /* The PTE offset of the end address,    
5245         to_pte = min3(from_pte + nr_pages, (p    
5246                       pte_off + vma_pages(vmf    
5247                                                  
5248         if (pmd_none(*vmf->pmd)) {               
5249                 vmf->prealloc_pte = pte_alloc    
5250                 if (!vmf->prealloc_pte)          
5251                         return VM_FAULT_OOM;     
5252         }                                        
5253                                                  
5254         rcu_read_lock();                         
5255         ret = vmf->vma->vm_ops->map_pages(vmf    
5256                         vmf->pgoff + from_pte    
5257                         vmf->pgoff + to_pte -    
5258         rcu_read_unlock();                       
5259                                                  
5260         return ret;                              
5261 }                                                
5262                                                  
5263 /* Return true if we should do read fault-aro    
5264 static inline bool should_fault_around(struct    
5265 {                                                
5266         /* No ->map_pages?  No way to fault a    
5267         if (!vmf->vma->vm_ops->map_pages)        
5268                 return false;                    
5269                                                  
5270         if (uffd_disable_fault_around(vmf->vm    
5271                 return false;                    
5272                                                  
5273         /* A single page implies no faulting     
5274         return fault_around_pages > 1;           
5275 }                                                
5276                                                  
5277 static vm_fault_t do_read_fault(struct vm_fau    
5278 {                                                
5279         vm_fault_t ret = 0;                      
5280         struct folio *folio;                     
5281                                                  
5282         /*                                       
5283          * Let's call ->map_pages() first and    
5284          * if page by the offset is not ready    
5285          * something).                           
5286          */                                      
5287         if (should_fault_around(vmf)) {          
5288                 ret = do_fault_around(vmf);      
5289                 if (ret)                         
5290                         return ret;              
5291         }                                        
5292                                                  
5293         ret = vmf_can_call_fault(vmf);           
5294         if (ret)                                 
5295                 return ret;                      
5296                                                  
5297         ret = __do_fault(vmf);                   
5298         if (unlikely(ret & (VM_FAULT_ERROR |     
5299                 return ret;                      
5300                                                  
5301         ret |= finish_fault(vmf);                
5302         folio = page_folio(vmf->page);           
5303         folio_unlock(folio);                     
5304         if (unlikely(ret & (VM_FAULT_ERROR |     
5305                 folio_put(folio);                
5306         return ret;                              
5307 }                                                
5308                                                  
5309 static vm_fault_t do_cow_fault(struct vm_faul    
5310 {                                                
5311         struct vm_area_struct *vma = vmf->vma    
5312         struct folio *folio;                     
5313         vm_fault_t ret;                          
5314                                                  
5315         ret = vmf_can_call_fault(vmf);           
5316         if (!ret)                                
5317                 ret = vmf_anon_prepare(vmf);     
5318         if (ret)                                 
5319                 return ret;                      
5320                                                  
5321         folio = folio_prealloc(vma->vm_mm, vm    
5322         if (!folio)                              
5323                 return VM_FAULT_OOM;             
5324                                                  
5325         vmf->cow_page = &folio->page;            
5326                                                  
5327         ret = __do_fault(vmf);                   
5328         if (unlikely(ret & (VM_FAULT_ERROR |     
5329                 goto uncharge_out;               
5330         if (ret & VM_FAULT_DONE_COW)             
5331                 return ret;                      
5332                                                  
5333         if (copy_mc_user_highpage(vmf->cow_pa    
5334                 ret = VM_FAULT_HWPOISON;         
5335                 goto unlock;                     
5336         }                                        
5337         __folio_mark_uptodate(folio);            
5338                                                  
5339         ret |= finish_fault(vmf);                
5340 unlock:                                          
5341         unlock_page(vmf->page);                  
5342         put_page(vmf->page);                     
5343         if (unlikely(ret & (VM_FAULT_ERROR |     
5344                 goto uncharge_out;               
5345         return ret;                              
5346 uncharge_out:                                    
5347         folio_put(folio);                        
5348         return ret;                              
5349 }                                                
5350                                                  
5351 static vm_fault_t do_shared_fault(struct vm_f    
5352 {                                                
5353         struct vm_area_struct *vma = vmf->vma    
5354         vm_fault_t ret, tmp;                     
5355         struct folio *folio;                     
5356                                                  
5357         ret = vmf_can_call_fault(vmf);           
5358         if (ret)                                 
5359                 return ret;                      
5360                                                  
5361         ret = __do_fault(vmf);                   
5362         if (unlikely(ret & (VM_FAULT_ERROR |     
5363                 return ret;                      
5364                                                  
5365         folio = page_folio(vmf->page);           
5366                                                  
5367         /*                                       
5368          * Check if the backing address space    
5369          * about to become writable              
5370          */                                      
5371         if (vma->vm_ops->page_mkwrite) {         
5372                 folio_unlock(folio);             
5373                 tmp = do_page_mkwrite(vmf, fo    
5374                 if (unlikely(!tmp ||             
5375                                 (tmp & (VM_FA    
5376                         folio_put(folio);        
5377                         return tmp;              
5378                 }                                
5379         }                                        
5380                                                  
5381         ret |= finish_fault(vmf);                
5382         if (unlikely(ret & (VM_FAULT_ERROR |     
5383                                         VM_FA    
5384                 folio_unlock(folio);             
5385                 folio_put(folio);                
5386                 return ret;                      
5387         }                                        
5388                                                  
5389         ret |= fault_dirty_shared_page(vmf);     
5390         return ret;                              
5391 }                                                
5392                                                  
5393 /*                                               
5394  * We enter with non-exclusive mmap_lock (to     
5395  * but allow concurrent faults).                 
5396  * The mmap_lock may have been released depen    
5397  * return value.  See filemap_fault() and __f    
5398  * If mmap_lock is released, vma may become i    
5399  * by other thread calling munmap()).            
5400  */                                              
5401 static vm_fault_t do_fault(struct vm_fault *v    
5402 {                                                
5403         struct vm_area_struct *vma = vmf->vma    
5404         struct mm_struct *vm_mm = vma->vm_mm;    
5405         vm_fault_t ret;                          
5406                                                  
5407         /*                                       
5408          * The VMA was not fully populated on    
5409          */                                      
5410         if (!vma->vm_ops->fault) {               
5411                 vmf->pte = pte_offset_map_loc    
5412                                                  
5413                 if (unlikely(!vmf->pte))         
5414                         ret = VM_FAULT_SIGBUS    
5415                 else {                           
5416                         /*                       
5417                          * Make sure this is     
5418                          * by holding ptl and    
5419                          * of pte involves: t    
5420                          * we don't have conc    
5421                          * followed by an upd    
5422                          */                      
5423                         if (unlikely(pte_none    
5424                                 ret = VM_FAUL    
5425                         else                     
5426                                 ret = VM_FAUL    
5427                                                  
5428                         pte_unmap_unlock(vmf-    
5429                 }                                
5430         } else if (!(vmf->flags & FAULT_FLAG_    
5431                 ret = do_read_fault(vmf);        
5432         else if (!(vma->vm_flags & VM_SHARED)    
5433                 ret = do_cow_fault(vmf);         
5434         else                                     
5435                 ret = do_shared_fault(vmf);      
5436                                                  
5437         /* preallocated pagetable is unused:     
5438         if (vmf->prealloc_pte) {                 
5439                 pte_free(vm_mm, vmf->prealloc    
5440                 vmf->prealloc_pte = NULL;        
5441         }                                        
5442         return ret;                              
5443 }                                                
5444                                                  
5445 int numa_migrate_check(struct folio *folio, s    
5446                       unsigned long addr, int    
5447                       bool writable, int *las    
5448 {                                                
5449         struct vm_area_struct *vma = vmf->vma    
5450                                                  
5451         /*                                       
5452          * Avoid grouping on RO pages in gene    
5453          * much anyway since they can be in s    
5454          * the case where a mapping is writab    
5455          * to it but pte_write gets cleared d    
5456          * pte_dirty has unpredictable behavi    
5457          * background writeback, dirty balanc    
5458          */                                      
5459         if (!writable)                           
5460                 *flags |= TNF_NO_GROUP;          
5461                                                  
5462         /*                                       
5463          * Flag if the folio is shared betwee    
5464          * is later used when determining whe    
5465          */                                      
5466         if (folio_likely_mapped_shared(folio)    
5467                 *flags |= TNF_SHARED;            
5468         /*                                       
5469          * For memory tiering mode, cpupid of    
5470          * to record page access time.  So us    
5471          */                                      
5472         if (folio_use_access_time(folio))        
5473                 *last_cpupid = (-1 & LAST_CPU    
5474         else                                     
5475                 *last_cpupid = folio_last_cpu    
5476                                                  
5477         /* Record the current PID acceesing V    
5478         vma_set_access_pid_bit(vma);             
5479                                                  
5480         count_vm_numa_event(NUMA_HINT_FAULTS)    
5481 #ifdef CONFIG_NUMA_BALANCING                     
5482         count_memcg_folio_events(folio, NUMA_    
5483 #endif                                           
5484         if (folio_nid(folio) == numa_node_id(    
5485                 count_vm_numa_event(NUMA_HINT    
5486                 *flags |= TNF_FAULT_LOCAL;       
5487         }                                        
5488                                                  
5489         return mpol_misplaced(folio, vmf, add    
5490 }                                                
5491                                                  
5492 static void numa_rebuild_single_mapping(struc    
5493                                         unsig    
5494                                         bool     
5495 {                                                
5496         pte_t pte, old_pte;                      
5497                                                  
5498         old_pte = ptep_modify_prot_start(vma,    
5499         pte = pte_modify(old_pte, vma->vm_pag    
5500         pte = pte_mkyoung(pte);                  
5501         if (writable)                            
5502                 pte = pte_mkwrite(pte, vma);     
5503         ptep_modify_prot_commit(vma, fault_ad    
5504         update_mmu_cache_range(vmf, vma, faul    
5505 }                                                
5506                                                  
5507 static void numa_rebuild_large_mapping(struct    
5508                                        struct    
5509                                        bool i    
5510 {                                                
5511         int nr = pte_pfn(fault_pte) - folio_p    
5512         unsigned long start, end, addr = vmf-    
5513         unsigned long addr_start = addr - (nr    
5514         unsigned long pt_start = ALIGN_DOWN(a    
5515         pte_t *start_ptep;                       
5516                                                  
5517         /* Stay within the VMA and within the    
5518         start = max3(addr_start, pt_start, vm    
5519         end = min3(addr_start + folio_size(fo    
5520                    vma->vm_end);                 
5521         start_ptep = vmf->pte - ((addr - star    
5522                                                  
5523         /* Restore all PTEs' mapping of the l    
5524         for (addr = start; addr != end; start    
5525                 pte_t ptent = ptep_get(start_    
5526                 bool writable = false;           
5527                                                  
5528                 if (!pte_present(ptent) || !p    
5529                         continue;                
5530                                                  
5531                 if (pfn_folio(pte_pfn(ptent))    
5532                         continue;                
5533                                                  
5534                 if (!ignore_writable) {          
5535                         ptent = pte_modify(pt    
5536                         writable = pte_write(    
5537                         if (!writable && pte_    
5538                             can_change_pte_wr    
5539                                 writable = tr    
5540                 }                                
5541                                                  
5542                 numa_rebuild_single_mapping(v    
5543         }                                        
5544 }                                                
5545                                                  
5546 static vm_fault_t do_numa_page(struct vm_faul    
5547 {                                                
5548         struct vm_area_struct *vma = vmf->vma    
5549         struct folio *folio = NULL;              
5550         int nid = NUMA_NO_NODE;                  
5551         bool writable = false, ignore_writabl    
5552         bool pte_write_upgrade = vma_wants_ma    
5553         int last_cpupid;                         
5554         int target_nid;                          
5555         pte_t pte, old_pte;                      
5556         int flags = 0, nr_pages;                 
5557                                                  
5558         /*                                       
5559          * The pte cannot be used safely unti    
5560          * table lock, that its contents have    
5561          */                                      
5562         spin_lock(vmf->ptl);                     
5563         /* Read the live PTE from the page ta    
5564         old_pte = ptep_get(vmf->pte);            
5565                                                  
5566         if (unlikely(!pte_same(old_pte, vmf->    
5567                 pte_unmap_unlock(vmf->pte, vm    
5568                 return 0;                        
5569         }                                        
5570                                                  
5571         pte = pte_modify(old_pte, vma->vm_pag    
5572                                                  
5573         /*                                       
5574          * Detect now whether the PTE could b    
5575          * is only valid while holding the PT    
5576          */                                      
5577         writable = pte_write(pte);               
5578         if (!writable && pte_write_upgrade &&    
5579             can_change_pte_writable(vma, vmf-    
5580                 writable = true;                 
5581                                                  
5582         folio = vm_normal_folio(vma, vmf->add    
5583         if (!folio || folio_is_zone_device(fo    
5584                 goto out_map;                    
5585                                                  
5586         nid = folio_nid(folio);                  
5587         nr_pages = folio_nr_pages(folio);        
5588                                                  
5589         target_nid = numa_migrate_check(folio    
5590                                         writa    
5591         if (target_nid == NUMA_NO_NODE)          
5592                 goto out_map;                    
5593         if (migrate_misplaced_folio_prepare(f    
5594                 flags |= TNF_MIGRATE_FAIL;       
5595                 goto out_map;                    
5596         }                                        
5597         /* The folio is isolated and isolatio    
5598         pte_unmap_unlock(vmf->pte, vmf->ptl);    
5599         writable = false;                        
5600         ignore_writable = true;                  
5601                                                  
5602         /* Migrate to the requested node */      
5603         if (!migrate_misplaced_folio(folio, v    
5604                 nid = target_nid;                
5605                 flags |= TNF_MIGRATED;           
5606                 task_numa_fault(last_cpupid,     
5607                 return 0;                        
5608         }                                        
5609                                                  
5610         flags |= TNF_MIGRATE_FAIL;               
5611         vmf->pte = pte_offset_map_lock(vma->v    
5612                                        vmf->a    
5613         if (unlikely(!vmf->pte))                 
5614                 return 0;                        
5615         if (unlikely(!pte_same(ptep_get(vmf->    
5616                 pte_unmap_unlock(vmf->pte, vm    
5617                 return 0;                        
5618         }                                        
5619 out_map:                                         
5620         /*                                       
5621          * Make it present again, depending o    
5622          * non-accessible ptes, some can allo    
5623          */                                      
5624         if (folio && folio_test_large(folio))    
5625                 numa_rebuild_large_mapping(vm    
5626                                            pt    
5627         else                                     
5628                 numa_rebuild_single_mapping(v    
5629                                             w    
5630         pte_unmap_unlock(vmf->pte, vmf->ptl);    
5631                                                  
5632         if (nid != NUMA_NO_NODE)                 
5633                 task_numa_fault(last_cpupid,     
5634         return 0;                                
5635 }                                                
5636                                                  
5637 static inline vm_fault_t create_huge_pmd(stru    
5638 {                                                
5639         struct vm_area_struct *vma = vmf->vma    
5640         if (vma_is_anonymous(vma))               
5641                 return do_huge_pmd_anonymous_    
5642         if (vma->vm_ops->huge_fault)             
5643                 return vma->vm_ops->huge_faul    
5644         return VM_FAULT_FALLBACK;                
5645 }                                                
5646                                                  
5647 /* `inline' is required to avoid gcc 4.1.2 bu    
5648 static inline vm_fault_t wp_huge_pmd(struct v    
5649 {                                                
5650         struct vm_area_struct *vma = vmf->vma    
5651         const bool unshare = vmf->flags & FAU    
5652         vm_fault_t ret;                          
5653                                                  
5654         if (vma_is_anonymous(vma)) {             
5655                 if (likely(!unshare) &&          
5656                     userfaultfd_huge_pmd_wp(v    
5657                         if (userfaultfd_wp_as    
5658                                 goto split;      
5659                         return handle_userfau    
5660                 }                                
5661                 return do_huge_pmd_wp_page(vm    
5662         }                                        
5663                                                  
5664         if (vma->vm_flags & (VM_SHARED | VM_M    
5665                 if (vma->vm_ops->huge_fault)     
5666                         ret = vma->vm_ops->hu    
5667                         if (!(ret & VM_FAULT_    
5668                                 return ret;      
5669                 }                                
5670         }                                        
5671                                                  
5672 split:                                           
5673         /* COW or write-notify handled on pte    
5674         __split_huge_pmd(vma, vmf->pmd, vmf->    
5675                                                  
5676         return VM_FAULT_FALLBACK;                
5677 }                                                
5678                                                  
5679 static vm_fault_t create_huge_pud(struct vm_f    
5680 {                                                
5681 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&      
5682         defined(CONFIG_HAVE_ARCH_TRANSPARENT_    
5683         struct vm_area_struct *vma = vmf->vma    
5684         /* No support for anonymous transpare    
5685         if (vma_is_anonymous(vma))               
5686                 return VM_FAULT_FALLBACK;        
5687         if (vma->vm_ops->huge_fault)             
5688                 return vma->vm_ops->huge_faul    
5689 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */         
5690         return VM_FAULT_FALLBACK;                
5691 }                                                
5692                                                  
5693 static vm_fault_t wp_huge_pud(struct vm_fault    
5694 {                                                
5695 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&      
5696         defined(CONFIG_HAVE_ARCH_TRANSPARENT_    
5697         struct vm_area_struct *vma = vmf->vma    
5698         vm_fault_t ret;                          
5699                                                  
5700         /* No support for anonymous transpare    
5701         if (vma_is_anonymous(vma))               
5702                 goto split;                      
5703         if (vma->vm_flags & (VM_SHARED | VM_M    
5704                 if (vma->vm_ops->huge_fault)     
5705                         ret = vma->vm_ops->hu    
5706                         if (!(ret & VM_FAULT_    
5707                                 return ret;      
5708                 }                                
5709         }                                        
5710 split:                                           
5711         /* COW or write-notify not handled on    
5712         __split_huge_pud(vma, vmf->pud, vmf->    
5713 #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONF    
5714         return VM_FAULT_FALLBACK;                
5715 }                                                
5716                                                  
5717 /*                                               
5718  * These routines also need to handle stuff l    
5719  * and/or accessed for architectures that don    
5720  * RISC architectures).  The early dirtying i    
5721  *                                               
5722  * There is also a hook called "update_mmu_ca    
5723  * with external mmu caches can use to update    
5724  * PowerPC hashed page tables that act as ext    
5725  *                                               
5726  * We enter with non-exclusive mmap_lock (to     
5727  * concurrent faults).                           
5728  *                                               
5729  * The mmap_lock may have been released depen    
5730  * See filemap_fault() and __folio_lock_or_re    
5731  */                                              
5732 static vm_fault_t handle_pte_fault(struct vm_    
5733 {                                                
5734         pte_t entry;                             
5735                                                  
5736         if (unlikely(pmd_none(*vmf->pmd))) {     
5737                 /*                               
5738                  * Leave __pte_alloc() until     
5739                  * want to allocate huge page    
5740                  * for an instant, it will be    
5741                  * concurrent faults and from    
5742                  */                              
5743                 vmf->pte = NULL;                 
5744                 vmf->flags &= ~FAULT_FLAG_ORI    
5745         } else {                                 
5746                 /*                               
5747                  * A regular pmd is establish    
5748                  * pmd by anon khugepaged, si    
5749                  * mode; but shmem or file co    
5750                  * it into a huge pmd: just r    
5751                  */                              
5752                 vmf->pte = pte_offset_map_nol    
5753                                                  
5754                 if (unlikely(!vmf->pte))         
5755                         return 0;                
5756                 vmf->orig_pte = ptep_get_lock    
5757                 vmf->flags |= FAULT_FLAG_ORIG    
5758                                                  
5759                 if (pte_none(vmf->orig_pte))     
5760                         pte_unmap(vmf->pte);     
5761                         vmf->pte = NULL;         
5762                 }                                
5763         }                                        
5764                                                  
5765         if (!vmf->pte)                           
5766                 return do_pte_missing(vmf);      
5767                                                  
5768         if (!pte_present(vmf->orig_pte))         
5769                 return do_swap_page(vmf);        
5770                                                  
5771         if (pte_protnone(vmf->orig_pte) && vm    
5772                 return do_numa_page(vmf);        
5773                                                  
5774         spin_lock(vmf->ptl);                     
5775         entry = vmf->orig_pte;                   
5776         if (unlikely(!pte_same(ptep_get(vmf->    
5777                 update_mmu_tlb(vmf->vma, vmf-    
5778                 goto unlock;                     
5779         }                                        
5780         if (vmf->flags & (FAULT_FLAG_WRITE|FA    
5781                 if (!pte_write(entry))           
5782                         return do_wp_page(vmf    
5783                 else if (likely(vmf->flags &     
5784                         entry = pte_mkdirty(e    
5785         }                                        
5786         entry = pte_mkyoung(entry);              
5787         if (ptep_set_access_flags(vmf->vma, v    
5788                                 vmf->flags &     
5789                 update_mmu_cache_range(vmf, v    
5790                                 vmf->pte, 1);    
5791         } else {                                 
5792                 /* Skip spurious TLB flush fo    
5793                 if (vmf->flags & FAULT_FLAG_T    
5794                         goto unlock;             
5795                 /*                               
5796                  * This is needed only for pr    
5797                  * is not yet telling us if t    
5798                  * This still avoids useless     
5799                  * with threads.                 
5800                  */                              
5801                 if (vmf->flags & FAULT_FLAG_W    
5802                         flush_tlb_fix_spuriou    
5803                                                  
5804         }                                        
5805 unlock:                                          
5806         pte_unmap_unlock(vmf->pte, vmf->ptl);    
5807         return 0;                                
5808 }                                                
5809                                                  
5810 /*                                               
5811  * On entry, we hold either the VMA lock or t    
5812  * (FAULT_FLAG_VMA_LOCK tells you which).  If    
5813  * the result, the mmap_lock is not held on e    
5814  * and __folio_lock_or_retry().                  
5815  */                                              
5816 static vm_fault_t __handle_mm_fault(struct vm    
5817                 unsigned long address, unsign    
5818 {                                                
5819         struct vm_fault vmf = {                  
5820                 .vma = vma,                      
5821                 .address = address & PAGE_MAS    
5822                 .real_address = address,         
5823                 .flags = flags,                  
5824                 .pgoff = linear_page_index(vm    
5825                 .gfp_mask = __get_fault_gfp_m    
5826         };                                       
5827         struct mm_struct *mm = vma->vm_mm;       
5828         unsigned long vm_flags = vma->vm_flag    
5829         pgd_t *pgd;                              
5830         p4d_t *p4d;                              
5831         vm_fault_t ret;                          
5832                                                  
5833         pgd = pgd_offset(mm, address);           
5834         p4d = p4d_alloc(mm, pgd, address);       
5835         if (!p4d)                                
5836                 return VM_FAULT_OOM;             
5837                                                  
5838         vmf.pud = pud_alloc(mm, p4d, address)    
5839         if (!vmf.pud)                            
5840                 return VM_FAULT_OOM;             
5841 retry_pud:                                       
5842         if (pud_none(*vmf.pud) &&                
5843             thp_vma_allowable_order(vma, vm_f    
5844                                 TVA_IN_PF | T    
5845                 ret = create_huge_pud(&vmf);     
5846                 if (!(ret & VM_FAULT_FALLBACK    
5847                         return ret;              
5848         } else {                                 
5849                 pud_t orig_pud = *vmf.pud;       
5850                                                  
5851                 barrier();                       
5852                 if (pud_trans_huge(orig_pud)     
5853                                                  
5854                         /*                       
5855                          * TODO once we suppo    
5856                          * FAULT_FLAG_UNSHARE    
5857                          */                      
5858                         if ((flags & FAULT_FL    
5859                                 ret = wp_huge    
5860                                 if (!(ret & V    
5861                                         retur    
5862                         } else {                 
5863                                 huge_pud_set_    
5864                                 return 0;        
5865                         }                        
5866                 }                                
5867         }                                        
5868                                                  
5869         vmf.pmd = pmd_alloc(mm, vmf.pud, addr    
5870         if (!vmf.pmd)                            
5871                 return VM_FAULT_OOM;             
5872                                                  
5873         /* Huge pud page fault raced with pmd    
5874         if (pud_trans_unstable(vmf.pud))         
5875                 goto retry_pud;                  
5876                                                  
5877         if (pmd_none(*vmf.pmd) &&                
5878             thp_vma_allowable_order(vma, vm_f    
5879                                 TVA_IN_PF | T    
5880                 ret = create_huge_pmd(&vmf);     
5881                 if (!(ret & VM_FAULT_FALLBACK    
5882                         return ret;              
5883         } else {                                 
5884                 vmf.orig_pmd = pmdp_get_lockl    
5885                                                  
5886                 if (unlikely(is_swap_pmd(vmf.    
5887                         VM_BUG_ON(thp_migrati    
5888                                           !is    
5889                         if (is_pmd_migration_    
5890                                 pmd_migration    
5891                         return 0;                
5892                 }                                
5893                 if (pmd_trans_huge(vmf.orig_p    
5894                         if (pmd_protnone(vmf.    
5895                                 return do_hug    
5896                                                  
5897                         if ((flags & (FAULT_F    
5898                             !pmd_write(vmf.or    
5899                                 ret = wp_huge    
5900                                 if (!(ret & V    
5901                                         retur    
5902                         } else {                 
5903                                 huge_pmd_set_    
5904                                 return 0;        
5905                         }                        
5906                 }                                
5907         }                                        
5908                                                  
5909         return handle_pte_fault(&vmf);           
5910 }                                                
5911                                                  
5912 /**                                              
5913  * mm_account_fault - Do page fault accountin    
5914  * @mm: mm from which memcg should be extract    
5915  * @regs: the pt_regs struct pointer.  When s    
5916  *        of perf event counters, but we'll s    
5917  *        the task who triggered this page fa    
5918  * @address: the faulted address.                
5919  * @flags: the fault flags.                      
5920  * @ret: the fault retcode.                      
5921  *                                               
5922  * This will take care of most of the page fa    
5923  * will also include the PERF_COUNT_SW_PAGE_F    
5924  * updates.  However, note that the handling     
5925  * still be in per-arch page fault handlers a    
5926  */                                              
5927 static inline void mm_account_fault(struct mm    
5928                                     unsigned     
5929                                     vm_fault_    
5930 {                                                
5931         bool major;                              
5932                                                  
5933         /* Incomplete faults will be accounte    
5934         if (ret & VM_FAULT_RETRY)                
5935                 return;                          
5936                                                  
5937         /*                                       
5938          * To preserve the behavior of older     
5939          * both successful and failed faults,    
5940          * which ignore failed cases.            
5941          */                                      
5942         count_vm_event(PGFAULT);                 
5943         count_memcg_event_mm(mm, PGFAULT);       
5944                                                  
5945         /*                                       
5946          * Do not account for unsuccessful fa    
5947          * valid).  That includes arch_vma_ac    
5948          * reaching here. So this is not a "t    
5949          * counter.  We should use the hw pro    
5950          */                                      
5951         if (ret & VM_FAULT_ERROR)                
5952                 return;                          
5953                                                  
5954         /*                                       
5955          * We define the fault as a major fau    
5956          * is VM_FAULT_MAJOR, or if it retrie    
5957          * handle it immediately previously).    
5958          */                                      
5959         major = (ret & VM_FAULT_MAJOR) || (fl    
5960                                                  
5961         if (major)                               
5962                 current->maj_flt++;              
5963         else                                     
5964                 current->min_flt++;              
5965                                                  
5966         /*                                       
5967          * If the fault is done for GUP, regs    
5968          * accounting for the per thread faul    
5969          * fault, and we skip the perf event     
5970          */                                      
5971         if (!regs)                               
5972                 return;                          
5973                                                  
5974         if (major)                               
5975                 perf_sw_event(PERF_COUNT_SW_P    
5976         else                                     
5977                 perf_sw_event(PERF_COUNT_SW_P    
5978 }                                                
5979                                                  
5980 #ifdef CONFIG_LRU_GEN                            
5981 static void lru_gen_enter_fault(struct vm_are    
5982 {                                                
5983         /* the LRU algorithm only applies to     
5984         current->in_lru_fault = vma_has_recen    
5985 }                                                
5986                                                  
5987 static void lru_gen_exit_fault(void)             
5988 {                                                
5989         current->in_lru_fault = false;           
5990 }                                                
5991 #else                                            
5992 static void lru_gen_enter_fault(struct vm_are    
5993 {                                                
5994 }                                                
5995                                                  
5996 static void lru_gen_exit_fault(void)             
5997 {                                                
5998 }                                                
5999 #endif /* CONFIG_LRU_GEN */                      
6000                                                  
6001 static vm_fault_t sanitize_fault_flags(struct    
6002                                        unsign    
6003 {                                                
6004         if (unlikely(*flags & FAULT_FLAG_UNSH    
6005                 if (WARN_ON_ONCE(*flags & FAU    
6006                         return VM_FAULT_SIGSE    
6007                 /*                               
6008                  * FAULT_FLAG_UNSHARE only ap    
6009                  * just treat it like an ordi    
6010                  */                              
6011                 if (!is_cow_mapping(vma->vm_f    
6012                         *flags &= ~FAULT_FLAG    
6013         } else if (*flags & FAULT_FLAG_WRITE)    
6014                 /* Write faults on read-only     
6015                 if (WARN_ON_ONCE(!(vma->vm_fl    
6016                         return VM_FAULT_SIGSE    
6017                 /* ... and FOLL_FORCE only ap    
6018                 if (WARN_ON_ONCE(!(vma->vm_fl    
6019                                  !is_cow_mapp    
6020                         return VM_FAULT_SIGSE    
6021         }                                        
6022 #ifdef CONFIG_PER_VMA_LOCK                       
6023         /*                                       
6024          * Per-VMA locks can't be used with F    
6025          * the assumption that lock is droppe    
6026          */                                      
6027         if (WARN_ON_ONCE((*flags &               
6028                         (FAULT_FLAG_VMA_LOCK     
6029                         (FAULT_FLAG_VMA_LOCK     
6030                 return VM_FAULT_SIGSEGV;         
6031 #endif                                           
6032                                                  
6033         return 0;                                
6034 }                                                
6035                                                  
6036 /*                                               
6037  * By the time we get here, we already hold t    
6038  *                                               
6039  * The mmap_lock may have been released depen    
6040  * return value.  See filemap_fault() and __f    
6041  */                                              
6042 vm_fault_t handle_mm_fault(struct vm_area_str    
6043                            unsigned int flags    
6044 {                                                
6045         /* If the fault handler drops the mma    
6046         struct mm_struct *mm = vma->vm_mm;       
6047         vm_fault_t ret;                          
6048         bool is_droppable;                       
6049                                                  
6050         __set_current_state(TASK_RUNNING);       
6051                                                  
6052         ret = sanitize_fault_flags(vma, &flag    
6053         if (ret)                                 
6054                 goto out;                        
6055                                                  
6056         if (!arch_vma_access_permitted(vma, f    
6057                                             f    
6058                                             f    
6059                 ret = VM_FAULT_SIGSEGV;          
6060                 goto out;                        
6061         }                                        
6062                                                  
6063         is_droppable = !!(vma->vm_flags & VM_    
6064                                                  
6065         /*                                       
6066          * Enable the memcg OOM handling for     
6067          * space.  Kernel faults are handled     
6068          */                                      
6069         if (flags & FAULT_FLAG_USER)             
6070                 mem_cgroup_enter_user_fault()    
6071                                                  
6072         lru_gen_enter_fault(vma);                
6073                                                  
6074         if (unlikely(is_vm_hugetlb_page(vma))    
6075                 ret = hugetlb_fault(vma->vm_m    
6076         else                                     
6077                 ret = __handle_mm_fault(vma,     
6078                                                  
6079         /*                                       
6080          * Warning: It is no longer safe to d    
6081          * because mmap_lock might have been     
6082          * vma might be destroyed from undern    
6083          */                                      
6084                                                  
6085         lru_gen_exit_fault();                    
6086                                                  
6087         /* If the mapping is droppable, then     
6088         if (is_droppable)                        
6089                 ret &= ~VM_FAULT_OOM;            
6090                                                  
6091         if (flags & FAULT_FLAG_USER) {           
6092                 mem_cgroup_exit_user_fault();    
6093                 /*                               
6094                  * The task may have entered     
6095                  * if the allocation error wa    
6096                  * VM_FAULT_OOM), there is no    
6097                  * Just clean up the OOM stat    
6098                  */                              
6099                 if (task_in_memcg_oom(current    
6100                         mem_cgroup_oom_synchr    
6101         }                                        
6102 out:                                             
6103         mm_account_fault(mm, regs, address, f    
6104                                                  
6105         return ret;                              
6106 }                                                
6107 EXPORT_SYMBOL_GPL(handle_mm_fault);              
6108                                                  
6109 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA               
6110 #include <linux/extable.h>                       
6111                                                  
6112 static inline bool get_mmap_lock_carefully(st    
6113 {                                                
6114         if (likely(mmap_read_trylock(mm)))       
6115                 return true;                     
6116                                                  
6117         if (regs && !user_mode(regs)) {          
6118                 unsigned long ip = exception_    
6119                 if (!search_exception_tables(    
6120                         return false;            
6121         }                                        
6122                                                  
6123         return !mmap_read_lock_killable(mm);     
6124 }                                                
6125                                                  
6126 static inline bool mmap_upgrade_trylock(struc    
6127 {                                                
6128         /*                                       
6129          * We don't have this operation yet.     
6130          *                                       
6131          * It should be easy enough to do: it    
6132          *    atomic_long_try_cmpxchg_acquire    
6133          * from RWSEM_READER_BIAS -> RWSEM_WR    
6134          * it also needs the proper lockdep m    
6135          */                                      
6136         return false;                            
6137 }                                                
6138                                                  
6139 static inline bool upgrade_mmap_lock_carefull    
6140 {                                                
6141         mmap_read_unlock(mm);                    
6142         if (regs && !user_mode(regs)) {          
6143                 unsigned long ip = exception_    
6144                 if (!search_exception_tables(    
6145                         return false;            
6146         }                                        
6147         return !mmap_write_lock_killable(mm);    
6148 }                                                
6149                                                  
6150 /*                                               
6151  * Helper for page fault handling.               
6152  *                                               
6153  * This is kind of equivalend to "mmap_read_l    
6154  * by "find_extend_vma()", except it's a lot     
6155  * the locking (and will drop the lock on fai    
6156  *                                               
6157  * For example, if we have a kernel bug that     
6158  * fault, we don't want to just use mmap_read    
6159  * the mm lock, because that would deadlock i    
6160  * to happen while we're holding the mm lock     
6161  *                                               
6162  * So this checks the exception tables on ker    
6163  * order to only do this all for instructions    
6164  * expected to fault.                            
6165  *                                               
6166  * We can also actually take the mm lock for     
6167  * need to extend the vma, which helps the VM    
6168  */                                              
6169 struct vm_area_struct *lock_mm_and_find_vma(s    
6170                         unsigned long addr, s    
6171 {                                                
6172         struct vm_area_struct *vma;              
6173                                                  
6174         if (!get_mmap_lock_carefully(mm, regs    
6175                 return NULL;                     
6176                                                  
6177         vma = find_vma(mm, addr);                
6178         if (likely(vma && (vma->vm_start <= a    
6179                 return vma;                      
6180                                                  
6181         /*                                       
6182          * Well, dang. We might still be succ    
6183          * if we can extend a vma to do so.      
6184          */                                      
6185         if (!vma || !(vma->vm_flags & VM_GROW    
6186                 mmap_read_unlock(mm);            
6187                 return NULL;                     
6188         }                                        
6189                                                  
6190         /*                                       
6191          * We can try to upgrade the mmap loc    
6192          * in which case we can continue to u    
6193          * we already looked up.                 
6194          *                                       
6195          * Otherwise we'll have to drop the m    
6196          * re-take it, and also look up the v    
6197          * re-checking it.                       
6198          */                                      
6199         if (!mmap_upgrade_trylock(mm)) {         
6200                 if (!upgrade_mmap_lock_carefu    
6201                         return NULL;             
6202                                                  
6203                 vma = find_vma(mm, addr);        
6204                 if (!vma)                        
6205                         goto fail;               
6206                 if (vma->vm_start <= addr)       
6207                         goto success;            
6208                 if (!(vma->vm_flags & VM_GROW    
6209                         goto fail;               
6210         }                                        
6211                                                  
6212         if (expand_stack_locked(vma, addr))      
6213                 goto fail;                       
6214                                                  
6215 success:                                         
6216         mmap_write_downgrade(mm);                
6217         return vma;                              
6218                                                  
6219 fail:                                            
6220         mmap_write_unlock(mm);                   
6221         return NULL;                             
6222 }                                                
6223 #endif                                           
6224                                                  
6225 #ifdef CONFIG_PER_VMA_LOCK                       
6226 /*                                               
6227  * Lookup and lock a VMA under RCU protection    
6228  * stable and not isolated. If the VMA is not    
6229  * function returns NULL.                        
6230  */                                              
6231 struct vm_area_struct *lock_vma_under_rcu(str    
6232                                           uns    
6233 {                                                
6234         MA_STATE(mas, &mm->mm_mt, address, ad    
6235         struct vm_area_struct *vma;              
6236                                                  
6237         rcu_read_lock();                         
6238 retry:                                           
6239         vma = mas_walk(&mas);                    
6240         if (!vma)                                
6241                 goto inval;                      
6242                                                  
6243         if (!vma_start_read(vma))                
6244                 goto inval;                      
6245                                                  
6246         /* Check if the VMA got isolated afte    
6247         if (vma->detached) {                     
6248                 vma_end_read(vma);               
6249                 count_vm_vma_lock_event(VMA_L    
6250                 /* The area was replaced with    
6251                 goto retry;                      
6252         }                                        
6253         /*                                       
6254          * At this point, we have a stable re    
6255          * locked and we know it hasn't alrea    
6256          * From here on, we can access the VM    
6257          * fields are accessible for RCU read    
6258          */                                      
6259                                                  
6260         /* Check since vm_start/vm_end might     
6261         if (unlikely(address < vma->vm_start     
6262                 goto inval_end_read;             
6263                                                  
6264         rcu_read_unlock();                       
6265         return vma;                              
6266                                                  
6267 inval_end_read:                                  
6268         vma_end_read(vma);                       
6269 inval:                                           
6270         rcu_read_unlock();                       
6271         count_vm_vma_lock_event(VMA_LOCK_ABOR    
6272         return NULL;                             
6273 }                                                
6274 #endif /* CONFIG_PER_VMA_LOCK */                 
6275                                                  
6276 #ifndef __PAGETABLE_P4D_FOLDED                   
6277 /*                                               
6278  * Allocate p4d page table.                      
6279  * We've already handled the fast-path in-lin    
6280  */                                              
6281 int __p4d_alloc(struct mm_struct *mm, pgd_t *    
6282 {                                                
6283         p4d_t *new = p4d_alloc_one(mm, addres    
6284         if (!new)                                
6285                 return -ENOMEM;                  
6286                                                  
6287         spin_lock(&mm->page_table_lock);         
6288         if (pgd_present(*pgd)) {        /* An    
6289                 p4d_free(mm, new);               
6290         } else {                                 
6291                 smp_wmb(); /* See comment in     
6292                 pgd_populate(mm, pgd, new);      
6293         }                                        
6294         spin_unlock(&mm->page_table_lock);       
6295         return 0;                                
6296 }                                                
6297 #endif /* __PAGETABLE_P4D_FOLDED */              
6298                                                  
6299 #ifndef __PAGETABLE_PUD_FOLDED                   
6300 /*                                               
6301  * Allocate page upper directory.                
6302  * We've already handled the fast-path in-lin    
6303  */                                              
6304 int __pud_alloc(struct mm_struct *mm, p4d_t *    
6305 {                                                
6306         pud_t *new = pud_alloc_one(mm, addres    
6307         if (!new)                                
6308                 return -ENOMEM;                  
6309                                                  
6310         spin_lock(&mm->page_table_lock);         
6311         if (!p4d_present(*p4d)) {                
6312                 mm_inc_nr_puds(mm);              
6313                 smp_wmb(); /* See comment in     
6314                 p4d_populate(mm, p4d, new);      
6315         } else  /* Another has populated it *    
6316                 pud_free(mm, new);               
6317         spin_unlock(&mm->page_table_lock);       
6318         return 0;                                
6319 }                                                
6320 #endif /* __PAGETABLE_PUD_FOLDED */              
6321                                                  
6322 #ifndef __PAGETABLE_PMD_FOLDED                   
6323 /*                                               
6324  * Allocate page middle directory.               
6325  * We've already handled the fast-path in-lin    
6326  */                                              
6327 int __pmd_alloc(struct mm_struct *mm, pud_t *    
6328 {                                                
6329         spinlock_t *ptl;                         
6330         pmd_t *new = pmd_alloc_one(mm, addres    
6331         if (!new)                                
6332                 return -ENOMEM;                  
6333                                                  
6334         ptl = pud_lock(mm, pud);                 
6335         if (!pud_present(*pud)) {                
6336                 mm_inc_nr_pmds(mm);              
6337                 smp_wmb(); /* See comment in     
6338                 pud_populate(mm, pud, new);      
6339         } else {        /* Another has popula    
6340                 pmd_free(mm, new);               
6341         }                                        
6342         spin_unlock(ptl);                        
6343         return 0;                                
6344 }                                                
6345 #endif /* __PAGETABLE_PMD_FOLDED */              
6346                                                  
6347 static inline void pfnmap_args_setup(struct f    
6348                                      spinlock    
6349                                      pgprot_t    
6350                                      unsigned    
6351                                      bool spe    
6352 {                                                
6353         args->lock = lock;                       
6354         args->ptep = ptep;                       
6355         args->pfn = pfn_base + ((args->addres    
6356         args->pgprot = pgprot;                   
6357         args->writable = writable;               
6358         args->special = special;                 
6359 }                                                
6360                                                  
6361 static inline void pfnmap_lockdep_assert(stru    
6362 {                                                
6363 #ifdef CONFIG_LOCKDEP                            
6364         struct file *file = vma->vm_file;        
6365         struct address_space *mapping = file     
6366                                                  
6367         if (mapping)                             
6368                 lockdep_assert(lockdep_is_hel    
6369                                lockdep_is_hel    
6370         else                                     
6371                 lockdep_assert(lockdep_is_hel    
6372 #endif                                           
6373 }                                                
6374                                                  
6375 /**                                              
6376  * follow_pfnmap_start() - Look up a pfn mapp    
6377  * @args: Pointer to struct @follow_pfnmap_ar    
6378  *                                               
6379  * The caller needs to setup args->vma and ar    
6380  * virtual address as the target of such look    
6381  * the results will be put into other output     
6382  *                                               
6383  * After the caller finished using the fields    
6384  * another follow_pfnmap_end() to proper rele    
6385  * of such look up request.                      
6386  *                                               
6387  * During the start() and end() calls, the re    
6388  * as proper locks will be held.  After the e    
6389  * in @follow_pfnmap_args will be invalid to     
6390  * use of such information after end() may re    
6391  * by the caller with page table updates, oth    
6392  * security bug.                                 
6393  *                                               
6394  * If the PTE maps a refcounted page, callers    
6395  * against invalidation with MMU notifiers; o    
6396  * a later point in time can trigger use-afte    
6397  *                                               
6398  * Only IO mappings and raw PFN mappings are     
6399  * should be taken for read, and the mmap sem    
6400  * before the end() is invoked.                  
6401  *                                               
6402  * This function must not be used to modify P    
6403  *                                               
6404  * Return: zero on success, negative otherwis    
6405  */                                              
6406 int follow_pfnmap_start(struct follow_pfnmap_    
6407 {                                                
6408         struct vm_area_struct *vma = args->vm    
6409         unsigned long address = args->address    
6410         struct mm_struct *mm = vma->vm_mm;       
6411         spinlock_t *lock;                        
6412         pgd_t *pgdp;                             
6413         p4d_t *p4dp, p4d;                        
6414         pud_t *pudp, pud;                        
6415         pmd_t *pmdp, pmd;                        
6416         pte_t *ptep, pte;                        
6417                                                  
6418         pfnmap_lockdep_assert(vma);              
6419                                                  
6420         if (unlikely(address < vma->vm_start     
6421                 goto out;                        
6422                                                  
6423         if (!(vma->vm_flags & (VM_IO | VM_PFN    
6424                 goto out;                        
6425 retry:                                           
6426         pgdp = pgd_offset(mm, address);          
6427         if (pgd_none(*pgdp) || unlikely(pgd_b    
6428                 goto out;                        
6429                                                  
6430         p4dp = p4d_offset(pgdp, address);        
6431         p4d = READ_ONCE(*p4dp);                  
6432         if (p4d_none(p4d) || unlikely(p4d_bad    
6433                 goto out;                        
6434                                                  
6435         pudp = pud_offset(p4dp, address);        
6436         pud = READ_ONCE(*pudp);                  
6437         if (pud_none(pud))                       
6438                 goto out;                        
6439         if (pud_leaf(pud)) {                     
6440                 lock = pud_lock(mm, pudp);       
6441                 if (!unlikely(pud_leaf(pud)))    
6442                         spin_unlock(lock);       
6443                         goto retry;              
6444                 }                                
6445                 pfnmap_args_setup(args, lock,    
6446                                   pud_pfn(pud    
6447                                   pud_special    
6448                 return 0;                        
6449         }                                        
6450                                                  
6451         pmdp = pmd_offset(pudp, address);        
6452         pmd = pmdp_get_lockless(pmdp);           
6453         if (pmd_leaf(pmd)) {                     
6454                 lock = pmd_lock(mm, pmdp);       
6455                 if (!unlikely(pmd_leaf(pmd)))    
6456                         spin_unlock(lock);       
6457                         goto retry;              
6458                 }                                
6459                 pfnmap_args_setup(args, lock,    
6460                                   pmd_pfn(pmd    
6461                                   pmd_special    
6462                 return 0;                        
6463         }                                        
6464                                                  
6465         ptep = pte_offset_map_lock(mm, pmdp,     
6466         if (!ptep)                               
6467                 goto out;                        
6468         pte = ptep_get(ptep);                    
6469         if (!pte_present(pte))                   
6470                 goto unlock;                     
6471         pfnmap_args_setup(args, lock, ptep, p    
6472                           pte_pfn(pte), PAGE_    
6473                           pte_special(pte));     
6474         return 0;                                
6475 unlock:                                          
6476         pte_unmap_unlock(ptep, lock);            
6477 out:                                             
6478         return -EINVAL;                          
6479 }                                                
6480 EXPORT_SYMBOL_GPL(follow_pfnmap_start);          
6481                                                  
6482 /**                                              
6483  * follow_pfnmap_end(): End a follow_pfnmap_s    
6484  * @args: Pointer to struct @follow_pfnmap_ar    
6485  *                                               
6486  * Must be used in pair of follow_pfnmap_star    
6487  * above for more information.                   
6488  */                                              
6489 void follow_pfnmap_end(struct follow_pfnmap_a    
6490 {                                                
6491         if (args->lock)                          
6492                 spin_unlock(args->lock);         
6493         if (args->ptep)                          
6494                 pte_unmap(args->ptep);           
6495 }                                                
6496 EXPORT_SYMBOL_GPL(follow_pfnmap_end);            
6497                                                  
6498 #ifdef CONFIG_HAVE_IOREMAP_PROT                  
6499 /**                                              
6500  * generic_access_phys - generic implementati    
6501  * @vma: the vma to access                       
6502  * @addr: userspace address, not relative off    
6503  * @buf: buffer to read/write                    
6504  * @len: length of transfer                      
6505  * @write: set to FOLL_WRITE when writing, ot    
6506  *                                               
6507  * This is a generic implementation for &vm_o    
6508  * iomem mapping. This callback is used by ac    
6509  * not page based.                               
6510  */                                              
6511 int generic_access_phys(struct vm_area_struct    
6512                         void *buf, int len, i    
6513 {                                                
6514         resource_size_t phys_addr;               
6515         unsigned long prot = 0;                  
6516         void __iomem *maddr;                     
6517         int offset = offset_in_page(addr);       
6518         int ret = -EINVAL;                       
6519         bool writable;                           
6520         struct follow_pfnmap_args args = { .v    
6521                                                  
6522 retry:                                           
6523         if (follow_pfnmap_start(&args))          
6524                 return -EINVAL;                  
6525         prot = pgprot_val(args.pgprot);          
6526         phys_addr = (resource_size_t)args.pfn    
6527         writable = args.writable;                
6528         follow_pfnmap_end(&args);                
6529                                                  
6530         if ((write & FOLL_WRITE) && !writable    
6531                 return -EINVAL;                  
6532                                                  
6533         maddr = ioremap_prot(phys_addr, PAGE_    
6534         if (!maddr)                              
6535                 return -ENOMEM;                  
6536                                                  
6537         if (follow_pfnmap_start(&args))          
6538                 goto out_unmap;                  
6539                                                  
6540         if ((prot != pgprot_val(args.pgprot))    
6541             (phys_addr != (args.pfn << PAGE_S    
6542             (writable != args.writable)) {       
6543                 follow_pfnmap_end(&args);        
6544                 iounmap(maddr);                  
6545                 goto retry;                      
6546         }                                        
6547                                                  
6548         if (write)                               
6549                 memcpy_toio(maddr + offset, b    
6550         else                                     
6551                 memcpy_fromio(buf, maddr + of    
6552         ret = len;                               
6553         follow_pfnmap_end(&args);                
6554 out_unmap:                                       
6555         iounmap(maddr);                          
6556                                                  
6557         return ret;                              
6558 }                                                
6559 EXPORT_SYMBOL_GPL(generic_access_phys);          
6560 #endif                                           
6561                                                  
6562 /*                                               
6563  * Access another process' address space as g    
6564  */                                              
6565 static int __access_remote_vm(struct mm_struc    
6566                               void *buf, int     
6567 {                                                
6568         void *old_buf = buf;                     
6569         int write = gup_flags & FOLL_WRITE;      
6570                                                  
6571         if (mmap_read_lock_killable(mm))         
6572                 return 0;                        
6573                                                  
6574         /* Untag the address before looking u    
6575         addr = untagged_addr_remote(mm, addr)    
6576                                                  
6577         /* Avoid triggering the temporary war    
6578         if (!vma_lookup(mm, addr) && !expand_    
6579                 return 0;                        
6580                                                  
6581         /* ignore errors, just check how much    
6582         while (len) {                            
6583                 int bytes, offset;               
6584                 void *maddr;                     
6585                 struct vm_area_struct *vma =     
6586                 struct page *page = get_user_    
6587                                                  
6588                                                  
6589                 if (IS_ERR(page)) {              
6590                         /* We might need to e    
6591                         vma = vma_lookup(mm,     
6592                         if (!vma) {              
6593                                 vma = expand_    
6594                                                  
6595                                 /* mmap_lock     
6596                                 if (!vma)        
6597                                         retur    
6598                                                  
6599                                 /* Try again     
6600                                 continue;        
6601                         }                        
6602                                                  
6603                         /*                       
6604                          * Check if this is a    
6605                          * we can access usin    
6606                          */                      
6607                         bytes = 0;               
6608 #ifdef CONFIG_HAVE_IOREMAP_PROT                  
6609                         if (vma->vm_ops && vm    
6610                                 bytes = vma->    
6611                                                  
6612 #endif                                           
6613                         if (bytes <= 0)          
6614                                 break;           
6615                 } else {                         
6616                         bytes = len;             
6617                         offset = addr & (PAGE    
6618                         if (bytes > PAGE_SIZE    
6619                                 bytes = PAGE_    
6620                                                  
6621                         maddr = kmap_local_pa    
6622                         if (write) {             
6623                                 copy_to_user_    
6624                                                  
6625                                 set_page_dirt    
6626                         } else {                 
6627                                 copy_from_use    
6628                                                  
6629                         }                        
6630                         unmap_and_put_page(pa    
6631                 }                                
6632                 len -= bytes;                    
6633                 buf += bytes;                    
6634                 addr += bytes;                   
6635         }                                        
6636         mmap_read_unlock(mm);                    
6637                                                  
6638         return buf - old_buf;                    
6639 }                                                
6640                                                  
6641 /**                                              
6642  * access_remote_vm - access another process'    
6643  * @mm:         the mm_struct of the target a    
6644  * @addr:       start address to access          
6645  * @buf:        source or destination buffer     
6646  * @len:        number of bytes to transfer      
6647  * @gup_flags:  flags modifying lookup behavi    
6648  *                                               
6649  * The caller must hold a reference on @mm.      
6650  *                                               
6651  * Return: number of bytes copied from source    
6652  */                                              
6653 int access_remote_vm(struct mm_struct *mm, un    
6654                 void *buf, int len, unsigned     
6655 {                                                
6656         return __access_remote_vm(mm, addr, b    
6657 }                                                
6658                                                  
6659 /*                                               
6660  * Access another process' address space.        
6661  * Source/target buffer must be kernel space,    
6662  * Do not walk the page table directly, use g    
6663  */                                              
6664 int access_process_vm(struct task_struct *tsk    
6665                 void *buf, int len, unsigned     
6666 {                                                
6667         struct mm_struct *mm;                    
6668         int ret;                                 
6669                                                  
6670         mm = get_task_mm(tsk);                   
6671         if (!mm)                                 
6672                 return 0;                        
6673                                                  
6674         ret = __access_remote_vm(mm, addr, bu    
6675                                                  
6676         mmput(mm);                               
6677                                                  
6678         return ret;                              
6679 }                                                
6680 EXPORT_SYMBOL_GPL(access_process_vm);            
6681                                                  
6682 /*                                               
6683  * Print the name of a VMA.                      
6684  */                                              
6685 void print_vma_addr(char *prefix, unsigned lo    
6686 {                                                
6687         struct mm_struct *mm = current->mm;      
6688         struct vm_area_struct *vma;              
6689                                                  
6690         /*                                       
6691          * we might be running from an atomic    
6692          */                                      
6693         if (!mmap_read_trylock(mm))              
6694                 return;                          
6695                                                  
6696         vma = vma_lookup(mm, ip);                
6697         if (vma && vma->vm_file) {               
6698                 struct file *f = vma->vm_file    
6699                 ip -= vma->vm_start;             
6700                 ip += vma->vm_pgoff << PAGE_S    
6701                 printk("%s%pD[%lx,%lx+%lx]",     
6702                                 vma->vm_start    
6703                                 vma->vm_end -    
6704         }                                        
6705         mmap_read_unlock(mm);                    
6706 }                                                
6707                                                  
6708 #if defined(CONFIG_PROVE_LOCKING) || defined(    
6709 void __might_fault(const char *file, int line    
6710 {                                                
6711         if (pagefault_disabled())                
6712                 return;                          
6713         __might_sleep(file, line);               
6714 #if defined(CONFIG_DEBUG_ATOMIC_SLEEP)           
6715         if (current->mm)                         
6716                 might_lock_read(&current->mm-    
6717 #endif                                           
6718 }                                                
6719 EXPORT_SYMBOL(__might_fault);                    
6720 #endif                                           
6721                                                  
6722 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || d    
6723 /*                                               
6724  * Process all subpages of the specified huge    
6725  * operation.  The target subpage will be pro    
6726  * cache lines hot.                              
6727  */                                              
6728 static inline int process_huge_page(             
6729         unsigned long addr_hint, unsigned int    
6730         int (*process_subpage)(unsigned long     
6731         void *arg)                               
6732 {                                                
6733         int i, n, base, l, ret;                  
6734         unsigned long addr = addr_hint &         
6735                 ~(((unsigned long)nr_pages <<    
6736                                                  
6737         /* Process target subpage last to kee    
6738         might_sleep();                           
6739         n = (addr_hint - addr) / PAGE_SIZE;      
6740         if (2 * n <= nr_pages) {                 
6741                 /* If target subpage in first    
6742                 base = 0;                        
6743                 l = n;                           
6744                 /* Process subpages at the en    
6745                 for (i = nr_pages - 1; i >= 2    
6746                         cond_resched();          
6747                         ret = process_subpage    
6748                         if (ret)                 
6749                                 return ret;      
6750                 }                                
6751         } else {                                 
6752                 /* If target subpage in secon    
6753                 base = nr_pages - 2 * (nr_pag    
6754                 l = nr_pages - n;                
6755                 /* Process subpages at the be    
6756                 for (i = 0; i < base; i++) {     
6757                         cond_resched();          
6758                         ret = process_subpage    
6759                         if (ret)                 
6760                                 return ret;      
6761                 }                                
6762         }                                        
6763         /*                                       
6764          * Process remaining subpages in left    
6765          * towards the target subpage            
6766          */                                      
6767         for (i = 0; i < l; i++) {                
6768                 int left_idx = base + i;         
6769                 int right_idx = base + 2 * l     
6770                                                  
6771                 cond_resched();                  
6772                 ret = process_subpage(addr +     
6773                 if (ret)                         
6774                         return ret;              
6775                 cond_resched();                  
6776                 ret = process_subpage(addr +     
6777                 if (ret)                         
6778                         return ret;              
6779         }                                        
6780         return 0;                                
6781 }                                                
6782                                                  
6783 static void clear_gigantic_page(struct folio     
6784                                 unsigned int     
6785 {                                                
6786         int i;                                   
6787                                                  
6788         might_sleep();                           
6789         for (i = 0; i < nr_pages; i++) {         
6790                 cond_resched();                  
6791                 clear_user_highpage(folio_pag    
6792         }                                        
6793 }                                                
6794                                                  
6795 static int clear_subpage(unsigned long addr,     
6796 {                                                
6797         struct folio *folio = arg;               
6798                                                  
6799         clear_user_highpage(folio_page(folio,    
6800         return 0;                                
6801 }                                                
6802                                                  
6803 /**                                              
6804  * folio_zero_user - Zero a folio which will     
6805  * @folio: The folio to zero.                    
6806  * @addr_hint: The address will be accessed o    
6807  */                                              
6808 void folio_zero_user(struct folio *folio, uns    
6809 {                                                
6810         unsigned int nr_pages = folio_nr_page    
6811                                                  
6812         if (unlikely(nr_pages > MAX_ORDER_NR_    
6813                 clear_gigantic_page(folio, ad    
6814         else                                     
6815                 process_huge_page(addr_hint,     
6816 }                                                
6817                                                  
6818 static int copy_user_gigantic_page(struct fol    
6819                                    unsigned l    
6820                                    struct vm_    
6821                                    unsigned i    
6822 {                                                
6823         int i;                                   
6824         struct page *dst_page;                   
6825         struct page *src_page;                   
6826                                                  
6827         for (i = 0; i < nr_pages; i++) {         
6828                 dst_page = folio_page(dst, i)    
6829                 src_page = folio_page(src, i)    
6830                                                  
6831                 cond_resched();                  
6832                 if (copy_mc_user_highpage(dst    
6833                                           add    
6834                         return -EHWPOISON;       
6835         }                                        
6836         return 0;                                
6837 }                                                
6838                                                  
6839 struct copy_subpage_arg {                        
6840         struct folio *dst;                       
6841         struct folio *src;                       
6842         struct vm_area_struct *vma;              
6843 };                                               
6844                                                  
6845 static int copy_subpage(unsigned long addr, i    
6846 {                                                
6847         struct copy_subpage_arg *copy_arg = a    
6848         struct page *dst = folio_page(copy_ar    
6849         struct page *src = folio_page(copy_ar    
6850                                                  
6851         if (copy_mc_user_highpage(dst, src, a    
6852                 return -EHWPOISON;               
6853         return 0;                                
6854 }                                                
6855                                                  
6856 int copy_user_large_folio(struct folio *dst,     
6857                           unsigned long addr_    
6858 {                                                
6859         unsigned int nr_pages = folio_nr_page    
6860         struct copy_subpage_arg arg = {          
6861                 .dst = dst,                      
6862                 .src = src,                      
6863                 .vma = vma,                      
6864         };                                       
6865                                                  
6866         if (unlikely(nr_pages > MAX_ORDER_NR_    
6867                 return copy_user_gigantic_pag    
6868                                                  
6869         return process_huge_page(addr_hint, n    
6870 }                                                
6871                                                  
6872 long copy_folio_from_user(struct folio *dst_f    
6873                            const void __user     
6874                            bool allow_pagefau    
6875 {                                                
6876         void *kaddr;                             
6877         unsigned long i, rc = 0;                 
6878         unsigned int nr_pages = folio_nr_page    
6879         unsigned long ret_val = nr_pages * PA    
6880         struct page *subpage;                    
6881                                                  
6882         for (i = 0; i < nr_pages; i++) {         
6883                 subpage = folio_page(dst_foli    
6884                 kaddr = kmap_local_page(subpa    
6885                 if (!allow_pagefault)            
6886                         pagefault_disable();     
6887                 rc = copy_from_user(kaddr, us    
6888                 if (!allow_pagefault)            
6889                         pagefault_enable();      
6890                 kunmap_local(kaddr);             
6891                                                  
6892                 ret_val -= (PAGE_SIZE - rc);     
6893                 if (rc)                          
6894                         break;                   
6895                                                  
6896                 flush_dcache_page(subpage);      
6897                                                  
6898                 cond_resched();                  
6899         }                                        
6900         return ret_val;                          
6901 }                                                
6902 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONF    
6903                                                  
6904 #if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLO    
6905                                                  
6906 static struct kmem_cache *page_ptl_cachep;       
6907                                                  
6908 void __init ptlock_cache_init(void)              
6909 {                                                
6910         page_ptl_cachep = kmem_cache_create("    
6911                         SLAB_PANIC, NULL);       
6912 }                                                
6913                                                  
6914 bool ptlock_alloc(struct ptdesc *ptdesc)         
6915 {                                                
6916         spinlock_t *ptl;                         
6917                                                  
6918         ptl = kmem_cache_alloc(page_ptl_cache    
6919         if (!ptl)                                
6920                 return false;                    
6921         ptdesc->ptl = ptl;                       
6922         return true;                             
6923 }                                                
6924                                                  
6925 void ptlock_free(struct ptdesc *ptdesc)          
6926 {                                                
6927         kmem_cache_free(page_ptl_cachep, ptde    
6928 }                                                
6929 #endif                                           
6930                                                  
6931 void vma_pgtable_walk_begin(struct vm_area_st    
6932 {                                                
6933         if (is_vm_hugetlb_page(vma))             
6934                 hugetlb_vma_lock_read(vma);      
6935 }                                                
6936                                                  
6937 void vma_pgtable_walk_end(struct vm_area_stru    
6938 {                                                
6939         if (is_vm_hugetlb_page(vma))             
6940                 hugetlb_vma_unlock_read(vma);    
6941 }                                                
6942                                                  

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php