~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~
huge_memory.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~
Diff markup

Differences between /mm/huge_memory.c (Version linux-6.12-rc7) and /mm/huge_memory.c (Version policy-sample)

  1 // SPDX-License-Identifier: GPL-2.0-only            1 
  2 /*                                                
  3  *  Copyright (C) 2009  Red Hat, Inc.             
  4  */                                               
  5                                                   
  6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt       
  7                                                   
  8 #include <linux/mm.h>                             
  9 #include <linux/sched.h>                          
 10 #include <linux/sched/mm.h>                       
 11 #include <linux/sched/coredump.h>                 
 12 #include <linux/sched/numa_balancing.h>           
 13 #include <linux/highmem.h>                        
 14 #include <linux/hugetlb.h>                        
 15 #include <linux/mmu_notifier.h>                   
 16 #include <linux/rmap.h>                           
 17 #include <linux/swap.h>                           
 18 #include <linux/shrinker.h>                       
 19 #include <linux/mm_inline.h>                      
 20 #include <linux/swapops.h>                        
 21 #include <linux/backing-dev.h>                    
 22 #include <linux/dax.h>                            
 23 #include <linux/mm_types.h>                       
 24 #include <linux/khugepaged.h>                     
 25 #include <linux/freezer.h>                        
 26 #include <linux/pfn_t.h>                          
 27 #include <linux/mman.h>                           
 28 #include <linux/memremap.h>                       
 29 #include <linux/pagemap.h>                        
 30 #include <linux/debugfs.h>                        
 31 #include <linux/migrate.h>                        
 32 #include <linux/hashtable.h>                      
 33 #include <linux/userfaultfd_k.h>                  
 34 #include <linux/page_idle.h>                      
 35 #include <linux/shmem_fs.h>                       
 36 #include <linux/oom.h>                            
 37 #include <linux/numa.h>                           
 38 #include <linux/page_owner.h>                     
 39 #include <linux/sched/sysctl.h>                   
 40 #include <linux/memory-tiers.h>                   
 41 #include <linux/compat.h>                         
 42 #include <linux/pgalloc_tag.h>                    
 43 #include <linux/pagewalk.h>                       
 44                                                   
 45 #include <asm/tlb.h>                              
 46 #include <asm/pgalloc.h>                          
 47 #include "internal.h"                             
 48 #include "swap.h"                                 
 49                                                   
 50 #define CREATE_TRACE_POINTS                       
 51 #include <trace/events/thp.h>                     
 52                                                   
 53 /*                                                
 54  * By default, transparent hugepage support is    
 55  * risking an increased memory footprint for a    
 56  * guaranteed to benefit from it. When transpa    
 57  * enabled, it is for all mappings, and khugep    
 58  * Defrag is invoked by khugepaged hugepage al    
 59  * for all hugepage allocations.                  
 60  */                                               
 61 unsigned long transparent_hugepage_flags __rea    
 62 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS         
 63         (1<<TRANSPARENT_HUGEPAGE_FLAG)|           
 64 #endif                                            
 65 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE        
 66         (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG    
 67 #endif                                            
 68         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MA    
 69         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEP    
 70         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE    
 71                                                   
 72 static struct shrinker *deferred_split_shrinke    
 73 static unsigned long deferred_split_count(stru    
 74                                           stru    
 75 static unsigned long deferred_split_scan(struc    
 76                                          struc    
 77 static bool split_underused_thp = true;           
 78                                                   
 79 static atomic_t huge_zero_refcount;               
 80 struct folio *huge_zero_folio __read_mostly;      
 81 unsigned long huge_zero_pfn __read_mostly = ~0    
 82 unsigned long huge_anon_orders_always __read_m    
 83 unsigned long huge_anon_orders_madvise __read_    
 84 unsigned long huge_anon_orders_inherit __read_    
 85 static bool anon_orders_configured __initdata;    
 86                                                   
 87 unsigned long __thp_vma_allowable_orders(struc    
 88                                          unsig    
 89                                          unsig    
 90                                          unsig    
 91 {                                                 
 92         bool smaps = tva_flags & TVA_SMAPS;       
 93         bool in_pf = tva_flags & TVA_IN_PF;       
 94         bool enforce_sysfs = tva_flags & TVA_E    
 95         unsigned long supported_orders;           
 96                                                   
 97         /* Check the intersection of requested    
 98         if (vma_is_anonymous(vma))                
 99                 supported_orders = THP_ORDERS_    
100         else if (vma_is_special_huge(vma))        
101                 supported_orders = THP_ORDERS_    
102         else                                      
103                 supported_orders = THP_ORDERS_    
104                                                   
105         orders &= supported_orders;               
106         if (!orders)                              
107                 return 0;                         
108                                                   
109         if (!vma->vm_mm)                /* vds    
110                 return 0;                         
111                                                   
112         if (thp_disabled_by_hw() || vma_thp_di    
113                 return 0;                         
114                                                   
115         /* khugepaged doesn't collapse DAX vma    
116         if (vma_is_dax(vma))                      
117                 return in_pf ? orders : 0;        
118                                                   
119         /*                                        
120          * khugepaged special VMA and hugetlb     
121          * Must be checked after dax since som    
122          * VM_MIXEDMAP set.                       
123          */                                       
124         if (!in_pf && !smaps && (vm_flags & VM    
125                 return 0;                         
126                                                   
127         /*                                        
128          * Check alignment for file vma and si    
129          * filtering out the unsuitable orders    
130          *                                        
131          * Skip the check for page fault. Huge    
132          * handlers.                              
133          */                                       
134         if (!in_pf) {                             
135                 int order = highest_order(orde    
136                 unsigned long addr;               
137                                                   
138                 while (orders) {                  
139                         addr = vma->vm_end - (    
140                         if (thp_vma_suitable_o    
141                                 break;            
142                         order = next_order(&or    
143                 }                                 
144                                                   
145                 if (!orders)                      
146                         return 0;                 
147         }                                         
148                                                   
149         /*                                        
150          * Enabled via shmem mount options or     
151          * Must be done before hugepage flags     
152          * own flags.                             
153          */                                       
154         if (!in_pf && shmem_file(vma->vm_file)    
155                 return shmem_allowable_huge_or    
156                                                   
157                                                   
158                                                   
159         if (!vma_is_anonymous(vma)) {             
160                 /*                                
161                  * Enforce sysfs THP requireme    
162                  * were already handled in thp    
163                  */                               
164                 if (enforce_sysfs &&              
165                     (!hugepage_global_enabled(    
166                                                   
167                         return 0;                 
168                                                   
169                 /*                                
170                  * Trust that ->huge_fault() h    
171                  * in fault path.                 
172                  */                               
173                 if (((in_pf || smaps)) && vma-    
174                         return orders;            
175                 /* Only regular file is valid     
176                 if (((!in_pf || smaps)) && fil    
177                         return orders;            
178                 return 0;                         
179         }                                         
180                                                   
181         if (vma_is_temporary_stack(vma))          
182                 return 0;                         
183                                                   
184         /*                                        
185          * THPeligible bit of smaps should sho    
186          * though anon_vma is not initialized     
187          *                                        
188          * Allow page fault since anon_vma may    
189          * the first page fault.                  
190          */                                       
191         if (!vma->anon_vma)                       
192                 return (smaps || in_pf) ? orde    
193                                                   
194         return orders;                            
195 }                                                 
196                                                   
197 static bool get_huge_zero_page(void)              
198 {                                                 
199         struct folio *zero_folio;                 
200 retry:                                            
201         if (likely(atomic_inc_not_zero(&huge_z    
202                 return true;                      
203                                                   
204         zero_folio = folio_alloc((GFP_TRANSHUG    
205                         HPAGE_PMD_ORDER);         
206         if (!zero_folio) {                        
207                 count_vm_event(THP_ZERO_PAGE_A    
208                 return false;                     
209         }                                         
210         /* Ensure zero folio won't have large_    
211         folio_clear_large_rmappable(zero_folio    
212         preempt_disable();                        
213         if (cmpxchg(&huge_zero_folio, NULL, ze    
214                 preempt_enable();                 
215                 folio_put(zero_folio);            
216                 goto retry;                       
217         }                                         
218         WRITE_ONCE(huge_zero_pfn, folio_pfn(ze    
219                                                   
220         /* We take additional reference here.     
221         atomic_set(&huge_zero_refcount, 2);       
222         preempt_enable();                         
223         count_vm_event(THP_ZERO_PAGE_ALLOC);      
224         return true;                              
225 }                                                 
226                                                   
227 static void put_huge_zero_page(void)              
228 {                                                 
229         /*                                        
230          * Counter should never go to zero her    
231          * last reference.                        
232          */                                       
233         BUG_ON(atomic_dec_and_test(&huge_zero_    
234 }                                                 
235                                                   
236 struct folio *mm_get_huge_zero_folio(struct mm    
237 {                                                 
238         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->    
239                 return READ_ONCE(huge_zero_fol    
240                                                   
241         if (!get_huge_zero_page())                
242                 return NULL;                      
243                                                   
244         if (test_and_set_bit(MMF_HUGE_ZERO_PAG    
245                 put_huge_zero_page();             
246                                                   
247         return READ_ONCE(huge_zero_folio);        
248 }                                                 
249                                                   
250 void mm_put_huge_zero_folio(struct mm_struct *    
251 {                                                 
252         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->    
253                 put_huge_zero_page();             
254 }                                                 
255                                                   
256 static unsigned long shrink_huge_zero_page_cou    
257                                         struct    
258 {                                                 
259         /* we can free zero page only if last     
260         return atomic_read(&huge_zero_refcount    
261 }                                                 
262                                                   
263 static unsigned long shrink_huge_zero_page_sca    
264                                        struct     
265 {                                                 
266         if (atomic_cmpxchg(&huge_zero_refcount    
267                 struct folio *zero_folio = xch    
268                 BUG_ON(zero_folio == NULL);       
269                 WRITE_ONCE(huge_zero_pfn, ~0UL    
270                 folio_put(zero_folio);            
271                 return HPAGE_PMD_NR;              
272         }                                         
273                                                   
274         return 0;                                 
275 }                                                 
276                                                   
277 static struct shrinker *huge_zero_page_shrinke    
278                                                   
279 #ifdef CONFIG_SYSFS                               
280 static ssize_t enabled_show(struct kobject *ko    
281                             struct kobj_attrib    
282 {                                                 
283         const char *output;                       
284                                                   
285         if (test_bit(TRANSPARENT_HUGEPAGE_FLAG    
286                 output = "[always] madvise nev    
287         else if (test_bit(TRANSPARENT_HUGEPAGE    
288                           &transparent_hugepag    
289                 output = "always [madvise] nev    
290         else                                      
291                 output = "always madvise [neve    
292                                                   
293         return sysfs_emit(buf, "%s\n", output)    
294 }                                                 
295                                                   
296 static ssize_t enabled_store(struct kobject *k    
297                              struct kobj_attri    
298                              const char *buf,     
299 {                                                 
300         ssize_t ret = count;                      
301                                                   
302         if (sysfs_streq(buf, "always")) {         
303                 clear_bit(TRANSPARENT_HUGEPAGE    
304                 set_bit(TRANSPARENT_HUGEPAGE_F    
305         } else if (sysfs_streq(buf, "madvise")    
306                 clear_bit(TRANSPARENT_HUGEPAGE    
307                 set_bit(TRANSPARENT_HUGEPAGE_R    
308         } else if (sysfs_streq(buf, "never"))     
309                 clear_bit(TRANSPARENT_HUGEPAGE    
310                 clear_bit(TRANSPARENT_HUGEPAGE    
311         } else                                    
312                 ret = -EINVAL;                    
313                                                   
314         if (ret > 0) {                            
315                 int err = start_stop_khugepage    
316                 if (err)                          
317                         ret = err;                
318         }                                         
319         return ret;                               
320 }                                                 
321                                                   
322 static struct kobj_attribute enabled_attr = __    
323                                                   
324 ssize_t single_hugepage_flag_show(struct kobje    
325                                   struct kobj_    
326                                   enum transpa    
327 {                                                 
328         return sysfs_emit(buf, "%d\n",            
329                           !!test_bit(flag, &tr    
330 }                                                 
331                                                   
332 ssize_t single_hugepage_flag_store(struct kobj    
333                                  struct kobj_a    
334                                  const char *b    
335                                  enum transpar    
336 {                                                 
337         unsigned long value;                      
338         int ret;                                  
339                                                   
340         ret = kstrtoul(buf, 10, &value);          
341         if (ret < 0)                              
342                 return ret;                       
343         if (value > 1)                            
344                 return -EINVAL;                   
345                                                   
346         if (value)                                
347                 set_bit(flag, &transparent_hug    
348         else                                      
349                 clear_bit(flag, &transparent_h    
350                                                   
351         return count;                             
352 }                                                 
353                                                   
354 static ssize_t defrag_show(struct kobject *kob    
355                            struct kobj_attribu    
356 {                                                 
357         const char *output;                       
358                                                   
359         if (test_bit(TRANSPARENT_HUGEPAGE_DEFR    
360                      &transparent_hugepage_fla    
361                 output = "[always] defer defer    
362         else if (test_bit(TRANSPARENT_HUGEPAGE    
363                           &transparent_hugepag    
364                 output = "always [defer] defer    
365         else if (test_bit(TRANSPARENT_HUGEPAGE    
366                           &transparent_hugepag    
367                 output = "always defer [defer+    
368         else if (test_bit(TRANSPARENT_HUGEPAGE    
369                           &transparent_hugepag    
370                 output = "always defer defer+m    
371         else                                      
372                 output = "always defer defer+m    
373                                                   
374         return sysfs_emit(buf, "%s\n", output)    
375 }                                                 
376                                                   
377 static ssize_t defrag_store(struct kobject *ko    
378                             struct kobj_attrib    
379                             const char *buf, s    
380 {                                                 
381         if (sysfs_streq(buf, "always")) {         
382                 clear_bit(TRANSPARENT_HUGEPAGE    
383                 clear_bit(TRANSPARENT_HUGEPAGE    
384                 clear_bit(TRANSPARENT_HUGEPAGE    
385                 set_bit(TRANSPARENT_HUGEPAGE_D    
386         } else if (sysfs_streq(buf, "defer+mad    
387                 clear_bit(TRANSPARENT_HUGEPAGE    
388                 clear_bit(TRANSPARENT_HUGEPAGE    
389                 clear_bit(TRANSPARENT_HUGEPAGE    
390                 set_bit(TRANSPARENT_HUGEPAGE_D    
391         } else if (sysfs_streq(buf, "defer"))     
392                 clear_bit(TRANSPARENT_HUGEPAGE    
393                 clear_bit(TRANSPARENT_HUGEPAGE    
394                 clear_bit(TRANSPARENT_HUGEPAGE    
395                 set_bit(TRANSPARENT_HUGEPAGE_D    
396         } else if (sysfs_streq(buf, "madvise")    
397                 clear_bit(TRANSPARENT_HUGEPAGE    
398                 clear_bit(TRANSPARENT_HUGEPAGE    
399                 clear_bit(TRANSPARENT_HUGEPAGE    
400                 set_bit(TRANSPARENT_HUGEPAGE_D    
401         } else if (sysfs_streq(buf, "never"))     
402                 clear_bit(TRANSPARENT_HUGEPAGE    
403                 clear_bit(TRANSPARENT_HUGEPAGE    
404                 clear_bit(TRANSPARENT_HUGEPAGE    
405                 clear_bit(TRANSPARENT_HUGEPAGE    
406         } else                                    
407                 return -EINVAL;                   
408                                                   
409         return count;                             
410 }                                                 
411 static struct kobj_attribute defrag_attr = __A    
412                                                   
413 static ssize_t use_zero_page_show(struct kobje    
414                                   struct kobj_    
415 {                                                 
416         return single_hugepage_flag_show(kobj,    
417                                          TRANS    
418 }                                                 
419 static ssize_t use_zero_page_store(struct kobj    
420                 struct kobj_attribute *attr, c    
421 {                                                 
422         return single_hugepage_flag_store(kobj    
423                                  TRANSPARENT_H    
424 }                                                 
425 static struct kobj_attribute use_zero_page_att    
426                                                   
427 static ssize_t hpage_pmd_size_show(struct kobj    
428                                    struct kobj    
429 {                                                 
430         return sysfs_emit(buf, "%lu\n", HPAGE_    
431 }                                                 
432 static struct kobj_attribute hpage_pmd_size_at    
433         __ATTR_RO(hpage_pmd_size);                
434                                                   
435 static ssize_t split_underused_thp_show(struct    
436                             struct kobj_attrib    
437 {                                                 
438         return sysfs_emit(buf, "%d\n", split_u    
439 }                                                 
440                                                   
441 static ssize_t split_underused_thp_store(struc    
442                              struct kobj_attri    
443                              const char *buf,     
444 {                                                 
445         int err = kstrtobool(buf, &split_under    
446                                                   
447         if (err < 0)                              
448                 return err;                       
449                                                   
450         return count;                             
451 }                                                 
452                                                   
453 static struct kobj_attribute split_underused_t    
454         shrink_underused, 0644, split_underuse    
455                                                   
456 static struct attribute *hugepage_attr[] = {      
457         &enabled_attr.attr,                       
458         &defrag_attr.attr,                        
459         &use_zero_page_attr.attr,                 
460         &hpage_pmd_size_attr.attr,                
461 #ifdef CONFIG_SHMEM                               
462         &shmem_enabled_attr.attr,                 
463 #endif                                            
464         &split_underused_thp_attr.attr,           
465         NULL,                                     
466 };                                                
467                                                   
468 static const struct attribute_group hugepage_a    
469         .attrs = hugepage_attr,                   
470 };                                                
471                                                   
472 static void hugepage_exit_sysfs(struct kobject    
473 static void thpsize_release(struct kobject *ko    
474 static DEFINE_SPINLOCK(huge_anon_orders_lock);    
475 static LIST_HEAD(thpsize_list);                   
476                                                   
477 static ssize_t anon_enabled_show(struct kobjec    
478                                  struct kobj_a    
479 {                                                 
480         int order = to_thpsize(kobj)->order;      
481         const char *output;                       
482                                                   
483         if (test_bit(order, &huge_anon_orders_    
484                 output = "[always] inherit mad    
485         else if (test_bit(order, &huge_anon_or    
486                 output = "always [inherit] mad    
487         else if (test_bit(order, &huge_anon_or    
488                 output = "always inherit [madv    
489         else                                      
490                 output = "always inherit madvi    
491                                                   
492         return sysfs_emit(buf, "%s\n", output)    
493 }                                                 
494                                                   
495 static ssize_t anon_enabled_store(struct kobje    
496                                   struct kobj_    
497                                   const char *    
498 {                                                 
499         int order = to_thpsize(kobj)->order;      
500         ssize_t ret = count;                      
501                                                   
502         if (sysfs_streq(buf, "always")) {         
503                 spin_lock(&huge_anon_orders_lo    
504                 clear_bit(order, &huge_anon_or    
505                 clear_bit(order, &huge_anon_or    
506                 set_bit(order, &huge_anon_orde    
507                 spin_unlock(&huge_anon_orders_    
508         } else if (sysfs_streq(buf, "inherit")    
509                 spin_lock(&huge_anon_orders_lo    
510                 clear_bit(order, &huge_anon_or    
511                 clear_bit(order, &huge_anon_or    
512                 set_bit(order, &huge_anon_orde    
513                 spin_unlock(&huge_anon_orders_    
514         } else if (sysfs_streq(buf, "madvise")    
515                 spin_lock(&huge_anon_orders_lo    
516                 clear_bit(order, &huge_anon_or    
517                 clear_bit(order, &huge_anon_or    
518                 set_bit(order, &huge_anon_orde    
519                 spin_unlock(&huge_anon_orders_    
520         } else if (sysfs_streq(buf, "never"))     
521                 spin_lock(&huge_anon_orders_lo    
522                 clear_bit(order, &huge_anon_or    
523                 clear_bit(order, &huge_anon_or    
524                 clear_bit(order, &huge_anon_or    
525                 spin_unlock(&huge_anon_orders_    
526         } else                                    
527                 ret = -EINVAL;                    
528                                                   
529         if (ret > 0) {                            
530                 int err;                          
531                                                   
532                 err = start_stop_khugepaged();    
533                 if (err)                          
534                         ret = err;                
535         }                                         
536         return ret;                               
537 }                                                 
538                                                   
539 static struct kobj_attribute anon_enabled_attr    
540         __ATTR(enabled, 0644, anon_enabled_sho    
541                                                   
542 static struct attribute *anon_ctrl_attrs[] = {    
543         &anon_enabled_attr.attr,                  
544         NULL,                                     
545 };                                                
546                                                   
547 static const struct attribute_group anon_ctrl_    
548         .attrs = anon_ctrl_attrs,                 
549 };                                                
550                                                   
551 static struct attribute *file_ctrl_attrs[] = {    
552 #ifdef CONFIG_SHMEM                               
553         &thpsize_shmem_enabled_attr.attr,         
554 #endif                                            
555         NULL,                                     
556 };                                                
557                                                   
558 static const struct attribute_group file_ctrl_    
559         .attrs = file_ctrl_attrs,                 
560 };                                                
561                                                   
562 static struct attribute *any_ctrl_attrs[] = {     
563         NULL,                                     
564 };                                                
565                                                   
566 static const struct attribute_group any_ctrl_a    
567         .attrs = any_ctrl_attrs,                  
568 };                                                
569                                                   
570 static const struct kobj_type thpsize_ktype =     
571         .release = &thpsize_release,              
572         .sysfs_ops = &kobj_sysfs_ops,             
573 };                                                
574                                                   
575 DEFINE_PER_CPU(struct mthp_stat, mthp_stats) =    
576                                                   
577 static unsigned long sum_mthp_stat(int order,     
578 {                                                 
579         unsigned long sum = 0;                    
580         int cpu;                                  
581                                                   
582         for_each_possible_cpu(cpu) {              
583                 struct mthp_stat *this = &per_    
584                                                   
585                 sum += this->stats[order][item    
586         }                                         
587                                                   
588         return sum;                               
589 }                                                 
590                                                   
591 #define DEFINE_MTHP_STAT_ATTR(_name, _index)      
592 static ssize_t _name##_show(struct kobject *ko    
593                         struct kobj_attribute     
594 {                                                 
595         int order = to_thpsize(kobj)->order;      
596                                                   
597         return sysfs_emit(buf, "%lu\n", sum_mt    
598 }                                                 
599 static struct kobj_attribute _name##_attr = __    
600                                                   
601 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_S    
602 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTH    
603 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_char    
604 DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT    
605 DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_ST    
606 #ifdef CONFIG_SHMEM                               
607 DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_S    
608 DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STA    
609 DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, M    
610 #endif                                            
611 DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);    
612 DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_    
613 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STA    
614 DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_AN    
615 DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped    
616                                                   
617 static struct attribute *anon_stats_attrs[] =     
618         &anon_fault_alloc_attr.attr,              
619         &anon_fault_fallback_attr.attr,           
620         &anon_fault_fallback_charge_attr.attr,    
621 #ifndef CONFIG_SHMEM                              
622         &swpout_attr.attr,                        
623         &swpout_fallback_attr.attr,               
624 #endif                                            
625         &split_deferred_attr.attr,                
626         &nr_anon_attr.attr,                       
627         &nr_anon_partially_mapped_attr.attr,      
628         NULL,                                     
629 };                                                
630                                                   
631 static struct attribute_group anon_stats_attr_    
632         .name = "stats",                          
633         .attrs = anon_stats_attrs,                
634 };                                                
635                                                   
636 static struct attribute *file_stats_attrs[] =     
637 #ifdef CONFIG_SHMEM                               
638         &shmem_alloc_attr.attr,                   
639         &shmem_fallback_attr.attr,                
640         &shmem_fallback_charge_attr.attr,         
641 #endif                                            
642         NULL,                                     
643 };                                                
644                                                   
645 static struct attribute_group file_stats_attr_    
646         .name = "stats",                          
647         .attrs = file_stats_attrs,                
648 };                                                
649                                                   
650 static struct attribute *any_stats_attrs[] = {    
651 #ifdef CONFIG_SHMEM                               
652         &swpout_attr.attr,                        
653         &swpout_fallback_attr.attr,               
654 #endif                                            
655         &split_attr.attr,                         
656         &split_failed_attr.attr,                  
657         NULL,                                     
658 };                                                
659                                                   
660 static struct attribute_group any_stats_attr_g    
661         .name = "stats",                          
662         .attrs = any_stats_attrs,                 
663 };                                                
664                                                   
665 static int sysfs_add_group(struct kobject *kob    
666                            const struct attrib    
667 {                                                 
668         int ret = -ENOENT;                        
669                                                   
670         /*                                        
671          * If the group is named, try to merge    
672          * was already created. This avoids th    
673          * sysfs_create_group() if the directo    
674          */                                       
675         if (grp->name)                            
676                 ret = sysfs_merge_group(kobj,     
677         if (ret)                                  
678                 ret = sysfs_create_group(kobj,    
679                                                   
680         return ret;                               
681 }                                                 
682                                                   
683 static struct thpsize *thpsize_create(int orde    
684 {                                                 
685         unsigned long size = (PAGE_SIZE << ord    
686         struct thpsize *thpsize;                  
687         int ret = -ENOMEM;                        
688                                                   
689         thpsize = kzalloc(sizeof(*thpsize), GF    
690         if (!thpsize)                             
691                 goto err;                         
692                                                   
693         thpsize->order = order;                   
694                                                   
695         ret = kobject_init_and_add(&thpsize->k    
696                                    "hugepages-    
697         if (ret) {                                
698                 kfree(thpsize);                   
699                 goto err;                         
700         }                                         
701                                                   
702                                                   
703         ret = sysfs_add_group(&thpsize->kobj,     
704         if (ret)                                  
705                 goto err_put;                     
706                                                   
707         ret = sysfs_add_group(&thpsize->kobj,     
708         if (ret)                                  
709                 goto err_put;                     
710                                                   
711         if (BIT(order) & THP_ORDERS_ALL_ANON)     
712                 ret = sysfs_add_group(&thpsize    
713                 if (ret)                          
714                         goto err_put;             
715                                                   
716                 ret = sysfs_add_group(&thpsize    
717                 if (ret)                          
718                         goto err_put;             
719         }                                         
720                                                   
721         if (BIT(order) & THP_ORDERS_ALL_FILE_D    
722                 ret = sysfs_add_group(&thpsize    
723                 if (ret)                          
724                         goto err_put;             
725                                                   
726                 ret = sysfs_add_group(&thpsize    
727                 if (ret)                          
728                         goto err_put;             
729         }                                         
730                                                   
731         return thpsize;                           
732 err_put:                                          
733         kobject_put(&thpsize->kobj);              
734 err:                                              
735         return ERR_PTR(ret);                      
736 }                                                 
737                                                   
738 static void thpsize_release(struct kobject *ko    
739 {                                                 
740         kfree(to_thpsize(kobj));                  
741 }                                                 
742                                                   
743 static int __init hugepage_init_sysfs(struct k    
744 {                                                 
745         int err;                                  
746         struct thpsize *thpsize;                  
747         unsigned long orders;                     
748         int order;                                
749                                                   
750         /*                                        
751          * Default to setting PMD-sized THP to    
752          * disable all other sizes. powerpc's     
753          * constant so we have to do this here    
754          */                                       
755         if (!anon_orders_configured)              
756                 huge_anon_orders_inherit = BIT    
757                                                   
758         *hugepage_kobj = kobject_create_and_ad    
759         if (unlikely(!*hugepage_kobj)) {          
760                 pr_err("failed to create trans    
761                 return -ENOMEM;                   
762         }                                         
763                                                   
764         err = sysfs_create_group(*hugepage_kob    
765         if (err) {                                
766                 pr_err("failed to register tra    
767                 goto delete_obj;                  
768         }                                         
769                                                   
770         err = sysfs_create_group(*hugepage_kob    
771         if (err) {                                
772                 pr_err("failed to register tra    
773                 goto remove_hp_group;             
774         }                                         
775                                                   
776         orders = THP_ORDERS_ALL_ANON | THP_ORD    
777         order = highest_order(orders);            
778         while (orders) {                          
779                 thpsize = thpsize_create(order    
780                 if (IS_ERR(thpsize)) {            
781                         pr_err("failed to crea    
782                         err = PTR_ERR(thpsize)    
783                         goto remove_all;          
784                 }                                 
785                 list_add(&thpsize->node, &thps    
786                 order = next_order(&orders, or    
787         }                                         
788                                                   
789         return 0;                                 
790                                                   
791 remove_all:                                       
792         hugepage_exit_sysfs(*hugepage_kobj);      
793         return err;                               
794 remove_hp_group:                                  
795         sysfs_remove_group(*hugepage_kobj, &hu    
796 delete_obj:                                       
797         kobject_put(*hugepage_kobj);              
798         return err;                               
799 }                                                 
800                                                   
801 static void __init hugepage_exit_sysfs(struct     
802 {                                                 
803         struct thpsize *thpsize, *tmp;            
804                                                   
805         list_for_each_entry_safe(thpsize, tmp,    
806                 list_del(&thpsize->node);         
807                 kobject_put(&thpsize->kobj);      
808         }                                         
809                                                   
810         sysfs_remove_group(hugepage_kobj, &khu    
811         sysfs_remove_group(hugepage_kobj, &hug    
812         kobject_put(hugepage_kobj);               
813 }                                                 
814 #else                                             
815 static inline int hugepage_init_sysfs(struct k    
816 {                                                 
817         return 0;                                 
818 }                                                 
819                                                   
820 static inline void hugepage_exit_sysfs(struct     
821 {                                                 
822 }                                                 
823 #endif /* CONFIG_SYSFS */                         
824                                                   
825 static int __init thp_shrinker_init(void)         
826 {                                                 
827         huge_zero_page_shrinker = shrinker_all    
828         if (!huge_zero_page_shrinker)             
829                 return -ENOMEM;                   
830                                                   
831         deferred_split_shrinker = shrinker_all    
832                                                   
833                                                   
834                                                   
835         if (!deferred_split_shrinker) {           
836                 shrinker_free(huge_zero_page_s    
837                 return -ENOMEM;                   
838         }                                         
839                                                   
840         huge_zero_page_shrinker->count_objects    
841         huge_zero_page_shrinker->scan_objects     
842         shrinker_register(huge_zero_page_shrin    
843                                                   
844         deferred_split_shrinker->count_objects    
845         deferred_split_shrinker->scan_objects     
846         shrinker_register(deferred_split_shrin    
847                                                   
848         return 0;                                 
849 }                                                 
850                                                   
851 static void __init thp_shrinker_exit(void)        
852 {                                                 
853         shrinker_free(huge_zero_page_shrinker)    
854         shrinker_free(deferred_split_shrinker)    
855 }                                                 
856                                                   
857 static int __init hugepage_init(void)             
858 {                                                 
859         int err;                                  
860         struct kobject *hugepage_kobj;            
861                                                   
862         if (!has_transparent_hugepage()) {        
863                 transparent_hugepage_flags = 1    
864                 return -EINVAL;                   
865         }                                         
866                                                   
867         /*                                        
868          * hugepages can't be allocated by the    
869          */                                       
870         MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > M    
871                                                   
872         err = hugepage_init_sysfs(&hugepage_ko    
873         if (err)                                  
874                 goto err_sysfs;                   
875                                                   
876         err = khugepaged_init();                  
877         if (err)                                  
878                 goto err_slab;                    
879                                                   
880         err = thp_shrinker_init();                
881         if (err)                                  
882                 goto err_shrinker;                
883                                                   
884         /*                                        
885          * By default disable transparent huge    
886          * where the extra memory used could h    
887          * is likely to save.  The admin can s    
888          */                                       
889         if (totalram_pages() < (512 << (20 - P    
890                 transparent_hugepage_flags = 0    
891                 return 0;                         
892         }                                         
893                                                   
894         err = start_stop_khugepaged();            
895         if (err)                                  
896                 goto err_khugepaged;              
897                                                   
898         return 0;                                 
899 err_khugepaged:                                   
900         thp_shrinker_exit();                      
901 err_shrinker:                                     
902         khugepaged_destroy();                     
903 err_slab:                                         
904         hugepage_exit_sysfs(hugepage_kobj);       
905 err_sysfs:                                        
906         return err;                               
907 }                                                 
908 subsys_initcall(hugepage_init);                   
909                                                   
910 static int __init setup_transparent_hugepage(c    
911 {                                                 
912         int ret = 0;                              
913         if (!str)                                 
914                 goto out;                         
915         if (!strcmp(str, "always")) {             
916                 set_bit(TRANSPARENT_HUGEPAGE_F    
917                         &transparent_hugepage_    
918                 clear_bit(TRANSPARENT_HUGEPAGE    
919                           &transparent_hugepag    
920                 ret = 1;                          
921         } else if (!strcmp(str, "madvise")) {     
922                 clear_bit(TRANSPARENT_HUGEPAGE    
923                           &transparent_hugepag    
924                 set_bit(TRANSPARENT_HUGEPAGE_R    
925                         &transparent_hugepage_    
926                 ret = 1;                          
927         } else if (!strcmp(str, "never")) {       
928                 clear_bit(TRANSPARENT_HUGEPAGE    
929                           &transparent_hugepag    
930                 clear_bit(TRANSPARENT_HUGEPAGE    
931                           &transparent_hugepag    
932                 ret = 1;                          
933         }                                         
934 out:                                              
935         if (!ret)                                 
936                 pr_warn("transparent_hugepage=    
937         return ret;                               
938 }                                                 
939 __setup("transparent_hugepage=", setup_transpa    
940                                                   
941 static inline int get_order_from_str(const cha    
942 {                                                 
943         unsigned long size;                       
944         char *endptr;                             
945         int order;                                
946                                                   
947         size = memparse(size_str, &endptr);       
948                                                   
949         if (!is_power_of_2(size))                 
950                 goto err;                         
951         order = get_order(size);                  
952         if (BIT(order) & ~THP_ORDERS_ALL_ANON)    
953                 goto err;                         
954                                                   
955         return order;                             
956 err:                                              
957         pr_err("invalid size %s in thp_anon bo    
958         return -EINVAL;                           
959 }                                                 
960                                                   
961 static char str_dup[PAGE_SIZE] __initdata;        
962 static int __init setup_thp_anon(char *str)       
963 {                                                 
964         char *token, *range, *policy, *subtoke    
965         unsigned long always, inherit, madvise    
966         char *start_size, *end_size;              
967         int start, end, nr;                       
968         char *p;                                  
969                                                   
970         if (!str || strlen(str) + 1 > PAGE_SIZ    
971                 goto err;                         
972         strcpy(str_dup, str);                     
973                                                   
974         always = huge_anon_orders_always;         
975         madvise = huge_anon_orders_madvise;       
976         inherit = huge_anon_orders_inherit;       
977         p = str_dup;                              
978         while ((token = strsep(&p, ";")) != NU    
979                 range = strsep(&token, ":");      
980                 policy = token;                   
981                                                   
982                 if (!policy)                      
983                         goto err;                 
984                                                   
985                 while ((subtoken = strsep(&ran    
986                         if (strchr(subtoken, '    
987                                 start_size = s    
988                                 end_size = sub    
989                                                   
990                                 start = get_or    
991                                 end = get_orde    
992                         } else {                  
993                                 start = end =     
994                         }                         
995                                                   
996                         if (start < 0 || end <    
997                                 goto err;         
998                                                   
999                         nr = end - start + 1;     
1000                         if (!strcmp(policy, "    
1001                                 bitmap_set(&a    
1002                                 bitmap_clear(    
1003                                 bitmap_clear(    
1004                         } else if (!strcmp(po    
1005                                 bitmap_set(&m    
1006                                 bitmap_clear(    
1007                                 bitmap_clear(    
1008                         } else if (!strcmp(po    
1009                                 bitmap_set(&i    
1010                                 bitmap_clear(    
1011                                 bitmap_clear(    
1012                         } else if (!strcmp(po    
1013                                 bitmap_clear(    
1014                                 bitmap_clear(    
1015                                 bitmap_clear(    
1016                         } else {                 
1017                                 pr_err("inval    
1018                                 goto err;        
1019                         }                        
1020                 }                                
1021         }                                        
1022                                                  
1023         huge_anon_orders_always = always;        
1024         huge_anon_orders_madvise = madvise;      
1025         huge_anon_orders_inherit = inherit;      
1026         anon_orders_configured = true;           
1027         return 1;                                
1028                                                  
1029 err:                                             
1030         pr_warn("thp_anon=%s: error parsing s    
1031         return 0;                                
1032 }                                                
1033 __setup("thp_anon=", setup_thp_anon);            
1034                                                  
1035 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_    
1036 {                                                
1037         if (likely(vma->vm_flags & VM_WRITE))    
1038                 pmd = pmd_mkwrite(pmd, vma);     
1039         return pmd;                              
1040 }                                                
1041                                                  
1042 #ifdef CONFIG_MEMCG                              
1043 static inline                                    
1044 struct deferred_split *get_deferred_split_que    
1045 {                                                
1046         struct mem_cgroup *memcg = folio_memc    
1047         struct pglist_data *pgdat = NODE_DATA    
1048                                                  
1049         if (memcg)                               
1050                 return &memcg->deferred_split    
1051         else                                     
1052                 return &pgdat->deferred_split    
1053 }                                                
1054 #else                                            
1055 static inline                                    
1056 struct deferred_split *get_deferred_split_que    
1057 {                                                
1058         struct pglist_data *pgdat = NODE_DATA    
1059                                                  
1060         return &pgdat->deferred_split_queue;     
1061 }                                                
1062 #endif                                           
1063                                                  
1064 static inline bool is_transparent_hugepage(co    
1065 {                                                
1066         if (!folio_test_large(folio))            
1067                 return false;                    
1068                                                  
1069         return is_huge_zero_folio(folio) ||      
1070                 folio_test_large_rmappable(fo    
1071 }                                                
1072                                                  
1073 static unsigned long __thp_get_unmapped_area(    
1074                 unsigned long addr, unsigned     
1075                 loff_t off, unsigned long fla    
1076                 vm_flags_t vm_flags)             
1077 {                                                
1078         loff_t off_end = off + len;              
1079         loff_t off_align = round_up(off, size    
1080         unsigned long len_pad, ret, off_sub;     
1081                                                  
1082         if (!IS_ENABLED(CONFIG_64BIT) || in_c    
1083                 return 0;                        
1084                                                  
1085         if (off_end <= off_align || (off_end     
1086                 return 0;                        
1087                                                  
1088         len_pad = len + size;                    
1089         if (len_pad < len || (off + len_pad)     
1090                 return 0;                        
1091                                                  
1092         ret = mm_get_unmapped_area_vmflags(cu    
1093                                            of    
1094                                                  
1095         /*                                       
1096          * The failure might be due to length    
1097          * without the padding.                  
1098          */                                      
1099         if (IS_ERR_VALUE(ret))                   
1100                 return 0;                        
1101                                                  
1102         /*                                       
1103          * Do not try to align to THP boundar    
1104          * hint succeeds.                        
1105          */                                      
1106         if (ret == addr)                         
1107                 return addr;                     
1108                                                  
1109         off_sub = (off - ret) & (size - 1);      
1110                                                  
1111         if (test_bit(MMF_TOPDOWN, &current->m    
1112                 return ret + size;               
1113                                                  
1114         ret += off_sub;                          
1115         return ret;                              
1116 }                                                
1117                                                  
1118 unsigned long thp_get_unmapped_area_vmflags(s    
1119                 unsigned long len, unsigned l    
1120                 vm_flags_t vm_flags)             
1121 {                                                
1122         unsigned long ret;                       
1123         loff_t off = (loff_t)pgoff << PAGE_SH    
1124                                                  
1125         ret = __thp_get_unmapped_area(filp, a    
1126         if (ret)                                 
1127                 return ret;                      
1128                                                  
1129         return mm_get_unmapped_area_vmflags(c    
1130                                             v    
1131 }                                                
1132                                                  
1133 unsigned long thp_get_unmapped_area(struct fi    
1134                 unsigned long len, unsigned l    
1135 {                                                
1136         return thp_get_unmapped_area_vmflags(    
1137 }                                                
1138 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);        
1139                                                  
1140 static vm_fault_t __do_huge_pmd_anonymous_pag    
1141                         struct page *page, gf    
1142 {                                                
1143         struct vm_area_struct *vma = vmf->vma    
1144         struct folio *folio = page_folio(page    
1145         pgtable_t pgtable;                       
1146         unsigned long haddr = vmf->address &     
1147         vm_fault_t ret = 0;                      
1148                                                  
1149         VM_BUG_ON_FOLIO(!folio_test_large(fol    
1150                                                  
1151         if (mem_cgroup_charge(folio, vma->vm_    
1152                 folio_put(folio);                
1153                 count_vm_event(THP_FAULT_FALL    
1154                 count_vm_event(THP_FAULT_FALL    
1155                 count_mthp_stat(HPAGE_PMD_ORD    
1156                 count_mthp_stat(HPAGE_PMD_ORD    
1157                 return VM_FAULT_FALLBACK;        
1158         }                                        
1159         folio_throttle_swaprate(folio, gfp);     
1160                                                  
1161         pgtable = pte_alloc_one(vma->vm_mm);     
1162         if (unlikely(!pgtable)) {                
1163                 ret = VM_FAULT_OOM;              
1164                 goto release;                    
1165         }                                        
1166                                                  
1167         folio_zero_user(folio, vmf->address);    
1168         /*                                       
1169          * The memory barrier inside __folio_    
1170          * folio_zero_user writes become visi    
1171          * write.                                
1172          */                                      
1173         __folio_mark_uptodate(folio);            
1174                                                  
1175         vmf->ptl = pmd_lock(vma->vm_mm, vmf->    
1176         if (unlikely(!pmd_none(*vmf->pmd))) {    
1177                 goto unlock_release;             
1178         } else {                                 
1179                 pmd_t entry;                     
1180                                                  
1181                 ret = check_stable_address_sp    
1182                 if (ret)                         
1183                         goto unlock_release;     
1184                                                  
1185                 /* Deliver the page fault to     
1186                 if (userfaultfd_missing(vma))    
1187                         spin_unlock(vmf->ptl)    
1188                         folio_put(folio);        
1189                         pte_free(vma->vm_mm,     
1190                         ret = handle_userfaul    
1191                         VM_BUG_ON(ret & VM_FA    
1192                         return ret;              
1193                 }                                
1194                                                  
1195                 entry = mk_huge_pmd(page, vma    
1196                 entry = maybe_pmd_mkwrite(pmd    
1197                 folio_add_new_anon_rmap(folio    
1198                 folio_add_lru_vma(folio, vma)    
1199                 pgtable_trans_huge_deposit(vm    
1200                 set_pmd_at(vma->vm_mm, haddr,    
1201                 update_mmu_cache_pmd(vma, vmf    
1202                 add_mm_counter(vma->vm_mm, MM    
1203                 mm_inc_nr_ptes(vma->vm_mm);      
1204                 deferred_split_folio(folio, f    
1205                 spin_unlock(vmf->ptl);           
1206                 count_vm_event(THP_FAULT_ALLO    
1207                 count_mthp_stat(HPAGE_PMD_ORD    
1208                 count_memcg_event_mm(vma->vm_    
1209         }                                        
1210                                                  
1211         return 0;                                
1212 unlock_release:                                  
1213         spin_unlock(vmf->ptl);                   
1214 release:                                         
1215         if (pgtable)                             
1216                 pte_free(vma->vm_mm, pgtable)    
1217         folio_put(folio);                        
1218         return ret;                              
1219                                                  
1220 }                                                
1221                                                  
1222 /*                                               
1223  * always: directly stall for all thp allocat    
1224  * defer: wake kswapd and fail if not immedia    
1225  * defer+madvise: wake kswapd and directly st    
1226  *                fail if not immediately ava    
1227  * madvise: directly stall for MADV_HUGEPAGE,    
1228  *          available                            
1229  * never: never stall for any thp allocation     
1230  */                                              
1231 gfp_t vma_thp_gfp_mask(struct vm_area_struct     
1232 {                                                
1233         const bool vma_madvised = vma && (vma    
1234                                                  
1235         /* Always do synchronous compaction *    
1236         if (test_bit(TRANSPARENT_HUGEPAGE_DEF    
1237                 return GFP_TRANSHUGE | (vma_m    
1238                                                  
1239         /* Kick kcompactd and fail quickly */    
1240         if (test_bit(TRANSPARENT_HUGEPAGE_DEF    
1241                 return GFP_TRANSHUGE_LIGHT |     
1242                                                  
1243         /* Synchronous compaction if madvised    
1244         if (test_bit(TRANSPARENT_HUGEPAGE_DEF    
1245                 return GFP_TRANSHUGE_LIGHT |     
1246                         (vma_madvised ? __GFP    
1247                                         __GFP    
1248                                                  
1249         /* Only do synchronous compaction if     
1250         if (test_bit(TRANSPARENT_HUGEPAGE_DEF    
1251                 return GFP_TRANSHUGE_LIGHT |     
1252                        (vma_madvised ? __GFP_    
1253                                                  
1254         return GFP_TRANSHUGE_LIGHT;              
1255 }                                                
1256                                                  
1257 /* Caller must hold page table lock. */          
1258 static void set_huge_zero_folio(pgtable_t pgt    
1259                 struct vm_area_struct *vma, u    
1260                 struct folio *zero_folio)        
1261 {                                                
1262         pmd_t entry;                             
1263         if (!pmd_none(*pmd))                     
1264                 return;                          
1265         entry = mk_pmd(&zero_folio->page, vma    
1266         entry = pmd_mkhuge(entry);               
1267         pgtable_trans_huge_deposit(mm, pmd, p    
1268         set_pmd_at(mm, haddr, pmd, entry);       
1269         mm_inc_nr_ptes(mm);                      
1270 }                                                
1271                                                  
1272 vm_fault_t do_huge_pmd_anonymous_page(struct     
1273 {                                                
1274         struct vm_area_struct *vma = vmf->vma    
1275         gfp_t gfp;                               
1276         struct folio *folio;                     
1277         unsigned long haddr = vmf->address &     
1278         vm_fault_t ret;                          
1279                                                  
1280         if (!thp_vma_suitable_order(vma, hadd    
1281                 return VM_FAULT_FALLBACK;        
1282         ret = vmf_anon_prepare(vmf);             
1283         if (ret)                                 
1284                 return ret;                      
1285         khugepaged_enter_vma(vma, vma->vm_fla    
1286                                                  
1287         if (!(vmf->flags & FAULT_FLAG_WRITE)     
1288                         !mm_forbids_zeropage(    
1289                         transparent_hugepage_    
1290                 pgtable_t pgtable;               
1291                 struct folio *zero_folio;        
1292                 vm_fault_t ret;                  
1293                                                  
1294                 pgtable = pte_alloc_one(vma->    
1295                 if (unlikely(!pgtable))          
1296                         return VM_FAULT_OOM;     
1297                 zero_folio = mm_get_huge_zero    
1298                 if (unlikely(!zero_folio)) {     
1299                         pte_free(vma->vm_mm,     
1300                         count_vm_event(THP_FA    
1301                         return VM_FAULT_FALLB    
1302                 }                                
1303                 vmf->ptl = pmd_lock(vma->vm_m    
1304                 ret = 0;                         
1305                 if (pmd_none(*vmf->pmd)) {       
1306                         ret = check_stable_ad    
1307                         if (ret) {               
1308                                 spin_unlock(v    
1309                                 pte_free(vma-    
1310                         } else if (userfaultf    
1311                                 spin_unlock(v    
1312                                 pte_free(vma-    
1313                                 ret = handle_    
1314                                 VM_BUG_ON(ret    
1315                         } else {                 
1316                                 set_huge_zero    
1317                                                  
1318                                 update_mmu_ca    
1319                                 spin_unlock(v    
1320                         }                        
1321                 } else {                         
1322                         spin_unlock(vmf->ptl)    
1323                         pte_free(vma->vm_mm,     
1324                 }                                
1325                 return ret;                      
1326         }                                        
1327         gfp = vma_thp_gfp_mask(vma);             
1328         folio = vma_alloc_folio(gfp, HPAGE_PM    
1329         if (unlikely(!folio)) {                  
1330                 count_vm_event(THP_FAULT_FALL    
1331                 count_mthp_stat(HPAGE_PMD_ORD    
1332                 return VM_FAULT_FALLBACK;        
1333         }                                        
1334         return __do_huge_pmd_anonymous_page(v    
1335 }                                                
1336                                                  
1337 static void insert_pfn_pmd(struct vm_area_str    
1338                 pmd_t *pmd, pfn_t pfn, pgprot    
1339                 pgtable_t pgtable)               
1340 {                                                
1341         struct mm_struct *mm = vma->vm_mm;       
1342         pmd_t entry;                             
1343         spinlock_t *ptl;                         
1344                                                  
1345         ptl = pmd_lock(mm, pmd);                 
1346         if (!pmd_none(*pmd)) {                   
1347                 if (write) {                     
1348                         if (pmd_pfn(*pmd) !=     
1349                                 WARN_ON_ONCE(    
1350                                 goto out_unlo    
1351                         }                        
1352                         entry = pmd_mkyoung(*    
1353                         entry = maybe_pmd_mkw    
1354                         if (pmdp_set_access_f    
1355                                 update_mmu_ca    
1356                 }                                
1357                                                  
1358                 goto out_unlock;                 
1359         }                                        
1360                                                  
1361         entry = pmd_mkhuge(pfn_t_pmd(pfn, pro    
1362         if (pfn_t_devmap(pfn))                   
1363                 entry = pmd_mkdevmap(entry);     
1364         else                                     
1365                 entry = pmd_mkspecial(entry);    
1366         if (write) {                             
1367                 entry = pmd_mkyoung(pmd_mkdir    
1368                 entry = maybe_pmd_mkwrite(ent    
1369         }                                        
1370                                                  
1371         if (pgtable) {                           
1372                 pgtable_trans_huge_deposit(mm    
1373                 mm_inc_nr_ptes(mm);              
1374                 pgtable = NULL;                  
1375         }                                        
1376                                                  
1377         set_pmd_at(mm, addr, pmd, entry);        
1378         update_mmu_cache_pmd(vma, addr, pmd);    
1379                                                  
1380 out_unlock:                                      
1381         spin_unlock(ptl);                        
1382         if (pgtable)                             
1383                 pte_free(mm, pgtable);           
1384 }                                                
1385                                                  
1386 /**                                              
1387  * vmf_insert_pfn_pmd - insert a pmd size pfn    
1388  * @vmf: Structure describing the fault          
1389  * @pfn: pfn to insert                           
1390  * @write: whether it's a write fault            
1391  *                                               
1392  * Insert a pmd size pfn. See vmf_insert_pfn(    
1393  *                                               
1394  * Return: vm_fault_t value.                     
1395  */                                              
1396 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault    
1397 {                                                
1398         unsigned long addr = vmf->address & P    
1399         struct vm_area_struct *vma = vmf->vma    
1400         pgprot_t pgprot = vma->vm_page_prot;     
1401         pgtable_t pgtable = NULL;                
1402                                                  
1403         /*                                       
1404          * If we had pmd_special, we could av    
1405          * but we need to be consistent with     
1406          * can't support a 'special' bit.        
1407          */                                      
1408         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|V    
1409                         !pfn_t_devmap(pfn));     
1410         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM    
1411                                                  
1412         BUG_ON((vma->vm_flags & VM_PFNMAP) &&    
1413                                                  
1414         if (addr < vma->vm_start || addr >= v    
1415                 return VM_FAULT_SIGBUS;          
1416                                                  
1417         if (arch_needs_pgtable_deposit()) {      
1418                 pgtable = pte_alloc_one(vma->    
1419                 if (!pgtable)                    
1420                         return VM_FAULT_OOM;     
1421         }                                        
1422                                                  
1423         track_pfn_insert(vma, &pgprot, pfn);     
1424                                                  
1425         insert_pfn_pmd(vma, addr, vmf->pmd, p    
1426         return VM_FAULT_NOPAGE;                  
1427 }                                                
1428 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);           
1429                                                  
1430 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_    
1431 static pud_t maybe_pud_mkwrite(pud_t pud, str    
1432 {                                                
1433         if (likely(vma->vm_flags & VM_WRITE))    
1434                 pud = pud_mkwrite(pud);          
1435         return pud;                              
1436 }                                                
1437                                                  
1438 static void insert_pfn_pud(struct vm_area_str    
1439                 pud_t *pud, pfn_t pfn, bool w    
1440 {                                                
1441         struct mm_struct *mm = vma->vm_mm;       
1442         pgprot_t prot = vma->vm_page_prot;       
1443         pud_t entry;                             
1444         spinlock_t *ptl;                         
1445                                                  
1446         ptl = pud_lock(mm, pud);                 
1447         if (!pud_none(*pud)) {                   
1448                 if (write) {                     
1449                         if (WARN_ON_ONCE(pud_    
1450                                 goto out_unlo    
1451                         entry = pud_mkyoung(*    
1452                         entry = maybe_pud_mkw    
1453                         if (pudp_set_access_f    
1454                                 update_mmu_ca    
1455                 }                                
1456                 goto out_unlock;                 
1457         }                                        
1458                                                  
1459         entry = pud_mkhuge(pfn_t_pud(pfn, pro    
1460         if (pfn_t_devmap(pfn))                   
1461                 entry = pud_mkdevmap(entry);     
1462         else                                     
1463                 entry = pud_mkspecial(entry);    
1464         if (write) {                             
1465                 entry = pud_mkyoung(pud_mkdir    
1466                 entry = maybe_pud_mkwrite(ent    
1467         }                                        
1468         set_pud_at(mm, addr, pud, entry);        
1469         update_mmu_cache_pud(vma, addr, pud);    
1470                                                  
1471 out_unlock:                                      
1472         spin_unlock(ptl);                        
1473 }                                                
1474                                                  
1475 /**                                              
1476  * vmf_insert_pfn_pud - insert a pud size pfn    
1477  * @vmf: Structure describing the fault          
1478  * @pfn: pfn to insert                           
1479  * @write: whether it's a write fault            
1480  *                                               
1481  * Insert a pud size pfn. See vmf_insert_pfn(    
1482  *                                               
1483  * Return: vm_fault_t value.                     
1484  */                                              
1485 vm_fault_t vmf_insert_pfn_pud(struct vm_fault    
1486 {                                                
1487         unsigned long addr = vmf->address & P    
1488         struct vm_area_struct *vma = vmf->vma    
1489         pgprot_t pgprot = vma->vm_page_prot;     
1490                                                  
1491         /*                                       
1492          * If we had pud_special, we could av    
1493          * but we need to be consistent with     
1494          * can't support a 'special' bit.        
1495          */                                      
1496         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|V    
1497                         !pfn_t_devmap(pfn));     
1498         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM    
1499                                                  
1500         BUG_ON((vma->vm_flags & VM_PFNMAP) &&    
1501                                                  
1502         if (addr < vma->vm_start || addr >= v    
1503                 return VM_FAULT_SIGBUS;          
1504                                                  
1505         track_pfn_insert(vma, &pgprot, pfn);     
1506                                                  
1507         insert_pfn_pud(vma, addr, vmf->pud, p    
1508         return VM_FAULT_NOPAGE;                  
1509 }                                                
1510 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);           
1511 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPA    
1512                                                  
1513 void touch_pmd(struct vm_area_struct *vma, un    
1514                pmd_t *pmd, bool write)           
1515 {                                                
1516         pmd_t _pmd;                              
1517                                                  
1518         _pmd = pmd_mkyoung(*pmd);                
1519         if (write)                               
1520                 _pmd = pmd_mkdirty(_pmd);        
1521         if (pmdp_set_access_flags(vma, addr &    
1522                                   pmd, _pmd,     
1523                 update_mmu_cache_pmd(vma, add    
1524 }                                                
1525                                                  
1526 struct page *follow_devmap_pmd(struct vm_area    
1527                 pmd_t *pmd, int flags, struct    
1528 {                                                
1529         unsigned long pfn = pmd_pfn(*pmd);       
1530         struct mm_struct *mm = vma->vm_mm;       
1531         struct page *page;                       
1532         int ret;                                 
1533                                                  
1534         assert_spin_locked(pmd_lockptr(mm, pm    
1535                                                  
1536         if (flags & FOLL_WRITE && !pmd_write(    
1537                 return NULL;                     
1538                                                  
1539         if (pmd_present(*pmd) && pmd_devmap(*    
1540                 /* pass */;                      
1541         else                                     
1542                 return NULL;                     
1543                                                  
1544         if (flags & FOLL_TOUCH)                  
1545                 touch_pmd(vma, addr, pmd, fla    
1546                                                  
1547         /*                                       
1548          * device mapped pages can only be re    
1549          * caller will manage the page refere    
1550          */                                      
1551         if (!(flags & (FOLL_GET | FOLL_PIN)))    
1552                 return ERR_PTR(-EEXIST);         
1553                                                  
1554         pfn += (addr & ~PMD_MASK) >> PAGE_SHI    
1555         *pgmap = get_dev_pagemap(pfn, *pgmap)    
1556         if (!*pgmap)                             
1557                 return ERR_PTR(-EFAULT);         
1558         page = pfn_to_page(pfn);                 
1559         ret = try_grab_folio(page_folio(page)    
1560         if (ret)                                 
1561                 page = ERR_PTR(ret);             
1562                                                  
1563         return page;                             
1564 }                                                
1565                                                  
1566 int copy_huge_pmd(struct mm_struct *dst_mm, s    
1567                   pmd_t *dst_pmd, pmd_t *src_    
1568                   struct vm_area_struct *dst_    
1569 {                                                
1570         spinlock_t *dst_ptl, *src_ptl;           
1571         struct page *src_page;                   
1572         struct folio *src_folio;                 
1573         pmd_t pmd;                               
1574         pgtable_t pgtable = NULL;                
1575         int ret = -ENOMEM;                       
1576                                                  
1577         pmd = pmdp_get_lockless(src_pmd);        
1578         if (unlikely(pmd_present(pmd) && pmd_    
1579                 dst_ptl = pmd_lock(dst_mm, ds    
1580                 src_ptl = pmd_lockptr(src_mm,    
1581                 spin_lock_nested(src_ptl, SIN    
1582                 /*                               
1583                  * No need to recheck the pmd    
1584                  * mmap lock held here.          
1585                  *                               
1586                  * Meanwhile, making sure it'    
1587                  * mapping, otherwise it mean    
1588                  * applied special bit, or we    
1589                  * able to wrongly write to t    
1590                  */                              
1591                 VM_WARN_ON_ONCE(is_cow_mappin    
1592                 goto set_pmd;                    
1593         }                                        
1594                                                  
1595         /* Skip if can be re-fill on fault */    
1596         if (!vma_is_anonymous(dst_vma))          
1597                 return 0;                        
1598                                                  
1599         pgtable = pte_alloc_one(dst_mm);         
1600         if (unlikely(!pgtable))                  
1601                 goto out;                        
1602                                                  
1603         dst_ptl = pmd_lock(dst_mm, dst_pmd);     
1604         src_ptl = pmd_lockptr(src_mm, src_pmd    
1605         spin_lock_nested(src_ptl, SINGLE_DEPT    
1606                                                  
1607         ret = -EAGAIN;                           
1608         pmd = *src_pmd;                          
1609                                                  
1610 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION          
1611         if (unlikely(is_swap_pmd(pmd))) {        
1612                 swp_entry_t entry = pmd_to_sw    
1613                                                  
1614                 VM_BUG_ON(!is_pmd_migration_e    
1615                 if (!is_readable_migration_en    
1616                         entry = make_readable    
1617                                                  
1618                         pmd = swp_entry_to_pm    
1619                         if (pmd_swp_soft_dirt    
1620                                 pmd = pmd_swp    
1621                         if (pmd_swp_uffd_wp(*    
1622                                 pmd = pmd_swp    
1623                         set_pmd_at(src_mm, ad    
1624                 }                                
1625                 add_mm_counter(dst_mm, MM_ANO    
1626                 mm_inc_nr_ptes(dst_mm);          
1627                 pgtable_trans_huge_deposit(ds    
1628                 if (!userfaultfd_wp(dst_vma))    
1629                         pmd = pmd_swp_clear_u    
1630                 set_pmd_at(dst_mm, addr, dst_    
1631                 ret = 0;                         
1632                 goto out_unlock;                 
1633         }                                        
1634 #endif                                           
1635                                                  
1636         if (unlikely(!pmd_trans_huge(pmd))) {    
1637                 pte_free(dst_mm, pgtable);       
1638                 goto out_unlock;                 
1639         }                                        
1640         /*                                       
1641          * When page table lock is held, the     
1642          * under splitting since we don't spl    
1643          * a page table.                         
1644          */                                      
1645         if (is_huge_zero_pmd(pmd)) {             
1646                 /*                               
1647                  * mm_get_huge_zero_folio() w    
1648                  * folio here, since we alrea    
1649                  * copy. It just takes a refe    
1650                  */                              
1651                 mm_get_huge_zero_folio(dst_mm    
1652                 goto out_zero_page;              
1653         }                                        
1654                                                  
1655         src_page = pmd_page(pmd);                
1656         VM_BUG_ON_PAGE(!PageHead(src_page), s    
1657         src_folio = page_folio(src_page);        
1658                                                  
1659         folio_get(src_folio);                    
1660         if (unlikely(folio_try_dup_anon_rmap_    
1661                 /* Page maybe pinned: split a    
1662                 folio_put(src_folio);            
1663                 pte_free(dst_mm, pgtable);       
1664                 spin_unlock(src_ptl);            
1665                 spin_unlock(dst_ptl);            
1666                 __split_huge_pmd(src_vma, src    
1667                 return -EAGAIN;                  
1668         }                                        
1669         add_mm_counter(dst_mm, MM_ANONPAGES,     
1670 out_zero_page:                                   
1671         mm_inc_nr_ptes(dst_mm);                  
1672         pgtable_trans_huge_deposit(dst_mm, ds    
1673         pmdp_set_wrprotect(src_mm, addr, src_    
1674         if (!userfaultfd_wp(dst_vma))            
1675                 pmd = pmd_clear_uffd_wp(pmd);    
1676         pmd = pmd_wrprotect(pmd);                
1677 set_pmd:                                         
1678         pmd = pmd_mkold(pmd);                    
1679         set_pmd_at(dst_mm, addr, dst_pmd, pmd    
1680                                                  
1681         ret = 0;                                 
1682 out_unlock:                                      
1683         spin_unlock(src_ptl);                    
1684         spin_unlock(dst_ptl);                    
1685 out:                                             
1686         return ret;                              
1687 }                                                
1688                                                  
1689 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_    
1690 void touch_pud(struct vm_area_struct *vma, un    
1691                pud_t *pud, bool write)           
1692 {                                                
1693         pud_t _pud;                              
1694                                                  
1695         _pud = pud_mkyoung(*pud);                
1696         if (write)                               
1697                 _pud = pud_mkdirty(_pud);        
1698         if (pudp_set_access_flags(vma, addr &    
1699                                   pud, _pud,     
1700                 update_mmu_cache_pud(vma, add    
1701 }                                                
1702                                                  
1703 int copy_huge_pud(struct mm_struct *dst_mm, s    
1704                   pud_t *dst_pud, pud_t *src_    
1705                   struct vm_area_struct *vma)    
1706 {                                                
1707         spinlock_t *dst_ptl, *src_ptl;           
1708         pud_t pud;                               
1709         int ret;                                 
1710                                                  
1711         dst_ptl = pud_lock(dst_mm, dst_pud);     
1712         src_ptl = pud_lockptr(src_mm, src_pud    
1713         spin_lock_nested(src_ptl, SINGLE_DEPT    
1714                                                  
1715         ret = -EAGAIN;                           
1716         pud = *src_pud;                          
1717         if (unlikely(!pud_trans_huge(pud) &&     
1718                 goto out_unlock;                 
1719                                                  
1720         /*                                       
1721          * TODO: once we support anonymous pa    
1722          * folio_try_dup_anon_rmap_*() and sp    
1723          */                                      
1724         if (is_cow_mapping(vma->vm_flags) &&     
1725                 pudp_set_wrprotect(src_mm, ad    
1726                 pud = pud_wrprotect(pud);        
1727         }                                        
1728         pud = pud_mkold(pud);                    
1729         set_pud_at(dst_mm, addr, dst_pud, pud    
1730                                                  
1731         ret = 0;                                 
1732 out_unlock:                                      
1733         spin_unlock(src_ptl);                    
1734         spin_unlock(dst_ptl);                    
1735         return ret;                              
1736 }                                                
1737                                                  
1738 void huge_pud_set_accessed(struct vm_fault *v    
1739 {                                                
1740         bool write = vmf->flags & FAULT_FLAG_    
1741                                                  
1742         vmf->ptl = pud_lock(vmf->vma->vm_mm,     
1743         if (unlikely(!pud_same(*vmf->pud, ori    
1744                 goto unlock;                     
1745                                                  
1746         touch_pud(vmf->vma, vmf->address, vmf    
1747 unlock:                                          
1748         spin_unlock(vmf->ptl);                   
1749 }                                                
1750 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPA    
1751                                                  
1752 void huge_pmd_set_accessed(struct vm_fault *v    
1753 {                                                
1754         bool write = vmf->flags & FAULT_FLAG_    
1755                                                  
1756         vmf->ptl = pmd_lock(vmf->vma->vm_mm,     
1757         if (unlikely(!pmd_same(*vmf->pmd, vmf    
1758                 goto unlock;                     
1759                                                  
1760         touch_pmd(vmf->vma, vmf->address, vmf    
1761                                                  
1762 unlock:                                          
1763         spin_unlock(vmf->ptl);                   
1764 }                                                
1765                                                  
1766 vm_fault_t do_huge_pmd_wp_page(struct vm_faul    
1767 {                                                
1768         const bool unshare = vmf->flags & FAU    
1769         struct vm_area_struct *vma = vmf->vma    
1770         struct folio *folio;                     
1771         struct page *page;                       
1772         unsigned long haddr = vmf->address &     
1773         pmd_t orig_pmd = vmf->orig_pmd;          
1774                                                  
1775         vmf->ptl = pmd_lockptr(vma->vm_mm, vm    
1776         VM_BUG_ON_VMA(!vma->anon_vma, vma);      
1777                                                  
1778         if (is_huge_zero_pmd(orig_pmd))          
1779                 goto fallback;                   
1780                                                  
1781         spin_lock(vmf->ptl);                     
1782                                                  
1783         if (unlikely(!pmd_same(*vmf->pmd, ori    
1784                 spin_unlock(vmf->ptl);           
1785                 return 0;                        
1786         }                                        
1787                                                  
1788         page = pmd_page(orig_pmd);               
1789         folio = page_folio(page);                
1790         VM_BUG_ON_PAGE(!PageHead(page), page)    
1791                                                  
1792         /* Early check when only holding the     
1793         if (PageAnonExclusive(page))             
1794                 goto reuse;                      
1795                                                  
1796         if (!folio_trylock(folio)) {             
1797                 folio_get(folio);                
1798                 spin_unlock(vmf->ptl);           
1799                 folio_lock(folio);               
1800                 spin_lock(vmf->ptl);             
1801                 if (unlikely(!pmd_same(*vmf->    
1802                         spin_unlock(vmf->ptl)    
1803                         folio_unlock(folio);     
1804                         folio_put(folio);        
1805                         return 0;                
1806                 }                                
1807                 folio_put(folio);                
1808         }                                        
1809                                                  
1810         /* Recheck after temporarily dropping    
1811         if (PageAnonExclusive(page)) {           
1812                 folio_unlock(folio);             
1813                 goto reuse;                      
1814         }                                        
1815                                                  
1816         /*                                       
1817          * See do_wp_page(): we can only reus    
1818          * there are no additional references    
1819          * the LRU cache immediately after ad    
1820          */                                      
1821         if (folio_ref_count(folio) >             
1822                         1 + folio_test_swapca    
1823                 goto unlock_fallback;            
1824         if (folio_test_swapcache(folio))         
1825                 folio_free_swap(folio);          
1826         if (folio_ref_count(folio) == 1) {       
1827                 pmd_t entry;                     
1828                                                  
1829                 folio_move_anon_rmap(folio, v    
1830                 SetPageAnonExclusive(page);      
1831                 folio_unlock(folio);             
1832 reuse:                                           
1833                 if (unlikely(unshare)) {         
1834                         spin_unlock(vmf->ptl)    
1835                         return 0;                
1836                 }                                
1837                 entry = pmd_mkyoung(orig_pmd)    
1838                 entry = maybe_pmd_mkwrite(pmd    
1839                 if (pmdp_set_access_flags(vma    
1840                         update_mmu_cache_pmd(    
1841                 spin_unlock(vmf->ptl);           
1842                 return 0;                        
1843         }                                        
1844                                                  
1845 unlock_fallback:                                 
1846         folio_unlock(folio);                     
1847         spin_unlock(vmf->ptl);                   
1848 fallback:                                        
1849         __split_huge_pmd(vma, vmf->pmd, vmf->    
1850         return VM_FAULT_FALLBACK;                
1851 }                                                
1852                                                  
1853 static inline bool can_change_pmd_writable(st    
1854                                            un    
1855 {                                                
1856         struct page *page;                       
1857                                                  
1858         if (WARN_ON_ONCE(!(vma->vm_flags & VM    
1859                 return false;                    
1860                                                  
1861         /* Don't touch entries that are not e    
1862         if (pmd_protnone(pmd))                   
1863                 return false;                    
1864                                                  
1865         /* Do we need write faults for softdi    
1866         if (pmd_needs_soft_dirty_wp(vma, pmd)    
1867                 return false;                    
1868                                                  
1869         /* Do we need write faults for uffd-w    
1870         if (userfaultfd_huge_pmd_wp(vma, pmd)    
1871                 return false;                    
1872                                                  
1873         if (!(vma->vm_flags & VM_SHARED)) {      
1874                 /* See can_change_pte_writabl    
1875                 page = vm_normal_page_pmd(vma    
1876                 return page && PageAnon(page)    
1877         }                                        
1878                                                  
1879         /* See can_change_pte_writable(). */     
1880         return pmd_dirty(pmd);                   
1881 }                                                
1882                                                  
1883 /* NUMA hinting page fault entry point for tr    
1884 vm_fault_t do_huge_pmd_numa_page(struct vm_fa    
1885 {                                                
1886         struct vm_area_struct *vma = vmf->vma    
1887         struct folio *folio;                     
1888         unsigned long haddr = vmf->address &     
1889         int nid = NUMA_NO_NODE;                  
1890         int target_nid, last_cpupid;             
1891         pmd_t pmd, old_pmd;                      
1892         bool writable = false;                   
1893         int flags = 0;                           
1894                                                  
1895         vmf->ptl = pmd_lock(vma->vm_mm, vmf->    
1896         old_pmd = pmdp_get(vmf->pmd);            
1897                                                  
1898         if (unlikely(!pmd_same(old_pmd, vmf->    
1899                 spin_unlock(vmf->ptl);           
1900                 return 0;                        
1901         }                                        
1902                                                  
1903         pmd = pmd_modify(old_pmd, vma->vm_pag    
1904                                                  
1905         /*                                       
1906          * Detect now whether the PMD could b    
1907          * is only valid while holding the PT    
1908          */                                      
1909         writable = pmd_write(pmd);               
1910         if (!writable && vma_wants_manual_pte    
1911             can_change_pmd_writable(vma, vmf-    
1912                 writable = true;                 
1913                                                  
1914         folio = vm_normal_folio_pmd(vma, hadd    
1915         if (!folio)                              
1916                 goto out_map;                    
1917                                                  
1918         nid = folio_nid(folio);                  
1919                                                  
1920         target_nid = numa_migrate_check(folio    
1921                                         &last    
1922         if (target_nid == NUMA_NO_NODE)          
1923                 goto out_map;                    
1924         if (migrate_misplaced_folio_prepare(f    
1925                 flags |= TNF_MIGRATE_FAIL;       
1926                 goto out_map;                    
1927         }                                        
1928         /* The folio is isolated and isolatio    
1929         spin_unlock(vmf->ptl);                   
1930         writable = false;                        
1931                                                  
1932         if (!migrate_misplaced_folio(folio, v    
1933                 flags |= TNF_MIGRATED;           
1934                 nid = target_nid;                
1935                 task_numa_fault(last_cpupid,     
1936                 return 0;                        
1937         }                                        
1938                                                  
1939         flags |= TNF_MIGRATE_FAIL;               
1940         vmf->ptl = pmd_lock(vma->vm_mm, vmf->    
1941         if (unlikely(!pmd_same(pmdp_get(vmf->    
1942                 spin_unlock(vmf->ptl);           
1943                 return 0;                        
1944         }                                        
1945 out_map:                                         
1946         /* Restore the PMD */                    
1947         pmd = pmd_modify(pmdp_get(vmf->pmd),     
1948         pmd = pmd_mkyoung(pmd);                  
1949         if (writable)                            
1950                 pmd = pmd_mkwrite(pmd, vma);     
1951         set_pmd_at(vma->vm_mm, haddr, vmf->pm    
1952         update_mmu_cache_pmd(vma, vmf->addres    
1953         spin_unlock(vmf->ptl);                   
1954                                                  
1955         if (nid != NUMA_NO_NODE)                 
1956                 task_numa_fault(last_cpupid,     
1957         return 0;                                
1958 }                                                
1959                                                  
1960 /*                                               
1961  * Return true if we do MADV_FREE successfull    
1962  * Otherwise, return false.                      
1963  */                                              
1964 bool madvise_free_huge_pmd(struct mmu_gather     
1965                 pmd_t *pmd, unsigned long add    
1966 {                                                
1967         spinlock_t *ptl;                         
1968         pmd_t orig_pmd;                          
1969         struct folio *folio;                     
1970         struct mm_struct *mm = tlb->mm;          
1971         bool ret = false;                        
1972                                                  
1973         tlb_change_page_size(tlb, HPAGE_PMD_S    
1974                                                  
1975         ptl = pmd_trans_huge_lock(pmd, vma);     
1976         if (!ptl)                                
1977                 goto out_unlocked;               
1978                                                  
1979         orig_pmd = *pmd;                         
1980         if (is_huge_zero_pmd(orig_pmd))          
1981                 goto out;                        
1982                                                  
1983         if (unlikely(!pmd_present(orig_pmd)))    
1984                 VM_BUG_ON(thp_migration_suppo    
1985                                   !is_pmd_mig    
1986                 goto out;                        
1987         }                                        
1988                                                  
1989         folio = pmd_folio(orig_pmd);             
1990         /*                                       
1991          * If other processes are mapping thi    
1992          * the folio unless they all do MADV_    
1993          */                                      
1994         if (folio_likely_mapped_shared(folio)    
1995                 goto out;                        
1996                                                  
1997         if (!folio_trylock(folio))               
1998                 goto out;                        
1999                                                  
2000         /*                                       
2001          * If user want to discard part-pages    
2002          * will deactivate only them.            
2003          */                                      
2004         if (next - addr != HPAGE_PMD_SIZE) {     
2005                 folio_get(folio);                
2006                 spin_unlock(ptl);                
2007                 split_folio(folio);              
2008                 folio_unlock(folio);             
2009                 folio_put(folio);                
2010                 goto out_unlocked;               
2011         }                                        
2012                                                  
2013         if (folio_test_dirty(folio))             
2014                 folio_clear_dirty(folio);        
2015         folio_unlock(folio);                     
2016                                                  
2017         if (pmd_young(orig_pmd) || pmd_dirty(    
2018                 pmdp_invalidate(vma, addr, pm    
2019                 orig_pmd = pmd_mkold(orig_pmd    
2020                 orig_pmd = pmd_mkclean(orig_p    
2021                                                  
2022                 set_pmd_at(mm, addr, pmd, ori    
2023                 tlb_remove_pmd_tlb_entry(tlb,    
2024         }                                        
2025                                                  
2026         folio_mark_lazyfree(folio);              
2027         ret = true;                              
2028 out:                                             
2029         spin_unlock(ptl);                        
2030 out_unlocked:                                    
2031         return ret;                              
2032 }                                                
2033                                                  
2034 static inline void zap_deposited_table(struct    
2035 {                                                
2036         pgtable_t pgtable;                       
2037                                                  
2038         pgtable = pgtable_trans_huge_withdraw    
2039         pte_free(mm, pgtable);                   
2040         mm_dec_nr_ptes(mm);                      
2041 }                                                
2042                                                  
2043 int zap_huge_pmd(struct mmu_gather *tlb, stru    
2044                  pmd_t *pmd, unsigned long ad    
2045 {                                                
2046         pmd_t orig_pmd;                          
2047         spinlock_t *ptl;                         
2048                                                  
2049         tlb_change_page_size(tlb, HPAGE_PMD_S    
2050                                                  
2051         ptl = __pmd_trans_huge_lock(pmd, vma)    
2052         if (!ptl)                                
2053                 return 0;                        
2054         /*                                       
2055          * For architectures like ppc64 we lo    
2056          * when calling pmdp_huge_get_and_cle    
2057          * pgtable_trans_huge_withdraw after     
2058          * operations.                           
2059          */                                      
2060         orig_pmd = pmdp_huge_get_and_clear_fu    
2061                                                  
2062         arch_check_zapped_pmd(vma, orig_pmd);    
2063         tlb_remove_pmd_tlb_entry(tlb, pmd, ad    
2064         if (vma_is_special_huge(vma)) {          
2065                 if (arch_needs_pgtable_deposi    
2066                         zap_deposited_table(t    
2067                 spin_unlock(ptl);                
2068         } else if (is_huge_zero_pmd(orig_pmd)    
2069                 zap_deposited_table(tlb->mm,     
2070                 spin_unlock(ptl);                
2071         } else {                                 
2072                 struct folio *folio = NULL;      
2073                 int flush_needed = 1;            
2074                                                  
2075                 if (pmd_present(orig_pmd)) {     
2076                         struct page *page = p    
2077                                                  
2078                         folio = page_folio(pa    
2079                         folio_remove_rmap_pmd    
2080                         WARN_ON_ONCE(folio_ma    
2081                         VM_BUG_ON_PAGE(!PageH    
2082                 } else if (thp_migration_supp    
2083                         swp_entry_t entry;       
2084                                                  
2085                         VM_BUG_ON(!is_pmd_mig    
2086                         entry = pmd_to_swp_en    
2087                         folio = pfn_swap_entr    
2088                         flush_needed = 0;        
2089                 } else                           
2090                         WARN_ONCE(1, "Non pre    
2091                                                  
2092                 if (folio_test_anon(folio)) {    
2093                         zap_deposited_table(t    
2094                         add_mm_counter(tlb->m    
2095                 } else {                         
2096                         if (arch_needs_pgtabl    
2097                                 zap_deposited    
2098                         add_mm_counter(tlb->m    
2099                                        -HPAGE    
2100                 }                                
2101                                                  
2102                 spin_unlock(ptl);                
2103                 if (flush_needed)                
2104                         tlb_remove_page_size(    
2105         }                                        
2106         return 1;                                
2107 }                                                
2108                                                  
2109 #ifndef pmd_move_must_withdraw                   
2110 static inline int pmd_move_must_withdraw(spin    
2111                                          spin    
2112                                          stru    
2113 {                                                
2114         /*                                       
2115          * With split pmd lock we also need t    
2116          * PTE page table if new_pmd is on di    
2117          *                                       
2118          * We also don't deposit and withdraw    
2119          */                                      
2120         return (new_pmd_ptl != old_pmd_ptl) &    
2121 }                                                
2122 #endif                                           
2123                                                  
2124 static pmd_t move_soft_dirty_pmd(pmd_t pmd)      
2125 {                                                
2126 #ifdef CONFIG_MEM_SOFT_DIRTY                     
2127         if (unlikely(is_pmd_migration_entry(p    
2128                 pmd = pmd_swp_mksoft_dirty(pm    
2129         else if (pmd_present(pmd))               
2130                 pmd = pmd_mksoft_dirty(pmd);     
2131 #endif                                           
2132         return pmd;                              
2133 }                                                
2134                                                  
2135 bool move_huge_pmd(struct vm_area_struct *vma    
2136                   unsigned long new_addr, pmd    
2137 {                                                
2138         spinlock_t *old_ptl, *new_ptl;           
2139         pmd_t pmd;                               
2140         struct mm_struct *mm = vma->vm_mm;       
2141         bool force_flush = false;                
2142                                                  
2143         /*                                       
2144          * The destination pmd shouldn't be e    
2145          * should have released it; but move_    
2146          * inserted a page table, if racing a    
2147          */                                      
2148         if (!pmd_none(*new_pmd)) {               
2149                 VM_BUG_ON(pmd_trans_huge(*new    
2150                 return false;                    
2151         }                                        
2152                                                  
2153         /*                                       
2154          * We don't have to worry about the o    
2155          * ptlocks because exclusive mmap_loc    
2156          */                                      
2157         old_ptl = __pmd_trans_huge_lock(old_p    
2158         if (old_ptl) {                           
2159                 new_ptl = pmd_lockptr(mm, new    
2160                 if (new_ptl != old_ptl)          
2161                         spin_lock_nested(new_    
2162                 pmd = pmdp_huge_get_and_clear    
2163                 if (pmd_present(pmd))            
2164                         force_flush = true;      
2165                 VM_BUG_ON(!pmd_none(*new_pmd)    
2166                                                  
2167                 if (pmd_move_must_withdraw(ne    
2168                         pgtable_t pgtable;       
2169                         pgtable = pgtable_tra    
2170                         pgtable_trans_huge_de    
2171                 }                                
2172                 pmd = move_soft_dirty_pmd(pmd    
2173                 set_pmd_at(mm, new_addr, new_    
2174                 if (force_flush)                 
2175                         flush_pmd_tlb_range(v    
2176                 if (new_ptl != old_ptl)          
2177                         spin_unlock(new_ptl);    
2178                 spin_unlock(old_ptl);            
2179                 return true;                     
2180         }                                        
2181         return false;                            
2182 }                                                
2183                                                  
2184 /*                                               
2185  * Returns                                       
2186  *  - 0 if PMD could not be locked               
2187  *  - 1 if PMD was locked but protections unc    
2188  *      or if prot_numa but THP migration is     
2189  *  - HPAGE_PMD_NR if protections changed and    
2190  */                                              
2191 int change_huge_pmd(struct mmu_gather *tlb, s    
2192                     pmd_t *pmd, unsigned long    
2193                     unsigned long cp_flags)      
2194 {                                                
2195         struct mm_struct *mm = vma->vm_mm;       
2196         spinlock_t *ptl;                         
2197         pmd_t oldpmd, entry;                     
2198         bool prot_numa = cp_flags & MM_CP_PRO    
2199         bool uffd_wp = cp_flags & MM_CP_UFFD_    
2200         bool uffd_wp_resolve = cp_flags & MM_    
2201         int ret = 1;                             
2202                                                  
2203         tlb_change_page_size(tlb, HPAGE_PMD_S    
2204                                                  
2205         if (prot_numa && !thp_migration_suppo    
2206                 return 1;                        
2207                                                  
2208         ptl = __pmd_trans_huge_lock(pmd, vma)    
2209         if (!ptl)                                
2210                 return 0;                        
2211                                                  
2212 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION          
2213         if (is_swap_pmd(*pmd)) {                 
2214                 swp_entry_t entry = pmd_to_sw    
2215                 struct folio *folio = pfn_swa    
2216                 pmd_t newpmd;                    
2217                                                  
2218                 VM_BUG_ON(!is_pmd_migration_e    
2219                 if (is_writable_migration_ent    
2220                         /*                       
2221                          * A protection check    
2222                          * just be safe and d    
2223                          */                      
2224                         if (folio_test_anon(f    
2225                                 entry = make_    
2226                         else                     
2227                                 entry = make_    
2228                         newpmd = swp_entry_to    
2229                         if (pmd_swp_soft_dirt    
2230                                 newpmd = pmd_    
2231                 } else {                         
2232                         newpmd = *pmd;           
2233                 }                                
2234                                                  
2235                 if (uffd_wp)                     
2236                         newpmd = pmd_swp_mkuf    
2237                 else if (uffd_wp_resolve)        
2238                         newpmd = pmd_swp_clea    
2239                 if (!pmd_same(*pmd, newpmd))     
2240                         set_pmd_at(mm, addr,     
2241                 goto unlock;                     
2242         }                                        
2243 #endif                                           
2244                                                  
2245         if (prot_numa) {                         
2246                 struct folio *folio;             
2247                 bool toptier;                    
2248                 /*                               
2249                  * Avoid trapping faults agai    
2250                  * data is likely to be read-    
2251                  * local/remote hits to the z    
2252                  */                              
2253                 if (is_huge_zero_pmd(*pmd))      
2254                         goto unlock;             
2255                                                  
2256                 if (pmd_protnone(*pmd))          
2257                         goto unlock;             
2258                                                  
2259                 folio = pmd_folio(*pmd);         
2260                 toptier = node_is_toptier(fol    
2261                 /*                               
2262                  * Skip scanning top tier nod    
2263                  * balancing is disabled         
2264                  */                              
2265                 if (!(sysctl_numa_balancing_m    
2266                     toptier)                     
2267                         goto unlock;             
2268                                                  
2269                 if (folio_use_access_time(fol    
2270                         folio_xchg_access_tim    
2271                                                  
2272         }                                        
2273         /*                                       
2274          * In case prot_numa, we are under mm    
2275          * to not clear pmd intermittently to    
2276          * which is also under mmap_read_lock    
2277          *                                       
2278          *      CPU0:                            
2279          *                              chang    
2280          *                               pmdp    
2281          * madvise_dontneed()                    
2282          *  zap_pmd_range()                      
2283          *   pmd_trans_huge(*pmd) == 0 (witho    
2284          *   // skip the pmd                     
2285          *                               set_    
2286          *                               // p    
2287          *                                       
2288          * The race makes MADV_DONTNEED miss     
2289          * which may break userspace.            
2290          *                                       
2291          * pmdp_invalidate_ad() is required t    
2292          * dirty/young flags set by hardware.    
2293          */                                      
2294         oldpmd = pmdp_invalidate_ad(vma, addr    
2295                                                  
2296         entry = pmd_modify(oldpmd, newprot);     
2297         if (uffd_wp)                             
2298                 entry = pmd_mkuffd_wp(entry);    
2299         else if (uffd_wp_resolve)                
2300                 /*                               
2301                  * Leave the write bit to be     
2302                  * handler, then things like     
2303                  * handled.                      
2304                  */                              
2305                 entry = pmd_clear_uffd_wp(ent    
2306                                                  
2307         /* See change_pte_range(). */            
2308         if ((cp_flags & MM_CP_TRY_CHANGE_WRIT    
2309             can_change_pmd_writable(vma, addr    
2310                 entry = pmd_mkwrite(entry, vm    
2311                                                  
2312         ret = HPAGE_PMD_NR;                      
2313         set_pmd_at(mm, addr, pmd, entry);        
2314                                                  
2315         if (huge_pmd_needs_flush(oldpmd, entr    
2316                 tlb_flush_pmd_range(tlb, addr    
2317 unlock:                                          
2318         spin_unlock(ptl);                        
2319         return ret;                              
2320 }                                                
2321                                                  
2322 /*                                               
2323  * Returns:                                      
2324  *                                               
2325  * - 0: if pud leaf changed from under us        
2326  * - 1: if pud can be skipped                    
2327  * - HPAGE_PUD_NR: if pud was successfully pr    
2328  */                                              
2329 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_    
2330 int change_huge_pud(struct mmu_gather *tlb, s    
2331                     pud_t *pudp, unsigned lon    
2332                     unsigned long cp_flags)      
2333 {                                                
2334         struct mm_struct *mm = vma->vm_mm;       
2335         pud_t oldpud, entry;                     
2336         spinlock_t *ptl;                         
2337                                                  
2338         tlb_change_page_size(tlb, HPAGE_PUD_S    
2339                                                  
2340         /* NUMA balancing doesn't apply to da    
2341         if (cp_flags & MM_CP_PROT_NUMA)          
2342                 return 1;                        
2343                                                  
2344         /*                                       
2345          * Huge entries on userfault-wp only     
2346          * don't have anonymous PUDs yet.        
2347          */                                      
2348         if (WARN_ON_ONCE(cp_flags & MM_CP_UFF    
2349                 return 1;                        
2350                                                  
2351         ptl = __pud_trans_huge_lock(pudp, vma    
2352         if (!ptl)                                
2353                 return 0;                        
2354                                                  
2355         /*                                       
2356          * Can't clear PUD or it can race wit    
2357          * change_huge_pmd().                    
2358          */                                      
2359         oldpud = pudp_invalidate(vma, addr, p    
2360         entry = pud_modify(oldpud, newprot);     
2361         set_pud_at(mm, addr, pudp, entry);       
2362         tlb_flush_pud_range(tlb, addr, HPAGE_    
2363                                                  
2364         spin_unlock(ptl);                        
2365         return HPAGE_PUD_NR;                     
2366 }                                                
2367 #endif                                           
2368                                                  
2369 #ifdef CONFIG_USERFAULTFD                        
2370 /*                                               
2371  * The PT lock for src_pmd and dst_vma/src_vm    
2372  * the caller, but it must return after relea    
2373  * Just move the page from src_pmd to dst_pmd    
2374  * Return zero if succeeded in moving the pag    
2375  * repeated by the caller, or other errors in    
2376  */                                              
2377 int move_pages_huge_pmd(struct mm_struct *mm,    
2378                         struct vm_area_struct    
2379                         unsigned long dst_add    
2380 {                                                
2381         pmd_t _dst_pmd, src_pmdval;              
2382         struct page *src_page;                   
2383         struct folio *src_folio;                 
2384         struct anon_vma *src_anon_vma;           
2385         spinlock_t *src_ptl, *dst_ptl;           
2386         pgtable_t src_pgtable;                   
2387         struct mmu_notifier_range range;         
2388         int err = 0;                             
2389                                                  
2390         src_pmdval = *src_pmd;                   
2391         src_ptl = pmd_lockptr(mm, src_pmd);      
2392                                                  
2393         lockdep_assert_held(src_ptl);            
2394         vma_assert_locked(src_vma);              
2395         vma_assert_locked(dst_vma);              
2396                                                  
2397         /* Sanity checks before the operation    
2398         if (WARN_ON_ONCE(!pmd_none(dst_pmdval    
2399             WARN_ON_ONCE(dst_addr & ~HPAGE_PM    
2400                 spin_unlock(src_ptl);            
2401                 return -EINVAL;                  
2402         }                                        
2403                                                  
2404         if (!pmd_trans_huge(src_pmdval)) {       
2405                 spin_unlock(src_ptl);            
2406                 if (is_pmd_migration_entry(sr    
2407                         pmd_migration_entry_w    
2408                         return -EAGAIN;          
2409                 }                                
2410                 return -ENOENT;                  
2411         }                                        
2412                                                  
2413         src_page = pmd_page(src_pmdval);         
2414                                                  
2415         if (!is_huge_zero_pmd(src_pmdval)) {     
2416                 if (unlikely(!PageAnonExclusi    
2417                         spin_unlock(src_ptl);    
2418                         return -EBUSY;           
2419                 }                                
2420                                                  
2421                 src_folio = page_folio(src_pa    
2422                 folio_get(src_folio);            
2423         } else                                   
2424                 src_folio = NULL;                
2425                                                  
2426         spin_unlock(src_ptl);                    
2427                                                  
2428         flush_cache_range(src_vma, src_addr,     
2429         mmu_notifier_range_init(&range, MMU_N    
2430                                 src_addr + HP    
2431         mmu_notifier_invalidate_range_start(&    
2432                                                  
2433         if (src_folio) {                         
2434                 folio_lock(src_folio);           
2435                                                  
2436                 /*                               
2437                  * split_huge_page walks the     
2438                  * lock. Serialize against it    
2439                  * lock is not enough.           
2440                  */                              
2441                 src_anon_vma = folio_get_anon    
2442                 if (!src_anon_vma) {             
2443                         err = -EAGAIN;           
2444                         goto unlock_folio;       
2445                 }                                
2446                 anon_vma_lock_write(src_anon_    
2447         } else                                   
2448                 src_anon_vma = NULL;             
2449                                                  
2450         dst_ptl = pmd_lockptr(mm, dst_pmd);      
2451         double_pt_lock(src_ptl, dst_ptl);        
2452         if (unlikely(!pmd_same(*src_pmd, src_    
2453                      !pmd_same(*dst_pmd, dst_    
2454                 err = -EAGAIN;                   
2455                 goto unlock_ptls;                
2456         }                                        
2457         if (src_folio) {                         
2458                 if (folio_maybe_dma_pinned(sr    
2459                     !PageAnonExclusive(&src_f    
2460                         err = -EBUSY;            
2461                         goto unlock_ptls;        
2462                 }                                
2463                                                  
2464                 if (WARN_ON_ONCE(!folio_test_    
2465                     WARN_ON_ONCE(!folio_test_    
2466                         err = -EBUSY;            
2467                         goto unlock_ptls;        
2468                 }                                
2469                                                  
2470                 src_pmdval = pmdp_huge_clear_    
2471                 /* Folio got pinned from unde    
2472                 if (folio_maybe_dma_pinned(sr    
2473                         set_pmd_at(mm, src_ad    
2474                         err = -EBUSY;            
2475                         goto unlock_ptls;        
2476                 }                                
2477                                                  
2478                 folio_move_anon_rmap(src_foli    
2479                 src_folio->index = linear_pag    
2480                                                  
2481                 _dst_pmd = mk_huge_pmd(&src_f    
2482                 /* Follow mremap() behavior a    
2483                 _dst_pmd = pmd_mkwrite(pmd_mk    
2484         } else {                                 
2485                 src_pmdval = pmdp_huge_clear_    
2486                 _dst_pmd = mk_huge_pmd(src_pa    
2487         }                                        
2488         set_pmd_at(mm, dst_addr, dst_pmd, _ds    
2489                                                  
2490         src_pgtable = pgtable_trans_huge_with    
2491         pgtable_trans_huge_deposit(mm, dst_pm    
2492 unlock_ptls:                                     
2493         double_pt_unlock(src_ptl, dst_ptl);      
2494         if (src_anon_vma) {                      
2495                 anon_vma_unlock_write(src_ano    
2496                 put_anon_vma(src_anon_vma);      
2497         }                                        
2498 unlock_folio:                                    
2499         /* unblock rmap walks */                 
2500         if (src_folio)                           
2501                 folio_unlock(src_folio);         
2502         mmu_notifier_invalidate_range_end(&ra    
2503         if (src_folio)                           
2504                 folio_put(src_folio);            
2505         return err;                              
2506 }                                                
2507 #endif /* CONFIG_USERFAULTFD */                  
2508                                                  
2509 /*                                               
2510  * Returns page table lock pointer if a given    
2511  *                                               
2512  * Note that if it returns page table lock po    
2513  * unlocking page table lock. So callers must    
2514  */                                              
2515 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,    
2516 {                                                
2517         spinlock_t *ptl;                         
2518         ptl = pmd_lock(vma->vm_mm, pmd);         
2519         if (likely(is_swap_pmd(*pmd) || pmd_t    
2520                         pmd_devmap(*pmd)))       
2521                 return ptl;                      
2522         spin_unlock(ptl);                        
2523         return NULL;                             
2524 }                                                
2525                                                  
2526 /*                                               
2527  * Returns page table lock pointer if a given    
2528  *                                               
2529  * Note that if it returns page table lock po    
2530  * unlocking page table lock. So callers must    
2531  */                                              
2532 spinlock_t *__pud_trans_huge_lock(pud_t *pud,    
2533 {                                                
2534         spinlock_t *ptl;                         
2535                                                  
2536         ptl = pud_lock(vma->vm_mm, pud);         
2537         if (likely(pud_trans_huge(*pud) || pu    
2538                 return ptl;                      
2539         spin_unlock(ptl);                        
2540         return NULL;                             
2541 }                                                
2542                                                  
2543 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_    
2544 int zap_huge_pud(struct mmu_gather *tlb, stru    
2545                  pud_t *pud, unsigned long ad    
2546 {                                                
2547         spinlock_t *ptl;                         
2548         pud_t orig_pud;                          
2549                                                  
2550         ptl = __pud_trans_huge_lock(pud, vma)    
2551         if (!ptl)                                
2552                 return 0;                        
2553                                                  
2554         orig_pud = pudp_huge_get_and_clear_fu    
2555         arch_check_zapped_pud(vma, orig_pud);    
2556         tlb_remove_pud_tlb_entry(tlb, pud, ad    
2557         if (vma_is_special_huge(vma)) {          
2558                 spin_unlock(ptl);                
2559                 /* No zero page support yet *    
2560         } else {                                 
2561                 /* No support for anonymous P    
2562                 BUG();                           
2563         }                                        
2564         return 1;                                
2565 }                                                
2566                                                  
2567 static void __split_huge_pud_locked(struct vm    
2568                 unsigned long haddr)             
2569 {                                                
2570         VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);      
2571         VM_BUG_ON_VMA(vma->vm_start > haddr,     
2572         VM_BUG_ON_VMA(vma->vm_end < haddr + H    
2573         VM_BUG_ON(!pud_trans_huge(*pud) && !p    
2574                                                  
2575         count_vm_event(THP_SPLIT_PUD);           
2576                                                  
2577         pudp_huge_clear_flush(vma, haddr, pud    
2578 }                                                
2579                                                  
2580 void __split_huge_pud(struct vm_area_struct *    
2581                 unsigned long address)           
2582 {                                                
2583         spinlock_t *ptl;                         
2584         struct mmu_notifier_range range;         
2585                                                  
2586         mmu_notifier_range_init(&range, MMU_N    
2587                                 address & HPA    
2588                                 (address & HP    
2589         mmu_notifier_invalidate_range_start(&    
2590         ptl = pud_lock(vma->vm_mm, pud);         
2591         if (unlikely(!pud_trans_huge(*pud) &&    
2592                 goto out;                        
2593         __split_huge_pud_locked(vma, pud, ran    
2594                                                  
2595 out:                                             
2596         spin_unlock(ptl);                        
2597         mmu_notifier_invalidate_range_end(&ra    
2598 }                                                
2599 #else                                            
2600 void __split_huge_pud(struct vm_area_struct *    
2601                 unsigned long address)           
2602 {                                                
2603 }                                                
2604 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPA    
2605                                                  
2606 static void __split_huge_zero_page_pmd(struct    
2607                 unsigned long haddr, pmd_t *p    
2608 {                                                
2609         struct mm_struct *mm = vma->vm_mm;       
2610         pgtable_t pgtable;                       
2611         pmd_t _pmd, old_pmd;                     
2612         unsigned long addr;                      
2613         pte_t *pte;                              
2614         int i;                                   
2615                                                  
2616         /*                                       
2617          * Leave pmd empty until pte is fille    
2618          * notification until mmu_notifier_in    
2619          * replacing a zero pmd write protect    
2620          * protected page.                       
2621          *                                       
2622          * See Documentation/mm/mmu_notifier.    
2623          */                                      
2624         old_pmd = pmdp_huge_clear_flush(vma,     
2625                                                  
2626         pgtable = pgtable_trans_huge_withdraw    
2627         pmd_populate(mm, &_pmd, pgtable);        
2628                                                  
2629         pte = pte_offset_map(&_pmd, haddr);      
2630         VM_BUG_ON(!pte);                         
2631         for (i = 0, addr = haddr; i < HPAGE_P    
2632                 pte_t entry;                     
2633                                                  
2634                 entry = pfn_pte(my_zero_pfn(a    
2635                 entry = pte_mkspecial(entry);    
2636                 if (pmd_uffd_wp(old_pmd))        
2637                         entry = pte_mkuffd_wp    
2638                 VM_BUG_ON(!pte_none(ptep_get(    
2639                 set_pte_at(mm, addr, pte, ent    
2640                 pte++;                           
2641         }                                        
2642         pte_unmap(pte - 1);                      
2643         smp_wmb(); /* make pte visible before    
2644         pmd_populate(mm, pmd, pgtable);          
2645 }                                                
2646                                                  
2647 static void __split_huge_pmd_locked(struct vm    
2648                 unsigned long haddr, bool fre    
2649 {                                                
2650         struct mm_struct *mm = vma->vm_mm;       
2651         struct folio *folio;                     
2652         struct page *page;                       
2653         pgtable_t pgtable;                       
2654         pmd_t old_pmd, _pmd;                     
2655         bool young, write, soft_dirty, pmd_mi    
2656         bool anon_exclusive = false, dirty =     
2657         unsigned long addr;                      
2658         pte_t *pte;                              
2659         int i;                                   
2660                                                  
2661         VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);      
2662         VM_BUG_ON_VMA(vma->vm_start > haddr,     
2663         VM_BUG_ON_VMA(vma->vm_end < haddr + H    
2664         VM_BUG_ON(!is_pmd_migration_entry(*pm    
2665                                 && !pmd_devma    
2666                                                  
2667         count_vm_event(THP_SPLIT_PMD);           
2668                                                  
2669         if (!vma_is_anonymous(vma)) {            
2670                 old_pmd = pmdp_huge_clear_flu    
2671                 /*                               
2672                  * We are going to unmap this    
2673                  * just go ahead and zap it      
2674                  */                              
2675                 if (arch_needs_pgtable_deposi    
2676                         zap_deposited_table(m    
2677                 if (vma_is_special_huge(vma))    
2678                         return;                  
2679                 if (unlikely(is_pmd_migration    
2680                         swp_entry_t entry;       
2681                                                  
2682                         entry = pmd_to_swp_en    
2683                         folio = pfn_swap_entr    
2684                 } else {                         
2685                         page = pmd_page(old_p    
2686                         folio = page_folio(pa    
2687                         if (!folio_test_dirty    
2688                                 folio_mark_di    
2689                         if (!folio_test_refer    
2690                                 folio_set_ref    
2691                         folio_remove_rmap_pmd    
2692                         folio_put(folio);        
2693                 }                                
2694                 add_mm_counter(mm, mm_counter    
2695                 return;                          
2696         }                                        
2697                                                  
2698         if (is_huge_zero_pmd(*pmd)) {            
2699                 /*                               
2700                  * FIXME: Do we want to inval    
2701                  * mmu_notifier_arch_invalida    
2702                  * inside __split_huge_pmd()     
2703                  *                               
2704                  * We are going from a zero h    
2705                  * small page also write prot    
2706                  * to invalidate secondary mm    
2707                  */                              
2708                 return __split_huge_zero_page    
2709         }                                        
2710                                                  
2711         pmd_migration = is_pmd_migration_entr    
2712         if (unlikely(pmd_migration)) {           
2713                 swp_entry_t entry;               
2714                                                  
2715                 old_pmd = *pmd;                  
2716                 entry = pmd_to_swp_entry(old_    
2717                 page = pfn_swap_entry_to_page    
2718                 write = is_writable_migration    
2719                 if (PageAnon(page))              
2720                         anon_exclusive = is_r    
2721                 young = is_migration_entry_yo    
2722                 dirty = is_migration_entry_di    
2723                 soft_dirty = pmd_swp_soft_dir    
2724                 uffd_wp = pmd_swp_uffd_wp(old    
2725         } else {                                 
2726                 /*                               
2727                  * Up to this point the pmd i    
2728                  * the whole access to the hu    
2729                  * happens in place). If we o    
2730                  * version pointing to the pt    
2731                  * all CPUs were bug free), u    
2732                  * size TLB miss on the small    
2733                  * entry is still established    
2734                  * like that. See                
2735                  * http://support.amd.com/Tec    
2736                  * 383 on page 105. Intel sho    
2737                  * it's only safe if the perm    
2738                  * two entries loaded in the     
2739                  * be the case here). But it     
2740                  * small and huge TLB entries    
2741                  * loaded simultaneously. So     
2742                  * flush_pmd_tlb_range();" we    
2743                  * notpresent (atomically bec    
2744                  * remain set at all times on    
2745                  * complete for this pmd), th    
2746                  * we write the non-huge vers    
2747                  * pmd_populate.                 
2748                  */                              
2749                 old_pmd = pmdp_invalidate(vma    
2750                 page = pmd_page(old_pmd);        
2751                 folio = page_folio(page);        
2752                 if (pmd_dirty(old_pmd)) {        
2753                         dirty = true;            
2754                         folio_set_dirty(folio    
2755                 }                                
2756                 write = pmd_write(old_pmd);      
2757                 young = pmd_young(old_pmd);      
2758                 soft_dirty = pmd_soft_dirty(o    
2759                 uffd_wp = pmd_uffd_wp(old_pmd    
2760                                                  
2761                 VM_WARN_ON_FOLIO(!folio_ref_c    
2762                 VM_WARN_ON_FOLIO(!folio_test_    
2763                                                  
2764                 /*                               
2765                  * Without "freeze", we'll si    
2766                  * PageAnonExclusive() flag f    
2767                  * each subpage -- no need to    
2768                  *                               
2769                  * With "freeze" we want to r    
2770                  * migration entries right aw    
2771                  * managed to clear PageAnonE    
2772                  * set_pmd_migration_entry().    
2773                  *                               
2774                  * In case we cannot clear Pa    
2775                  * only and let try_to_migrat    
2776                  *                               
2777                  * See folio_try_share_anon_r    
2778                  */                              
2779                 anon_exclusive = PageAnonExcl    
2780                 if (freeze && anon_exclusive     
2781                     folio_try_share_anon_rmap    
2782                         freeze = false;          
2783                 if (!freeze) {                   
2784                         rmap_t rmap_flags = R    
2785                                                  
2786                         folio_ref_add(folio,     
2787                         if (anon_exclusive)      
2788                                 rmap_flags |=    
2789                         folio_add_anon_rmap_p    
2790                                                  
2791                 }                                
2792         }                                        
2793                                                  
2794         /*                                       
2795          * Withdraw the table only after we m    
2796          * This's critical for some architect    
2797          */                                      
2798         pgtable = pgtable_trans_huge_withdraw    
2799         pmd_populate(mm, &_pmd, pgtable);        
2800                                                  
2801         pte = pte_offset_map(&_pmd, haddr);      
2802         VM_BUG_ON(!pte);                         
2803                                                  
2804         /*                                       
2805          * Note that NUMA hinting access rest    
2806          * avoid any possibility of altering     
2807          */                                      
2808         if (freeze || pmd_migration) {           
2809                 for (i = 0, addr = haddr; i <    
2810                         pte_t entry;             
2811                         swp_entry_t swp_entry    
2812                                                  
2813                         if (write)               
2814                                 swp_entry = m    
2815                                                  
2816                         else if (anon_exclusi    
2817                                 swp_entry = m    
2818                                                  
2819                         else                     
2820                                 swp_entry = m    
2821                                                  
2822                         if (young)               
2823                                 swp_entry = m    
2824                         if (dirty)               
2825                                 swp_entry = m    
2826                         entry = swp_entry_to_    
2827                         if (soft_dirty)          
2828                                 entry = pte_s    
2829                         if (uffd_wp)             
2830                                 entry = pte_s    
2831                                                  
2832                         VM_WARN_ON(!pte_none(    
2833                         set_pte_at(mm, addr,     
2834                 }                                
2835         } else {                                 
2836                 pte_t entry;                     
2837                                                  
2838                 entry = mk_pte(page, READ_ONC    
2839                 if (write)                       
2840                         entry = pte_mkwrite(e    
2841                 if (!young)                      
2842                         entry = pte_mkold(ent    
2843                 /* NOTE: this may set soft-di    
2844                 if (dirty)                       
2845                         entry = pte_mkdirty(e    
2846                 if (soft_dirty)                  
2847                         entry = pte_mksoft_di    
2848                 if (uffd_wp)                     
2849                         entry = pte_mkuffd_wp    
2850                                                  
2851                 for (i = 0; i < HPAGE_PMD_NR;    
2852                         VM_WARN_ON(!pte_none(    
2853                                                  
2854                 set_ptes(mm, haddr, pte, entr    
2855         }                                        
2856         pte_unmap(pte);                          
2857                                                  
2858         if (!pmd_migration)                      
2859                 folio_remove_rmap_pmd(folio,     
2860         if (freeze)                              
2861                 put_page(page);                  
2862                                                  
2863         smp_wmb(); /* make pte visible before    
2864         pmd_populate(mm, pmd, pgtable);          
2865 }                                                
2866                                                  
2867 void split_huge_pmd_locked(struct vm_area_str    
2868                            pmd_t *pmd, bool f    
2869 {                                                
2870         VM_WARN_ON_ONCE(folio && !folio_test_    
2871         VM_WARN_ON_ONCE(!IS_ALIGNED(address,     
2872         VM_WARN_ON_ONCE(folio && !folio_test_    
2873         VM_BUG_ON(freeze && !folio);             
2874                                                  
2875         /*                                       
2876          * When the caller requests to set up    
2877          * require a folio to check the PMD a    
2878          * is a risk of replacing the wrong f    
2879          */                                      
2880         if (pmd_trans_huge(*pmd) || pmd_devma    
2881             is_pmd_migration_entry(*pmd)) {      
2882                 if (folio && folio != pmd_fol    
2883                         return;                  
2884                 __split_huge_pmd_locked(vma,     
2885         }                                        
2886 }                                                
2887                                                  
2888 void __split_huge_pmd(struct vm_area_struct *    
2889                 unsigned long address, bool f    
2890 {                                                
2891         spinlock_t *ptl;                         
2892         struct mmu_notifier_range range;         
2893                                                  
2894         mmu_notifier_range_init(&range, MMU_N    
2895                                 address & HPA    
2896                                 (address & HP    
2897         mmu_notifier_invalidate_range_start(&    
2898         ptl = pmd_lock(vma->vm_mm, pmd);         
2899         split_huge_pmd_locked(vma, range.star    
2900         spin_unlock(ptl);                        
2901         mmu_notifier_invalidate_range_end(&ra    
2902 }                                                
2903                                                  
2904 void split_huge_pmd_address(struct vm_area_st    
2905                 bool freeze, struct folio *fo    
2906 {                                                
2907         pmd_t *pmd = mm_find_pmd(vma->vm_mm,     
2908                                                  
2909         if (!pmd)                                
2910                 return;                          
2911                                                  
2912         __split_huge_pmd(vma, pmd, address, f    
2913 }                                                
2914                                                  
2915 static inline void split_huge_pmd_if_needed(s    
2916 {                                                
2917         /*                                       
2918          * If the new address isn't hpage ali    
2919          * contain an hugepage: check if we n    
2920          */                                      
2921         if (!IS_ALIGNED(address, HPAGE_PMD_SI    
2922             range_in_vma(vma, ALIGN_DOWN(addr    
2923                          ALIGN(address, HPAGE    
2924                 split_huge_pmd_address(vma, a    
2925 }                                                
2926                                                  
2927 void vma_adjust_trans_huge(struct vm_area_str    
2928                              unsigned long st    
2929                              unsigned long en    
2930                              long adjust_next    
2931 {                                                
2932         /* Check if we need to split start fi    
2933         split_huge_pmd_if_needed(vma, start);    
2934                                                  
2935         /* Check if we need to split end next    
2936         split_huge_pmd_if_needed(vma, end);      
2937                                                  
2938         /*                                       
2939          * If we're also updating the next vm    
2940          * check if we need to split it.         
2941          */                                      
2942         if (adjust_next > 0) {                   
2943                 struct vm_area_struct *next =    
2944                 unsigned long nstart = next->    
2945                 nstart += adjust_next;           
2946                 split_huge_pmd_if_needed(next    
2947         }                                        
2948 }                                                
2949                                                  
2950 static void unmap_folio(struct folio *folio)     
2951 {                                                
2952         enum ttu_flags ttu_flags = TTU_RMAP_L    
2953                 TTU_BATCH_FLUSH;                 
2954                                                  
2955         VM_BUG_ON_FOLIO(!folio_test_large(fol    
2956                                                  
2957         if (folio_test_pmd_mappable(folio))      
2958                 ttu_flags |= TTU_SPLIT_HUGE_P    
2959                                                  
2960         /*                                       
2961          * Anon pages need migration entries     
2962          * pages can simply be left unmapped,    
2963          * If that is ever changed (perhaps f    
2964          */                                      
2965         if (folio_test_anon(folio))              
2966                 try_to_migrate(folio, ttu_fla    
2967         else                                     
2968                 try_to_unmap(folio, ttu_flags    
2969                                                  
2970         try_to_unmap_flush();                    
2971 }                                                
2972                                                  
2973 static bool __discard_anon_folio_pmd_locked(s    
2974                                             u    
2975                                             s    
2976 {                                                
2977         struct mm_struct *mm = vma->vm_mm;       
2978         int ref_count, map_count;                
2979         pmd_t orig_pmd = *pmdp;                  
2980                                                  
2981         if (folio_test_dirty(folio) || pmd_di    
2982                 return false;                    
2983                                                  
2984         orig_pmd = pmdp_huge_clear_flush(vma,    
2985                                                  
2986         /*                                       
2987          * Syncing against concurrent GUP-fas    
2988          * - clear PMD; barrier; read refcoun    
2989          * - inc refcount; barrier; read PMD     
2990          */                                      
2991         smp_mb();                                
2992                                                  
2993         ref_count = folio_ref_count(folio);      
2994         map_count = folio_mapcount(folio);       
2995                                                  
2996         /*                                       
2997          * Order reads for folio refcount and    
2998          * (see comments in __remove_mapping(    
2999          */                                      
3000         smp_rmb();                               
3001                                                  
3002         /*                                       
3003          * If the folio or its PMD is redirti    
3004          * are unexpected references, we will    
3005          * and remap it.                         
3006          *                                       
3007          * The only folio refs must be one fr    
3008          */                                      
3009         if (folio_test_dirty(folio) || pmd_di    
3010             ref_count != map_count + 1) {        
3011                 set_pmd_at(mm, addr, pmdp, or    
3012                 return false;                    
3013         }                                        
3014                                                  
3015         folio_remove_rmap_pmd(folio, pmd_page    
3016         zap_deposited_table(mm, pmdp);           
3017         add_mm_counter(mm, MM_ANONPAGES, -HPA    
3018         if (vma->vm_flags & VM_LOCKED)           
3019                 mlock_drain_local();             
3020         folio_put(folio);                        
3021                                                  
3022         return true;                             
3023 }                                                
3024                                                  
3025 bool unmap_huge_pmd_locked(struct vm_area_str    
3026                            pmd_t *pmdp, struc    
3027 {                                                
3028         VM_WARN_ON_FOLIO(!folio_test_pmd_mapp    
3029         VM_WARN_ON_FOLIO(!folio_test_locked(f    
3030         VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPA    
3031                                                  
3032         if (folio_test_anon(folio) && !folio_    
3033                 return __discard_anon_folio_p    
3034                                                  
3035         return false;                            
3036 }                                                
3037                                                  
3038 static void remap_page(struct folio *folio, u    
3039 {                                                
3040         int i = 0;                               
3041                                                  
3042         /* If unmap_folio() uses try_to_migra    
3043         if (!folio_test_anon(folio))             
3044                 return;                          
3045         for (;;) {                               
3046                 remove_migration_ptes(folio,     
3047                 i += folio_nr_pages(folio);      
3048                 if (i >= nr)                     
3049                         break;                   
3050                 folio = folio_next(folio);       
3051         }                                        
3052 }                                                
3053                                                  
3054 static void lru_add_page_tail(struct folio *f    
3055                 struct lruvec *lruvec, struct    
3056 {                                                
3057         VM_BUG_ON_FOLIO(!folio_test_large(fol    
3058         VM_BUG_ON_FOLIO(PageLRU(tail), folio)    
3059         lockdep_assert_held(&lruvec->lru_lock    
3060                                                  
3061         if (list) {                              
3062                 /* page reclaim is reclaiming    
3063                 VM_WARN_ON(folio_test_lru(fol    
3064                 get_page(tail);                  
3065                 list_add_tail(&tail->lru, lis    
3066         } else {                                 
3067                 /* head is still on lru (and     
3068                 VM_WARN_ON(!folio_test_lru(fo    
3069                 if (folio_test_unevictable(fo    
3070                         tail->mlock_count = 0    
3071                 else                             
3072                         list_add_tail(&tail->    
3073                 SetPageLRU(tail);                
3074         }                                        
3075 }                                                
3076                                                  
3077 static void __split_huge_page_tail(struct fol    
3078                 struct lruvec *lruvec, struct    
3079                 unsigned int new_order)          
3080 {                                                
3081         struct page *head = &folio->page;        
3082         struct page *page_tail = head + tail;    
3083         /*                                       
3084          * Careful: new_folio is not a "real"    
3085          * Don't pass it around before clear_    
3086          */                                      
3087         struct folio *new_folio = (struct fol    
3088                                                  
3089         VM_BUG_ON_PAGE(atomic_read(&page_tail    
3090                                                  
3091         /*                                       
3092          * Clone page flags before unfreezing    
3093          *                                       
3094          * After successful get_page_unless_z    
3095          * for example lock_page() which set     
3096          *                                       
3097          * Note that for mapped sub-pages of     
3098          * PG_anon_exclusive has been cleared    
3099          * the migration entry instead from w    
3100          * We can still have PG_anon_exclusiv    
3101          * unreferenced sub-pages of an anony    
3102          * PG_anon_exclusive (-> PG_mappedtod    
3103          */                                      
3104         page_tail->flags &= ~PAGE_FLAGS_CHECK    
3105         page_tail->flags |= (head->flags &       
3106                         ((1L << PG_referenced    
3107                          (1L << PG_swapbacked    
3108                          (1L << PG_swapcache)    
3109                          (1L << PG_mlocked) |    
3110                          (1L << PG_uptodate)     
3111                          (1L << PG_active) |     
3112                          (1L << PG_workingset    
3113                          (1L << PG_locked) |     
3114                          (1L << PG_unevictabl    
3115 #ifdef CONFIG_ARCH_USES_PG_ARCH_2                
3116                          (1L << PG_arch_2) |     
3117 #endif                                           
3118 #ifdef CONFIG_ARCH_USES_PG_ARCH_3                
3119                          (1L << PG_arch_3) |     
3120 #endif                                           
3121                          (1L << PG_dirty) |      
3122                          LRU_GEN_MASK | LRU_R    
3123                                                  
3124         /* ->mapping in first and second tail    
3125         VM_BUG_ON_PAGE(tail > 2 && page_tail-    
3126                         page_tail);              
3127         page_tail->mapping = head->mapping;      
3128         page_tail->index = head->index + tail    
3129                                                  
3130         /*                                       
3131          * page->private should not be set in    
3132          * if private is unexpectedly set.       
3133          */                                      
3134         if (unlikely(page_tail->private)) {      
3135                 VM_WARN_ON_ONCE_PAGE(true, pa    
3136                 page_tail->private = 0;          
3137         }                                        
3138         if (folio_test_swapcache(folio))         
3139                 new_folio->swap.val = folio->    
3140                                                  
3141         /* Page flags must be visible before     
3142         smp_wmb();                               
3143                                                  
3144         /*                                       
3145          * Clear PageTail before unfreezing p    
3146          *                                       
3147          * After successful get_page_unless_z    
3148          * which needs correct compound_head(    
3149          */                                      
3150         clear_compound_head(page_tail);          
3151         if (new_order) {                         
3152                 prep_compound_page(page_tail,    
3153                 folio_set_large_rmappable(new    
3154         }                                        
3155                                                  
3156         /* Finally unfreeze refcount. Additio    
3157         page_ref_unfreeze(page_tail,             
3158                 1 + ((!folio_test_anon(folio)    
3159                              folio_nr_pages(n    
3160                                                  
3161         if (folio_test_young(folio))             
3162                 folio_set_young(new_folio);      
3163         if (folio_test_idle(folio))              
3164                 folio_set_idle(new_folio);       
3165                                                  
3166         folio_xchg_last_cpupid(new_folio, fol    
3167                                                  
3168         /*                                       
3169          * always add to the tail because som    
3170          * pages to show after the currently     
3171          * migrate_pages                         
3172          */                                      
3173         lru_add_page_tail(folio, page_tail, l    
3174 }                                                
3175                                                  
3176 static void __split_huge_page(struct page *pa    
3177                 pgoff_t end, unsigned int new    
3178 {                                                
3179         struct folio *folio = page_folio(page    
3180         struct page *head = &folio->page;        
3181         struct lruvec *lruvec;                   
3182         struct address_space *swap_cache = NU    
3183         unsigned long offset = 0;                
3184         int i, nr_dropped = 0;                   
3185         unsigned int new_nr = 1 << new_order;    
3186         int order = folio_order(folio);          
3187         unsigned int nr = 1 << order;            
3188                                                  
3189         /* complete memcg works before add pa    
3190         split_page_memcg(head, order, new_ord    
3191                                                  
3192         if (folio_test_anon(folio) && folio_t    
3193                 offset = swap_cache_index(fol    
3194                 swap_cache = swap_address_spa    
3195                 xa_lock(&swap_cache->i_pages)    
3196         }                                        
3197                                                  
3198         /* lock lru list/PageCompound, ref fr    
3199         lruvec = folio_lruvec_lock(folio);       
3200                                                  
3201         ClearPageHasHWPoisoned(head);            
3202                                                  
3203         for (i = nr - new_nr; i >= new_nr; i     
3204                 __split_huge_page_tail(folio,    
3205                 /* Some pages can be beyond E    
3206                 if (head[i].index >= end) {      
3207                         struct folio *tail =     
3208                                                  
3209                         if (shmem_mapping(fol    
3210                                 nr_dropped++;    
3211                         else if (folio_test_c    
3212                                 folio_account    
3213                                         inode    
3214                         __filemap_remove_foli    
3215                         folio_put(tail);         
3216                 } else if (!PageAnon(page)) {    
3217                         __xa_store(&folio->ma    
3218                                         head     
3219                 } else if (swap_cache) {         
3220                         __xa_store(&swap_cach    
3221                                         head     
3222                 }                                
3223         }                                        
3224                                                  
3225         if (!new_order)                          
3226                 ClearPageCompound(head);         
3227         else {                                   
3228                 struct folio *new_folio = (st    
3229                                                  
3230                 folio_set_order(new_folio, ne    
3231         }                                        
3232         unlock_page_lruvec(lruvec);              
3233         /* Caller disabled irqs, so they are     
3234                                                  
3235         split_page_owner(head, order, new_ord    
3236         pgalloc_tag_split(folio, order, new_o    
3237                                                  
3238         /* See comment in __split_huge_page_t    
3239         if (folio_test_anon(folio)) {            
3240                 /* Additional pin to swap cac    
3241                 if (folio_test_swapcache(foli    
3242                         folio_ref_add(folio,     
3243                         xa_unlock(&swap_cache    
3244                 } else {                         
3245                         folio_ref_inc(folio);    
3246                 }                                
3247         } else {                                 
3248                 /* Additional pin to page cac    
3249                 folio_ref_add(folio, 1 + new_    
3250                 xa_unlock(&folio->mapping->i_    
3251         }                                        
3252         local_irq_enable();                      
3253                                                  
3254         if (nr_dropped)                          
3255                 shmem_uncharge(folio->mapping    
3256         remap_page(folio, nr, PageAnon(head)     
3257                                                  
3258         /*                                       
3259          * set page to its compound_head when    
3260          * we can skip unlocking it below, si    
3261          * the compound_head of the page and     
3262          */                                      
3263         if (new_order)                           
3264                 page = compound_head(page);      
3265                                                  
3266         for (i = 0; i < nr; i += new_nr) {       
3267                 struct page *subpage = head +    
3268                 struct folio *new_folio = pag    
3269                 if (subpage == page)             
3270                         continue;                
3271                 folio_unlock(new_folio);         
3272                                                  
3273                 /*                               
3274                  * Subpages may be freed if t    
3275                  * like if add_to_swap() is r    
3276                  * had its mapping zapped. An    
3277                  * requires taking the lru_lo    
3278                  * of the tail pages after th    
3279                  */                              
3280                 free_page_and_swap_cache(subp    
3281         }                                        
3282 }                                                
3283                                                  
3284 /* Racy check whether the huge page can be sp    
3285 bool can_split_folio(struct folio *folio, int    
3286 {                                                
3287         int extra_pins;                          
3288                                                  
3289         /* Additional pins from page cache */    
3290         if (folio_test_anon(folio))              
3291                 extra_pins = folio_test_swapc    
3292                                 folio_nr_page    
3293         else                                     
3294                 extra_pins = folio_nr_pages(f    
3295         if (pextra_pins)                         
3296                 *pextra_pins = extra_pins;       
3297         return folio_mapcount(folio) == folio    
3298                                         calle    
3299 }                                                
3300                                                  
3301 /*                                               
3302  * This function splits a large folio into sm    
3303  * @page can point to any page of the large f    
3304  * does not change the position of @page.        
3305  *                                               
3306  * Prerequisites:                                
3307  *                                               
3308  * 1) The caller must hold a reference on the    
3309  *    as the large folio.                        
3310  *                                               
3311  * 2) The large folio must be locked.            
3312  *                                               
3313  * 3) The folio must not be pinned. Any unexp    
3314  *    GUP pins, will result in the folio not     
3315  *    will receive an -EAGAIN.                   
3316  *                                               
3317  * 4) @new_order > 1, usually. Splitting to o    
3318  *    supported for non-file-backed folios, b    
3319  *    is used by partially mapped folios, is     
3320  *    folio only has subpages 0 and 1. File-b    
3321  *    since they do not use _deferred_list.      
3322  *                                               
3323  * After splitting, the caller's folio refere    
3324  * resulting in a raised refcount of @page af    
3325  * be freed if they are not mapped.              
3326  *                                               
3327  * If @list is null, tail pages will be added    
3328  *                                               
3329  * Pages in @new_order will inherit the mappi    
3330  * huge page.                                    
3331  *                                               
3332  * Returns 0 if the huge page was split succe    
3333  *                                               
3334  * Returns -EAGAIN if the folio has unexpecte    
3335  * the folio was concurrently removed from th    
3336  *                                               
3337  * Returns -EBUSY when trying to split the hu    
3338  * under writeback, if fs-specific folio meta    
3339  * released, or if some unexpected race happe    
3340  * truncation).                                  
3341  *                                               
3342  * Callers should ensure that the order respe    
3343  * min-order if one is set for non-anonymous     
3344  *                                               
3345  * Returns -EINVAL when trying to split to an    
3346  * with the folio. Splitting to order 0 is co    
3347  */                                              
3348 int split_huge_page_to_list_to_order(struct p    
3349                                      unsigned    
3350 {                                                
3351         struct folio *folio = page_folio(page    
3352         struct deferred_split *ds_queue = get    
3353         /* reset xarray order to new order af    
3354         XA_STATE_ORDER(xas, &folio->mapping->    
3355         bool is_anon = folio_test_anon(folio)    
3356         struct address_space *mapping = NULL;    
3357         struct anon_vma *anon_vma = NULL;        
3358         int order = folio_order(folio);          
3359         int extra_pins, ret;                     
3360         pgoff_t end;                             
3361         bool is_hzp;                             
3362                                                  
3363         VM_BUG_ON_FOLIO(!folio_test_locked(fo    
3364         VM_BUG_ON_FOLIO(!folio_test_large(fol    
3365                                                  
3366         if (new_order >= folio_order(folio))     
3367                 return -EINVAL;                  
3368                                                  
3369         if (is_anon) {                           
3370                 /* order-1 is not supported f    
3371                 if (new_order == 1) {            
3372                         VM_WARN_ONCE(1, "Cann    
3373                         return -EINVAL;          
3374                 }                                
3375         } else if (new_order) {                  
3376                 /* Split shmem folio to non-z    
3377                 if (shmem_mapping(folio->mapp    
3378                         VM_WARN_ONCE(1,          
3379                                 "Cannot split    
3380                         return -EINVAL;          
3381                 }                                
3382                 /*                               
3383                  * No split if the file syste    
3384                  * Note that we might still h    
3385                  * CONFIG_READ_ONLY_THP_FOR_F    
3386                  * does not actually support     
3387                  */                              
3388                 if (IS_ENABLED(CONFIG_READ_ON    
3389                     !mapping_large_folio_supp    
3390                         VM_WARN_ONCE(1,          
3391                                 "Cannot split    
3392                         return -EINVAL;          
3393                 }                                
3394         }                                        
3395                                                  
3396         /* Only swapping a whole PMD-mapped f    
3397         if (folio_test_swapcache(folio) && ne    
3398                 return -EINVAL;                  
3399                                                  
3400         is_hzp = is_huge_zero_folio(folio);      
3401         if (is_hzp) {                            
3402                 pr_warn_ratelimited("Called s    
3403                 return -EBUSY;                   
3404         }                                        
3405                                                  
3406         if (folio_test_writeback(folio))         
3407                 return -EBUSY;                   
3408                                                  
3409         if (is_anon) {                           
3410                 /*                               
3411                  * The caller does not necess    
3412                  * prevent the anon_vma disap    
3413                  * reference to it and then l    
3414                  * is similar to folio_lock_a    
3415                  * is taken to serialise agai    
3416                  * operations.                   
3417                  */                              
3418                 anon_vma = folio_get_anon_vma    
3419                 if (!anon_vma) {                 
3420                         ret = -EBUSY;            
3421                         goto out;                
3422                 }                                
3423                 end = -1;                        
3424                 mapping = NULL;                  
3425                 anon_vma_lock_write(anon_vma)    
3426         } else {                                 
3427                 unsigned int min_order;          
3428                 gfp_t gfp;                       
3429                                                  
3430                 mapping = folio->mapping;        
3431                                                  
3432                 /* Truncated ? */                
3433                 if (!mapping) {                  
3434                         ret = -EBUSY;            
3435                         goto out;                
3436                 }                                
3437                                                  
3438                 min_order = mapping_min_folio    
3439                 if (new_order < min_order) {     
3440                         VM_WARN_ONCE(1, "Cann    
3441                                      min_orde    
3442                         ret = -EINVAL;           
3443                         goto out;                
3444                 }                                
3445                                                  
3446                 gfp = current_gfp_context(map    
3447                                                  
3448                                                  
3449                 if (!filemap_release_folio(fo    
3450                         ret = -EBUSY;            
3451                         goto out;                
3452                 }                                
3453                                                  
3454                 xas_split_alloc(&xas, folio,     
3455                 if (xas_error(&xas)) {           
3456                         ret = xas_error(&xas)    
3457                         goto out;                
3458                 }                                
3459                                                  
3460                 anon_vma = NULL;                 
3461                 i_mmap_lock_read(mapping);       
3462                                                  
3463                 /*                               
3464                  *__split_huge_page() may nee    
3465                  * but on 32-bit, i_size_read    
3466                  * which cannot be nested ins    
3467                  * end now: i_size itself may    
3468                  * folio lock is good enough     
3469                  */                              
3470                 end = DIV_ROUND_UP(i_size_rea    
3471                 if (shmem_mapping(mapping))      
3472                         end = shmem_fallocend    
3473         }                                        
3474                                                  
3475         /*                                       
3476          * Racy check if we can split the pag    
3477          * split PMDs                            
3478          */                                      
3479         if (!can_split_folio(folio, 1, &extra    
3480                 ret = -EAGAIN;                   
3481                 goto out_unlock;                 
3482         }                                        
3483                                                  
3484         unmap_folio(folio);                      
3485                                                  
3486         /* block interrupt reentry in xa_lock    
3487         local_irq_disable();                     
3488         if (mapping) {                           
3489                 /*                               
3490                  * Check if the folio is pres    
3491                  * We assume all tail are pre    
3492                  */                              
3493                 xas_lock(&xas);                  
3494                 xas_reset(&xas);                 
3495                 if (xas_load(&xas) != folio)     
3496                         goto fail;               
3497         }                                        
3498                                                  
3499         /* Prevent deferred_split_scan() touc    
3500         spin_lock(&ds_queue->split_queue_lock    
3501         if (folio_ref_freeze(folio, 1 + extra    
3502                 if (folio_order(folio) > 1 &&    
3503                     !list_empty(&folio->_defe    
3504                         ds_queue->split_queue    
3505                         if (folio_test_partia    
3506                                 __folio_clear    
3507                                 mod_mthp_stat    
3508                                                  
3509                         }                        
3510                         /*                       
3511                          * Reinitialize page_    
3512                          * page from the spli    
3513                          * split will see lis    
3514                          * page_deferred_list    
3515                          */                      
3516                         list_del_init(&folio-    
3517                 }                                
3518                 spin_unlock(&ds_queue->split_    
3519                 if (mapping) {                   
3520                         int nr = folio_nr_pag    
3521                                                  
3522                         xas_split(&xas, folio    
3523                         if (folio_test_pmd_ma    
3524                             new_order < HPAGE    
3525                                 if (folio_tes    
3526                                         __lru    
3527                                                  
3528                                 } else {         
3529                                         __lru    
3530                                                  
3531                                         filem    
3532                                 }                
3533                         }                        
3534                 }                                
3535                                                  
3536                 if (is_anon) {                   
3537                         mod_mthp_stat(order,     
3538                         mod_mthp_stat(new_ord    
3539                 }                                
3540                 __split_huge_page(page, list,    
3541                 ret = 0;                         
3542         } else {                                 
3543                 spin_unlock(&ds_queue->split_    
3544 fail:                                            
3545                 if (mapping)                     
3546                         xas_unlock(&xas);        
3547                 local_irq_enable();              
3548                 remap_page(folio, folio_nr_pa    
3549                 ret = -EAGAIN;                   
3550         }                                        
3551                                                  
3552 out_unlock:                                      
3553         if (anon_vma) {                          
3554                 anon_vma_unlock_write(anon_vm    
3555                 put_anon_vma(anon_vma);          
3556         }                                        
3557         if (mapping)                             
3558                 i_mmap_unlock_read(mapping);     
3559 out:                                             
3560         xas_destroy(&xas);                       
3561         if (order == HPAGE_PMD_ORDER)            
3562                 count_vm_event(!ret ? THP_SPL    
3563         count_mthp_stat(order, !ret ? MTHP_ST    
3564         return ret;                              
3565 }                                                
3566                                                  
3567 int min_order_for_split(struct folio *folio)     
3568 {                                                
3569         if (folio_test_anon(folio))              
3570                 return 0;                        
3571                                                  
3572         if (!folio->mapping) {                   
3573                 if (folio_test_pmd_mappable(f    
3574                         count_vm_event(THP_SP    
3575                 return -EBUSY;                   
3576         }                                        
3577                                                  
3578         return mapping_min_folio_order(folio-    
3579 }                                                
3580                                                  
3581 int split_folio_to_list(struct folio *folio,     
3582 {                                                
3583         int ret = min_order_for_split(folio);    
3584                                                  
3585         if (ret < 0)                             
3586                 return ret;                      
3587                                                  
3588         return split_huge_page_to_list_to_ord    
3589 }                                                
3590                                                  
3591 /*                                               
3592  * __folio_unqueue_deferred_split() is not to    
3593  * the folio_unqueue_deferred_split() inline     
3594  * limits its calls to those folios which may    
3595  * queueing THP splits, and that list is (rac    
3596  *                                               
3597  * It is unsafe to call folio_unqueue_deferre    
3598  * zero: because even when split_queue_lock i    
3599  * might be in use on deferred_split_scan()'s    
3600  *                                               
3601  * If memory cgroups are enabled, split_queue    
3602  * therefore important to unqueue deferred sp    
3603  */                                              
3604 bool __folio_unqueue_deferred_split(struct fo    
3605 {                                                
3606         struct deferred_split *ds_queue;         
3607         unsigned long flags;                     
3608         bool unqueued = false;                   
3609                                                  
3610         WARN_ON_ONCE(folio_ref_count(folio));    
3611         WARN_ON_ONCE(!mem_cgroup_disabled() &    
3612                                                  
3613         ds_queue = get_deferred_split_queue(f    
3614         spin_lock_irqsave(&ds_queue->split_qu    
3615         if (!list_empty(&folio->_deferred_lis    
3616                 ds_queue->split_queue_len--;     
3617                 if (folio_test_partially_mapp    
3618                         __folio_clear_partial    
3619                         mod_mthp_stat(folio_o    
3620                                       MTHP_ST    
3621                 }                                
3622                 list_del_init(&folio->_deferr    
3623                 unqueued = true;                 
3624         }                                        
3625         spin_unlock_irqrestore(&ds_queue->spl    
3626                                                  
3627         return unqueued;        /* useful for    
3628 }                                                
3629                                                  
3630 /* partially_mapped=false won't clear PG_part    
3631 void deferred_split_folio(struct folio *folio    
3632 {                                                
3633         struct deferred_split *ds_queue = get    
3634 #ifdef CONFIG_MEMCG                              
3635         struct mem_cgroup *memcg = folio_memc    
3636 #endif                                           
3637         unsigned long flags;                     
3638                                                  
3639         /*                                       
3640          * Order 1 folios have no space for a    
3641          * won't waste much memory by not add    
3642          */                                      
3643         if (folio_order(folio) <= 1)             
3644                 return;                          
3645                                                  
3646         if (!partially_mapped && !split_under    
3647                 return;                          
3648                                                  
3649         /*                                       
3650          * Exclude swapcache: originally to a    
3651          * queue. Nowadays that is fully prev    
3652          * but if page reclaim is already han    
3653          * unnecessary to handle it again in     
3654          * swapcache here may still be a usef    
3655          */                                      
3656         if (folio_test_swapcache(folio))         
3657                 return;                          
3658                                                  
3659         spin_lock_irqsave(&ds_queue->split_qu    
3660         if (partially_mapped) {                  
3661                 if (!folio_test_partially_map    
3662                         __folio_set_partially    
3663                         if (folio_test_pmd_ma    
3664                                 count_vm_even    
3665                         count_mthp_stat(folio    
3666                         mod_mthp_stat(folio_o    
3667                                                  
3668                 }                                
3669         } else {                                 
3670                 /* partially mapped folios ca    
3671                 VM_WARN_ON_FOLIO(folio_test_p    
3672         }                                        
3673         if (list_empty(&folio->_deferred_list    
3674                 list_add_tail(&folio->_deferr    
3675                 ds_queue->split_queue_len++;     
3676 #ifdef CONFIG_MEMCG                              
3677                 if (memcg)                       
3678                         set_shrinker_bit(memc    
3679                                          defe    
3680 #endif                                           
3681         }                                        
3682         spin_unlock_irqrestore(&ds_queue->spl    
3683 }                                                
3684                                                  
3685 static unsigned long deferred_split_count(str    
3686                 struct shrink_control *sc)       
3687 {                                                
3688         struct pglist_data *pgdata = NODE_DAT    
3689         struct deferred_split *ds_queue = &pg    
3690                                                  
3691 #ifdef CONFIG_MEMCG                              
3692         if (sc->memcg)                           
3693                 ds_queue = &sc->memcg->deferr    
3694 #endif                                           
3695         return READ_ONCE(ds_queue->split_queu    
3696 }                                                
3697                                                  
3698 static bool thp_underused(struct folio *folio    
3699 {                                                
3700         int num_zero_pages = 0, num_filled_pa    
3701         void *kaddr;                             
3702         int i;                                   
3703                                                  
3704         if (khugepaged_max_ptes_none == HPAGE    
3705                 return false;                    
3706                                                  
3707         for (i = 0; i < folio_nr_pages(folio)    
3708                 kaddr = kmap_local_folio(foli    
3709                 if (!memchr_inv(kaddr, 0, PAG    
3710                         num_zero_pages++;        
3711                         if (num_zero_pages >     
3712                                 kunmap_local(    
3713                                 return true;     
3714                         }                        
3715                 } else {                         
3716                         /*                       
3717                          * Another path for e    
3718                          * of non-zero filled    
3719                          */                      
3720                         num_filled_pages++;      
3721                         if (num_filled_pages     
3722                                 kunmap_local(    
3723                                 return false;    
3724                         }                        
3725                 }                                
3726                 kunmap_local(kaddr);             
3727         }                                        
3728         return false;                            
3729 }                                                
3730                                                  
3731 static unsigned long deferred_split_scan(stru    
3732                 struct shrink_control *sc)       
3733 {                                                
3734         struct pglist_data *pgdata = NODE_DAT    
3735         struct deferred_split *ds_queue = &pg    
3736         unsigned long flags;                     
3737         LIST_HEAD(list);                         
3738         struct folio *folio, *next, *prev = N    
3739         int split = 0, removed = 0;              
3740                                                  
3741 #ifdef CONFIG_MEMCG                              
3742         if (sc->memcg)                           
3743                 ds_queue = &sc->memcg->deferr    
3744 #endif                                           
3745                                                  
3746         spin_lock_irqsave(&ds_queue->split_qu    
3747         /* Take pin on all head pages to avoi    
3748         list_for_each_entry_safe(folio, next,    
3749                                                  
3750                 if (folio_try_get(folio)) {      
3751                         list_move(&folio->_de    
3752                 } else {                         
3753                         /* We lost race with     
3754                         if (folio_test_partia    
3755                                 __folio_clear    
3756                                 mod_mthp_stat    
3757                                                  
3758                         }                        
3759                         list_del_init(&folio-    
3760                         ds_queue->split_queue    
3761                 }                                
3762                 if (!--sc->nr_to_scan)           
3763                         break;                   
3764         }                                        
3765         spin_unlock_irqrestore(&ds_queue->spl    
3766                                                  
3767         list_for_each_entry_safe(folio, next,    
3768                 bool did_split = false;          
3769                 bool underused = false;          
3770                                                  
3771                 if (!folio_test_partially_map    
3772                         underused = thp_under    
3773                         if (!underused)          
3774                                 goto next;       
3775                 }                                
3776                 if (!folio_trylock(folio))       
3777                         goto next;               
3778                 if (!split_folio(folio)) {       
3779                         did_split = true;        
3780                         if (underused)           
3781                                 count_vm_even    
3782                         split++;                 
3783                 }                                
3784                 folio_unlock(folio);             
3785 next:                                            
3786                 /*                               
3787                  * split_folio() removes foli    
3788                  * Only add back to the queue    
3789                  * If thp_underused returns f    
3790                  * in the case it was underus    
3791                  * don't add it back to split    
3792                  */                              
3793                 if (!did_split && !folio_test    
3794                         list_del_init(&folio-    
3795                         removed++;               
3796                 } else {                         
3797                         /*                       
3798                          * That unlocked list    
3799                          * unless its folio i    
3800                          * left on the list (    
3801                          * by one safe folio     
3802                          */                      
3803                         swap(folio, prev);       
3804                 }                                
3805                 if (folio)                       
3806                         folio_put(folio);        
3807         }                                        
3808                                                  
3809         spin_lock_irqsave(&ds_queue->split_qu    
3810         list_splice_tail(&list, &ds_queue->sp    
3811         ds_queue->split_queue_len -= removed;    
3812         spin_unlock_irqrestore(&ds_queue->spl    
3813                                                  
3814         if (prev)                                
3815                 folio_put(prev);                 
3816                                                  
3817         /*                                       
3818          * Stop shrinker if we didn't split a    
3819          * This can happen if pages were free    
3820          */                                      
3821         if (!split && list_empty(&ds_queue->s    
3822                 return SHRINK_STOP;              
3823         return split;                            
3824 }                                                
3825                                                  
3826 #ifdef CONFIG_DEBUG_FS                           
3827 static void split_huge_pages_all(void)           
3828 {                                                
3829         struct zone *zone;                       
3830         struct page *page;                       
3831         struct folio *folio;                     
3832         unsigned long pfn, max_zone_pfn;         
3833         unsigned long total = 0, split = 0;      
3834                                                  
3835         pr_debug("Split all THPs\n");            
3836         for_each_zone(zone) {                    
3837                 if (!managed_zone(zone))         
3838                         continue;                
3839                 max_zone_pfn = zone_end_pfn(z    
3840                 for (pfn = zone->zone_start_p    
3841                         int nr_pages;            
3842                                                  
3843                         page = pfn_to_online_    
3844                         if (!page || PageTail    
3845                                 continue;        
3846                         folio = page_folio(pa    
3847                         if (!folio_try_get(fo    
3848                                 continue;        
3849                                                  
3850                         if (unlikely(page_fol    
3851                                 goto next;       
3852                                                  
3853                         if (zone != folio_zon    
3854                                 goto next;       
3855                                                  
3856                         if (!folio_test_large    
3857                                 || folio_test    
3858                                 || !folio_tes    
3859                                 goto next;       
3860                                                  
3861                         total++;                 
3862                         folio_lock(folio);       
3863                         nr_pages = folio_nr_p    
3864                         if (!split_folio(foli    
3865                                 split++;         
3866                         pfn += nr_pages - 1;     
3867                         folio_unlock(folio);     
3868 next:                                            
3869                         folio_put(folio);        
3870                         cond_resched();          
3871                 }                                
3872         }                                        
3873                                                  
3874         pr_debug("%lu of %lu THP split\n", sp    
3875 }                                                
3876                                                  
3877 static inline bool vma_not_suitable_for_thp_s    
3878 {                                                
3879         return vma_is_special_huge(vma) || (v    
3880                     is_vm_hugetlb_page(vma);     
3881 }                                                
3882                                                  
3883 static int split_huge_pages_pid(int pid, unsi    
3884                                 unsigned long    
3885 {                                                
3886         int ret = 0;                             
3887         struct task_struct *task;                
3888         struct mm_struct *mm;                    
3889         unsigned long total = 0, split = 0;      
3890         unsigned long addr;                      
3891                                                  
3892         vaddr_start &= PAGE_MASK;                
3893         vaddr_end &= PAGE_MASK;                  
3894                                                  
3895         task = find_get_task_by_vpid(pid);       
3896         if (!task) {                             
3897                 ret = -ESRCH;                    
3898                 goto out;                        
3899         }                                        
3900                                                  
3901         /* Find the mm_struct */                 
3902         mm = get_task_mm(task);                  
3903         put_task_struct(task);                   
3904                                                  
3905         if (!mm) {                               
3906                 ret = -EINVAL;                   
3907                 goto out;                        
3908         }                                        
3909                                                  
3910         pr_debug("Split huge pages in pid: %d    
3911                  pid, vaddr_start, vaddr_end)    
3912                                                  
3913         mmap_read_lock(mm);                      
3914         /*                                       
3915          * always increase addr by PAGE_SIZE,    
3916          * table filled with PTE-mapped THPs,    
3917          */                                      
3918         for (addr = vaddr_start; addr < vaddr    
3919                 struct vm_area_struct *vma =     
3920                 struct folio_walk fw;            
3921                 struct folio *folio;             
3922                 struct address_space *mapping    
3923                 unsigned int target_order = n    
3924                                                  
3925                 if (!vma)                        
3926                         break;                   
3927                                                  
3928                 /* skip special VMA and huget    
3929                 if (vma_not_suitable_for_thp_    
3930                         addr = vma->vm_end;      
3931                         continue;                
3932                 }                                
3933                                                  
3934                 folio = folio_walk_start(&fw,    
3935                 if (!folio)                      
3936                         continue;                
3937                                                  
3938                 if (!is_transparent_hugepage(    
3939                         goto next;               
3940                                                  
3941                 if (!folio_test_anon(folio))     
3942                         mapping = folio->mapp    
3943                         target_order = max(ne    
3944                                            ma    
3945                 }                                
3946                                                  
3947                 if (target_order >= folio_ord    
3948                         goto next;               
3949                                                  
3950                 total++;                         
3951                 /*                               
3952                  * For folios with private, s    
3953                  * will try to drop it before    
3954                  * can be split or not. So sk    
3955                  */                              
3956                 if (!folio_test_private(folio    
3957                     !can_split_folio(folio, 0    
3958                         goto next;               
3959                                                  
3960                 if (!folio_trylock(folio))       
3961                         goto next;               
3962                 folio_get(folio);                
3963                 folio_walk_end(&fw, vma);        
3964                                                  
3965                 if (!folio_test_anon(folio) &    
3966                         goto unlock;             
3967                                                  
3968                 if (!split_folio_to_order(fol    
3969                         split++;                 
3970                                                  
3971 unlock:                                          
3972                                                  
3973                 folio_unlock(folio);             
3974                 folio_put(folio);                
3975                                                  
3976                 cond_resched();                  
3977                 continue;                        
3978 next:                                            
3979                 folio_walk_end(&fw, vma);        
3980                 cond_resched();                  
3981         }                                        
3982         mmap_read_unlock(mm);                    
3983         mmput(mm);                               
3984                                                  
3985         pr_debug("%lu of %lu THP split\n", sp    
3986                                                  
3987 out:                                             
3988         return ret;                              
3989 }                                                
3990                                                  
3991 static int split_huge_pages_in_file(const cha    
3992                                 pgoff_t off_e    
3993 {                                                
3994         struct filename *file;                   
3995         struct file *candidate;                  
3996         struct address_space *mapping;           
3997         int ret = -EINVAL;                       
3998         pgoff_t index;                           
3999         int nr_pages = 1;                        
4000         unsigned long total = 0, split = 0;      
4001         unsigned int min_order;                  
4002         unsigned int target_order;               
4003                                                  
4004         file = getname_kernel(file_path);        
4005         if (IS_ERR(file))                        
4006                 return ret;                      
4007                                                  
4008         candidate = file_open_name(file, O_RD    
4009         if (IS_ERR(candidate))                   
4010                 goto out;                        
4011                                                  
4012         pr_debug("split file-backed THPs in f    
4013                  file_path, off_start, off_en    
4014                                                  
4015         mapping = candidate->f_mapping;          
4016         min_order = mapping_min_folio_order(m    
4017         target_order = max(new_order, min_ord    
4018                                                  
4019         for (index = off_start; index < off_e    
4020                 struct folio *folio = filemap    
4021                                                  
4022                 nr_pages = 1;                    
4023                 if (IS_ERR(folio))               
4024                         continue;                
4025                                                  
4026                 if (!folio_test_large(folio))    
4027                         goto next;               
4028                                                  
4029                 total++;                         
4030                 nr_pages = folio_nr_pages(fol    
4031                                                  
4032                 if (target_order >= folio_ord    
4033                         goto next;               
4034                                                  
4035                 if (!folio_trylock(folio))       
4036                         goto next;               
4037                                                  
4038                 if (folio->mapping != mapping    
4039                         goto unlock;             
4040                                                  
4041                 if (!split_folio_to_order(fol    
4042                         split++;                 
4043                                                  
4044 unlock:                                          
4045                 folio_unlock(folio);             
4046 next:                                            
4047                 folio_put(folio);                
4048                 cond_resched();                  
4049         }                                        
4050                                                  
4051         filp_close(candidate, NULL);             
4052         ret = 0;                                 
4053                                                  
4054         pr_debug("%lu of %lu file-backed THP     
4055 out:                                             
4056         putname(file);                           
4057         return ret;                              
4058 }                                                
4059                                                  
4060 #define MAX_INPUT_BUF_SZ 255                     
4061                                                  
4062 static ssize_t split_huge_pages_write(struct     
4063                                 size_t count,    
4064 {                                                
4065         static DEFINE_MUTEX(split_debug_mutex    
4066         ssize_t ret;                             
4067         /*                                       
4068          * hold pid, start_vaddr, end_vaddr,     
4069          * file_path, off_start, off_end, new    
4070          */                                      
4071         char input_buf[MAX_INPUT_BUF_SZ];        
4072         int pid;                                 
4073         unsigned long vaddr_start, vaddr_end;    
4074         unsigned int new_order = 0;              
4075                                                  
4076         ret = mutex_lock_interruptible(&split    
4077         if (ret)                                 
4078                 return ret;                      
4079                                                  
4080         ret = -EFAULT;                           
4081                                                  
4082         memset(input_buf, 0, MAX_INPUT_BUF_SZ    
4083         if (copy_from_user(input_buf, buf, mi    
4084                 goto out;                        
4085                                                  
4086         input_buf[MAX_INPUT_BUF_SZ - 1] = '\0    
4087                                                  
4088         if (input_buf[0] == '/') {               
4089                 char *tok;                       
4090                 char *buf = input_buf;           
4091                 char file_path[MAX_INPUT_BUF_    
4092                 pgoff_t off_start = 0, off_en    
4093                 size_t input_len = strlen(inp    
4094                                                  
4095                 tok = strsep(&buf, ",");         
4096                 if (tok) {                       
4097                         strcpy(file_path, tok    
4098                 } else {                         
4099                         ret = -EINVAL;           
4100                         goto out;                
4101                 }                                
4102                                                  
4103                 ret = sscanf(buf, "0x%lx,0x%l    
4104                 if (ret != 2 && ret != 3) {      
4105                         ret = -EINVAL;           
4106                         goto out;                
4107                 }                                
4108                 ret = split_huge_pages_in_fil    
4109                 if (!ret)                        
4110                         ret = input_len;         
4111                                                  
4112                 goto out;                        
4113         }                                        
4114                                                  
4115         ret = sscanf(input_buf, "%d,0x%lx,0x%    
4116         if (ret == 1 && pid == 1) {              
4117                 split_huge_pages_all();          
4118                 ret = strlen(input_buf);         
4119                 goto out;                        
4120         } else if (ret != 3 && ret != 4) {       
4121                 ret = -EINVAL;                   
4122                 goto out;                        
4123         }                                        
4124                                                  
4125         ret = split_huge_pages_pid(pid, vaddr    
4126         if (!ret)                                
4127                 ret = strlen(input_buf);         
4128 out:                                             
4129         mutex_unlock(&split_debug_mutex);        
4130         return ret;                              
4131                                                  
4132 }                                                
4133                                                  
4134 static const struct file_operations split_hug    
4135         .owner   = THIS_MODULE,                  
4136         .write   = split_huge_pages_write,       
4137 };                                               
4138                                                  
4139 static int __init split_huge_pages_debugfs(vo    
4140 {                                                
4141         debugfs_create_file("split_huge_pages    
4142                             &split_huge_pages    
4143         return 0;                                
4144 }                                                
4145 late_initcall(split_huge_pages_debugfs);         
4146 #endif                                           
4147                                                  
4148 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION          
4149 int set_pmd_migration_entry(struct page_vma_m    
4150                 struct page *page)               
4151 {                                                
4152         struct folio *folio = page_folio(page    
4153         struct vm_area_struct *vma = pvmw->vm    
4154         struct mm_struct *mm = vma->vm_mm;       
4155         unsigned long address = pvmw->address    
4156         bool anon_exclusive;                     
4157         pmd_t pmdval;                            
4158         swp_entry_t entry;                       
4159         pmd_t pmdswp;                            
4160                                                  
4161         if (!(pvmw->pmd && !pvmw->pte))          
4162                 return 0;                        
4163                                                  
4164         flush_cache_range(vma, address, addre    
4165         pmdval = pmdp_invalidate(vma, address    
4166                                                  
4167         /* See folio_try_share_anon_rmap_pmd(    
4168         anon_exclusive = folio_test_anon(foli    
4169         if (anon_exclusive && folio_try_share    
4170                 set_pmd_at(mm, address, pvmw-    
4171                 return -EBUSY;                   
4172         }                                        
4173                                                  
4174         if (pmd_dirty(pmdval))                   
4175                 folio_mark_dirty(folio);         
4176         if (pmd_write(pmdval))                   
4177                 entry = make_writable_migrati    
4178         else if (anon_exclusive)                 
4179                 entry = make_readable_exclusi    
4180         else                                     
4181                 entry = make_readable_migrati    
4182         if (pmd_young(pmdval))                   
4183                 entry = make_migration_entry_    
4184         if (pmd_dirty(pmdval))                   
4185                 entry = make_migration_entry_    
4186         pmdswp = swp_entry_to_pmd(entry);        
4187         if (pmd_soft_dirty(pmdval))              
4188                 pmdswp = pmd_swp_mksoft_dirty    
4189         if (pmd_uffd_wp(pmdval))                 
4190                 pmdswp = pmd_swp_mkuffd_wp(pm    
4191         set_pmd_at(mm, address, pvmw->pmd, pm    
4192         folio_remove_rmap_pmd(folio, page, vm    
4193         folio_put(folio);                        
4194         trace_set_migration_pmd(address, pmd_    
4195                                                  
4196         return 0;                                
4197 }                                                
4198                                                  
4199 void remove_migration_pmd(struct page_vma_map    
4200 {                                                
4201         struct folio *folio = page_folio(new)    
4202         struct vm_area_struct *vma = pvmw->vm    
4203         struct mm_struct *mm = vma->vm_mm;       
4204         unsigned long address = pvmw->address    
4205         unsigned long haddr = address & HPAGE    
4206         pmd_t pmde;                              
4207         swp_entry_t entry;                       
4208                                                  
4209         if (!(pvmw->pmd && !pvmw->pte))          
4210                 return;                          
4211                                                  
4212         entry = pmd_to_swp_entry(*pvmw->pmd);    
4213         folio_get(folio);                        
4214         pmde = mk_huge_pmd(new, READ_ONCE(vma    
4215         if (pmd_swp_soft_dirty(*pvmw->pmd))      
4216                 pmde = pmd_mksoft_dirty(pmde)    
4217         if (is_writable_migration_entry(entry    
4218                 pmde = pmd_mkwrite(pmde, vma)    
4219         if (pmd_swp_uffd_wp(*pvmw->pmd))         
4220                 pmde = pmd_mkuffd_wp(pmde);      
4221         if (!is_migration_entry_young(entry))    
4222                 pmde = pmd_mkold(pmde);          
4223         /* NOTE: this may contain setting sof    
4224         if (folio_test_dirty(folio) && is_mig    
4225                 pmde = pmd_mkdirty(pmde);        
4226                                                  
4227         if (folio_test_anon(folio)) {            
4228                 rmap_t rmap_flags = RMAP_NONE    
4229                                                  
4230                 if (!is_readable_migration_en    
4231                         rmap_flags |= RMAP_EX    
4232                                                  
4233                 folio_add_anon_rmap_pmd(folio    
4234         } else {                                 
4235                 folio_add_file_rmap_pmd(folio    
4236         }                                        
4237         VM_BUG_ON(pmd_write(pmde) && folio_te    
4238         set_pmd_at(mm, haddr, pvmw->pmd, pmde    
4239                                                  
4240         /* No need to invalidate - it was non    
4241         update_mmu_cache_pmd(vma, address, pv    
4242         trace_remove_migration_pmd(address, p    
4243 }                                                
4244 #endif                                           
4245
~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.
TOMOYO Linux Cross Reference Linux/mm/huge_memory.c

Diff markup

Differences between /mm/huge_memory.c (Version linux-6.12-rc7) and /mm/huge_memory.c (Version policy-sample)

TOMOYO Linux Cross Reference
Linux/mm/huge_memory.c