~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/mm/huge_memory.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /mm/huge_memory.c (Version linux-6.12-rc7) and /mm/huge_memory.c (Version linux-4.16.18)


  1 // SPDX-License-Identifier: GPL-2.0-only       << 
  2 /*                                                  1 /*
  3  *  Copyright (C) 2009  Red Hat, Inc.               2  *  Copyright (C) 2009  Red Hat, Inc.
                                                   >>   3  *
                                                   >>   4  *  This work is licensed under the terms of the GNU GPL, version 2. See
                                                   >>   5  *  the COPYING file in the top-level directory.
  4  */                                                 6  */
  5                                                     7 
  6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt         8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  7                                                     9 
  8 #include <linux/mm.h>                              10 #include <linux/mm.h>
  9 #include <linux/sched.h>                           11 #include <linux/sched.h>
 10 #include <linux/sched/mm.h>                    << 
 11 #include <linux/sched/coredump.h>                  12 #include <linux/sched/coredump.h>
 12 #include <linux/sched/numa_balancing.h>            13 #include <linux/sched/numa_balancing.h>
 13 #include <linux/highmem.h>                         14 #include <linux/highmem.h>
 14 #include <linux/hugetlb.h>                         15 #include <linux/hugetlb.h>
 15 #include <linux/mmu_notifier.h>                    16 #include <linux/mmu_notifier.h>
 16 #include <linux/rmap.h>                            17 #include <linux/rmap.h>
 17 #include <linux/swap.h>                            18 #include <linux/swap.h>
 18 #include <linux/shrinker.h>                        19 #include <linux/shrinker.h>
 19 #include <linux/mm_inline.h>                       20 #include <linux/mm_inline.h>
 20 #include <linux/swapops.h>                         21 #include <linux/swapops.h>
 21 #include <linux/backing-dev.h>                 << 
 22 #include <linux/dax.h>                             22 #include <linux/dax.h>
 23 #include <linux/mm_types.h>                    << 
 24 #include <linux/khugepaged.h>                      23 #include <linux/khugepaged.h>
 25 #include <linux/freezer.h>                         24 #include <linux/freezer.h>
 26 #include <linux/pfn_t.h>                           25 #include <linux/pfn_t.h>
 27 #include <linux/mman.h>                            26 #include <linux/mman.h>
 28 #include <linux/memremap.h>                        27 #include <linux/memremap.h>
 29 #include <linux/pagemap.h>                         28 #include <linux/pagemap.h>
 30 #include <linux/debugfs.h>                         29 #include <linux/debugfs.h>
 31 #include <linux/migrate.h>                         30 #include <linux/migrate.h>
 32 #include <linux/hashtable.h>                       31 #include <linux/hashtable.h>
 33 #include <linux/userfaultfd_k.h>                   32 #include <linux/userfaultfd_k.h>
 34 #include <linux/page_idle.h>                       33 #include <linux/page_idle.h>
 35 #include <linux/shmem_fs.h>                        34 #include <linux/shmem_fs.h>
 36 #include <linux/oom.h>                             35 #include <linux/oom.h>
 37 #include <linux/numa.h>                        << 
 38 #include <linux/page_owner.h>                  << 
 39 #include <linux/sched/sysctl.h>                << 
 40 #include <linux/memory-tiers.h>                << 
 41 #include <linux/compat.h>                      << 
 42 #include <linux/pgalloc_tag.h>                 << 
 43 #include <linux/pagewalk.h>                    << 
 44                                                    36 
 45 #include <asm/tlb.h>                               37 #include <asm/tlb.h>
 46 #include <asm/pgalloc.h>                           38 #include <asm/pgalloc.h>
 47 #include "internal.h"                              39 #include "internal.h"
 48 #include "swap.h"                              << 
 49                                                << 
 50 #define CREATE_TRACE_POINTS                    << 
 51 #include <trace/events/thp.h>                  << 
 52                                                    40 
 53 /*                                                 41 /*
 54  * By default, transparent hugepage support is     42  * By default, transparent hugepage support is disabled in order to avoid
 55  * risking an increased memory footprint for a     43  * risking an increased memory footprint for applications that are not
 56  * guaranteed to benefit from it. When transpa     44  * guaranteed to benefit from it. When transparent hugepage support is
 57  * enabled, it is for all mappings, and khugep     45  * enabled, it is for all mappings, and khugepaged scans all mappings.
 58  * Defrag is invoked by khugepaged hugepage al     46  * Defrag is invoked by khugepaged hugepage allocations and by page faults
 59  * for all hugepage allocations.                   47  * for all hugepage allocations.
 60  */                                                48  */
 61 unsigned long transparent_hugepage_flags __rea     49 unsigned long transparent_hugepage_flags __read_mostly =
 62 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS          50 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
 63         (1<<TRANSPARENT_HUGEPAGE_FLAG)|            51         (1<<TRANSPARENT_HUGEPAGE_FLAG)|
 64 #endif                                             52 #endif
 65 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE         53 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
 66         (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG     54         (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
 67 #endif                                             55 #endif
 68         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MA     56         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
 69         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEP     57         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
 70         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE     58         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 71                                                    59 
 72 static struct shrinker *deferred_split_shrinke !!  60 static struct shrinker deferred_split_shrinker;
 73 static unsigned long deferred_split_count(stru << 
 74                                           stru << 
 75 static unsigned long deferred_split_scan(struc << 
 76                                          struc << 
 77 static bool split_underused_thp = true;        << 
 78                                                    61 
 79 static atomic_t huge_zero_refcount;                62 static atomic_t huge_zero_refcount;
 80 struct folio *huge_zero_folio __read_mostly;   !!  63 struct page *huge_zero_page __read_mostly;
 81 unsigned long huge_zero_pfn __read_mostly = ~0 << 
 82 unsigned long huge_anon_orders_always __read_m << 
 83 unsigned long huge_anon_orders_madvise __read_ << 
 84 unsigned long huge_anon_orders_inherit __read_ << 
 85 static bool anon_orders_configured __initdata; << 
 86                                                << 
 87 unsigned long __thp_vma_allowable_orders(struc << 
 88                                          unsig << 
 89                                          unsig << 
 90                                          unsig << 
 91 {                                              << 
 92         bool smaps = tva_flags & TVA_SMAPS;    << 
 93         bool in_pf = tva_flags & TVA_IN_PF;    << 
 94         bool enforce_sysfs = tva_flags & TVA_E << 
 95         unsigned long supported_orders;        << 
 96                                                << 
 97         /* Check the intersection of requested << 
 98         if (vma_is_anonymous(vma))             << 
 99                 supported_orders = THP_ORDERS_ << 
100         else if (vma_is_special_huge(vma))     << 
101                 supported_orders = THP_ORDERS_ << 
102         else                                   << 
103                 supported_orders = THP_ORDERS_ << 
104                                                << 
105         orders &= supported_orders;            << 
106         if (!orders)                           << 
107                 return 0;                      << 
108                                                << 
109         if (!vma->vm_mm)                /* vds << 
110                 return 0;                      << 
111                                                << 
112         if (thp_disabled_by_hw() || vma_thp_di << 
113                 return 0;                      << 
114                                                << 
115         /* khugepaged doesn't collapse DAX vma << 
116         if (vma_is_dax(vma))                   << 
117                 return in_pf ? orders : 0;     << 
118                                                << 
119         /*                                     << 
120          * khugepaged special VMA and hugetlb  << 
121          * Must be checked after dax since som << 
122          * VM_MIXEDMAP set.                    << 
123          */                                    << 
124         if (!in_pf && !smaps && (vm_flags & VM << 
125                 return 0;                      << 
126                                                << 
127         /*                                     << 
128          * Check alignment for file vma and si << 
129          * filtering out the unsuitable orders << 
130          *                                     << 
131          * Skip the check for page fault. Huge << 
132          * handlers.                           << 
133          */                                    << 
134         if (!in_pf) {                          << 
135                 int order = highest_order(orde << 
136                 unsigned long addr;            << 
137                                                << 
138                 while (orders) {               << 
139                         addr = vma->vm_end - ( << 
140                         if (thp_vma_suitable_o << 
141                                 break;         << 
142                         order = next_order(&or << 
143                 }                              << 
144                                                << 
145                 if (!orders)                   << 
146                         return 0;              << 
147         }                                      << 
148                                                << 
149         /*                                     << 
150          * Enabled via shmem mount options or  << 
151          * Must be done before hugepage flags  << 
152          * own flags.                          << 
153          */                                    << 
154         if (!in_pf && shmem_file(vma->vm_file) << 
155                 return shmem_allowable_huge_or << 
156                                                << 
157                                                << 
158                                                << 
159         if (!vma_is_anonymous(vma)) {          << 
160                 /*                             << 
161                  * Enforce sysfs THP requireme << 
162                  * were already handled in thp << 
163                  */                            << 
164                 if (enforce_sysfs &&           << 
165                     (!hugepage_global_enabled( << 
166                                                << 
167                         return 0;              << 
168                                                << 
169                 /*                             << 
170                  * Trust that ->huge_fault() h << 
171                  * in fault path.              << 
172                  */                            << 
173                 if (((in_pf || smaps)) && vma- << 
174                         return orders;         << 
175                 /* Only regular file is valid  << 
176                 if (((!in_pf || smaps)) && fil << 
177                         return orders;         << 
178                 return 0;                      << 
179         }                                      << 
180                                                << 
181         if (vma_is_temporary_stack(vma))       << 
182                 return 0;                      << 
183                                                << 
184         /*                                     << 
185          * THPeligible bit of smaps should sho << 
186          * though anon_vma is not initialized  << 
187          *                                     << 
188          * Allow page fault since anon_vma may << 
189          * the first page fault.               << 
190          */                                    << 
191         if (!vma->anon_vma)                    << 
192                 return (smaps || in_pf) ? orde << 
193                                                << 
194         return orders;                         << 
195 }                                              << 
196                                                    64 
197 static bool get_huge_zero_page(void)           !!  65 static struct page *get_huge_zero_page(void)
198 {                                                  66 {
199         struct folio *zero_folio;              !!  67         struct page *zero_page;
200 retry:                                             68 retry:
201         if (likely(atomic_inc_not_zero(&huge_z     69         if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
202                 return true;                   !!  70                 return READ_ONCE(huge_zero_page);
203                                                    71 
204         zero_folio = folio_alloc((GFP_TRANSHUG !!  72         zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
205                         HPAGE_PMD_ORDER);          73                         HPAGE_PMD_ORDER);
206         if (!zero_folio) {                     !!  74         if (!zero_page) {
207                 count_vm_event(THP_ZERO_PAGE_A     75                 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
208                 return false;                  !!  76                 return NULL;
209         }                                          77         }
210         /* Ensure zero folio won't have large_ !!  78         count_vm_event(THP_ZERO_PAGE_ALLOC);
211         folio_clear_large_rmappable(zero_folio << 
212         preempt_disable();                         79         preempt_disable();
213         if (cmpxchg(&huge_zero_folio, NULL, ze !!  80         if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
214                 preempt_enable();                  81                 preempt_enable();
215                 folio_put(zero_folio);         !!  82                 __free_pages(zero_page, compound_order(zero_page));
216                 goto retry;                        83                 goto retry;
217         }                                          84         }
218         WRITE_ONCE(huge_zero_pfn, folio_pfn(ze << 
219                                                    85 
220         /* We take additional reference here.      86         /* We take additional reference here. It will be put back by shrinker */
221         atomic_set(&huge_zero_refcount, 2);        87         atomic_set(&huge_zero_refcount, 2);
222         preempt_enable();                          88         preempt_enable();
223         count_vm_event(THP_ZERO_PAGE_ALLOC);   !!  89         return READ_ONCE(huge_zero_page);
224         return true;                           << 
225 }                                                  90 }
226                                                    91 
227 static void put_huge_zero_page(void)               92 static void put_huge_zero_page(void)
228 {                                                  93 {
229         /*                                         94         /*
230          * Counter should never go to zero her     95          * Counter should never go to zero here. Only shrinker can put
231          * last reference.                         96          * last reference.
232          */                                        97          */
233         BUG_ON(atomic_dec_and_test(&huge_zero_     98         BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
234 }                                                  99 }
235                                                   100 
236 struct folio *mm_get_huge_zero_folio(struct mm !! 101 struct page *mm_get_huge_zero_page(struct mm_struct *mm)
237 {                                                 102 {
238         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->    103         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
239                 return READ_ONCE(huge_zero_fol !! 104                 return READ_ONCE(huge_zero_page);
240                                                   105 
241         if (!get_huge_zero_page())                106         if (!get_huge_zero_page())
242                 return NULL;                      107                 return NULL;
243                                                   108 
244         if (test_and_set_bit(MMF_HUGE_ZERO_PAG    109         if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
245                 put_huge_zero_page();             110                 put_huge_zero_page();
246                                                   111 
247         return READ_ONCE(huge_zero_folio);     !! 112         return READ_ONCE(huge_zero_page);
248 }                                                 113 }
249                                                   114 
250 void mm_put_huge_zero_folio(struct mm_struct * !! 115 void mm_put_huge_zero_page(struct mm_struct *mm)
251 {                                                 116 {
252         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->    117         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
253                 put_huge_zero_page();             118                 put_huge_zero_page();
254 }                                                 119 }
255                                                   120 
256 static unsigned long shrink_huge_zero_page_cou    121 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
257                                         struct    122                                         struct shrink_control *sc)
258 {                                                 123 {
259         /* we can free zero page only if last     124         /* we can free zero page only if last reference remains */
260         return atomic_read(&huge_zero_refcount    125         return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
261 }                                                 126 }
262                                                   127 
263 static unsigned long shrink_huge_zero_page_sca    128 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
264                                        struct     129                                        struct shrink_control *sc)
265 {                                                 130 {
266         if (atomic_cmpxchg(&huge_zero_refcount    131         if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
267                 struct folio *zero_folio = xch !! 132                 struct page *zero_page = xchg(&huge_zero_page, NULL);
268                 BUG_ON(zero_folio == NULL);    !! 133                 BUG_ON(zero_page == NULL);
269                 WRITE_ONCE(huge_zero_pfn, ~0UL !! 134                 __free_pages(zero_page, compound_order(zero_page));
270                 folio_put(zero_folio);         << 
271                 return HPAGE_PMD_NR;              135                 return HPAGE_PMD_NR;
272         }                                         136         }
273                                                   137 
274         return 0;                                 138         return 0;
275 }                                                 139 }
276                                                   140 
277 static struct shrinker *huge_zero_page_shrinke !! 141 static struct shrinker huge_zero_page_shrinker = {
                                                   >> 142         .count_objects = shrink_huge_zero_page_count,
                                                   >> 143         .scan_objects = shrink_huge_zero_page_scan,
                                                   >> 144         .seeks = DEFAULT_SEEKS,
                                                   >> 145 };
278                                                   146 
279 #ifdef CONFIG_SYSFS                               147 #ifdef CONFIG_SYSFS
280 static ssize_t enabled_show(struct kobject *ko    148 static ssize_t enabled_show(struct kobject *kobj,
281                             struct kobj_attrib    149                             struct kobj_attribute *attr, char *buf)
282 {                                                 150 {
283         const char *output;                    << 
284                                                << 
285         if (test_bit(TRANSPARENT_HUGEPAGE_FLAG    151         if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
286                 output = "[always] madvise nev !! 152                 return sprintf(buf, "[always] madvise never\n");
287         else if (test_bit(TRANSPARENT_HUGEPAGE !! 153         else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
288                           &transparent_hugepag !! 154                 return sprintf(buf, "always [madvise] never\n");
289                 output = "always [madvise] nev << 
290         else                                      155         else
291                 output = "always madvise [neve !! 156                 return sprintf(buf, "always madvise [never]\n");
292                                                << 
293         return sysfs_emit(buf, "%s\n", output) << 
294 }                                                 157 }
295                                                   158 
296 static ssize_t enabled_store(struct kobject *k    159 static ssize_t enabled_store(struct kobject *kobj,
297                              struct kobj_attri    160                              struct kobj_attribute *attr,
298                              const char *buf,     161                              const char *buf, size_t count)
299 {                                                 162 {
300         ssize_t ret = count;                      163         ssize_t ret = count;
301                                                   164 
302         if (sysfs_streq(buf, "always")) {      !! 165         if (!memcmp("always", buf,
                                                   >> 166                     min(sizeof("always")-1, count))) {
303                 clear_bit(TRANSPARENT_HUGEPAGE    167                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
304                 set_bit(TRANSPARENT_HUGEPAGE_F    168                 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
305         } else if (sysfs_streq(buf, "madvise") !! 169         } else if (!memcmp("madvise", buf,
                                                   >> 170                            min(sizeof("madvise")-1, count))) {
306                 clear_bit(TRANSPARENT_HUGEPAGE    171                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
307                 set_bit(TRANSPARENT_HUGEPAGE_R    172                 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
308         } else if (sysfs_streq(buf, "never"))  !! 173         } else if (!memcmp("never", buf,
                                                   >> 174                            min(sizeof("never")-1, count))) {
309                 clear_bit(TRANSPARENT_HUGEPAGE    175                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
310                 clear_bit(TRANSPARENT_HUGEPAGE    176                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
311         } else                                    177         } else
312                 ret = -EINVAL;                    178                 ret = -EINVAL;
313                                                   179 
314         if (ret > 0) {                            180         if (ret > 0) {
315                 int err = start_stop_khugepage    181                 int err = start_stop_khugepaged();
316                 if (err)                          182                 if (err)
317                         ret = err;                183                         ret = err;
318         }                                         184         }
319         return ret;                               185         return ret;
320 }                                                 186 }
321                                                !! 187 static struct kobj_attribute enabled_attr =
322 static struct kobj_attribute enabled_attr = __ !! 188         __ATTR(enabled, 0644, enabled_show, enabled_store);
323                                                   189 
324 ssize_t single_hugepage_flag_show(struct kobje    190 ssize_t single_hugepage_flag_show(struct kobject *kobj,
325                                   struct kobj_ !! 191                                 struct kobj_attribute *attr, char *buf,
326                                   enum transpa !! 192                                 enum transparent_hugepage_flag flag)
327 {                                                 193 {
328         return sysfs_emit(buf, "%d\n",         !! 194         return sprintf(buf, "%d\n",
329                           !!test_bit(flag, &tr !! 195                        !!test_bit(flag, &transparent_hugepage_flags));
330 }                                                 196 }
331                                                   197 
332 ssize_t single_hugepage_flag_store(struct kobj    198 ssize_t single_hugepage_flag_store(struct kobject *kobj,
333                                  struct kobj_a    199                                  struct kobj_attribute *attr,
334                                  const char *b    200                                  const char *buf, size_t count,
335                                  enum transpar    201                                  enum transparent_hugepage_flag flag)
336 {                                                 202 {
337         unsigned long value;                      203         unsigned long value;
338         int ret;                                  204         int ret;
339                                                   205 
340         ret = kstrtoul(buf, 10, &value);          206         ret = kstrtoul(buf, 10, &value);
341         if (ret < 0)                              207         if (ret < 0)
342                 return ret;                       208                 return ret;
343         if (value > 1)                            209         if (value > 1)
344                 return -EINVAL;                   210                 return -EINVAL;
345                                                   211 
346         if (value)                                212         if (value)
347                 set_bit(flag, &transparent_hug    213                 set_bit(flag, &transparent_hugepage_flags);
348         else                                      214         else
349                 clear_bit(flag, &transparent_h    215                 clear_bit(flag, &transparent_hugepage_flags);
350                                                   216 
351         return count;                             217         return count;
352 }                                                 218 }
353                                                   219 
354 static ssize_t defrag_show(struct kobject *kob    220 static ssize_t defrag_show(struct kobject *kobj,
355                            struct kobj_attribu    221                            struct kobj_attribute *attr, char *buf)
356 {                                                 222 {
357         const char *output;                    !! 223         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
358                                                !! 224                 return sprintf(buf, "[always] defer defer+madvise madvise never\n");
359         if (test_bit(TRANSPARENT_HUGEPAGE_DEFR !! 225         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
360                      &transparent_hugepage_fla !! 226                 return sprintf(buf, "always [defer] defer+madvise madvise never\n");
361                 output = "[always] defer defer !! 227         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
362         else if (test_bit(TRANSPARENT_HUGEPAGE !! 228                 return sprintf(buf, "always defer [defer+madvise] madvise never\n");
363                           &transparent_hugepag !! 229         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
364                 output = "always [defer] defer !! 230                 return sprintf(buf, "always defer defer+madvise [madvise] never\n");
365         else if (test_bit(TRANSPARENT_HUGEPAGE !! 231         return sprintf(buf, "always defer defer+madvise madvise [never]\n");
366                           &transparent_hugepag << 
367                 output = "always defer [defer+ << 
368         else if (test_bit(TRANSPARENT_HUGEPAGE << 
369                           &transparent_hugepag << 
370                 output = "always defer defer+m << 
371         else                                   << 
372                 output = "always defer defer+m << 
373                                                << 
374         return sysfs_emit(buf, "%s\n", output) << 
375 }                                                 232 }
376                                                   233 
377 static ssize_t defrag_store(struct kobject *ko    234 static ssize_t defrag_store(struct kobject *kobj,
378                             struct kobj_attrib    235                             struct kobj_attribute *attr,
379                             const char *buf, s    236                             const char *buf, size_t count)
380 {                                                 237 {
381         if (sysfs_streq(buf, "always")) {      !! 238         if (!memcmp("always", buf,
                                                   >> 239                     min(sizeof("always")-1, count))) {
382                 clear_bit(TRANSPARENT_HUGEPAGE    240                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
383                 clear_bit(TRANSPARENT_HUGEPAGE    241                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
384                 clear_bit(TRANSPARENT_HUGEPAGE    242                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
385                 set_bit(TRANSPARENT_HUGEPAGE_D    243                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
386         } else if (sysfs_streq(buf, "defer+mad !! 244         } else if (!memcmp("defer+madvise", buf,
                                                   >> 245                     min(sizeof("defer+madvise")-1, count))) {
387                 clear_bit(TRANSPARENT_HUGEPAGE    246                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
388                 clear_bit(TRANSPARENT_HUGEPAGE    247                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
389                 clear_bit(TRANSPARENT_HUGEPAGE    248                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
390                 set_bit(TRANSPARENT_HUGEPAGE_D    249                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
391         } else if (sysfs_streq(buf, "defer"))  !! 250         } else if (!memcmp("defer", buf,
                                                   >> 251                     min(sizeof("defer")-1, count))) {
392                 clear_bit(TRANSPARENT_HUGEPAGE    252                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
393                 clear_bit(TRANSPARENT_HUGEPAGE    253                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
394                 clear_bit(TRANSPARENT_HUGEPAGE    254                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
395                 set_bit(TRANSPARENT_HUGEPAGE_D    255                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
396         } else if (sysfs_streq(buf, "madvise") !! 256         } else if (!memcmp("madvise", buf,
                                                   >> 257                            min(sizeof("madvise")-1, count))) {
397                 clear_bit(TRANSPARENT_HUGEPAGE    258                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
398                 clear_bit(TRANSPARENT_HUGEPAGE    259                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
399                 clear_bit(TRANSPARENT_HUGEPAGE    260                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
400                 set_bit(TRANSPARENT_HUGEPAGE_D    261                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
401         } else if (sysfs_streq(buf, "never"))  !! 262         } else if (!memcmp("never", buf,
                                                   >> 263                            min(sizeof("never")-1, count))) {
402                 clear_bit(TRANSPARENT_HUGEPAGE    264                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
403                 clear_bit(TRANSPARENT_HUGEPAGE    265                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
404                 clear_bit(TRANSPARENT_HUGEPAGE    266                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
405                 clear_bit(TRANSPARENT_HUGEPAGE    267                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
406         } else                                    268         } else
407                 return -EINVAL;                   269                 return -EINVAL;
408                                                   270 
409         return count;                             271         return count;
410 }                                                 272 }
411 static struct kobj_attribute defrag_attr = __A !! 273 static struct kobj_attribute defrag_attr =
                                                   >> 274         __ATTR(defrag, 0644, defrag_show, defrag_store);
412                                                   275 
413 static ssize_t use_zero_page_show(struct kobje    276 static ssize_t use_zero_page_show(struct kobject *kobj,
414                                   struct kobj_ !! 277                 struct kobj_attribute *attr, char *buf)
415 {                                                 278 {
416         return single_hugepage_flag_show(kobj,    279         return single_hugepage_flag_show(kobj, attr, buf,
417                                          TRANS !! 280                                 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
418 }                                                 281 }
419 static ssize_t use_zero_page_store(struct kobj    282 static ssize_t use_zero_page_store(struct kobject *kobj,
420                 struct kobj_attribute *attr, c    283                 struct kobj_attribute *attr, const char *buf, size_t count)
421 {                                                 284 {
422         return single_hugepage_flag_store(kobj    285         return single_hugepage_flag_store(kobj, attr, buf, count,
423                                  TRANSPARENT_H    286                                  TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
424 }                                                 287 }
425 static struct kobj_attribute use_zero_page_att !! 288 static struct kobj_attribute use_zero_page_attr =
                                                   >> 289         __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
426                                                   290 
427 static ssize_t hpage_pmd_size_show(struct kobj    291 static ssize_t hpage_pmd_size_show(struct kobject *kobj,
428                                    struct kobj !! 292                 struct kobj_attribute *attr, char *buf)
429 {                                                 293 {
430         return sysfs_emit(buf, "%lu\n", HPAGE_ !! 294         return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
431 }                                                 295 }
432 static struct kobj_attribute hpage_pmd_size_at    296 static struct kobj_attribute hpage_pmd_size_attr =
433         __ATTR_RO(hpage_pmd_size);                297         __ATTR_RO(hpage_pmd_size);
434                                                   298 
435 static ssize_t split_underused_thp_show(struct !! 299 #ifdef CONFIG_DEBUG_VM
436                             struct kobj_attrib !! 300 static ssize_t debug_cow_show(struct kobject *kobj,
                                                   >> 301                                 struct kobj_attribute *attr, char *buf)
437 {                                                 302 {
438         return sysfs_emit(buf, "%d\n", split_u !! 303         return single_hugepage_flag_show(kobj, attr, buf,
                                                   >> 304                                 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
439 }                                                 305 }
440                                                !! 306 static ssize_t debug_cow_store(struct kobject *kobj,
441 static ssize_t split_underused_thp_store(struc !! 307                                struct kobj_attribute *attr,
442                              struct kobj_attri !! 308                                const char *buf, size_t count)
443                              const char *buf,  << 
444 {                                                 309 {
445         int err = kstrtobool(buf, &split_under !! 310         return single_hugepage_flag_store(kobj, attr, buf, count,
446                                                !! 311                                  TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
447         if (err < 0)                           << 
448                 return err;                    << 
449                                                << 
450         return count;                          << 
451 }                                                 312 }
452                                                !! 313 static struct kobj_attribute debug_cow_attr =
453 static struct kobj_attribute split_underused_t !! 314         __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
454         shrink_underused, 0644, split_underuse !! 315 #endif /* CONFIG_DEBUG_VM */
455                                                   316 
456 static struct attribute *hugepage_attr[] = {      317 static struct attribute *hugepage_attr[] = {
457         &enabled_attr.attr,                       318         &enabled_attr.attr,
458         &defrag_attr.attr,                        319         &defrag_attr.attr,
459         &use_zero_page_attr.attr,                 320         &use_zero_page_attr.attr,
460         &hpage_pmd_size_attr.attr,                321         &hpage_pmd_size_attr.attr,
461 #ifdef CONFIG_SHMEM                            !! 322 #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
462         &shmem_enabled_attr.attr,                 323         &shmem_enabled_attr.attr,
463 #endif                                            324 #endif
464         &split_underused_thp_attr.attr,        !! 325 #ifdef CONFIG_DEBUG_VM
                                                   >> 326         &debug_cow_attr.attr,
                                                   >> 327 #endif
465         NULL,                                     328         NULL,
466 };                                                329 };
467                                                   330 
468 static const struct attribute_group hugepage_a    331 static const struct attribute_group hugepage_attr_group = {
469         .attrs = hugepage_attr,                   332         .attrs = hugepage_attr,
470 };                                                333 };
471                                                   334 
472 static void hugepage_exit_sysfs(struct kobject << 
473 static void thpsize_release(struct kobject *ko << 
474 static DEFINE_SPINLOCK(huge_anon_orders_lock); << 
475 static LIST_HEAD(thpsize_list);                << 
476                                                << 
477 static ssize_t anon_enabled_show(struct kobjec << 
478                                  struct kobj_a << 
479 {                                              << 
480         int order = to_thpsize(kobj)->order;   << 
481         const char *output;                    << 
482                                                << 
483         if (test_bit(order, &huge_anon_orders_ << 
484                 output = "[always] inherit mad << 
485         else if (test_bit(order, &huge_anon_or << 
486                 output = "always [inherit] mad << 
487         else if (test_bit(order, &huge_anon_or << 
488                 output = "always inherit [madv << 
489         else                                   << 
490                 output = "always inherit madvi << 
491                                                << 
492         return sysfs_emit(buf, "%s\n", output) << 
493 }                                              << 
494                                                << 
495 static ssize_t anon_enabled_store(struct kobje << 
496                                   struct kobj_ << 
497                                   const char * << 
498 {                                              << 
499         int order = to_thpsize(kobj)->order;   << 
500         ssize_t ret = count;                   << 
501                                                << 
502         if (sysfs_streq(buf, "always")) {      << 
503                 spin_lock(&huge_anon_orders_lo << 
504                 clear_bit(order, &huge_anon_or << 
505                 clear_bit(order, &huge_anon_or << 
506                 set_bit(order, &huge_anon_orde << 
507                 spin_unlock(&huge_anon_orders_ << 
508         } else if (sysfs_streq(buf, "inherit") << 
509                 spin_lock(&huge_anon_orders_lo << 
510                 clear_bit(order, &huge_anon_or << 
511                 clear_bit(order, &huge_anon_or << 
512                 set_bit(order, &huge_anon_orde << 
513                 spin_unlock(&huge_anon_orders_ << 
514         } else if (sysfs_streq(buf, "madvise") << 
515                 spin_lock(&huge_anon_orders_lo << 
516                 clear_bit(order, &huge_anon_or << 
517                 clear_bit(order, &huge_anon_or << 
518                 set_bit(order, &huge_anon_orde << 
519                 spin_unlock(&huge_anon_orders_ << 
520         } else if (sysfs_streq(buf, "never"))  << 
521                 spin_lock(&huge_anon_orders_lo << 
522                 clear_bit(order, &huge_anon_or << 
523                 clear_bit(order, &huge_anon_or << 
524                 clear_bit(order, &huge_anon_or << 
525                 spin_unlock(&huge_anon_orders_ << 
526         } else                                 << 
527                 ret = -EINVAL;                 << 
528                                                << 
529         if (ret > 0) {                         << 
530                 int err;                       << 
531                                                << 
532                 err = start_stop_khugepaged(); << 
533                 if (err)                       << 
534                         ret = err;             << 
535         }                                      << 
536         return ret;                            << 
537 }                                              << 
538                                                << 
539 static struct kobj_attribute anon_enabled_attr << 
540         __ATTR(enabled, 0644, anon_enabled_sho << 
541                                                << 
542 static struct attribute *anon_ctrl_attrs[] = { << 
543         &anon_enabled_attr.attr,               << 
544         NULL,                                  << 
545 };                                             << 
546                                                << 
547 static const struct attribute_group anon_ctrl_ << 
548         .attrs = anon_ctrl_attrs,              << 
549 };                                             << 
550                                                << 
551 static struct attribute *file_ctrl_attrs[] = { << 
552 #ifdef CONFIG_SHMEM                            << 
553         &thpsize_shmem_enabled_attr.attr,      << 
554 #endif                                         << 
555         NULL,                                  << 
556 };                                             << 
557                                                << 
558 static const struct attribute_group file_ctrl_ << 
559         .attrs = file_ctrl_attrs,              << 
560 };                                             << 
561                                                << 
562 static struct attribute *any_ctrl_attrs[] = {  << 
563         NULL,                                  << 
564 };                                             << 
565                                                << 
566 static const struct attribute_group any_ctrl_a << 
567         .attrs = any_ctrl_attrs,               << 
568 };                                             << 
569                                                << 
570 static const struct kobj_type thpsize_ktype =  << 
571         .release = &thpsize_release,           << 
572         .sysfs_ops = &kobj_sysfs_ops,          << 
573 };                                             << 
574                                                << 
575 DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = << 
576                                                << 
577 static unsigned long sum_mthp_stat(int order,  << 
578 {                                              << 
579         unsigned long sum = 0;                 << 
580         int cpu;                               << 
581                                                << 
582         for_each_possible_cpu(cpu) {           << 
583                 struct mthp_stat *this = &per_ << 
584                                                << 
585                 sum += this->stats[order][item << 
586         }                                      << 
587                                                << 
588         return sum;                            << 
589 }                                              << 
590                                                << 
591 #define DEFINE_MTHP_STAT_ATTR(_name, _index)   << 
592 static ssize_t _name##_show(struct kobject *ko << 
593                         struct kobj_attribute  << 
594 {                                              << 
595         int order = to_thpsize(kobj)->order;   << 
596                                                << 
597         return sysfs_emit(buf, "%lu\n", sum_mt << 
598 }                                              << 
599 static struct kobj_attribute _name##_attr = __ << 
600                                                << 
601 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_S << 
602 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTH << 
603 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_char << 
604 DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT << 
605 DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_ST << 
606 #ifdef CONFIG_SHMEM                            << 
607 DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_S << 
608 DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STA << 
609 DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, M << 
610 #endif                                         << 
611 DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); << 
612 DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_ << 
613 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STA << 
614 DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_AN << 
615 DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped << 
616                                                << 
617 static struct attribute *anon_stats_attrs[] =  << 
618         &anon_fault_alloc_attr.attr,           << 
619         &anon_fault_fallback_attr.attr,        << 
620         &anon_fault_fallback_charge_attr.attr, << 
621 #ifndef CONFIG_SHMEM                           << 
622         &swpout_attr.attr,                     << 
623         &swpout_fallback_attr.attr,            << 
624 #endif                                         << 
625         &split_deferred_attr.attr,             << 
626         &nr_anon_attr.attr,                    << 
627         &nr_anon_partially_mapped_attr.attr,   << 
628         NULL,                                  << 
629 };                                             << 
630                                                << 
631 static struct attribute_group anon_stats_attr_ << 
632         .name = "stats",                       << 
633         .attrs = anon_stats_attrs,             << 
634 };                                             << 
635                                                << 
636 static struct attribute *file_stats_attrs[] =  << 
637 #ifdef CONFIG_SHMEM                            << 
638         &shmem_alloc_attr.attr,                << 
639         &shmem_fallback_attr.attr,             << 
640         &shmem_fallback_charge_attr.attr,      << 
641 #endif                                         << 
642         NULL,                                  << 
643 };                                             << 
644                                                << 
645 static struct attribute_group file_stats_attr_ << 
646         .name = "stats",                       << 
647         .attrs = file_stats_attrs,             << 
648 };                                             << 
649                                                << 
650 static struct attribute *any_stats_attrs[] = { << 
651 #ifdef CONFIG_SHMEM                            << 
652         &swpout_attr.attr,                     << 
653         &swpout_fallback_attr.attr,            << 
654 #endif                                         << 
655         &split_attr.attr,                      << 
656         &split_failed_attr.attr,               << 
657         NULL,                                  << 
658 };                                             << 
659                                                << 
660 static struct attribute_group any_stats_attr_g << 
661         .name = "stats",                       << 
662         .attrs = any_stats_attrs,              << 
663 };                                             << 
664                                                << 
665 static int sysfs_add_group(struct kobject *kob << 
666                            const struct attrib << 
667 {                                              << 
668         int ret = -ENOENT;                     << 
669                                                << 
670         /*                                     << 
671          * If the group is named, try to merge << 
672          * was already created. This avoids th << 
673          * sysfs_create_group() if the directo << 
674          */                                    << 
675         if (grp->name)                         << 
676                 ret = sysfs_merge_group(kobj,  << 
677         if (ret)                               << 
678                 ret = sysfs_create_group(kobj, << 
679                                                << 
680         return ret;                            << 
681 }                                              << 
682                                                << 
683 static struct thpsize *thpsize_create(int orde << 
684 {                                              << 
685         unsigned long size = (PAGE_SIZE << ord << 
686         struct thpsize *thpsize;               << 
687         int ret = -ENOMEM;                     << 
688                                                << 
689         thpsize = kzalloc(sizeof(*thpsize), GF << 
690         if (!thpsize)                          << 
691                 goto err;                      << 
692                                                << 
693         thpsize->order = order;                << 
694                                                << 
695         ret = kobject_init_and_add(&thpsize->k << 
696                                    "hugepages- << 
697         if (ret) {                             << 
698                 kfree(thpsize);                << 
699                 goto err;                      << 
700         }                                      << 
701                                                << 
702                                                << 
703         ret = sysfs_add_group(&thpsize->kobj,  << 
704         if (ret)                               << 
705                 goto err_put;                  << 
706                                                << 
707         ret = sysfs_add_group(&thpsize->kobj,  << 
708         if (ret)                               << 
709                 goto err_put;                  << 
710                                                << 
711         if (BIT(order) & THP_ORDERS_ALL_ANON)  << 
712                 ret = sysfs_add_group(&thpsize << 
713                 if (ret)                       << 
714                         goto err_put;          << 
715                                                << 
716                 ret = sysfs_add_group(&thpsize << 
717                 if (ret)                       << 
718                         goto err_put;          << 
719         }                                      << 
720                                                << 
721         if (BIT(order) & THP_ORDERS_ALL_FILE_D << 
722                 ret = sysfs_add_group(&thpsize << 
723                 if (ret)                       << 
724                         goto err_put;          << 
725                                                << 
726                 ret = sysfs_add_group(&thpsize << 
727                 if (ret)                       << 
728                         goto err_put;          << 
729         }                                      << 
730                                                << 
731         return thpsize;                        << 
732 err_put:                                       << 
733         kobject_put(&thpsize->kobj);           << 
734 err:                                           << 
735         return ERR_PTR(ret);                   << 
736 }                                              << 
737                                                << 
738 static void thpsize_release(struct kobject *ko << 
739 {                                              << 
740         kfree(to_thpsize(kobj));               << 
741 }                                              << 
742                                                << 
743 static int __init hugepage_init_sysfs(struct k    335 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
744 {                                                 336 {
745         int err;                                  337         int err;
746         struct thpsize *thpsize;               << 
747         unsigned long orders;                  << 
748         int order;                             << 
749                                                << 
750         /*                                     << 
751          * Default to setting PMD-sized THP to << 
752          * disable all other sizes. powerpc's  << 
753          * constant so we have to do this here << 
754          */                                    << 
755         if (!anon_orders_configured)           << 
756                 huge_anon_orders_inherit = BIT << 
757                                                   338 
758         *hugepage_kobj = kobject_create_and_ad    339         *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
759         if (unlikely(!*hugepage_kobj)) {          340         if (unlikely(!*hugepage_kobj)) {
760                 pr_err("failed to create trans    341                 pr_err("failed to create transparent hugepage kobject\n");
761                 return -ENOMEM;                   342                 return -ENOMEM;
762         }                                         343         }
763                                                   344 
764         err = sysfs_create_group(*hugepage_kob    345         err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
765         if (err) {                                346         if (err) {
766                 pr_err("failed to register tra    347                 pr_err("failed to register transparent hugepage group\n");
767                 goto delete_obj;                  348                 goto delete_obj;
768         }                                         349         }
769                                                   350 
770         err = sysfs_create_group(*hugepage_kob    351         err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
771         if (err) {                                352         if (err) {
772                 pr_err("failed to register tra    353                 pr_err("failed to register transparent hugepage group\n");
773                 goto remove_hp_group;             354                 goto remove_hp_group;
774         }                                         355         }
775                                                   356 
776         orders = THP_ORDERS_ALL_ANON | THP_ORD << 
777         order = highest_order(orders);         << 
778         while (orders) {                       << 
779                 thpsize = thpsize_create(order << 
780                 if (IS_ERR(thpsize)) {         << 
781                         pr_err("failed to crea << 
782                         err = PTR_ERR(thpsize) << 
783                         goto remove_all;       << 
784                 }                              << 
785                 list_add(&thpsize->node, &thps << 
786                 order = next_order(&orders, or << 
787         }                                      << 
788                                                << 
789         return 0;                                 357         return 0;
790                                                   358 
791 remove_all:                                    << 
792         hugepage_exit_sysfs(*hugepage_kobj);   << 
793         return err;                            << 
794 remove_hp_group:                                  359 remove_hp_group:
795         sysfs_remove_group(*hugepage_kobj, &hu    360         sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
796 delete_obj:                                       361 delete_obj:
797         kobject_put(*hugepage_kobj);              362         kobject_put(*hugepage_kobj);
798         return err;                               363         return err;
799 }                                                 364 }
800                                                   365 
801 static void __init hugepage_exit_sysfs(struct     366 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
802 {                                                 367 {
803         struct thpsize *thpsize, *tmp;         << 
804                                                << 
805         list_for_each_entry_safe(thpsize, tmp, << 
806                 list_del(&thpsize->node);      << 
807                 kobject_put(&thpsize->kobj);   << 
808         }                                      << 
809                                                << 
810         sysfs_remove_group(hugepage_kobj, &khu    368         sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
811         sysfs_remove_group(hugepage_kobj, &hug    369         sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
812         kobject_put(hugepage_kobj);               370         kobject_put(hugepage_kobj);
813 }                                                 371 }
814 #else                                             372 #else
815 static inline int hugepage_init_sysfs(struct k    373 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
816 {                                                 374 {
817         return 0;                                 375         return 0;
818 }                                                 376 }
819                                                   377 
820 static inline void hugepage_exit_sysfs(struct     378 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
821 {                                                 379 {
822 }                                                 380 }
823 #endif /* CONFIG_SYSFS */                         381 #endif /* CONFIG_SYSFS */
824                                                   382 
825 static int __init thp_shrinker_init(void)      << 
826 {                                              << 
827         huge_zero_page_shrinker = shrinker_all << 
828         if (!huge_zero_page_shrinker)          << 
829                 return -ENOMEM;                << 
830                                                << 
831         deferred_split_shrinker = shrinker_all << 
832                                                << 
833                                                << 
834                                                << 
835         if (!deferred_split_shrinker) {        << 
836                 shrinker_free(huge_zero_page_s << 
837                 return -ENOMEM;                << 
838         }                                      << 
839                                                << 
840         huge_zero_page_shrinker->count_objects << 
841         huge_zero_page_shrinker->scan_objects  << 
842         shrinker_register(huge_zero_page_shrin << 
843                                                << 
844         deferred_split_shrinker->count_objects << 
845         deferred_split_shrinker->scan_objects  << 
846         shrinker_register(deferred_split_shrin << 
847                                                << 
848         return 0;                              << 
849 }                                              << 
850                                                << 
851 static void __init thp_shrinker_exit(void)     << 
852 {                                              << 
853         shrinker_free(huge_zero_page_shrinker) << 
854         shrinker_free(deferred_split_shrinker) << 
855 }                                              << 
856                                                << 
857 static int __init hugepage_init(void)             383 static int __init hugepage_init(void)
858 {                                                 384 {
859         int err;                                  385         int err;
860         struct kobject *hugepage_kobj;            386         struct kobject *hugepage_kobj;
861                                                   387 
862         if (!has_transparent_hugepage()) {        388         if (!has_transparent_hugepage()) {
863                 transparent_hugepage_flags = 1 !! 389                 transparent_hugepage_flags = 0;
864                 return -EINVAL;                   390                 return -EINVAL;
865         }                                         391         }
866                                                   392 
867         /*                                        393         /*
868          * hugepages can't be allocated by the    394          * hugepages can't be allocated by the buddy allocator
869          */                                       395          */
870         MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > M !! 396         MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
                                                   >> 397         /*
                                                   >> 398          * we use page->mapping and page->index in second tail page
                                                   >> 399          * as list_head: assuming THP order >= 2
                                                   >> 400          */
                                                   >> 401         MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
871                                                   402 
872         err = hugepage_init_sysfs(&hugepage_ko    403         err = hugepage_init_sysfs(&hugepage_kobj);
873         if (err)                                  404         if (err)
874                 goto err_sysfs;                   405                 goto err_sysfs;
875                                                   406 
876         err = khugepaged_init();                  407         err = khugepaged_init();
877         if (err)                                  408         if (err)
878                 goto err_slab;                    409                 goto err_slab;
879                                                   410 
880         err = thp_shrinker_init();             !! 411         err = register_shrinker(&huge_zero_page_shrinker);
                                                   >> 412         if (err)
                                                   >> 413                 goto err_hzp_shrinker;
                                                   >> 414         err = register_shrinker(&deferred_split_shrinker);
881         if (err)                                  415         if (err)
882                 goto err_shrinker;             !! 416                 goto err_split_shrinker;
883                                                   417 
884         /*                                        418         /*
885          * By default disable transparent huge    419          * By default disable transparent hugepages on smaller systems,
886          * where the extra memory used could h    420          * where the extra memory used could hurt more than TLB overhead
887          * is likely to save.  The admin can s    421          * is likely to save.  The admin can still enable it through /sys.
888          */                                       422          */
889         if (totalram_pages() < (512 << (20 - P !! 423         if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
890                 transparent_hugepage_flags = 0    424                 transparent_hugepage_flags = 0;
891                 return 0;                         425                 return 0;
892         }                                         426         }
893                                                   427 
894         err = start_stop_khugepaged();            428         err = start_stop_khugepaged();
895         if (err)                                  429         if (err)
896                 goto err_khugepaged;              430                 goto err_khugepaged;
897                                                   431 
898         return 0;                                 432         return 0;
899 err_khugepaged:                                   433 err_khugepaged:
900         thp_shrinker_exit();                   !! 434         unregister_shrinker(&deferred_split_shrinker);
901 err_shrinker:                                  !! 435 err_split_shrinker:
                                                   >> 436         unregister_shrinker(&huge_zero_page_shrinker);
                                                   >> 437 err_hzp_shrinker:
902         khugepaged_destroy();                     438         khugepaged_destroy();
903 err_slab:                                         439 err_slab:
904         hugepage_exit_sysfs(hugepage_kobj);       440         hugepage_exit_sysfs(hugepage_kobj);
905 err_sysfs:                                        441 err_sysfs:
906         return err;                               442         return err;
907 }                                                 443 }
908 subsys_initcall(hugepage_init);                   444 subsys_initcall(hugepage_init);
909                                                   445 
910 static int __init setup_transparent_hugepage(c    446 static int __init setup_transparent_hugepage(char *str)
911 {                                                 447 {
912         int ret = 0;                              448         int ret = 0;
913         if (!str)                                 449         if (!str)
914                 goto out;                         450                 goto out;
915         if (!strcmp(str, "always")) {             451         if (!strcmp(str, "always")) {
916                 set_bit(TRANSPARENT_HUGEPAGE_F    452                 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
917                         &transparent_hugepage_    453                         &transparent_hugepage_flags);
918                 clear_bit(TRANSPARENT_HUGEPAGE    454                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
919                           &transparent_hugepag    455                           &transparent_hugepage_flags);
920                 ret = 1;                          456                 ret = 1;
921         } else if (!strcmp(str, "madvise")) {     457         } else if (!strcmp(str, "madvise")) {
922                 clear_bit(TRANSPARENT_HUGEPAGE    458                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
923                           &transparent_hugepag    459                           &transparent_hugepage_flags);
924                 set_bit(TRANSPARENT_HUGEPAGE_R    460                 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
925                         &transparent_hugepage_    461                         &transparent_hugepage_flags);
926                 ret = 1;                          462                 ret = 1;
927         } else if (!strcmp(str, "never")) {       463         } else if (!strcmp(str, "never")) {
928                 clear_bit(TRANSPARENT_HUGEPAGE    464                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
929                           &transparent_hugepag    465                           &transparent_hugepage_flags);
930                 clear_bit(TRANSPARENT_HUGEPAGE    466                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
931                           &transparent_hugepag    467                           &transparent_hugepage_flags);
932                 ret = 1;                          468                 ret = 1;
933         }                                         469         }
934 out:                                              470 out:
935         if (!ret)                                 471         if (!ret)
936                 pr_warn("transparent_hugepage=    472                 pr_warn("transparent_hugepage= cannot parse, ignored\n");
937         return ret;                               473         return ret;
938 }                                                 474 }
939 __setup("transparent_hugepage=", setup_transpa    475 __setup("transparent_hugepage=", setup_transparent_hugepage);
940                                                   476 
941 static inline int get_order_from_str(const cha << 
942 {                                              << 
943         unsigned long size;                    << 
944         char *endptr;                          << 
945         int order;                             << 
946                                                << 
947         size = memparse(size_str, &endptr);    << 
948                                                << 
949         if (!is_power_of_2(size))              << 
950                 goto err;                      << 
951         order = get_order(size);               << 
952         if (BIT(order) & ~THP_ORDERS_ALL_ANON) << 
953                 goto err;                      << 
954                                                << 
955         return order;                          << 
956 err:                                           << 
957         pr_err("invalid size %s in thp_anon bo << 
958         return -EINVAL;                        << 
959 }                                              << 
960                                                << 
961 static char str_dup[PAGE_SIZE] __initdata;     << 
962 static int __init setup_thp_anon(char *str)    << 
963 {                                              << 
964         char *token, *range, *policy, *subtoke << 
965         unsigned long always, inherit, madvise << 
966         char *start_size, *end_size;           << 
967         int start, end, nr;                    << 
968         char *p;                               << 
969                                                << 
970         if (!str || strlen(str) + 1 > PAGE_SIZ << 
971                 goto err;                      << 
972         strcpy(str_dup, str);                  << 
973                                                << 
974         always = huge_anon_orders_always;      << 
975         madvise = huge_anon_orders_madvise;    << 
976         inherit = huge_anon_orders_inherit;    << 
977         p = str_dup;                           << 
978         while ((token = strsep(&p, ";")) != NU << 
979                 range = strsep(&token, ":");   << 
980                 policy = token;                << 
981                                                << 
982                 if (!policy)                   << 
983                         goto err;              << 
984                                                << 
985                 while ((subtoken = strsep(&ran << 
986                         if (strchr(subtoken, ' << 
987                                 start_size = s << 
988                                 end_size = sub << 
989                                                << 
990                                 start = get_or << 
991                                 end = get_orde << 
992                         } else {               << 
993                                 start = end =  << 
994                         }                      << 
995                                                << 
996                         if (start < 0 || end < << 
997                                 goto err;      << 
998                                                << 
999                         nr = end - start + 1;  << 
1000                         if (!strcmp(policy, " << 
1001                                 bitmap_set(&a << 
1002                                 bitmap_clear( << 
1003                                 bitmap_clear( << 
1004                         } else if (!strcmp(po << 
1005                                 bitmap_set(&m << 
1006                                 bitmap_clear( << 
1007                                 bitmap_clear( << 
1008                         } else if (!strcmp(po << 
1009                                 bitmap_set(&i << 
1010                                 bitmap_clear( << 
1011                                 bitmap_clear( << 
1012                         } else if (!strcmp(po << 
1013                                 bitmap_clear( << 
1014                                 bitmap_clear( << 
1015                                 bitmap_clear( << 
1016                         } else {              << 
1017                                 pr_err("inval << 
1018                                 goto err;     << 
1019                         }                     << 
1020                 }                             << 
1021         }                                     << 
1022                                               << 
1023         huge_anon_orders_always = always;     << 
1024         huge_anon_orders_madvise = madvise;   << 
1025         huge_anon_orders_inherit = inherit;   << 
1026         anon_orders_configured = true;        << 
1027         return 1;                             << 
1028                                               << 
1029 err:                                          << 
1030         pr_warn("thp_anon=%s: error parsing s << 
1031         return 0;                             << 
1032 }                                             << 
1033 __setup("thp_anon=", setup_thp_anon);         << 
1034                                               << 
1035 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_    477 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
1036 {                                                478 {
1037         if (likely(vma->vm_flags & VM_WRITE))    479         if (likely(vma->vm_flags & VM_WRITE))
1038                 pmd = pmd_mkwrite(pmd, vma);  !! 480                 pmd = pmd_mkwrite(pmd);
1039         return pmd;                              481         return pmd;
1040 }                                                482 }
1041                                                  483 
1042 #ifdef CONFIG_MEMCG                           !! 484 static inline struct list_head *page_deferred_list(struct page *page)
1043 static inline                                 << 
1044 struct deferred_split *get_deferred_split_que << 
1045 {                                                485 {
1046         struct mem_cgroup *memcg = folio_memc !! 486         /*
1047         struct pglist_data *pgdat = NODE_DATA !! 487          * ->lru in the tail pages is occupied by compound_head.
1048                                               !! 488          * Let's use ->mapping + ->index in the second tail page as list_head.
1049         if (memcg)                            !! 489          */
1050                 return &memcg->deferred_split !! 490         return (struct list_head *)&page[2].mapping;
1051         else                                  << 
1052                 return &pgdat->deferred_split << 
1053 }                                             << 
1054 #else                                         << 
1055 static inline                                 << 
1056 struct deferred_split *get_deferred_split_que << 
1057 {                                             << 
1058         struct pglist_data *pgdat = NODE_DATA << 
1059                                               << 
1060         return &pgdat->deferred_split_queue;  << 
1061 }                                                491 }
1062 #endif                                        << 
1063                                                  492 
1064 static inline bool is_transparent_hugepage(co !! 493 void prep_transhuge_page(struct page *page)
1065 {                                                494 {
1066         if (!folio_test_large(folio))         !! 495         /*
1067                 return false;                 !! 496          * we use page->mapping and page->indexlru in second tail page
                                                   >> 497          * as list_head: assuming THP order >= 2
                                                   >> 498          */
1068                                                  499 
1069         return is_huge_zero_folio(folio) ||   !! 500         INIT_LIST_HEAD(page_deferred_list(page));
1070                 folio_test_large_rmappable(fo !! 501         set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
1071 }                                                502 }
1072                                                  503 
1073 static unsigned long __thp_get_unmapped_area( !! 504 unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
1074                 unsigned long addr, unsigned  !! 505                 loff_t off, unsigned long flags, unsigned long size)
1075                 loff_t off, unsigned long fla << 
1076                 vm_flags_t vm_flags)          << 
1077 {                                                506 {
                                                   >> 507         unsigned long addr;
1078         loff_t off_end = off + len;              508         loff_t off_end = off + len;
1079         loff_t off_align = round_up(off, size    509         loff_t off_align = round_up(off, size);
1080         unsigned long len_pad, ret, off_sub;  !! 510         unsigned long len_pad;
1081                                               << 
1082         if (!IS_ENABLED(CONFIG_64BIT) || in_c << 
1083                 return 0;                     << 
1084                                                  511 
1085         if (off_end <= off_align || (off_end     512         if (off_end <= off_align || (off_end - off_align) < size)
1086                 return 0;                        513                 return 0;
1087                                                  514 
1088         len_pad = len + size;                    515         len_pad = len + size;
1089         if (len_pad < len || (off + len_pad)     516         if (len_pad < len || (off + len_pad) < off)
1090                 return 0;                        517                 return 0;
1091                                                  518 
1092         ret = mm_get_unmapped_area_vmflags(cu !! 519         addr = current->mm->get_unmapped_area(filp, 0, len_pad,
1093                                            of !! 520                                               off >> PAGE_SHIFT, flags);
1094                                               !! 521         if (IS_ERR_VALUE(addr))
1095         /*                                    << 
1096          * The failure might be due to length << 
1097          * without the padding.               << 
1098          */                                   << 
1099         if (IS_ERR_VALUE(ret))                << 
1100                 return 0;                        522                 return 0;
1101                                                  523 
1102         /*                                    !! 524         addr += (off - addr) & (size - 1);
1103          * Do not try to align to THP boundar !! 525         return addr;
1104          * hint succeeds.                     << 
1105          */                                   << 
1106         if (ret == addr)                      << 
1107                 return addr;                  << 
1108                                               << 
1109         off_sub = (off - ret) & (size - 1);   << 
1110                                               << 
1111         if (test_bit(MMF_TOPDOWN, &current->m << 
1112                 return ret + size;            << 
1113                                               << 
1114         ret += off_sub;                       << 
1115         return ret;                           << 
1116 }                                                526 }
1117                                                  527 
1118 unsigned long thp_get_unmapped_area_vmflags(s !! 528 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
1119                 unsigned long len, unsigned l !! 529                 unsigned long len, unsigned long pgoff, unsigned long flags)
1120                 vm_flags_t vm_flags)          << 
1121 {                                                530 {
1122         unsigned long ret;                    << 
1123         loff_t off = (loff_t)pgoff << PAGE_SH    531         loff_t off = (loff_t)pgoff << PAGE_SHIFT;
1124                                                  532 
1125         ret = __thp_get_unmapped_area(filp, a !! 533         if (addr)
1126         if (ret)                              !! 534                 goto out;
1127                 return ret;                   !! 535         if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
                                                   >> 536                 goto out;
1128                                                  537 
1129         return mm_get_unmapped_area_vmflags(c !! 538         addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
1130                                             v !! 539         if (addr)
1131 }                                             !! 540                 return addr;
1132                                                  541 
1133 unsigned long thp_get_unmapped_area(struct fi !! 542  out:
1134                 unsigned long len, unsigned l !! 543         return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
1135 {                                             << 
1136         return thp_get_unmapped_area_vmflags( << 
1137 }                                                544 }
1138 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);        545 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
1139                                                  546 
1140 static vm_fault_t __do_huge_pmd_anonymous_pag !! 547 static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
1141                         struct page *page, gf !! 548                 gfp_t gfp)
1142 {                                                549 {
1143         struct vm_area_struct *vma = vmf->vma    550         struct vm_area_struct *vma = vmf->vma;
1144         struct folio *folio = page_folio(page !! 551         struct mem_cgroup *memcg;
1145         pgtable_t pgtable;                       552         pgtable_t pgtable;
1146         unsigned long haddr = vmf->address &     553         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1147         vm_fault_t ret = 0;                   !! 554         int ret = 0;
1148                                                  555 
1149         VM_BUG_ON_FOLIO(!folio_test_large(fol !! 556         VM_BUG_ON_PAGE(!PageCompound(page), page);
1150                                                  557 
1151         if (mem_cgroup_charge(folio, vma->vm_ !! 558         if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg,
1152                 folio_put(folio);             !! 559                                   true)) {
                                                   >> 560                 put_page(page);
1153                 count_vm_event(THP_FAULT_FALL    561                 count_vm_event(THP_FAULT_FALLBACK);
1154                 count_vm_event(THP_FAULT_FALL << 
1155                 count_mthp_stat(HPAGE_PMD_ORD << 
1156                 count_mthp_stat(HPAGE_PMD_ORD << 
1157                 return VM_FAULT_FALLBACK;        562                 return VM_FAULT_FALLBACK;
1158         }                                        563         }
1159         folio_throttle_swaprate(folio, gfp);  << 
1160                                                  564 
1161         pgtable = pte_alloc_one(vma->vm_mm);  !! 565         pgtable = pte_alloc_one(vma->vm_mm, haddr);
1162         if (unlikely(!pgtable)) {                566         if (unlikely(!pgtable)) {
1163                 ret = VM_FAULT_OOM;              567                 ret = VM_FAULT_OOM;
1164                 goto release;                    568                 goto release;
1165         }                                        569         }
1166                                                  570 
1167         folio_zero_user(folio, vmf->address); !! 571         clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
1168         /*                                       572         /*
1169          * The memory barrier inside __folio_ !! 573          * The memory barrier inside __SetPageUptodate makes sure that
1170          * folio_zero_user writes become visi !! 574          * clear_huge_page writes become visible before the set_pmd_at()
1171          * write.                                575          * write.
1172          */                                      576          */
1173         __folio_mark_uptodate(folio);         !! 577         __SetPageUptodate(page);
1174                                                  578 
1175         vmf->ptl = pmd_lock(vma->vm_mm, vmf->    579         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1176         if (unlikely(!pmd_none(*vmf->pmd))) {    580         if (unlikely(!pmd_none(*vmf->pmd))) {
1177                 goto unlock_release;             581                 goto unlock_release;
1178         } else {                                 582         } else {
1179                 pmd_t entry;                     583                 pmd_t entry;
1180                                                  584 
1181                 ret = check_stable_address_sp    585                 ret = check_stable_address_space(vma->vm_mm);
1182                 if (ret)                         586                 if (ret)
1183                         goto unlock_release;     587                         goto unlock_release;
1184                                                  588 
1185                 /* Deliver the page fault to     589                 /* Deliver the page fault to userland */
1186                 if (userfaultfd_missing(vma))    590                 if (userfaultfd_missing(vma)) {
                                                   >> 591                         int ret;
                                                   >> 592 
1187                         spin_unlock(vmf->ptl)    593                         spin_unlock(vmf->ptl);
1188                         folio_put(folio);     !! 594                         mem_cgroup_cancel_charge(page, memcg, true);
                                                   >> 595                         put_page(page);
1189                         pte_free(vma->vm_mm,     596                         pte_free(vma->vm_mm, pgtable);
1190                         ret = handle_userfaul    597                         ret = handle_userfault(vmf, VM_UFFD_MISSING);
1191                         VM_BUG_ON(ret & VM_FA    598                         VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1192                         return ret;              599                         return ret;
1193                 }                                600                 }
1194                                                  601 
1195                 entry = mk_huge_pmd(page, vma    602                 entry = mk_huge_pmd(page, vma->vm_page_prot);
1196                 entry = maybe_pmd_mkwrite(pmd    603                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1197                 folio_add_new_anon_rmap(folio !! 604                 page_add_new_anon_rmap(page, vma, haddr, true);
1198                 folio_add_lru_vma(folio, vma) !! 605                 mem_cgroup_commit_charge(page, memcg, false, true);
                                                   >> 606                 lru_cache_add_active_or_unevictable(page, vma);
1199                 pgtable_trans_huge_deposit(vm    607                 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
1200                 set_pmd_at(vma->vm_mm, haddr,    608                 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
1201                 update_mmu_cache_pmd(vma, vmf << 
1202                 add_mm_counter(vma->vm_mm, MM    609                 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1203                 mm_inc_nr_ptes(vma->vm_mm);      610                 mm_inc_nr_ptes(vma->vm_mm);
1204                 deferred_split_folio(folio, f << 
1205                 spin_unlock(vmf->ptl);           611                 spin_unlock(vmf->ptl);
1206                 count_vm_event(THP_FAULT_ALLO    612                 count_vm_event(THP_FAULT_ALLOC);
1207                 count_mthp_stat(HPAGE_PMD_ORD << 
1208                 count_memcg_event_mm(vma->vm_ << 
1209         }                                        613         }
1210                                                  614 
1211         return 0;                                615         return 0;
1212 unlock_release:                                  616 unlock_release:
1213         spin_unlock(vmf->ptl);                   617         spin_unlock(vmf->ptl);
1214 release:                                         618 release:
1215         if (pgtable)                             619         if (pgtable)
1216                 pte_free(vma->vm_mm, pgtable)    620                 pte_free(vma->vm_mm, pgtable);
1217         folio_put(folio);                     !! 621         mem_cgroup_cancel_charge(page, memcg, true);
                                                   >> 622         put_page(page);
1218         return ret;                              623         return ret;
1219                                                  624 
1220 }                                                625 }
1221                                                  626 
1222 /*                                               627 /*
1223  * always: directly stall for all thp allocat    628  * always: directly stall for all thp allocations
1224  * defer: wake kswapd and fail if not immedia    629  * defer: wake kswapd and fail if not immediately available
1225  * defer+madvise: wake kswapd and directly st    630  * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
1226  *                fail if not immediately ava    631  *                fail if not immediately available
1227  * madvise: directly stall for MADV_HUGEPAGE,    632  * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
1228  *          available                            633  *          available
1229  * never: never stall for any thp allocation     634  * never: never stall for any thp allocation
1230  */                                              635  */
1231 gfp_t vma_thp_gfp_mask(struct vm_area_struct  !! 636 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
1232 {                                                637 {
1233         const bool vma_madvised = vma && (vma !! 638         const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
1234                                                  639 
1235         /* Always do synchronous compaction * << 
1236         if (test_bit(TRANSPARENT_HUGEPAGE_DEF    640         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
1237                 return GFP_TRANSHUGE | (vma_m    641                 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
1238                                               << 
1239         /* Kick kcompactd and fail quickly */ << 
1240         if (test_bit(TRANSPARENT_HUGEPAGE_DEF    642         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
1241                 return GFP_TRANSHUGE_LIGHT |     643                 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
1242                                               << 
1243         /* Synchronous compaction if madvised << 
1244         if (test_bit(TRANSPARENT_HUGEPAGE_DEF    644         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
1245                 return GFP_TRANSHUGE_LIGHT |  !! 645                 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
1246                         (vma_madvised ? __GFP !! 646                                                              __GFP_KSWAPD_RECLAIM);
1247                                         __GFP << 
1248                                               << 
1249         /* Only do synchronous compaction if  << 
1250         if (test_bit(TRANSPARENT_HUGEPAGE_DEF    647         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
1251                 return GFP_TRANSHUGE_LIGHT |  !! 648                 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
1252                        (vma_madvised ? __GFP_ !! 649                                                              0);
1253                                               << 
1254         return GFP_TRANSHUGE_LIGHT;              650         return GFP_TRANSHUGE_LIGHT;
1255 }                                                651 }
1256                                                  652 
1257 /* Caller must hold page table lock. */          653 /* Caller must hold page table lock. */
1258 static void set_huge_zero_folio(pgtable_t pgt !! 654 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
1259                 struct vm_area_struct *vma, u    655                 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
1260                 struct folio *zero_folio)     !! 656                 struct page *zero_page)
1261 {                                                657 {
1262         pmd_t entry;                             658         pmd_t entry;
1263         if (!pmd_none(*pmd))                     659         if (!pmd_none(*pmd))
1264                 return;                       !! 660                 return false;
1265         entry = mk_pmd(&zero_folio->page, vma !! 661         entry = mk_pmd(zero_page, vma->vm_page_prot);
1266         entry = pmd_mkhuge(entry);               662         entry = pmd_mkhuge(entry);
1267         pgtable_trans_huge_deposit(mm, pmd, p !! 663         if (pgtable)
                                                   >> 664                 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1268         set_pmd_at(mm, haddr, pmd, entry);       665         set_pmd_at(mm, haddr, pmd, entry);
1269         mm_inc_nr_ptes(mm);                      666         mm_inc_nr_ptes(mm);
                                                   >> 667         return true;
1270 }                                                668 }
1271                                                  669 
1272 vm_fault_t do_huge_pmd_anonymous_page(struct  !! 670 int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
1273 {                                                671 {
1274         struct vm_area_struct *vma = vmf->vma    672         struct vm_area_struct *vma = vmf->vma;
1275         gfp_t gfp;                               673         gfp_t gfp;
1276         struct folio *folio;                  !! 674         struct page *page;
1277         unsigned long haddr = vmf->address &     675         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1278         vm_fault_t ret;                       << 
1279                                                  676 
1280         if (!thp_vma_suitable_order(vma, hadd !! 677         if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
1281                 return VM_FAULT_FALLBACK;        678                 return VM_FAULT_FALLBACK;
1282         ret = vmf_anon_prepare(vmf);          !! 679         if (unlikely(anon_vma_prepare(vma)))
1283         if (ret)                              !! 680                 return VM_FAULT_OOM;
1284                 return ret;                   !! 681         if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
1285         khugepaged_enter_vma(vma, vma->vm_fla !! 682                 return VM_FAULT_OOM;
1286                                               << 
1287         if (!(vmf->flags & FAULT_FLAG_WRITE)     683         if (!(vmf->flags & FAULT_FLAG_WRITE) &&
1288                         !mm_forbids_zeropage(    684                         !mm_forbids_zeropage(vma->vm_mm) &&
1289                         transparent_hugepage_    685                         transparent_hugepage_use_zero_page()) {
1290                 pgtable_t pgtable;               686                 pgtable_t pgtable;
1291                 struct folio *zero_folio;     !! 687                 struct page *zero_page;
1292                 vm_fault_t ret;               !! 688                 bool set;
1293                                               !! 689                 int ret;
1294                 pgtable = pte_alloc_one(vma-> !! 690                 pgtable = pte_alloc_one(vma->vm_mm, haddr);
1295                 if (unlikely(!pgtable))          691                 if (unlikely(!pgtable))
1296                         return VM_FAULT_OOM;     692                         return VM_FAULT_OOM;
1297                 zero_folio = mm_get_huge_zero !! 693                 zero_page = mm_get_huge_zero_page(vma->vm_mm);
1298                 if (unlikely(!zero_folio)) {  !! 694                 if (unlikely(!zero_page)) {
1299                         pte_free(vma->vm_mm,     695                         pte_free(vma->vm_mm, pgtable);
1300                         count_vm_event(THP_FA    696                         count_vm_event(THP_FAULT_FALLBACK);
1301                         return VM_FAULT_FALLB    697                         return VM_FAULT_FALLBACK;
1302                 }                                698                 }
1303                 vmf->ptl = pmd_lock(vma->vm_m    699                 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1304                 ret = 0;                         700                 ret = 0;
                                                   >> 701                 set = false;
1305                 if (pmd_none(*vmf->pmd)) {       702                 if (pmd_none(*vmf->pmd)) {
1306                         ret = check_stable_ad    703                         ret = check_stable_address_space(vma->vm_mm);
1307                         if (ret) {               704                         if (ret) {
1308                                 spin_unlock(v    705                                 spin_unlock(vmf->ptl);
1309                                 pte_free(vma- << 
1310                         } else if (userfaultf    706                         } else if (userfaultfd_missing(vma)) {
1311                                 spin_unlock(v    707                                 spin_unlock(vmf->ptl);
1312                                 pte_free(vma- << 
1313                                 ret = handle_    708                                 ret = handle_userfault(vmf, VM_UFFD_MISSING);
1314                                 VM_BUG_ON(ret    709                                 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1315                         } else {                 710                         } else {
1316                                 set_huge_zero !! 711                                 set_huge_zero_page(pgtable, vma->vm_mm, vma,
1317                                               !! 712                                                    haddr, vmf->pmd, zero_page);
1318                                 update_mmu_ca << 
1319                                 spin_unlock(v    713                                 spin_unlock(vmf->ptl);
                                                   >> 714                                 set = true;
1320                         }                        715                         }
1321                 } else {                      !! 716                 } else
1322                         spin_unlock(vmf->ptl)    717                         spin_unlock(vmf->ptl);
                                                   >> 718                 if (!set)
1323                         pte_free(vma->vm_mm,     719                         pte_free(vma->vm_mm, pgtable);
1324                 }                             << 
1325                 return ret;                      720                 return ret;
1326         }                                        721         }
1327         gfp = vma_thp_gfp_mask(vma);          !! 722         gfp = alloc_hugepage_direct_gfpmask(vma);
1328         folio = vma_alloc_folio(gfp, HPAGE_PM !! 723         page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
1329         if (unlikely(!folio)) {               !! 724         if (unlikely(!page)) {
1330                 count_vm_event(THP_FAULT_FALL    725                 count_vm_event(THP_FAULT_FALLBACK);
1331                 count_mthp_stat(HPAGE_PMD_ORD << 
1332                 return VM_FAULT_FALLBACK;        726                 return VM_FAULT_FALLBACK;
1333         }                                        727         }
1334         return __do_huge_pmd_anonymous_page(v !! 728         prep_transhuge_page(page);
                                                   >> 729         return __do_huge_pmd_anonymous_page(vmf, page, gfp);
1335 }                                                730 }
1336                                                  731 
1337 static void insert_pfn_pmd(struct vm_area_str    732 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
1338                 pmd_t *pmd, pfn_t pfn, pgprot    733                 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
1339                 pgtable_t pgtable)               734                 pgtable_t pgtable)
1340 {                                                735 {
1341         struct mm_struct *mm = vma->vm_mm;       736         struct mm_struct *mm = vma->vm_mm;
1342         pmd_t entry;                             737         pmd_t entry;
1343         spinlock_t *ptl;                         738         spinlock_t *ptl;
1344                                                  739 
1345         ptl = pmd_lock(mm, pmd);                 740         ptl = pmd_lock(mm, pmd);
1346         if (!pmd_none(*pmd)) {                << 
1347                 if (write) {                  << 
1348                         if (pmd_pfn(*pmd) !=  << 
1349                                 WARN_ON_ONCE( << 
1350                                 goto out_unlo << 
1351                         }                     << 
1352                         entry = pmd_mkyoung(* << 
1353                         entry = maybe_pmd_mkw << 
1354                         if (pmdp_set_access_f << 
1355                                 update_mmu_ca << 
1356                 }                             << 
1357                                               << 
1358                 goto out_unlock;              << 
1359         }                                     << 
1360                                               << 
1361         entry = pmd_mkhuge(pfn_t_pmd(pfn, pro    741         entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
1362         if (pfn_t_devmap(pfn))                   742         if (pfn_t_devmap(pfn))
1363                 entry = pmd_mkdevmap(entry);     743                 entry = pmd_mkdevmap(entry);
1364         else                                  << 
1365                 entry = pmd_mkspecial(entry); << 
1366         if (write) {                             744         if (write) {
1367                 entry = pmd_mkyoung(pmd_mkdir    745                 entry = pmd_mkyoung(pmd_mkdirty(entry));
1368                 entry = maybe_pmd_mkwrite(ent    746                 entry = maybe_pmd_mkwrite(entry, vma);
1369         }                                        747         }
1370                                                  748 
1371         if (pgtable) {                           749         if (pgtable) {
1372                 pgtable_trans_huge_deposit(mm    750                 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1373                 mm_inc_nr_ptes(mm);              751                 mm_inc_nr_ptes(mm);
1374                 pgtable = NULL;               << 
1375         }                                        752         }
1376                                                  753 
1377         set_pmd_at(mm, addr, pmd, entry);        754         set_pmd_at(mm, addr, pmd, entry);
1378         update_mmu_cache_pmd(vma, addr, pmd);    755         update_mmu_cache_pmd(vma, addr, pmd);
1379                                               << 
1380 out_unlock:                                   << 
1381         spin_unlock(ptl);                        756         spin_unlock(ptl);
1382         if (pgtable)                          << 
1383                 pte_free(mm, pgtable);        << 
1384 }                                                757 }
1385                                                  758 
1386 /**                                           !! 759 int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
1387  * vmf_insert_pfn_pmd - insert a pmd size pfn !! 760                         pmd_t *pmd, pfn_t pfn, bool write)
1388  * @vmf: Structure describing the fault       << 
1389  * @pfn: pfn to insert                        << 
1390  * @write: whether it's a write fault         << 
1391  *                                            << 
1392  * Insert a pmd size pfn. See vmf_insert_pfn( << 
1393  *                                            << 
1394  * Return: vm_fault_t value.                  << 
1395  */                                           << 
1396 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault << 
1397 {                                                761 {
1398         unsigned long addr = vmf->address & P << 
1399         struct vm_area_struct *vma = vmf->vma << 
1400         pgprot_t pgprot = vma->vm_page_prot;     762         pgprot_t pgprot = vma->vm_page_prot;
1401         pgtable_t pgtable = NULL;                763         pgtable_t pgtable = NULL;
1402                                               << 
1403         /*                                       764         /*
1404          * If we had pmd_special, we could av    765          * If we had pmd_special, we could avoid all these restrictions,
1405          * but we need to be consistent with     766          * but we need to be consistent with PTEs and architectures that
1406          * can't support a 'special' bit.        767          * can't support a 'special' bit.
1407          */                                      768          */
1408         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|V !! 769         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1409                         !pfn_t_devmap(pfn));  << 
1410         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM    770         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1411                                                  771                                                 (VM_PFNMAP|VM_MIXEDMAP));
1412         BUG_ON((vma->vm_flags & VM_PFNMAP) &&    772         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
                                                   >> 773         BUG_ON(!pfn_t_devmap(pfn));
1413                                                  774 
1414         if (addr < vma->vm_start || addr >= v    775         if (addr < vma->vm_start || addr >= vma->vm_end)
1415                 return VM_FAULT_SIGBUS;          776                 return VM_FAULT_SIGBUS;
1416                                                  777 
1417         if (arch_needs_pgtable_deposit()) {      778         if (arch_needs_pgtable_deposit()) {
1418                 pgtable = pte_alloc_one(vma-> !! 779                 pgtable = pte_alloc_one(vma->vm_mm, addr);
1419                 if (!pgtable)                    780                 if (!pgtable)
1420                         return VM_FAULT_OOM;     781                         return VM_FAULT_OOM;
1421         }                                        782         }
1422                                                  783 
1423         track_pfn_insert(vma, &pgprot, pfn);     784         track_pfn_insert(vma, &pgprot, pfn);
1424                                                  785 
1425         insert_pfn_pmd(vma, addr, vmf->pmd, p !! 786         insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable);
1426         return VM_FAULT_NOPAGE;                  787         return VM_FAULT_NOPAGE;
1427 }                                                788 }
1428 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);           789 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
1429                                                  790 
1430 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_    791 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1431 static pud_t maybe_pud_mkwrite(pud_t pud, str    792 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
1432 {                                                793 {
1433         if (likely(vma->vm_flags & VM_WRITE))    794         if (likely(vma->vm_flags & VM_WRITE))
1434                 pud = pud_mkwrite(pud);          795                 pud = pud_mkwrite(pud);
1435         return pud;                              796         return pud;
1436 }                                                797 }
1437                                                  798 
1438 static void insert_pfn_pud(struct vm_area_str    799 static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
1439                 pud_t *pud, pfn_t pfn, bool w !! 800                 pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
1440 {                                                801 {
1441         struct mm_struct *mm = vma->vm_mm;       802         struct mm_struct *mm = vma->vm_mm;
1442         pgprot_t prot = vma->vm_page_prot;    << 
1443         pud_t entry;                             803         pud_t entry;
1444         spinlock_t *ptl;                         804         spinlock_t *ptl;
1445                                                  805 
1446         ptl = pud_lock(mm, pud);                 806         ptl = pud_lock(mm, pud);
1447         if (!pud_none(*pud)) {                << 
1448                 if (write) {                  << 
1449                         if (WARN_ON_ONCE(pud_ << 
1450                                 goto out_unlo << 
1451                         entry = pud_mkyoung(* << 
1452                         entry = maybe_pud_mkw << 
1453                         if (pudp_set_access_f << 
1454                                 update_mmu_ca << 
1455                 }                             << 
1456                 goto out_unlock;              << 
1457         }                                     << 
1458                                               << 
1459         entry = pud_mkhuge(pfn_t_pud(pfn, pro    807         entry = pud_mkhuge(pfn_t_pud(pfn, prot));
1460         if (pfn_t_devmap(pfn))                   808         if (pfn_t_devmap(pfn))
1461                 entry = pud_mkdevmap(entry);     809                 entry = pud_mkdevmap(entry);
1462         else                                  << 
1463                 entry = pud_mkspecial(entry); << 
1464         if (write) {                             810         if (write) {
1465                 entry = pud_mkyoung(pud_mkdir    811                 entry = pud_mkyoung(pud_mkdirty(entry));
1466                 entry = maybe_pud_mkwrite(ent    812                 entry = maybe_pud_mkwrite(entry, vma);
1467         }                                        813         }
1468         set_pud_at(mm, addr, pud, entry);        814         set_pud_at(mm, addr, pud, entry);
1469         update_mmu_cache_pud(vma, addr, pud);    815         update_mmu_cache_pud(vma, addr, pud);
1470                                               << 
1471 out_unlock:                                   << 
1472         spin_unlock(ptl);                        816         spin_unlock(ptl);
1473 }                                                817 }
1474                                                  818 
1475 /**                                           !! 819 int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
1476  * vmf_insert_pfn_pud - insert a pud size pfn !! 820                         pud_t *pud, pfn_t pfn, bool write)
1477  * @vmf: Structure describing the fault       << 
1478  * @pfn: pfn to insert                        << 
1479  * @write: whether it's a write fault         << 
1480  *                                            << 
1481  * Insert a pud size pfn. See vmf_insert_pfn( << 
1482  *                                            << 
1483  * Return: vm_fault_t value.                  << 
1484  */                                           << 
1485 vm_fault_t vmf_insert_pfn_pud(struct vm_fault << 
1486 {                                                821 {
1487         unsigned long addr = vmf->address & P << 
1488         struct vm_area_struct *vma = vmf->vma << 
1489         pgprot_t pgprot = vma->vm_page_prot;     822         pgprot_t pgprot = vma->vm_page_prot;
1490                                               << 
1491         /*                                       823         /*
1492          * If we had pud_special, we could av    824          * If we had pud_special, we could avoid all these restrictions,
1493          * but we need to be consistent with     825          * but we need to be consistent with PTEs and architectures that
1494          * can't support a 'special' bit.        826          * can't support a 'special' bit.
1495          */                                      827          */
1496         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|V !! 828         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1497                         !pfn_t_devmap(pfn));  << 
1498         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM    829         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1499                                                  830                                                 (VM_PFNMAP|VM_MIXEDMAP));
1500         BUG_ON((vma->vm_flags & VM_PFNMAP) &&    831         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
                                                   >> 832         BUG_ON(!pfn_t_devmap(pfn));
1501                                                  833 
1502         if (addr < vma->vm_start || addr >= v    834         if (addr < vma->vm_start || addr >= vma->vm_end)
1503                 return VM_FAULT_SIGBUS;          835                 return VM_FAULT_SIGBUS;
1504                                                  836 
1505         track_pfn_insert(vma, &pgprot, pfn);     837         track_pfn_insert(vma, &pgprot, pfn);
1506                                                  838 
1507         insert_pfn_pud(vma, addr, vmf->pud, p !! 839         insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
1508         return VM_FAULT_NOPAGE;                  840         return VM_FAULT_NOPAGE;
1509 }                                                841 }
1510 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);           842 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
1511 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPA    843 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1512                                                  844 
1513 void touch_pmd(struct vm_area_struct *vma, un !! 845 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1514                pmd_t *pmd, bool write)        !! 846                 pmd_t *pmd, int flags)
1515 {                                                847 {
1516         pmd_t _pmd;                              848         pmd_t _pmd;
1517                                                  849 
1518         _pmd = pmd_mkyoung(*pmd);                850         _pmd = pmd_mkyoung(*pmd);
1519         if (write)                            !! 851         if (flags & FOLL_WRITE)
1520                 _pmd = pmd_mkdirty(_pmd);        852                 _pmd = pmd_mkdirty(_pmd);
1521         if (pmdp_set_access_flags(vma, addr &    853         if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1522                                   pmd, _pmd,  !! 854                                 pmd, _pmd, flags & FOLL_WRITE))
1523                 update_mmu_cache_pmd(vma, add    855                 update_mmu_cache_pmd(vma, addr, pmd);
1524 }                                                856 }
1525                                                  857 
1526 struct page *follow_devmap_pmd(struct vm_area    858 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
1527                 pmd_t *pmd, int flags, struct !! 859                 pmd_t *pmd, int flags)
1528 {                                                860 {
1529         unsigned long pfn = pmd_pfn(*pmd);       861         unsigned long pfn = pmd_pfn(*pmd);
1530         struct mm_struct *mm = vma->vm_mm;       862         struct mm_struct *mm = vma->vm_mm;
                                                   >> 863         struct dev_pagemap *pgmap;
1531         struct page *page;                       864         struct page *page;
1532         int ret;                              << 
1533                                                  865 
1534         assert_spin_locked(pmd_lockptr(mm, pm    866         assert_spin_locked(pmd_lockptr(mm, pmd));
1535                                                  867 
                                                   >> 868         /*
                                                   >> 869          * When we COW a devmap PMD entry, we split it into PTEs, so we should
                                                   >> 870          * not be in this function with `flags & FOLL_COW` set.
                                                   >> 871          */
                                                   >> 872         WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
                                                   >> 873 
1536         if (flags & FOLL_WRITE && !pmd_write(    874         if (flags & FOLL_WRITE && !pmd_write(*pmd))
1537                 return NULL;                     875                 return NULL;
1538                                                  876 
1539         if (pmd_present(*pmd) && pmd_devmap(*    877         if (pmd_present(*pmd) && pmd_devmap(*pmd))
1540                 /* pass */;                      878                 /* pass */;
1541         else                                     879         else
1542                 return NULL;                     880                 return NULL;
1543                                                  881 
1544         if (flags & FOLL_TOUCH)                  882         if (flags & FOLL_TOUCH)
1545                 touch_pmd(vma, addr, pmd, fla !! 883                 touch_pmd(vma, addr, pmd, flags);
1546                                                  884 
1547         /*                                       885         /*
1548          * device mapped pages can only be re    886          * device mapped pages can only be returned if the
1549          * caller will manage the page refere    887          * caller will manage the page reference count.
1550          */                                      888          */
1551         if (!(flags & (FOLL_GET | FOLL_PIN))) !! 889         if (!(flags & FOLL_GET))
1552                 return ERR_PTR(-EEXIST);         890                 return ERR_PTR(-EEXIST);
1553                                                  891 
1554         pfn += (addr & ~PMD_MASK) >> PAGE_SHI    892         pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
1555         *pgmap = get_dev_pagemap(pfn, *pgmap) !! 893         pgmap = get_dev_pagemap(pfn, NULL);
1556         if (!*pgmap)                          !! 894         if (!pgmap)
1557                 return ERR_PTR(-EFAULT);         895                 return ERR_PTR(-EFAULT);
1558         page = pfn_to_page(pfn);                 896         page = pfn_to_page(pfn);
1559         ret = try_grab_folio(page_folio(page) !! 897         get_page(page);
1560         if (ret)                              !! 898         put_dev_pagemap(pgmap);
1561                 page = ERR_PTR(ret);          << 
1562                                                  899 
1563         return page;                             900         return page;
1564 }                                                901 }
1565                                                  902 
1566 int copy_huge_pmd(struct mm_struct *dst_mm, s    903 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1567                   pmd_t *dst_pmd, pmd_t *src_    904                   pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1568                   struct vm_area_struct *dst_ !! 905                   struct vm_area_struct *vma)
1569 {                                                906 {
1570         spinlock_t *dst_ptl, *src_ptl;           907         spinlock_t *dst_ptl, *src_ptl;
1571         struct page *src_page;                   908         struct page *src_page;
1572         struct folio *src_folio;              << 
1573         pmd_t pmd;                               909         pmd_t pmd;
1574         pgtable_t pgtable = NULL;                910         pgtable_t pgtable = NULL;
1575         int ret = -ENOMEM;                       911         int ret = -ENOMEM;
1576                                                  912 
1577         pmd = pmdp_get_lockless(src_pmd);     << 
1578         if (unlikely(pmd_present(pmd) && pmd_ << 
1579                 dst_ptl = pmd_lock(dst_mm, ds << 
1580                 src_ptl = pmd_lockptr(src_mm, << 
1581                 spin_lock_nested(src_ptl, SIN << 
1582                 /*                            << 
1583                  * No need to recheck the pmd << 
1584                  * mmap lock held here.       << 
1585                  *                            << 
1586                  * Meanwhile, making sure it' << 
1587                  * mapping, otherwise it mean << 
1588                  * applied special bit, or we << 
1589                  * able to wrongly write to t << 
1590                  */                           << 
1591                 VM_WARN_ON_ONCE(is_cow_mappin << 
1592                 goto set_pmd;                 << 
1593         }                                     << 
1594                                               << 
1595         /* Skip if can be re-fill on fault */    913         /* Skip if can be re-fill on fault */
1596         if (!vma_is_anonymous(dst_vma))       !! 914         if (!vma_is_anonymous(vma))
1597                 return 0;                        915                 return 0;
1598                                                  916 
1599         pgtable = pte_alloc_one(dst_mm);      !! 917         pgtable = pte_alloc_one(dst_mm, addr);
1600         if (unlikely(!pgtable))                  918         if (unlikely(!pgtable))
1601                 goto out;                        919                 goto out;
1602                                                  920 
1603         dst_ptl = pmd_lock(dst_mm, dst_pmd);     921         dst_ptl = pmd_lock(dst_mm, dst_pmd);
1604         src_ptl = pmd_lockptr(src_mm, src_pmd    922         src_ptl = pmd_lockptr(src_mm, src_pmd);
1605         spin_lock_nested(src_ptl, SINGLE_DEPT    923         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1606                                                  924 
1607         ret = -EAGAIN;                           925         ret = -EAGAIN;
1608         pmd = *src_pmd;                          926         pmd = *src_pmd;
1609                                                  927 
1610 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION          928 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1611         if (unlikely(is_swap_pmd(pmd))) {        929         if (unlikely(is_swap_pmd(pmd))) {
1612                 swp_entry_t entry = pmd_to_sw    930                 swp_entry_t entry = pmd_to_swp_entry(pmd);
1613                                                  931 
1614                 VM_BUG_ON(!is_pmd_migration_e    932                 VM_BUG_ON(!is_pmd_migration_entry(pmd));
1615                 if (!is_readable_migration_en !! 933                 if (is_write_migration_entry(entry)) {
1616                         entry = make_readable !! 934                         make_migration_entry_read(&entry);
1617                                               << 
1618                         pmd = swp_entry_to_pm    935                         pmd = swp_entry_to_pmd(entry);
1619                         if (pmd_swp_soft_dirt    936                         if (pmd_swp_soft_dirty(*src_pmd))
1620                                 pmd = pmd_swp    937                                 pmd = pmd_swp_mksoft_dirty(pmd);
1621                         if (pmd_swp_uffd_wp(* << 
1622                                 pmd = pmd_swp << 
1623                         set_pmd_at(src_mm, ad    938                         set_pmd_at(src_mm, addr, src_pmd, pmd);
1624                 }                                939                 }
1625                 add_mm_counter(dst_mm, MM_ANO    940                 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1626                 mm_inc_nr_ptes(dst_mm);          941                 mm_inc_nr_ptes(dst_mm);
1627                 pgtable_trans_huge_deposit(ds    942                 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1628                 if (!userfaultfd_wp(dst_vma)) << 
1629                         pmd = pmd_swp_clear_u << 
1630                 set_pmd_at(dst_mm, addr, dst_    943                 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1631                 ret = 0;                         944                 ret = 0;
1632                 goto out_unlock;                 945                 goto out_unlock;
1633         }                                        946         }
1634 #endif                                           947 #endif
1635                                                  948 
1636         if (unlikely(!pmd_trans_huge(pmd))) {    949         if (unlikely(!pmd_trans_huge(pmd))) {
1637                 pte_free(dst_mm, pgtable);       950                 pte_free(dst_mm, pgtable);
1638                 goto out_unlock;                 951                 goto out_unlock;
1639         }                                        952         }
1640         /*                                       953         /*
1641          * When page table lock is held, the     954          * When page table lock is held, the huge zero pmd should not be
1642          * under splitting since we don't spl    955          * under splitting since we don't split the page itself, only pmd to
1643          * a page table.                         956          * a page table.
1644          */                                      957          */
1645         if (is_huge_zero_pmd(pmd)) {             958         if (is_huge_zero_pmd(pmd)) {
                                                   >> 959                 struct page *zero_page;
1646                 /*                               960                 /*
1647                  * mm_get_huge_zero_folio() w !! 961                  * get_huge_zero_page() will never allocate a new page here,
1648                  * folio here, since we alrea !! 962                  * since we already have a zero page to copy. It just takes a
1649                  * copy. It just takes a refe !! 963                  * reference.
1650                  */                              964                  */
1651                 mm_get_huge_zero_folio(dst_mm !! 965                 zero_page = mm_get_huge_zero_page(dst_mm);
1652                 goto out_zero_page;           !! 966                 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
                                                   >> 967                                 zero_page);
                                                   >> 968                 ret = 0;
                                                   >> 969                 goto out_unlock;
1653         }                                        970         }
1654                                                  971 
1655         src_page = pmd_page(pmd);                972         src_page = pmd_page(pmd);
1656         VM_BUG_ON_PAGE(!PageHead(src_page), s    973         VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1657         src_folio = page_folio(src_page);     !! 974         get_page(src_page);
1658                                               !! 975         page_dup_rmap(src_page, true);
1659         folio_get(src_folio);                 << 
1660         if (unlikely(folio_try_dup_anon_rmap_ << 
1661                 /* Page maybe pinned: split a << 
1662                 folio_put(src_folio);         << 
1663                 pte_free(dst_mm, pgtable);    << 
1664                 spin_unlock(src_ptl);         << 
1665                 spin_unlock(dst_ptl);         << 
1666                 __split_huge_pmd(src_vma, src << 
1667                 return -EAGAIN;               << 
1668         }                                     << 
1669         add_mm_counter(dst_mm, MM_ANONPAGES,     976         add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1670 out_zero_page:                                << 
1671         mm_inc_nr_ptes(dst_mm);                  977         mm_inc_nr_ptes(dst_mm);
1672         pgtable_trans_huge_deposit(dst_mm, ds    978         pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
                                                   >> 979 
1673         pmdp_set_wrprotect(src_mm, addr, src_    980         pmdp_set_wrprotect(src_mm, addr, src_pmd);
1674         if (!userfaultfd_wp(dst_vma))         !! 981         pmd = pmd_mkold(pmd_wrprotect(pmd));
1675                 pmd = pmd_clear_uffd_wp(pmd); << 
1676         pmd = pmd_wrprotect(pmd);             << 
1677 set_pmd:                                      << 
1678         pmd = pmd_mkold(pmd);                 << 
1679         set_pmd_at(dst_mm, addr, dst_pmd, pmd    982         set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1680                                                  983 
1681         ret = 0;                                 984         ret = 0;
1682 out_unlock:                                      985 out_unlock:
1683         spin_unlock(src_ptl);                    986         spin_unlock(src_ptl);
1684         spin_unlock(dst_ptl);                    987         spin_unlock(dst_ptl);
1685 out:                                             988 out:
1686         return ret;                              989         return ret;
1687 }                                                990 }
1688                                                  991 
1689 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_    992 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1690 void touch_pud(struct vm_area_struct *vma, un !! 993 static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1691                pud_t *pud, bool write)        !! 994                 pud_t *pud, int flags)
1692 {                                                995 {
1693         pud_t _pud;                              996         pud_t _pud;
1694                                                  997 
1695         _pud = pud_mkyoung(*pud);                998         _pud = pud_mkyoung(*pud);
1696         if (write)                            !! 999         if (flags & FOLL_WRITE)
1697                 _pud = pud_mkdirty(_pud);        1000                 _pud = pud_mkdirty(_pud);
1698         if (pudp_set_access_flags(vma, addr &    1001         if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
1699                                   pud, _pud,  !! 1002                                 pud, _pud, flags & FOLL_WRITE))
1700                 update_mmu_cache_pud(vma, add    1003                 update_mmu_cache_pud(vma, addr, pud);
1701 }                                                1004 }
1702                                                  1005 
                                                   >> 1006 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
                                                   >> 1007                 pud_t *pud, int flags)
                                                   >> 1008 {
                                                   >> 1009         unsigned long pfn = pud_pfn(*pud);
                                                   >> 1010         struct mm_struct *mm = vma->vm_mm;
                                                   >> 1011         struct dev_pagemap *pgmap;
                                                   >> 1012         struct page *page;
                                                   >> 1013 
                                                   >> 1014         assert_spin_locked(pud_lockptr(mm, pud));
                                                   >> 1015 
                                                   >> 1016         if (flags & FOLL_WRITE && !pud_write(*pud))
                                                   >> 1017                 return NULL;
                                                   >> 1018 
                                                   >> 1019         if (pud_present(*pud) && pud_devmap(*pud))
                                                   >> 1020                 /* pass */;
                                                   >> 1021         else
                                                   >> 1022                 return NULL;
                                                   >> 1023 
                                                   >> 1024         if (flags & FOLL_TOUCH)
                                                   >> 1025                 touch_pud(vma, addr, pud, flags);
                                                   >> 1026 
                                                   >> 1027         /*
                                                   >> 1028          * device mapped pages can only be returned if the
                                                   >> 1029          * caller will manage the page reference count.
                                                   >> 1030          */
                                                   >> 1031         if (!(flags & FOLL_GET))
                                                   >> 1032                 return ERR_PTR(-EEXIST);
                                                   >> 1033 
                                                   >> 1034         pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
                                                   >> 1035         pgmap = get_dev_pagemap(pfn, NULL);
                                                   >> 1036         if (!pgmap)
                                                   >> 1037                 return ERR_PTR(-EFAULT);
                                                   >> 1038         page = pfn_to_page(pfn);
                                                   >> 1039         get_page(page);
                                                   >> 1040         put_dev_pagemap(pgmap);
                                                   >> 1041 
                                                   >> 1042         return page;
                                                   >> 1043 }
                                                   >> 1044 
1703 int copy_huge_pud(struct mm_struct *dst_mm, s    1045 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1704                   pud_t *dst_pud, pud_t *src_    1046                   pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1705                   struct vm_area_struct *vma)    1047                   struct vm_area_struct *vma)
1706 {                                                1048 {
1707         spinlock_t *dst_ptl, *src_ptl;           1049         spinlock_t *dst_ptl, *src_ptl;
1708         pud_t pud;                               1050         pud_t pud;
1709         int ret;                                 1051         int ret;
1710                                                  1052 
1711         dst_ptl = pud_lock(dst_mm, dst_pud);     1053         dst_ptl = pud_lock(dst_mm, dst_pud);
1712         src_ptl = pud_lockptr(src_mm, src_pud    1054         src_ptl = pud_lockptr(src_mm, src_pud);
1713         spin_lock_nested(src_ptl, SINGLE_DEPT    1055         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1714                                                  1056 
1715         ret = -EAGAIN;                           1057         ret = -EAGAIN;
1716         pud = *src_pud;                          1058         pud = *src_pud;
1717         if (unlikely(!pud_trans_huge(pud) &&     1059         if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1718                 goto out_unlock;                 1060                 goto out_unlock;
1719                                                  1061 
1720         /*                                       1062         /*
1721          * TODO: once we support anonymous pa !! 1063          * When page table lock is held, the huge zero pud should not be
1722          * folio_try_dup_anon_rmap_*() and sp !! 1064          * under splitting since we don't split the page itself, only pud to
                                                   >> 1065          * a page table.
1723          */                                      1066          */
1724         if (is_cow_mapping(vma->vm_flags) &&  !! 1067         if (is_huge_zero_pud(pud)) {
1725                 pudp_set_wrprotect(src_mm, ad !! 1068                 /* No huge zero pud yet */
1726                 pud = pud_wrprotect(pud);     << 
1727         }                                        1069         }
1728         pud = pud_mkold(pud);                 !! 1070 
                                                   >> 1071         pudp_set_wrprotect(src_mm, addr, src_pud);
                                                   >> 1072         pud = pud_mkold(pud_wrprotect(pud));
1729         set_pud_at(dst_mm, addr, dst_pud, pud    1073         set_pud_at(dst_mm, addr, dst_pud, pud);
1730                                                  1074 
1731         ret = 0;                                 1075         ret = 0;
1732 out_unlock:                                      1076 out_unlock:
1733         spin_unlock(src_ptl);                    1077         spin_unlock(src_ptl);
1734         spin_unlock(dst_ptl);                    1078         spin_unlock(dst_ptl);
1735         return ret;                              1079         return ret;
1736 }                                                1080 }
1737                                                  1081 
1738 void huge_pud_set_accessed(struct vm_fault *v    1082 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1739 {                                                1083 {
                                                   >> 1084         pud_t entry;
                                                   >> 1085         unsigned long haddr;
1740         bool write = vmf->flags & FAULT_FLAG_    1086         bool write = vmf->flags & FAULT_FLAG_WRITE;
1741                                                  1087 
1742         vmf->ptl = pud_lock(vmf->vma->vm_mm,     1088         vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1743         if (unlikely(!pud_same(*vmf->pud, ori    1089         if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1744                 goto unlock;                     1090                 goto unlock;
1745                                                  1091 
1746         touch_pud(vmf->vma, vmf->address, vmf !! 1092         entry = pud_mkyoung(orig_pud);
                                                   >> 1093         if (write)
                                                   >> 1094                 entry = pud_mkdirty(entry);
                                                   >> 1095         haddr = vmf->address & HPAGE_PUD_MASK;
                                                   >> 1096         if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
                                                   >> 1097                 update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
                                                   >> 1098 
1747 unlock:                                          1099 unlock:
1748         spin_unlock(vmf->ptl);                   1100         spin_unlock(vmf->ptl);
1749 }                                                1101 }
1750 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPA    1102 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1751                                                  1103 
1752 void huge_pmd_set_accessed(struct vm_fault *v !! 1104 void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
1753 {                                                1105 {
                                                   >> 1106         pmd_t entry;
                                                   >> 1107         unsigned long haddr;
1754         bool write = vmf->flags & FAULT_FLAG_    1108         bool write = vmf->flags & FAULT_FLAG_WRITE;
1755                                                  1109 
1756         vmf->ptl = pmd_lock(vmf->vma->vm_mm,     1110         vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1757         if (unlikely(!pmd_same(*vmf->pmd, vmf !! 1111         if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
1758                 goto unlock;                     1112                 goto unlock;
1759                                                  1113 
1760         touch_pmd(vmf->vma, vmf->address, vmf !! 1114         entry = pmd_mkyoung(orig_pmd);
                                                   >> 1115         if (write)
                                                   >> 1116                 entry = pmd_mkdirty(entry);
                                                   >> 1117         haddr = vmf->address & HPAGE_PMD_MASK;
                                                   >> 1118         if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
                                                   >> 1119                 update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
1761                                                  1120 
1762 unlock:                                          1121 unlock:
1763         spin_unlock(vmf->ptl);                   1122         spin_unlock(vmf->ptl);
1764 }                                                1123 }
1765                                                  1124 
1766 vm_fault_t do_huge_pmd_wp_page(struct vm_faul !! 1125 static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
                                                   >> 1126                 struct page *page)
1767 {                                                1127 {
1768         const bool unshare = vmf->flags & FAU << 
1769         struct vm_area_struct *vma = vmf->vma    1128         struct vm_area_struct *vma = vmf->vma;
1770         struct folio *folio;                  << 
1771         struct page *page;                    << 
1772         unsigned long haddr = vmf->address &     1129         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1773         pmd_t orig_pmd = vmf->orig_pmd;       !! 1130         struct mem_cgroup *memcg;
1774                                               !! 1131         pgtable_t pgtable;
1775         vmf->ptl = pmd_lockptr(vma->vm_mm, vm !! 1132         pmd_t _pmd;
1776         VM_BUG_ON_VMA(!vma->anon_vma, vma);   !! 1133         int ret = 0, i;
1777                                               !! 1134         struct page **pages;
1778         if (is_huge_zero_pmd(orig_pmd))       !! 1135         unsigned long mmun_start;       /* For mmu_notifiers */
1779                 goto fallback;                !! 1136         unsigned long mmun_end;         /* For mmu_notifiers */
                                                   >> 1137 
                                                   >> 1138         pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
                                                   >> 1139                         GFP_KERNEL);
                                                   >> 1140         if (unlikely(!pages)) {
                                                   >> 1141                 ret |= VM_FAULT_OOM;
                                                   >> 1142                 goto out;
                                                   >> 1143         }
1780                                                  1144 
1781         spin_lock(vmf->ptl);                  !! 1145         for (i = 0; i < HPAGE_PMD_NR; i++) {
                                                   >> 1146                 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
                                                   >> 1147                                                vmf->address, page_to_nid(page));
                                                   >> 1148                 if (unlikely(!pages[i] ||
                                                   >> 1149                              mem_cgroup_try_charge(pages[i], vma->vm_mm,
                                                   >> 1150                                      GFP_KERNEL, &memcg, false))) {
                                                   >> 1151                         if (pages[i])
                                                   >> 1152                                 put_page(pages[i]);
                                                   >> 1153                         while (--i >= 0) {
                                                   >> 1154                                 memcg = (void *)page_private(pages[i]);
                                                   >> 1155                                 set_page_private(pages[i], 0);
                                                   >> 1156                                 mem_cgroup_cancel_charge(pages[i], memcg,
                                                   >> 1157                                                 false);
                                                   >> 1158                                 put_page(pages[i]);
                                                   >> 1159                         }
                                                   >> 1160                         kfree(pages);
                                                   >> 1161                         ret |= VM_FAULT_OOM;
                                                   >> 1162                         goto out;
                                                   >> 1163                 }
                                                   >> 1164                 set_page_private(pages[i], (unsigned long)memcg);
                                                   >> 1165         }
1782                                                  1166 
1783         if (unlikely(!pmd_same(*vmf->pmd, ori !! 1167         for (i = 0; i < HPAGE_PMD_NR; i++) {
1784                 spin_unlock(vmf->ptl);        !! 1168                 copy_user_highpage(pages[i], page + i,
1785                 return 0;                     !! 1169                                    haddr + PAGE_SIZE * i, vma);
                                                   >> 1170                 __SetPageUptodate(pages[i]);
                                                   >> 1171                 cond_resched();
1786         }                                        1172         }
1787                                                  1173 
1788         page = pmd_page(orig_pmd);            !! 1174         mmun_start = haddr;
1789         folio = page_folio(page);             !! 1175         mmun_end   = haddr + HPAGE_PMD_SIZE;
                                                   >> 1176         mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
                                                   >> 1177 
                                                   >> 1178         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
                                                   >> 1179         if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
                                                   >> 1180                 goto out_free_pages;
1790         VM_BUG_ON_PAGE(!PageHead(page), page)    1181         VM_BUG_ON_PAGE(!PageHead(page), page);
1791                                                  1182 
1792         /* Early check when only holding the  !! 1183         /*
1793         if (PageAnonExclusive(page))          !! 1184          * Leave pmd empty until pte is filled note we must notify here as
1794                 goto reuse;                   !! 1185          * concurrent CPU thread might write to new page before the call to
                                                   >> 1186          * mmu_notifier_invalidate_range_end() happens which can lead to a
                                                   >> 1187          * device seeing memory write in different order than CPU.
                                                   >> 1188          *
                                                   >> 1189          * See Documentation/vm/mmu_notifier.txt
                                                   >> 1190          */
                                                   >> 1191         pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
1795                                                  1192 
1796         if (!folio_trylock(folio)) {          !! 1193         pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
1797                 folio_get(folio);             !! 1194         pmd_populate(vma->vm_mm, &_pmd, pgtable);
1798                 spin_unlock(vmf->ptl);        << 
1799                 folio_lock(folio);            << 
1800                 spin_lock(vmf->ptl);          << 
1801                 if (unlikely(!pmd_same(*vmf-> << 
1802                         spin_unlock(vmf->ptl) << 
1803                         folio_unlock(folio);  << 
1804                         folio_put(folio);     << 
1805                         return 0;             << 
1806                 }                             << 
1807                 folio_put(folio);             << 
1808         }                                     << 
1809                                                  1195 
1810         /* Recheck after temporarily dropping !! 1196         for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1811         if (PageAnonExclusive(page)) {        !! 1197                 pte_t entry;
1812                 folio_unlock(folio);          !! 1198                 entry = mk_pte(pages[i], vma->vm_page_prot);
1813                 goto reuse;                   !! 1199                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                                                   >> 1200                 memcg = (void *)page_private(pages[i]);
                                                   >> 1201                 set_page_private(pages[i], 0);
                                                   >> 1202                 page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
                                                   >> 1203                 mem_cgroup_commit_charge(pages[i], memcg, false, false);
                                                   >> 1204                 lru_cache_add_active_or_unevictable(pages[i], vma);
                                                   >> 1205                 vmf->pte = pte_offset_map(&_pmd, haddr);
                                                   >> 1206                 VM_BUG_ON(!pte_none(*vmf->pte));
                                                   >> 1207                 set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
                                                   >> 1208                 pte_unmap(vmf->pte);
1814         }                                        1209         }
                                                   >> 1210         kfree(pages);
                                                   >> 1211 
                                                   >> 1212         smp_wmb(); /* make pte visible before pmd */
                                                   >> 1213         pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
                                                   >> 1214         page_remove_rmap(page, true);
                                                   >> 1215         spin_unlock(vmf->ptl);
1815                                                  1216 
1816         /*                                       1217         /*
1817          * See do_wp_page(): we can only reus !! 1218          * No need to double call mmu_notifier->invalidate_range() callback as
1818          * there are no additional references !! 1219          * the above pmdp_huge_clear_flush_notify() did already call it.
1819          * the LRU cache immediately after ad << 
1820          */                                      1220          */
1821         if (folio_ref_count(folio) >          !! 1221         mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
1822                         1 + folio_test_swapca !! 1222                                                 mmun_end);
1823                 goto unlock_fallback;         << 
1824         if (folio_test_swapcache(folio))      << 
1825                 folio_free_swap(folio);       << 
1826         if (folio_ref_count(folio) == 1) {    << 
1827                 pmd_t entry;                  << 
1828                                                  1223 
1829                 folio_move_anon_rmap(folio, v !! 1224         ret |= VM_FAULT_WRITE;
1830                 SetPageAnonExclusive(page);   !! 1225         put_page(page);
1831                 folio_unlock(folio);          !! 1226 
1832 reuse:                                        !! 1227 out:
1833                 if (unlikely(unshare)) {      !! 1228         return ret;
1834                         spin_unlock(vmf->ptl) !! 1229 
1835                         return 0;             !! 1230 out_free_pages:
                                                   >> 1231         spin_unlock(vmf->ptl);
                                                   >> 1232         mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
                                                   >> 1233         for (i = 0; i < HPAGE_PMD_NR; i++) {
                                                   >> 1234                 memcg = (void *)page_private(pages[i]);
                                                   >> 1235                 set_page_private(pages[i], 0);
                                                   >> 1236                 mem_cgroup_cancel_charge(pages[i], memcg, false);
                                                   >> 1237                 put_page(pages[i]);
                                                   >> 1238         }
                                                   >> 1239         kfree(pages);
                                                   >> 1240         goto out;
                                                   >> 1241 }
                                                   >> 1242 
                                                   >> 1243 int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
                                                   >> 1244 {
                                                   >> 1245         struct vm_area_struct *vma = vmf->vma;
                                                   >> 1246         struct page *page = NULL, *new_page;
                                                   >> 1247         struct mem_cgroup *memcg;
                                                   >> 1248         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
                                                   >> 1249         unsigned long mmun_start;       /* For mmu_notifiers */
                                                   >> 1250         unsigned long mmun_end;         /* For mmu_notifiers */
                                                   >> 1251         gfp_t huge_gfp;                 /* for allocation and charge */
                                                   >> 1252         int ret = 0;
                                                   >> 1253 
                                                   >> 1254         vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
                                                   >> 1255         VM_BUG_ON_VMA(!vma->anon_vma, vma);
                                                   >> 1256         if (is_huge_zero_pmd(orig_pmd))
                                                   >> 1257                 goto alloc;
                                                   >> 1258         spin_lock(vmf->ptl);
                                                   >> 1259         if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
                                                   >> 1260                 goto out_unlock;
                                                   >> 1261 
                                                   >> 1262         page = pmd_page(orig_pmd);
                                                   >> 1263         VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
                                                   >> 1264         /*
                                                   >> 1265          * We can only reuse the page if nobody else maps the huge page or it's
                                                   >> 1266          * part.
                                                   >> 1267          */
                                                   >> 1268         if (!trylock_page(page)) {
                                                   >> 1269                 get_page(page);
                                                   >> 1270                 spin_unlock(vmf->ptl);
                                                   >> 1271                 lock_page(page);
                                                   >> 1272                 spin_lock(vmf->ptl);
                                                   >> 1273                 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
                                                   >> 1274                         unlock_page(page);
                                                   >> 1275                         put_page(page);
                                                   >> 1276                         goto out_unlock;
1836                 }                                1277                 }
                                                   >> 1278                 put_page(page);
                                                   >> 1279         }
                                                   >> 1280         if (reuse_swap_page(page, NULL)) {
                                                   >> 1281                 pmd_t entry;
1837                 entry = pmd_mkyoung(orig_pmd)    1282                 entry = pmd_mkyoung(orig_pmd);
1838                 entry = maybe_pmd_mkwrite(pmd    1283                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1839                 if (pmdp_set_access_flags(vma !! 1284                 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
1840                         update_mmu_cache_pmd(    1285                         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1841                 spin_unlock(vmf->ptl);        !! 1286                 ret |= VM_FAULT_WRITE;
1842                 return 0;                     !! 1287                 unlock_page(page);
                                                   >> 1288                 goto out_unlock;
                                                   >> 1289         }
                                                   >> 1290         unlock_page(page);
                                                   >> 1291         get_page(page);
                                                   >> 1292         spin_unlock(vmf->ptl);
                                                   >> 1293 alloc:
                                                   >> 1294         if (transparent_hugepage_enabled(vma) &&
                                                   >> 1295             !transparent_hugepage_debug_cow()) {
                                                   >> 1296                 huge_gfp = alloc_hugepage_direct_gfpmask(vma);
                                                   >> 1297                 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
                                                   >> 1298         } else
                                                   >> 1299                 new_page = NULL;
                                                   >> 1300 
                                                   >> 1301         if (likely(new_page)) {
                                                   >> 1302                 prep_transhuge_page(new_page);
                                                   >> 1303         } else {
                                                   >> 1304                 if (!page) {
                                                   >> 1305                         split_huge_pmd(vma, vmf->pmd, vmf->address);
                                                   >> 1306                         ret |= VM_FAULT_FALLBACK;
                                                   >> 1307                 } else {
                                                   >> 1308                         ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
                                                   >> 1309                         if (ret & VM_FAULT_OOM) {
                                                   >> 1310                                 split_huge_pmd(vma, vmf->pmd, vmf->address);
                                                   >> 1311                                 ret |= VM_FAULT_FALLBACK;
                                                   >> 1312                         }
                                                   >> 1313                         put_page(page);
                                                   >> 1314                 }
                                                   >> 1315                 count_vm_event(THP_FAULT_FALLBACK);
                                                   >> 1316                 goto out;
                                                   >> 1317         }
                                                   >> 1318 
                                                   >> 1319         if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
                                                   >> 1320                                 huge_gfp | __GFP_NORETRY, &memcg, true))) {
                                                   >> 1321                 put_page(new_page);
                                                   >> 1322                 split_huge_pmd(vma, vmf->pmd, vmf->address);
                                                   >> 1323                 if (page)
                                                   >> 1324                         put_page(page);
                                                   >> 1325                 ret |= VM_FAULT_FALLBACK;
                                                   >> 1326                 count_vm_event(THP_FAULT_FALLBACK);
                                                   >> 1327                 goto out;
1843         }                                        1328         }
1844                                                  1329 
1845 unlock_fallback:                              !! 1330         count_vm_event(THP_FAULT_ALLOC);
1846         folio_unlock(folio);                  !! 1331 
                                                   >> 1332         if (!page)
                                                   >> 1333                 clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
                                                   >> 1334         else
                                                   >> 1335                 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
                                                   >> 1336         __SetPageUptodate(new_page);
                                                   >> 1337 
                                                   >> 1338         mmun_start = haddr;
                                                   >> 1339         mmun_end   = haddr + HPAGE_PMD_SIZE;
                                                   >> 1340         mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
                                                   >> 1341 
                                                   >> 1342         spin_lock(vmf->ptl);
                                                   >> 1343         if (page)
                                                   >> 1344                 put_page(page);
                                                   >> 1345         if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
                                                   >> 1346                 spin_unlock(vmf->ptl);
                                                   >> 1347                 mem_cgroup_cancel_charge(new_page, memcg, true);
                                                   >> 1348                 put_page(new_page);
                                                   >> 1349                 goto out_mn;
                                                   >> 1350         } else {
                                                   >> 1351                 pmd_t entry;
                                                   >> 1352                 entry = mk_huge_pmd(new_page, vma->vm_page_prot);
                                                   >> 1353                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                                                   >> 1354                 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
                                                   >> 1355                 page_add_new_anon_rmap(new_page, vma, haddr, true);
                                                   >> 1356                 mem_cgroup_commit_charge(new_page, memcg, false, true);
                                                   >> 1357                 lru_cache_add_active_or_unevictable(new_page, vma);
                                                   >> 1358                 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
                                                   >> 1359                 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                                                   >> 1360                 if (!page) {
                                                   >> 1361                         add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
                                                   >> 1362                 } else {
                                                   >> 1363                         VM_BUG_ON_PAGE(!PageHead(page), page);
                                                   >> 1364                         page_remove_rmap(page, true);
                                                   >> 1365                         put_page(page);
                                                   >> 1366                 }
                                                   >> 1367                 ret |= VM_FAULT_WRITE;
                                                   >> 1368         }
1847         spin_unlock(vmf->ptl);                   1369         spin_unlock(vmf->ptl);
1848 fallback:                                     !! 1370 out_mn:
1849         __split_huge_pmd(vma, vmf->pmd, vmf-> !! 1371         /*
1850         return VM_FAULT_FALLBACK;             !! 1372          * No need to double call mmu_notifier->invalidate_range() callback as
                                                   >> 1373          * the above pmdp_huge_clear_flush_notify() did already call it.
                                                   >> 1374          */
                                                   >> 1375         mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
                                                   >> 1376                                                mmun_end);
                                                   >> 1377 out:
                                                   >> 1378         return ret;
                                                   >> 1379 out_unlock:
                                                   >> 1380         spin_unlock(vmf->ptl);
                                                   >> 1381         return ret;
1851 }                                                1382 }
1852                                                  1383 
1853 static inline bool can_change_pmd_writable(st !! 1384 /*
1854                                            un !! 1385  * FOLL_FORCE can write to even unwritable pmd's, but only
                                                   >> 1386  * after we've gone through a COW cycle and they are dirty.
                                                   >> 1387  */
                                                   >> 1388 static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
1855 {                                                1389 {
1856         struct page *page;                    !! 1390         return pmd_write(pmd) ||
                                                   >> 1391                ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
                                                   >> 1392 }
1857                                                  1393 
1858         if (WARN_ON_ONCE(!(vma->vm_flags & VM !! 1394 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1859                 return false;                 !! 1395                                    unsigned long addr,
                                                   >> 1396                                    pmd_t *pmd,
                                                   >> 1397                                    unsigned int flags)
                                                   >> 1398 {
                                                   >> 1399         struct mm_struct *mm = vma->vm_mm;
                                                   >> 1400         struct page *page = NULL;
1860                                                  1401 
1861         /* Don't touch entries that are not e !! 1402         assert_spin_locked(pmd_lockptr(mm, pmd));
1862         if (pmd_protnone(pmd))                << 
1863                 return false;                 << 
1864                                                  1403 
1865         /* Do we need write faults for softdi !! 1404         if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
1866         if (pmd_needs_soft_dirty_wp(vma, pmd) !! 1405                 goto out;
1867                 return false;                 << 
1868                                                  1406 
1869         /* Do we need write faults for uffd-w !! 1407         /* Avoid dumping huge zero page */
1870         if (userfaultfd_huge_pmd_wp(vma, pmd) !! 1408         if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1871                 return false;                 !! 1409                 return ERR_PTR(-EFAULT);
1872                                                  1410 
1873         if (!(vma->vm_flags & VM_SHARED)) {   !! 1411         /* Full NUMA hinting faults to serialise migration in fault paths */
1874                 /* See can_change_pte_writabl !! 1412         if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
1875                 page = vm_normal_page_pmd(vma !! 1413                 goto out;
1876                 return page && PageAnon(page) !! 1414 
1877         }                                     !! 1415         page = pmd_page(*pmd);
                                                   >> 1416         VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
                                                   >> 1417         if (flags & FOLL_TOUCH)
                                                   >> 1418                 touch_pmd(vma, addr, pmd, flags);
                                                   >> 1419         if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
                                                   >> 1420                 /*
                                                   >> 1421                  * We don't mlock() pte-mapped THPs. This way we can avoid
                                                   >> 1422                  * leaking mlocked pages into non-VM_LOCKED VMAs.
                                                   >> 1423                  *
                                                   >> 1424                  * For anon THP:
                                                   >> 1425                  *
                                                   >> 1426                  * In most cases the pmd is the only mapping of the page as we
                                                   >> 1427                  * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
                                                   >> 1428                  * writable private mappings in populate_vma_page_range().
                                                   >> 1429                  *
                                                   >> 1430                  * The only scenario when we have the page shared here is if we
                                                   >> 1431                  * mlocking read-only mapping shared over fork(). We skip
                                                   >> 1432                  * mlocking such pages.
                                                   >> 1433                  *
                                                   >> 1434                  * For file THP:
                                                   >> 1435                  *
                                                   >> 1436                  * We can expect PageDoubleMap() to be stable under page lock:
                                                   >> 1437                  * for file pages we set it in page_add_file_rmap(), which
                                                   >> 1438                  * requires page to be locked.
                                                   >> 1439                  */
                                                   >> 1440 
                                                   >> 1441                 if (PageAnon(page) && compound_mapcount(page) != 1)
                                                   >> 1442                         goto skip_mlock;
                                                   >> 1443                 if (PageDoubleMap(page) || !page->mapping)
                                                   >> 1444                         goto skip_mlock;
                                                   >> 1445                 if (!trylock_page(page))
                                                   >> 1446                         goto skip_mlock;
                                                   >> 1447                 lru_add_drain();
                                                   >> 1448                 if (page->mapping && !PageDoubleMap(page))
                                                   >> 1449                         mlock_vma_page(page);
                                                   >> 1450                 unlock_page(page);
                                                   >> 1451         }
                                                   >> 1452 skip_mlock:
                                                   >> 1453         page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
                                                   >> 1454         VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
                                                   >> 1455         if (flags & FOLL_GET)
                                                   >> 1456                 get_page(page);
1878                                                  1457 
1879         /* See can_change_pte_writable(). */  !! 1458 out:
1880         return pmd_dirty(pmd);                !! 1459         return page;
1881 }                                                1460 }
1882                                                  1461 
1883 /* NUMA hinting page fault entry point for tr    1462 /* NUMA hinting page fault entry point for trans huge pmds */
1884 vm_fault_t do_huge_pmd_numa_page(struct vm_fa !! 1463 int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
1885 {                                                1464 {
1886         struct vm_area_struct *vma = vmf->vma    1465         struct vm_area_struct *vma = vmf->vma;
1887         struct folio *folio;                  !! 1466         struct anon_vma *anon_vma = NULL;
                                                   >> 1467         struct page *page;
1888         unsigned long haddr = vmf->address &     1468         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1889         int nid = NUMA_NO_NODE;               !! 1469         int page_nid = -1, this_nid = numa_node_id();
1890         int target_nid, last_cpupid;          !! 1470         int target_nid, last_cpupid = -1;
1891         pmd_t pmd, old_pmd;                   !! 1471         bool page_locked;
1892         bool writable = false;                !! 1472         bool migrated = false;
                                                   >> 1473         bool was_writable;
1893         int flags = 0;                           1474         int flags = 0;
1894                                                  1475 
1895         vmf->ptl = pmd_lock(vma->vm_mm, vmf->    1476         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1896         old_pmd = pmdp_get(vmf->pmd);         !! 1477         if (unlikely(!pmd_same(pmd, *vmf->pmd)))
                                                   >> 1478                 goto out_unlock;
1897                                                  1479 
1898         if (unlikely(!pmd_same(old_pmd, vmf-> !! 1480         /*
                                                   >> 1481          * If there are potential migrations, wait for completion and retry
                                                   >> 1482          * without disrupting NUMA hinting information. Do not relock and
                                                   >> 1483          * check_same as the page may no longer be mapped.
                                                   >> 1484          */
                                                   >> 1485         if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
                                                   >> 1486                 page = pmd_page(*vmf->pmd);
                                                   >> 1487                 if (!get_page_unless_zero(page))
                                                   >> 1488                         goto out_unlock;
1899                 spin_unlock(vmf->ptl);           1489                 spin_unlock(vmf->ptl);
1900                 return 0;                     !! 1490                 wait_on_page_locked(page);
                                                   >> 1491                 put_page(page);
                                                   >> 1492                 goto out;
1901         }                                        1493         }
1902                                                  1494 
1903         pmd = pmd_modify(old_pmd, vma->vm_pag !! 1495         page = pmd_page(pmd);
                                                   >> 1496         BUG_ON(is_huge_zero_page(page));
                                                   >> 1497         page_nid = page_to_nid(page);
                                                   >> 1498         last_cpupid = page_cpupid_last(page);
                                                   >> 1499         count_vm_numa_event(NUMA_HINT_FAULTS);
                                                   >> 1500         if (page_nid == this_nid) {
                                                   >> 1501                 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
                                                   >> 1502                 flags |= TNF_FAULT_LOCAL;
                                                   >> 1503         }
                                                   >> 1504 
                                                   >> 1505         /* See similar comment in do_numa_page for explanation */
                                                   >> 1506         if (!pmd_savedwrite(pmd))
                                                   >> 1507                 flags |= TNF_NO_GROUP;
                                                   >> 1508 
                                                   >> 1509         /*
                                                   >> 1510          * Acquire the page lock to serialise THP migrations but avoid dropping
                                                   >> 1511          * page_table_lock if at all possible
                                                   >> 1512          */
                                                   >> 1513         page_locked = trylock_page(page);
                                                   >> 1514         target_nid = mpol_misplaced(page, vma, haddr);
                                                   >> 1515         if (target_nid == -1) {
                                                   >> 1516                 /* If the page was locked, there are no parallel migrations */
                                                   >> 1517                 if (page_locked)
                                                   >> 1518                         goto clear_pmdnuma;
                                                   >> 1519         }
                                                   >> 1520 
                                                   >> 1521         /* Migration could have started since the pmd_trans_migrating check */
                                                   >> 1522         if (!page_locked) {
                                                   >> 1523                 page_nid = -1;
                                                   >> 1524                 if (!get_page_unless_zero(page))
                                                   >> 1525                         goto out_unlock;
                                                   >> 1526                 spin_unlock(vmf->ptl);
                                                   >> 1527                 wait_on_page_locked(page);
                                                   >> 1528                 put_page(page);
                                                   >> 1529                 goto out;
                                                   >> 1530         }
1904                                                  1531 
1905         /*                                       1532         /*
1906          * Detect now whether the PMD could b !! 1533          * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
1907          * is only valid while holding the PT !! 1534          * to serialises splits
1908          */                                      1535          */
1909         writable = pmd_write(pmd);            !! 1536         get_page(page);
1910         if (!writable && vma_wants_manual_pte !! 1537         spin_unlock(vmf->ptl);
1911             can_change_pmd_writable(vma, vmf- !! 1538         anon_vma = page_lock_anon_vma_read(page);
1912                 writable = true;              << 
1913                                               << 
1914         folio = vm_normal_folio_pmd(vma, hadd << 
1915         if (!folio)                           << 
1916                 goto out_map;                 << 
1917                                                  1539 
1918         nid = folio_nid(folio);               !! 1540         /* Confirm the PMD did not change while page_table_lock was released */
                                                   >> 1541         spin_lock(vmf->ptl);
                                                   >> 1542         if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
                                                   >> 1543                 unlock_page(page);
                                                   >> 1544                 put_page(page);
                                                   >> 1545                 page_nid = -1;
                                                   >> 1546                 goto out_unlock;
                                                   >> 1547         }
1919                                                  1548 
1920         target_nid = numa_migrate_check(folio !! 1549         /* Bail if we fail to protect against THP splits for any reason */
1921                                         &last !! 1550         if (unlikely(!anon_vma)) {
1922         if (target_nid == NUMA_NO_NODE)       !! 1551                 put_page(page);
1923                 goto out_map;                 !! 1552                 page_nid = -1;
1924         if (migrate_misplaced_folio_prepare(f !! 1553                 goto clear_pmdnuma;
1925                 flags |= TNF_MIGRATE_FAIL;    << 
1926                 goto out_map;                 << 
1927         }                                        1554         }
1928         /* The folio is isolated and isolatio !! 1555 
                                                   >> 1556         /*
                                                   >> 1557          * Since we took the NUMA fault, we must have observed the !accessible
                                                   >> 1558          * bit. Make sure all other CPUs agree with that, to avoid them
                                                   >> 1559          * modifying the page we're about to migrate.
                                                   >> 1560          *
                                                   >> 1561          * Must be done under PTL such that we'll observe the relevant
                                                   >> 1562          * inc_tlb_flush_pending().
                                                   >> 1563          *
                                                   >> 1564          * We are not sure a pending tlb flush here is for a huge page
                                                   >> 1565          * mapping or not. Hence use the tlb range variant
                                                   >> 1566          */
                                                   >> 1567         if (mm_tlb_flush_pending(vma->vm_mm))
                                                   >> 1568                 flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
                                                   >> 1569 
                                                   >> 1570         /*
                                                   >> 1571          * Migrate the THP to the requested node, returns with page unlocked
                                                   >> 1572          * and access rights restored.
                                                   >> 1573          */
1929         spin_unlock(vmf->ptl);                   1574         spin_unlock(vmf->ptl);
1930         writable = false;                     << 
1931                                                  1575 
1932         if (!migrate_misplaced_folio(folio, v !! 1576         migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
                                                   >> 1577                                 vmf->pmd, pmd, vmf->address, page, target_nid);
                                                   >> 1578         if (migrated) {
1933                 flags |= TNF_MIGRATED;           1579                 flags |= TNF_MIGRATED;
1934                 nid = target_nid;             !! 1580                 page_nid = target_nid;
1935                 task_numa_fault(last_cpupid,  !! 1581         } else
1936                 return 0;                     !! 1582                 flags |= TNF_MIGRATE_FAIL;
1937         }                                     << 
1938                                                  1583 
1939         flags |= TNF_MIGRATE_FAIL;            !! 1584         goto out;
1940         vmf->ptl = pmd_lock(vma->vm_mm, vmf-> !! 1585 clear_pmdnuma:
1941         if (unlikely(!pmd_same(pmdp_get(vmf-> !! 1586         BUG_ON(!PageLocked(page));
1942                 spin_unlock(vmf->ptl);        !! 1587         was_writable = pmd_savedwrite(pmd);
1943                 return 0;                     !! 1588         pmd = pmd_modify(pmd, vma->vm_page_prot);
1944         }                                     << 
1945 out_map:                                      << 
1946         /* Restore the PMD */                 << 
1947         pmd = pmd_modify(pmdp_get(vmf->pmd),  << 
1948         pmd = pmd_mkyoung(pmd);                  1589         pmd = pmd_mkyoung(pmd);
1949         if (writable)                         !! 1590         if (was_writable)
1950                 pmd = pmd_mkwrite(pmd, vma);  !! 1591                 pmd = pmd_mkwrite(pmd);
1951         set_pmd_at(vma->vm_mm, haddr, vmf->pm    1592         set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
1952         update_mmu_cache_pmd(vma, vmf->addres    1593         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                                                   >> 1594         unlock_page(page);
                                                   >> 1595 out_unlock:
1953         spin_unlock(vmf->ptl);                   1596         spin_unlock(vmf->ptl);
1954                                                  1597 
1955         if (nid != NUMA_NO_NODE)              !! 1598 out:
1956                 task_numa_fault(last_cpupid,  !! 1599         if (anon_vma)
                                                   >> 1600                 page_unlock_anon_vma_read(anon_vma);
                                                   >> 1601 
                                                   >> 1602         if (page_nid != -1)
                                                   >> 1603                 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
                                                   >> 1604                                 flags);
                                                   >> 1605 
1957         return 0;                                1606         return 0;
1958 }                                                1607 }
1959                                                  1608 
1960 /*                                               1609 /*
1961  * Return true if we do MADV_FREE successfull    1610  * Return true if we do MADV_FREE successfully on entire pmd page.
1962  * Otherwise, return false.                      1611  * Otherwise, return false.
1963  */                                              1612  */
1964 bool madvise_free_huge_pmd(struct mmu_gather     1613 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1965                 pmd_t *pmd, unsigned long add    1614                 pmd_t *pmd, unsigned long addr, unsigned long next)
1966 {                                                1615 {
1967         spinlock_t *ptl;                         1616         spinlock_t *ptl;
1968         pmd_t orig_pmd;                          1617         pmd_t orig_pmd;
1969         struct folio *folio;                  !! 1618         struct page *page;
1970         struct mm_struct *mm = tlb->mm;          1619         struct mm_struct *mm = tlb->mm;
1971         bool ret = false;                        1620         bool ret = false;
1972                                                  1621 
1973         tlb_change_page_size(tlb, HPAGE_PMD_S !! 1622         tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
1974                                                  1623 
1975         ptl = pmd_trans_huge_lock(pmd, vma);     1624         ptl = pmd_trans_huge_lock(pmd, vma);
1976         if (!ptl)                                1625         if (!ptl)
1977                 goto out_unlocked;               1626                 goto out_unlocked;
1978                                                  1627 
1979         orig_pmd = *pmd;                         1628         orig_pmd = *pmd;
1980         if (is_huge_zero_pmd(orig_pmd))          1629         if (is_huge_zero_pmd(orig_pmd))
1981                 goto out;                        1630                 goto out;
1982                                                  1631 
1983         if (unlikely(!pmd_present(orig_pmd)))    1632         if (unlikely(!pmd_present(orig_pmd))) {
1984                 VM_BUG_ON(thp_migration_suppo    1633                 VM_BUG_ON(thp_migration_supported() &&
1985                                   !is_pmd_mig    1634                                   !is_pmd_migration_entry(orig_pmd));
1986                 goto out;                        1635                 goto out;
1987         }                                        1636         }
1988                                                  1637 
1989         folio = pmd_folio(orig_pmd);          !! 1638         page = pmd_page(orig_pmd);
1990         /*                                       1639         /*
1991          * If other processes are mapping thi !! 1640          * If other processes are mapping this page, we couldn't discard
1992          * the folio unless they all do MADV_ !! 1641          * the page unless they all do MADV_FREE so let's skip the page.
1993          */                                      1642          */
1994         if (folio_likely_mapped_shared(folio) !! 1643         if (page_mapcount(page) != 1)
1995                 goto out;                        1644                 goto out;
1996                                                  1645 
1997         if (!folio_trylock(folio))            !! 1646         if (!trylock_page(page))
1998                 goto out;                        1647                 goto out;
1999                                                  1648 
2000         /*                                       1649         /*
2001          * If user want to discard part-pages    1650          * If user want to discard part-pages of THP, split it so MADV_FREE
2002          * will deactivate only them.            1651          * will deactivate only them.
2003          */                                      1652          */
2004         if (next - addr != HPAGE_PMD_SIZE) {     1653         if (next - addr != HPAGE_PMD_SIZE) {
2005                 folio_get(folio);             !! 1654                 get_page(page);
2006                 spin_unlock(ptl);                1655                 spin_unlock(ptl);
2007                 split_folio(folio);           !! 1656                 split_huge_page(page);
2008                 folio_unlock(folio);          !! 1657                 unlock_page(page);
2009                 folio_put(folio);             !! 1658                 put_page(page);
2010                 goto out_unlocked;               1659                 goto out_unlocked;
2011         }                                        1660         }
2012                                                  1661 
2013         if (folio_test_dirty(folio))          !! 1662         if (PageDirty(page))
2014                 folio_clear_dirty(folio);     !! 1663                 ClearPageDirty(page);
2015         folio_unlock(folio);                  !! 1664         unlock_page(page);
2016                                                  1665 
2017         if (pmd_young(orig_pmd) || pmd_dirty(    1666         if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
2018                 pmdp_invalidate(vma, addr, pm    1667                 pmdp_invalidate(vma, addr, pmd);
2019                 orig_pmd = pmd_mkold(orig_pmd    1668                 orig_pmd = pmd_mkold(orig_pmd);
2020                 orig_pmd = pmd_mkclean(orig_p    1669                 orig_pmd = pmd_mkclean(orig_pmd);
2021                                                  1670 
2022                 set_pmd_at(mm, addr, pmd, ori    1671                 set_pmd_at(mm, addr, pmd, orig_pmd);
2023                 tlb_remove_pmd_tlb_entry(tlb,    1672                 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2024         }                                        1673         }
2025                                                  1674 
2026         folio_mark_lazyfree(folio);           !! 1675         mark_page_lazyfree(page);
2027         ret = true;                              1676         ret = true;
2028 out:                                             1677 out:
2029         spin_unlock(ptl);                        1678         spin_unlock(ptl);
2030 out_unlocked:                                    1679 out_unlocked:
2031         return ret;                              1680         return ret;
2032 }                                                1681 }
2033                                                  1682 
2034 static inline void zap_deposited_table(struct    1683 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
2035 {                                                1684 {
2036         pgtable_t pgtable;                       1685         pgtable_t pgtable;
2037                                                  1686 
2038         pgtable = pgtable_trans_huge_withdraw    1687         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2039         pte_free(mm, pgtable);                   1688         pte_free(mm, pgtable);
2040         mm_dec_nr_ptes(mm);                      1689         mm_dec_nr_ptes(mm);
2041 }                                                1690 }
2042                                                  1691 
2043 int zap_huge_pmd(struct mmu_gather *tlb, stru    1692 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
2044                  pmd_t *pmd, unsigned long ad    1693                  pmd_t *pmd, unsigned long addr)
2045 {                                                1694 {
2046         pmd_t orig_pmd;                          1695         pmd_t orig_pmd;
2047         spinlock_t *ptl;                         1696         spinlock_t *ptl;
2048                                                  1697 
2049         tlb_change_page_size(tlb, HPAGE_PMD_S !! 1698         tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
2050                                                  1699 
2051         ptl = __pmd_trans_huge_lock(pmd, vma)    1700         ptl = __pmd_trans_huge_lock(pmd, vma);
2052         if (!ptl)                                1701         if (!ptl)
2053                 return 0;                        1702                 return 0;
2054         /*                                       1703         /*
2055          * For architectures like ppc64 we lo    1704          * For architectures like ppc64 we look at deposited pgtable
2056          * when calling pmdp_huge_get_and_cle    1705          * when calling pmdp_huge_get_and_clear. So do the
2057          * pgtable_trans_huge_withdraw after     1706          * pgtable_trans_huge_withdraw after finishing pmdp related
2058          * operations.                           1707          * operations.
2059          */                                      1708          */
2060         orig_pmd = pmdp_huge_get_and_clear_fu !! 1709         orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
2061                                               !! 1710                         tlb->fullmm);
2062         arch_check_zapped_pmd(vma, orig_pmd); << 
2063         tlb_remove_pmd_tlb_entry(tlb, pmd, ad    1711         tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2064         if (vma_is_special_huge(vma)) {       !! 1712         if (vma_is_dax(vma)) {
2065                 if (arch_needs_pgtable_deposi    1713                 if (arch_needs_pgtable_deposit())
2066                         zap_deposited_table(t    1714                         zap_deposited_table(tlb->mm, pmd);
2067                 spin_unlock(ptl);                1715                 spin_unlock(ptl);
                                                   >> 1716                 if (is_huge_zero_pmd(orig_pmd))
                                                   >> 1717                         tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
2068         } else if (is_huge_zero_pmd(orig_pmd)    1718         } else if (is_huge_zero_pmd(orig_pmd)) {
2069                 zap_deposited_table(tlb->mm,     1719                 zap_deposited_table(tlb->mm, pmd);
2070                 spin_unlock(ptl);                1720                 spin_unlock(ptl);
                                                   >> 1721                 tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
2071         } else {                                 1722         } else {
2072                 struct folio *folio = NULL;   !! 1723                 struct page *page = NULL;
2073                 int flush_needed = 1;            1724                 int flush_needed = 1;
2074                                                  1725 
2075                 if (pmd_present(orig_pmd)) {     1726                 if (pmd_present(orig_pmd)) {
2076                         struct page *page = p !! 1727                         page = pmd_page(orig_pmd);
2077                                               !! 1728                         page_remove_rmap(page, true);
2078                         folio = page_folio(pa !! 1729                         VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
2079                         folio_remove_rmap_pmd << 
2080                         WARN_ON_ONCE(folio_ma << 
2081                         VM_BUG_ON_PAGE(!PageH    1730                         VM_BUG_ON_PAGE(!PageHead(page), page);
2082                 } else if (thp_migration_supp    1731                 } else if (thp_migration_supported()) {
2083                         swp_entry_t entry;       1732                         swp_entry_t entry;
2084                                                  1733 
2085                         VM_BUG_ON(!is_pmd_mig    1734                         VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
2086                         entry = pmd_to_swp_en    1735                         entry = pmd_to_swp_entry(orig_pmd);
2087                         folio = pfn_swap_entr !! 1736                         page = pfn_to_page(swp_offset(entry));
2088                         flush_needed = 0;        1737                         flush_needed = 0;
2089                 } else                           1738                 } else
2090                         WARN_ONCE(1, "Non pre    1739                         WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
2091                                                  1740 
2092                 if (folio_test_anon(folio)) { !! 1741                 if (PageAnon(page)) {
2093                         zap_deposited_table(t    1742                         zap_deposited_table(tlb->mm, pmd);
2094                         add_mm_counter(tlb->m    1743                         add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
2095                 } else {                         1744                 } else {
2096                         if (arch_needs_pgtabl    1745                         if (arch_needs_pgtable_deposit())
2097                                 zap_deposited    1746                                 zap_deposited_table(tlb->mm, pmd);
2098                         add_mm_counter(tlb->m !! 1747                         add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
2099                                        -HPAGE << 
2100                 }                                1748                 }
2101                                                  1749 
2102                 spin_unlock(ptl);                1750                 spin_unlock(ptl);
2103                 if (flush_needed)                1751                 if (flush_needed)
2104                         tlb_remove_page_size( !! 1752                         tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
2105         }                                        1753         }
2106         return 1;                                1754         return 1;
2107 }                                                1755 }
2108                                                  1756 
2109 #ifndef pmd_move_must_withdraw                   1757 #ifndef pmd_move_must_withdraw
2110 static inline int pmd_move_must_withdraw(spin    1758 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
2111                                          spin    1759                                          spinlock_t *old_pmd_ptl,
2112                                          stru    1760                                          struct vm_area_struct *vma)
2113 {                                                1761 {
2114         /*                                       1762         /*
2115          * With split pmd lock we also need t    1763          * With split pmd lock we also need to move preallocated
2116          * PTE page table if new_pmd is on di    1764          * PTE page table if new_pmd is on different PMD page table.
2117          *                                       1765          *
2118          * We also don't deposit and withdraw    1766          * We also don't deposit and withdraw tables for file pages.
2119          */                                      1767          */
2120         return (new_pmd_ptl != old_pmd_ptl) &    1768         return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
2121 }                                                1769 }
2122 #endif                                           1770 #endif
2123                                                  1771 
2124 static pmd_t move_soft_dirty_pmd(pmd_t pmd)      1772 static pmd_t move_soft_dirty_pmd(pmd_t pmd)
2125 {                                                1773 {
2126 #ifdef CONFIG_MEM_SOFT_DIRTY                     1774 #ifdef CONFIG_MEM_SOFT_DIRTY
2127         if (unlikely(is_pmd_migration_entry(p    1775         if (unlikely(is_pmd_migration_entry(pmd)))
2128                 pmd = pmd_swp_mksoft_dirty(pm    1776                 pmd = pmd_swp_mksoft_dirty(pmd);
2129         else if (pmd_present(pmd))               1777         else if (pmd_present(pmd))
2130                 pmd = pmd_mksoft_dirty(pmd);     1778                 pmd = pmd_mksoft_dirty(pmd);
2131 #endif                                           1779 #endif
2132         return pmd;                              1780         return pmd;
2133 }                                                1781 }
2134                                                  1782 
2135 bool move_huge_pmd(struct vm_area_struct *vma    1783 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
2136                   unsigned long new_addr, pmd !! 1784                   unsigned long new_addr, unsigned long old_end,
                                                   >> 1785                   pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
2137 {                                                1786 {
2138         spinlock_t *old_ptl, *new_ptl;           1787         spinlock_t *old_ptl, *new_ptl;
2139         pmd_t pmd;                               1788         pmd_t pmd;
2140         struct mm_struct *mm = vma->vm_mm;       1789         struct mm_struct *mm = vma->vm_mm;
2141         bool force_flush = false;                1790         bool force_flush = false;
2142                                                  1791 
                                                   >> 1792         if ((old_addr & ~HPAGE_PMD_MASK) ||
                                                   >> 1793             (new_addr & ~HPAGE_PMD_MASK) ||
                                                   >> 1794             old_end - old_addr < HPAGE_PMD_SIZE)
                                                   >> 1795                 return false;
                                                   >> 1796 
2143         /*                                       1797         /*
2144          * The destination pmd shouldn't be e    1798          * The destination pmd shouldn't be established, free_pgtables()
2145          * should have released it; but move_ !! 1799          * should have release it.
2146          * inserted a page table, if racing a << 
2147          */                                      1800          */
2148         if (!pmd_none(*new_pmd)) {            !! 1801         if (WARN_ON(!pmd_none(*new_pmd))) {
2149                 VM_BUG_ON(pmd_trans_huge(*new    1802                 VM_BUG_ON(pmd_trans_huge(*new_pmd));
2150                 return false;                    1803                 return false;
2151         }                                        1804         }
2152                                                  1805 
2153         /*                                       1806         /*
2154          * We don't have to worry about the o    1807          * We don't have to worry about the ordering of src and dst
2155          * ptlocks because exclusive mmap_loc !! 1808          * ptlocks because exclusive mmap_sem prevents deadlock.
2156          */                                      1809          */
2157         old_ptl = __pmd_trans_huge_lock(old_p    1810         old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
2158         if (old_ptl) {                           1811         if (old_ptl) {
2159                 new_ptl = pmd_lockptr(mm, new    1812                 new_ptl = pmd_lockptr(mm, new_pmd);
2160                 if (new_ptl != old_ptl)          1813                 if (new_ptl != old_ptl)
2161                         spin_lock_nested(new_    1814                         spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
2162                 pmd = pmdp_huge_get_and_clear    1815                 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
2163                 if (pmd_present(pmd))         !! 1816                 if (pmd_present(pmd) && pmd_dirty(pmd))
2164                         force_flush = true;      1817                         force_flush = true;
2165                 VM_BUG_ON(!pmd_none(*new_pmd)    1818                 VM_BUG_ON(!pmd_none(*new_pmd));
2166                                                  1819 
2167                 if (pmd_move_must_withdraw(ne    1820                 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
2168                         pgtable_t pgtable;       1821                         pgtable_t pgtable;
2169                         pgtable = pgtable_tra    1822                         pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
2170                         pgtable_trans_huge_de    1823                         pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
2171                 }                                1824                 }
2172                 pmd = move_soft_dirty_pmd(pmd    1825                 pmd = move_soft_dirty_pmd(pmd);
2173                 set_pmd_at(mm, new_addr, new_    1826                 set_pmd_at(mm, new_addr, new_pmd, pmd);
2174                 if (force_flush)              << 
2175                         flush_pmd_tlb_range(v << 
2176                 if (new_ptl != old_ptl)          1827                 if (new_ptl != old_ptl)
2177                         spin_unlock(new_ptl);    1828                         spin_unlock(new_ptl);
                                                   >> 1829                 if (force_flush)
                                                   >> 1830                         flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
                                                   >> 1831                 else
                                                   >> 1832                         *need_flush = true;
2178                 spin_unlock(old_ptl);            1833                 spin_unlock(old_ptl);
2179                 return true;                     1834                 return true;
2180         }                                        1835         }
2181         return false;                            1836         return false;
2182 }                                                1837 }
2183                                                  1838 
2184 /*                                               1839 /*
2185  * Returns                                       1840  * Returns
2186  *  - 0 if PMD could not be locked               1841  *  - 0 if PMD could not be locked
2187  *  - 1 if PMD was locked but protections unc !! 1842  *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
2188  *      or if prot_numa but THP migration is  !! 1843  *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
2189  *  - HPAGE_PMD_NR if protections changed and << 
2190  */                                              1844  */
2191 int change_huge_pmd(struct mmu_gather *tlb, s !! 1845 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2192                     pmd_t *pmd, unsigned long !! 1846                 unsigned long addr, pgprot_t newprot, int prot_numa)
2193                     unsigned long cp_flags)   << 
2194 {                                                1847 {
2195         struct mm_struct *mm = vma->vm_mm;       1848         struct mm_struct *mm = vma->vm_mm;
2196         spinlock_t *ptl;                         1849         spinlock_t *ptl;
2197         pmd_t oldpmd, entry;                  !! 1850         pmd_t entry;
2198         bool prot_numa = cp_flags & MM_CP_PRO !! 1851         bool preserve_write;
2199         bool uffd_wp = cp_flags & MM_CP_UFFD_ !! 1852         int ret;
2200         bool uffd_wp_resolve = cp_flags & MM_ << 
2201         int ret = 1;                          << 
2202                                               << 
2203         tlb_change_page_size(tlb, HPAGE_PMD_S << 
2204                                               << 
2205         if (prot_numa && !thp_migration_suppo << 
2206                 return 1;                     << 
2207                                                  1853 
2208         ptl = __pmd_trans_huge_lock(pmd, vma)    1854         ptl = __pmd_trans_huge_lock(pmd, vma);
2209         if (!ptl)                                1855         if (!ptl)
2210                 return 0;                        1856                 return 0;
2211                                                  1857 
                                                   >> 1858         preserve_write = prot_numa && pmd_write(*pmd);
                                                   >> 1859         ret = 1;
                                                   >> 1860 
2212 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION          1861 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
2213         if (is_swap_pmd(*pmd)) {                 1862         if (is_swap_pmd(*pmd)) {
2214                 swp_entry_t entry = pmd_to_sw    1863                 swp_entry_t entry = pmd_to_swp_entry(*pmd);
2215                 struct folio *folio = pfn_swa << 
2216                 pmd_t newpmd;                 << 
2217                                                  1864 
2218                 VM_BUG_ON(!is_pmd_migration_e    1865                 VM_BUG_ON(!is_pmd_migration_entry(*pmd));
2219                 if (is_writable_migration_ent !! 1866                 if (is_write_migration_entry(entry)) {
                                                   >> 1867                         pmd_t newpmd;
2220                         /*                       1868                         /*
2221                          * A protection check    1869                          * A protection check is difficult so
2222                          * just be safe and d    1870                          * just be safe and disable write
2223                          */                      1871                          */
2224                         if (folio_test_anon(f !! 1872                         make_migration_entry_read(&entry);
2225                                 entry = make_ << 
2226                         else                  << 
2227                                 entry = make_ << 
2228                         newpmd = swp_entry_to    1873                         newpmd = swp_entry_to_pmd(entry);
2229                         if (pmd_swp_soft_dirt    1874                         if (pmd_swp_soft_dirty(*pmd))
2230                                 newpmd = pmd_    1875                                 newpmd = pmd_swp_mksoft_dirty(newpmd);
2231                 } else {                      << 
2232                         newpmd = *pmd;        << 
2233                 }                             << 
2234                                               << 
2235                 if (uffd_wp)                  << 
2236                         newpmd = pmd_swp_mkuf << 
2237                 else if (uffd_wp_resolve)     << 
2238                         newpmd = pmd_swp_clea << 
2239                 if (!pmd_same(*pmd, newpmd))  << 
2240                         set_pmd_at(mm, addr,     1876                         set_pmd_at(mm, addr, pmd, newpmd);
                                                   >> 1877                 }
2241                 goto unlock;                     1878                 goto unlock;
2242         }                                        1879         }
2243 #endif                                           1880 #endif
2244                                                  1881 
2245         if (prot_numa) {                      !! 1882         /*
2246                 struct folio *folio;          !! 1883          * Avoid trapping faults against the zero page. The read-only
2247                 bool toptier;                 !! 1884          * data is likely to be read-cached on the local CPU and
2248                 /*                            !! 1885          * local/remote hits to the zero page are not interesting.
2249                  * Avoid trapping faults agai !! 1886          */
2250                  * data is likely to be read- !! 1887         if (prot_numa && is_huge_zero_pmd(*pmd))
2251                  * local/remote hits to the z !! 1888                 goto unlock;
2252                  */                           << 
2253                 if (is_huge_zero_pmd(*pmd))   << 
2254                         goto unlock;          << 
2255                                               << 
2256                 if (pmd_protnone(*pmd))       << 
2257                         goto unlock;          << 
2258                                                  1889 
2259                 folio = pmd_folio(*pmd);      !! 1890         if (prot_numa && pmd_protnone(*pmd))
2260                 toptier = node_is_toptier(fol !! 1891                 goto unlock;
2261                 /*                            << 
2262                  * Skip scanning top tier nod << 
2263                  * balancing is disabled      << 
2264                  */                           << 
2265                 if (!(sysctl_numa_balancing_m << 
2266                     toptier)                  << 
2267                         goto unlock;          << 
2268                                                  1892 
2269                 if (folio_use_access_time(fol << 
2270                         folio_xchg_access_tim << 
2271                                               << 
2272         }                                     << 
2273         /*                                       1893         /*
2274          * In case prot_numa, we are under mm !! 1894          * In case prot_numa, we are under down_read(mmap_sem). It's critical
2275          * to not clear pmd intermittently to    1895          * to not clear pmd intermittently to avoid race with MADV_DONTNEED
2276          * which is also under mmap_read_lock !! 1896          * which is also under down_read(mmap_sem):
2277          *                                       1897          *
2278          *      CPU0:                            1898          *      CPU0:                           CPU1:
2279          *                              chang    1899          *                              change_huge_pmd(prot_numa=1)
2280          *                               pmdp    1900          *                               pmdp_huge_get_and_clear_notify()
2281          * madvise_dontneed()                    1901          * madvise_dontneed()
2282          *  zap_pmd_range()                      1902          *  zap_pmd_range()
2283          *   pmd_trans_huge(*pmd) == 0 (witho    1903          *   pmd_trans_huge(*pmd) == 0 (without ptl)
2284          *   // skip the pmd                     1904          *   // skip the pmd
2285          *                               set_    1905          *                               set_pmd_at();
2286          *                               // p    1906          *                               // pmd is re-established
2287          *                                       1907          *
2288          * The race makes MADV_DONTNEED miss     1908          * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
2289          * which may break userspace.            1909          * which may break userspace.
2290          *                                       1910          *
2291          * pmdp_invalidate_ad() is required t !! 1911          * pmdp_invalidate() is required to make sure we don't miss
2292          * dirty/young flags set by hardware.    1912          * dirty/young flags set by hardware.
2293          */                                      1913          */
2294         oldpmd = pmdp_invalidate_ad(vma, addr !! 1914         entry = pmdp_invalidate(vma, addr, pmd);
2295                                               << 
2296         entry = pmd_modify(oldpmd, newprot);  << 
2297         if (uffd_wp)                          << 
2298                 entry = pmd_mkuffd_wp(entry); << 
2299         else if (uffd_wp_resolve)             << 
2300                 /*                            << 
2301                  * Leave the write bit to be  << 
2302                  * handler, then things like  << 
2303                  * handled.                   << 
2304                  */                           << 
2305                 entry = pmd_clear_uffd_wp(ent << 
2306                                               << 
2307         /* See change_pte_range(). */         << 
2308         if ((cp_flags & MM_CP_TRY_CHANGE_WRIT << 
2309             can_change_pmd_writable(vma, addr << 
2310                 entry = pmd_mkwrite(entry, vm << 
2311                                                  1915 
                                                   >> 1916         entry = pmd_modify(entry, newprot);
                                                   >> 1917         if (preserve_write)
                                                   >> 1918                 entry = pmd_mk_savedwrite(entry);
2312         ret = HPAGE_PMD_NR;                      1919         ret = HPAGE_PMD_NR;
2313         set_pmd_at(mm, addr, pmd, entry);        1920         set_pmd_at(mm, addr, pmd, entry);
2314                                               !! 1921         BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
2315         if (huge_pmd_needs_flush(oldpmd, entr << 
2316                 tlb_flush_pmd_range(tlb, addr << 
2317 unlock:                                          1922 unlock:
2318         spin_unlock(ptl);                        1923         spin_unlock(ptl);
2319         return ret;                              1924         return ret;
2320 }                                                1925 }
2321                                                  1926 
2322 /*                                               1927 /*
2323  * Returns:                                   << 
2324  *                                            << 
2325  * - 0: if pud leaf changed from under us     << 
2326  * - 1: if pud can be skipped                 << 
2327  * - HPAGE_PUD_NR: if pud was successfully pr << 
2328  */                                           << 
2329 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_ << 
2330 int change_huge_pud(struct mmu_gather *tlb, s << 
2331                     pud_t *pudp, unsigned lon << 
2332                     unsigned long cp_flags)   << 
2333 {                                             << 
2334         struct mm_struct *mm = vma->vm_mm;    << 
2335         pud_t oldpud, entry;                  << 
2336         spinlock_t *ptl;                      << 
2337                                               << 
2338         tlb_change_page_size(tlb, HPAGE_PUD_S << 
2339                                               << 
2340         /* NUMA balancing doesn't apply to da << 
2341         if (cp_flags & MM_CP_PROT_NUMA)       << 
2342                 return 1;                     << 
2343                                               << 
2344         /*                                    << 
2345          * Huge entries on userfault-wp only  << 
2346          * don't have anonymous PUDs yet.     << 
2347          */                                   << 
2348         if (WARN_ON_ONCE(cp_flags & MM_CP_UFF << 
2349                 return 1;                     << 
2350                                               << 
2351         ptl = __pud_trans_huge_lock(pudp, vma << 
2352         if (!ptl)                             << 
2353                 return 0;                     << 
2354                                               << 
2355         /*                                    << 
2356          * Can't clear PUD or it can race wit << 
2357          * change_huge_pmd().                 << 
2358          */                                   << 
2359         oldpud = pudp_invalidate(vma, addr, p << 
2360         entry = pud_modify(oldpud, newprot);  << 
2361         set_pud_at(mm, addr, pudp, entry);    << 
2362         tlb_flush_pud_range(tlb, addr, HPAGE_ << 
2363                                               << 
2364         spin_unlock(ptl);                     << 
2365         return HPAGE_PUD_NR;                  << 
2366 }                                             << 
2367 #endif                                        << 
2368                                               << 
2369 #ifdef CONFIG_USERFAULTFD                     << 
2370 /*                                            << 
2371  * The PT lock for src_pmd and dst_vma/src_vm << 
2372  * the caller, but it must return after relea << 
2373  * Just move the page from src_pmd to dst_pmd << 
2374  * Return zero if succeeded in moving the pag << 
2375  * repeated by the caller, or other errors in << 
2376  */                                           << 
2377 int move_pages_huge_pmd(struct mm_struct *mm, << 
2378                         struct vm_area_struct << 
2379                         unsigned long dst_add << 
2380 {                                             << 
2381         pmd_t _dst_pmd, src_pmdval;           << 
2382         struct page *src_page;                << 
2383         struct folio *src_folio;              << 
2384         struct anon_vma *src_anon_vma;        << 
2385         spinlock_t *src_ptl, *dst_ptl;        << 
2386         pgtable_t src_pgtable;                << 
2387         struct mmu_notifier_range range;      << 
2388         int err = 0;                          << 
2389                                               << 
2390         src_pmdval = *src_pmd;                << 
2391         src_ptl = pmd_lockptr(mm, src_pmd);   << 
2392                                               << 
2393         lockdep_assert_held(src_ptl);         << 
2394         vma_assert_locked(src_vma);           << 
2395         vma_assert_locked(dst_vma);           << 
2396                                               << 
2397         /* Sanity checks before the operation << 
2398         if (WARN_ON_ONCE(!pmd_none(dst_pmdval << 
2399             WARN_ON_ONCE(dst_addr & ~HPAGE_PM << 
2400                 spin_unlock(src_ptl);         << 
2401                 return -EINVAL;               << 
2402         }                                     << 
2403                                               << 
2404         if (!pmd_trans_huge(src_pmdval)) {    << 
2405                 spin_unlock(src_ptl);         << 
2406                 if (is_pmd_migration_entry(sr << 
2407                         pmd_migration_entry_w << 
2408                         return -EAGAIN;       << 
2409                 }                             << 
2410                 return -ENOENT;               << 
2411         }                                     << 
2412                                               << 
2413         src_page = pmd_page(src_pmdval);      << 
2414                                               << 
2415         if (!is_huge_zero_pmd(src_pmdval)) {  << 
2416                 if (unlikely(!PageAnonExclusi << 
2417                         spin_unlock(src_ptl); << 
2418                         return -EBUSY;        << 
2419                 }                             << 
2420                                               << 
2421                 src_folio = page_folio(src_pa << 
2422                 folio_get(src_folio);         << 
2423         } else                                << 
2424                 src_folio = NULL;             << 
2425                                               << 
2426         spin_unlock(src_ptl);                 << 
2427                                               << 
2428         flush_cache_range(src_vma, src_addr,  << 
2429         mmu_notifier_range_init(&range, MMU_N << 
2430                                 src_addr + HP << 
2431         mmu_notifier_invalidate_range_start(& << 
2432                                               << 
2433         if (src_folio) {                      << 
2434                 folio_lock(src_folio);        << 
2435                                               << 
2436                 /*                            << 
2437                  * split_huge_page walks the  << 
2438                  * lock. Serialize against it << 
2439                  * lock is not enough.        << 
2440                  */                           << 
2441                 src_anon_vma = folio_get_anon << 
2442                 if (!src_anon_vma) {          << 
2443                         err = -EAGAIN;        << 
2444                         goto unlock_folio;    << 
2445                 }                             << 
2446                 anon_vma_lock_write(src_anon_ << 
2447         } else                                << 
2448                 src_anon_vma = NULL;          << 
2449                                               << 
2450         dst_ptl = pmd_lockptr(mm, dst_pmd);   << 
2451         double_pt_lock(src_ptl, dst_ptl);     << 
2452         if (unlikely(!pmd_same(*src_pmd, src_ << 
2453                      !pmd_same(*dst_pmd, dst_ << 
2454                 err = -EAGAIN;                << 
2455                 goto unlock_ptls;             << 
2456         }                                     << 
2457         if (src_folio) {                      << 
2458                 if (folio_maybe_dma_pinned(sr << 
2459                     !PageAnonExclusive(&src_f << 
2460                         err = -EBUSY;         << 
2461                         goto unlock_ptls;     << 
2462                 }                             << 
2463                                               << 
2464                 if (WARN_ON_ONCE(!folio_test_ << 
2465                     WARN_ON_ONCE(!folio_test_ << 
2466                         err = -EBUSY;         << 
2467                         goto unlock_ptls;     << 
2468                 }                             << 
2469                                               << 
2470                 src_pmdval = pmdp_huge_clear_ << 
2471                 /* Folio got pinned from unde << 
2472                 if (folio_maybe_dma_pinned(sr << 
2473                         set_pmd_at(mm, src_ad << 
2474                         err = -EBUSY;         << 
2475                         goto unlock_ptls;     << 
2476                 }                             << 
2477                                               << 
2478                 folio_move_anon_rmap(src_foli << 
2479                 src_folio->index = linear_pag << 
2480                                               << 
2481                 _dst_pmd = mk_huge_pmd(&src_f << 
2482                 /* Follow mremap() behavior a << 
2483                 _dst_pmd = pmd_mkwrite(pmd_mk << 
2484         } else {                              << 
2485                 src_pmdval = pmdp_huge_clear_ << 
2486                 _dst_pmd = mk_huge_pmd(src_pa << 
2487         }                                     << 
2488         set_pmd_at(mm, dst_addr, dst_pmd, _ds << 
2489                                               << 
2490         src_pgtable = pgtable_trans_huge_with << 
2491         pgtable_trans_huge_deposit(mm, dst_pm << 
2492 unlock_ptls:                                  << 
2493         double_pt_unlock(src_ptl, dst_ptl);   << 
2494         if (src_anon_vma) {                   << 
2495                 anon_vma_unlock_write(src_ano << 
2496                 put_anon_vma(src_anon_vma);   << 
2497         }                                     << 
2498 unlock_folio:                                 << 
2499         /* unblock rmap walks */              << 
2500         if (src_folio)                        << 
2501                 folio_unlock(src_folio);      << 
2502         mmu_notifier_invalidate_range_end(&ra << 
2503         if (src_folio)                        << 
2504                 folio_put(src_folio);         << 
2505         return err;                           << 
2506 }                                             << 
2507 #endif /* CONFIG_USERFAULTFD */               << 
2508                                               << 
2509 /*                                            << 
2510  * Returns page table lock pointer if a given    1928  * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
2511  *                                               1929  *
2512  * Note that if it returns page table lock po    1930  * Note that if it returns page table lock pointer, this routine returns without
2513  * unlocking page table lock. So callers must    1931  * unlocking page table lock. So callers must unlock it.
2514  */                                              1932  */
2515 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,    1933 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
2516 {                                                1934 {
2517         spinlock_t *ptl;                         1935         spinlock_t *ptl;
2518         ptl = pmd_lock(vma->vm_mm, pmd);         1936         ptl = pmd_lock(vma->vm_mm, pmd);
2519         if (likely(is_swap_pmd(*pmd) || pmd_t    1937         if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
2520                         pmd_devmap(*pmd)))       1938                         pmd_devmap(*pmd)))
2521                 return ptl;                      1939                 return ptl;
2522         spin_unlock(ptl);                        1940         spin_unlock(ptl);
2523         return NULL;                             1941         return NULL;
2524 }                                                1942 }
2525                                                  1943 
2526 /*                                               1944 /*
2527  * Returns page table lock pointer if a given !! 1945  * Returns true if a given pud maps a thp, false otherwise.
2528  *                                               1946  *
2529  * Note that if it returns page table lock po !! 1947  * Note that if it returns true, this routine returns without unlocking page
2530  * unlocking page table lock. So callers must !! 1948  * table lock. So callers must unlock it.
2531  */                                              1949  */
2532 spinlock_t *__pud_trans_huge_lock(pud_t *pud,    1950 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
2533 {                                                1951 {
2534         spinlock_t *ptl;                         1952         spinlock_t *ptl;
2535                                                  1953 
2536         ptl = pud_lock(vma->vm_mm, pud);         1954         ptl = pud_lock(vma->vm_mm, pud);
2537         if (likely(pud_trans_huge(*pud) || pu    1955         if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
2538                 return ptl;                      1956                 return ptl;
2539         spin_unlock(ptl);                        1957         spin_unlock(ptl);
2540         return NULL;                             1958         return NULL;
2541 }                                                1959 }
2542                                                  1960 
2543 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_    1961 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2544 int zap_huge_pud(struct mmu_gather *tlb, stru    1962 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2545                  pud_t *pud, unsigned long ad    1963                  pud_t *pud, unsigned long addr)
2546 {                                                1964 {
2547         spinlock_t *ptl;                      << 
2548         pud_t orig_pud;                          1965         pud_t orig_pud;
                                                   >> 1966         spinlock_t *ptl;
2549                                                  1967 
2550         ptl = __pud_trans_huge_lock(pud, vma)    1968         ptl = __pud_trans_huge_lock(pud, vma);
2551         if (!ptl)                                1969         if (!ptl)
2552                 return 0;                        1970                 return 0;
2553                                               !! 1971         /*
2554         orig_pud = pudp_huge_get_and_clear_fu !! 1972          * For architectures like ppc64 we look at deposited pgtable
2555         arch_check_zapped_pud(vma, orig_pud); !! 1973          * when calling pudp_huge_get_and_clear. So do the
                                                   >> 1974          * pgtable_trans_huge_withdraw after finishing pudp related
                                                   >> 1975          * operations.
                                                   >> 1976          */
                                                   >> 1977         orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud,
                                                   >> 1978                         tlb->fullmm);
2556         tlb_remove_pud_tlb_entry(tlb, pud, ad    1979         tlb_remove_pud_tlb_entry(tlb, pud, addr);
2557         if (vma_is_special_huge(vma)) {       !! 1980         if (vma_is_dax(vma)) {
2558                 spin_unlock(ptl);                1981                 spin_unlock(ptl);
2559                 /* No zero page support yet *    1982                 /* No zero page support yet */
2560         } else {                                 1983         } else {
2561                 /* No support for anonymous P    1984                 /* No support for anonymous PUD pages yet */
2562                 BUG();                           1985                 BUG();
2563         }                                        1986         }
2564         return 1;                                1987         return 1;
2565 }                                                1988 }
2566                                                  1989 
2567 static void __split_huge_pud_locked(struct vm    1990 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2568                 unsigned long haddr)             1991                 unsigned long haddr)
2569 {                                                1992 {
2570         VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);      1993         VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2571         VM_BUG_ON_VMA(vma->vm_start > haddr,     1994         VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2572         VM_BUG_ON_VMA(vma->vm_end < haddr + H    1995         VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2573         VM_BUG_ON(!pud_trans_huge(*pud) && !p    1996         VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
2574                                                  1997 
2575         count_vm_event(THP_SPLIT_PUD);           1998         count_vm_event(THP_SPLIT_PUD);
2576                                                  1999 
2577         pudp_huge_clear_flush(vma, haddr, pud !! 2000         pudp_huge_clear_flush_notify(vma, haddr, pud);
2578 }                                                2001 }
2579                                                  2002 
2580 void __split_huge_pud(struct vm_area_struct *    2003 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2581                 unsigned long address)           2004                 unsigned long address)
2582 {                                                2005 {
2583         spinlock_t *ptl;                         2006         spinlock_t *ptl;
2584         struct mmu_notifier_range range;      !! 2007         struct mm_struct *mm = vma->vm_mm;
                                                   >> 2008         unsigned long haddr = address & HPAGE_PUD_MASK;
2585                                                  2009 
2586         mmu_notifier_range_init(&range, MMU_N !! 2010         mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
2587                                 address & HPA !! 2011         ptl = pud_lock(mm, pud);
2588                                 (address & HP << 
2589         mmu_notifier_invalidate_range_start(& << 
2590         ptl = pud_lock(vma->vm_mm, pud);      << 
2591         if (unlikely(!pud_trans_huge(*pud) &&    2012         if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
2592                 goto out;                        2013                 goto out;
2593         __split_huge_pud_locked(vma, pud, ran !! 2014         __split_huge_pud_locked(vma, pud, haddr);
2594                                                  2015 
2595 out:                                             2016 out:
2596         spin_unlock(ptl);                        2017         spin_unlock(ptl);
2597         mmu_notifier_invalidate_range_end(&ra !! 2018         /*
2598 }                                             !! 2019          * No need to double call mmu_notifier->invalidate_range() callback as
2599 #else                                         !! 2020          * the above pudp_huge_clear_flush_notify() did already call it.
2600 void __split_huge_pud(struct vm_area_struct * !! 2021          */
2601                 unsigned long address)        !! 2022         mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
2602 {                                             !! 2023                                                HPAGE_PUD_SIZE);
2603 }                                                2024 }
2604 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPA    2025 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2605                                                  2026 
2606 static void __split_huge_zero_page_pmd(struct    2027 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2607                 unsigned long haddr, pmd_t *p    2028                 unsigned long haddr, pmd_t *pmd)
2608 {                                                2029 {
2609         struct mm_struct *mm = vma->vm_mm;       2030         struct mm_struct *mm = vma->vm_mm;
2610         pgtable_t pgtable;                       2031         pgtable_t pgtable;
2611         pmd_t _pmd, old_pmd;                  !! 2032         pmd_t _pmd;
2612         unsigned long addr;                   << 
2613         pte_t *pte;                           << 
2614         int i;                                   2033         int i;
2615                                                  2034 
2616         /*                                       2035         /*
2617          * Leave pmd empty until pte is fille    2036          * Leave pmd empty until pte is filled note that it is fine to delay
2618          * notification until mmu_notifier_in    2037          * notification until mmu_notifier_invalidate_range_end() as we are
2619          * replacing a zero pmd write protect    2038          * replacing a zero pmd write protected page with a zero pte write
2620          * protected page.                       2039          * protected page.
2621          *                                       2040          *
2622          * See Documentation/mm/mmu_notifier. !! 2041          * See Documentation/vm/mmu_notifier.txt
2623          */                                      2042          */
2624         old_pmd = pmdp_huge_clear_flush(vma,  !! 2043         pmdp_huge_clear_flush(vma, haddr, pmd);
2625                                                  2044 
2626         pgtable = pgtable_trans_huge_withdraw    2045         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2627         pmd_populate(mm, &_pmd, pgtable);        2046         pmd_populate(mm, &_pmd, pgtable);
2628                                                  2047 
2629         pte = pte_offset_map(&_pmd, haddr);   !! 2048         for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2630         VM_BUG_ON(!pte);                      !! 2049                 pte_t *pte, entry;
2631         for (i = 0, addr = haddr; i < HPAGE_P !! 2050                 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
2632                 pte_t entry;                  << 
2633                                               << 
2634                 entry = pfn_pte(my_zero_pfn(a << 
2635                 entry = pte_mkspecial(entry);    2051                 entry = pte_mkspecial(entry);
2636                 if (pmd_uffd_wp(old_pmd))     !! 2052                 pte = pte_offset_map(&_pmd, haddr);
2637                         entry = pte_mkuffd_wp !! 2053                 VM_BUG_ON(!pte_none(*pte));
2638                 VM_BUG_ON(!pte_none(ptep_get( !! 2054                 set_pte_at(mm, haddr, pte, entry);
2639                 set_pte_at(mm, addr, pte, ent !! 2055                 pte_unmap(pte);
2640                 pte++;                        << 
2641         }                                        2056         }
2642         pte_unmap(pte - 1);                   << 
2643         smp_wmb(); /* make pte visible before    2057         smp_wmb(); /* make pte visible before pmd */
2644         pmd_populate(mm, pmd, pgtable);          2058         pmd_populate(mm, pmd, pgtable);
2645 }                                                2059 }
2646                                                  2060 
2647 static void __split_huge_pmd_locked(struct vm    2061 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2648                 unsigned long haddr, bool fre    2062                 unsigned long haddr, bool freeze)
2649 {                                                2063 {
2650         struct mm_struct *mm = vma->vm_mm;       2064         struct mm_struct *mm = vma->vm_mm;
2651         struct folio *folio;                  << 
2652         struct page *page;                       2065         struct page *page;
2653         pgtable_t pgtable;                       2066         pgtable_t pgtable;
2654         pmd_t old_pmd, _pmd;                     2067         pmd_t old_pmd, _pmd;
2655         bool young, write, soft_dirty, pmd_mi !! 2068         bool young, write, soft_dirty, pmd_migration = false;
2656         bool anon_exclusive = false, dirty =  << 
2657         unsigned long addr;                      2069         unsigned long addr;
2658         pte_t *pte;                           << 
2659         int i;                                   2070         int i;
2660                                                  2071 
2661         VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);      2072         VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2662         VM_BUG_ON_VMA(vma->vm_start > haddr,     2073         VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2663         VM_BUG_ON_VMA(vma->vm_end < haddr + H    2074         VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
2664         VM_BUG_ON(!is_pmd_migration_entry(*pm    2075         VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
2665                                 && !pmd_devma    2076                                 && !pmd_devmap(*pmd));
2666                                                  2077 
2667         count_vm_event(THP_SPLIT_PMD);           2078         count_vm_event(THP_SPLIT_PMD);
2668                                                  2079 
2669         if (!vma_is_anonymous(vma)) {            2080         if (!vma_is_anonymous(vma)) {
2670                 old_pmd = pmdp_huge_clear_flu !! 2081                 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
2671                 /*                               2082                 /*
2672                  * We are going to unmap this    2083                  * We are going to unmap this huge page. So
2673                  * just go ahead and zap it      2084                  * just go ahead and zap it
2674                  */                              2085                  */
2675                 if (arch_needs_pgtable_deposi    2086                 if (arch_needs_pgtable_deposit())
2676                         zap_deposited_table(m    2087                         zap_deposited_table(mm, pmd);
2677                 if (vma_is_special_huge(vma)) !! 2088                 if (vma_is_dax(vma))
2678                         return;                  2089                         return;
2679                 if (unlikely(is_pmd_migration !! 2090                 page = pmd_page(_pmd);
2680                         swp_entry_t entry;    !! 2091                 if (!PageReferenced(page) && pmd_young(_pmd))
2681                                               !! 2092                         SetPageReferenced(page);
2682                         entry = pmd_to_swp_en !! 2093                 page_remove_rmap(page, true);
2683                         folio = pfn_swap_entr !! 2094                 put_page(page);
2684                 } else {                      !! 2095                 add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
2685                         page = pmd_page(old_p << 
2686                         folio = page_folio(pa << 
2687                         if (!folio_test_dirty << 
2688                                 folio_mark_di << 
2689                         if (!folio_test_refer << 
2690                                 folio_set_ref << 
2691                         folio_remove_rmap_pmd << 
2692                         folio_put(folio);     << 
2693                 }                             << 
2694                 add_mm_counter(mm, mm_counter << 
2695                 return;                          2096                 return;
2696         }                                     !! 2097         } else if (is_huge_zero_pmd(*pmd)) {
2697                                               << 
2698         if (is_huge_zero_pmd(*pmd)) {         << 
2699                 /*                               2098                 /*
2700                  * FIXME: Do we want to inval    2099                  * FIXME: Do we want to invalidate secondary mmu by calling
2701                  * mmu_notifier_arch_invalida !! 2100                  * mmu_notifier_invalidate_range() see comments below inside
2702                  * inside __split_huge_pmd()  !! 2101                  * __split_huge_pmd() ?
2703                  *                               2102                  *
2704                  * We are going from a zero h    2103                  * We are going from a zero huge page write protected to zero
2705                  * small page also write prot    2104                  * small page also write protected so it does not seems useful
2706                  * to invalidate secondary mm    2105                  * to invalidate secondary mmu at this time.
2707                  */                              2106                  */
2708                 return __split_huge_zero_page    2107                 return __split_huge_zero_page_pmd(vma, haddr, pmd);
2709         }                                        2108         }
2710                                                  2109 
2711         pmd_migration = is_pmd_migration_entr !! 2110         /*
2712         if (unlikely(pmd_migration)) {        !! 2111          * Up to this point the pmd is present and huge and userland has the
                                                   >> 2112          * whole access to the hugepage during the split (which happens in
                                                   >> 2113          * place). If we overwrite the pmd with the not-huge version pointing
                                                   >> 2114          * to the pte here (which of course we could if all CPUs were bug
                                                   >> 2115          * free), userland could trigger a small page size TLB miss on the
                                                   >> 2116          * small sized TLB while the hugepage TLB entry is still established in
                                                   >> 2117          * the huge TLB. Some CPU doesn't like that.
                                                   >> 2118          * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
                                                   >> 2119          * 383 on page 93. Intel should be safe but is also warns that it's
                                                   >> 2120          * only safe if the permission and cache attributes of the two entries
                                                   >> 2121          * loaded in the two TLB is identical (which should be the case here).
                                                   >> 2122          * But it is generally safer to never allow small and huge TLB entries
                                                   >> 2123          * for the same virtual address to be loaded simultaneously. So instead
                                                   >> 2124          * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
                                                   >> 2125          * current pmd notpresent (atomically because here the pmd_trans_huge
                                                   >> 2126          * must remain set at all times on the pmd until the split is complete
                                                   >> 2127          * for this pmd), then we flush the SMP TLB and finally we write the
                                                   >> 2128          * non-huge version of the pmd entry with pmd_populate.
                                                   >> 2129          */
                                                   >> 2130         old_pmd = pmdp_invalidate(vma, haddr, pmd);
                                                   >> 2131 
                                                   >> 2132 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
                                                   >> 2133         pmd_migration = is_pmd_migration_entry(old_pmd);
                                                   >> 2134         if (pmd_migration) {
2713                 swp_entry_t entry;               2135                 swp_entry_t entry;
2714                                                  2136 
2715                 old_pmd = *pmd;               << 
2716                 entry = pmd_to_swp_entry(old_    2137                 entry = pmd_to_swp_entry(old_pmd);
2717                 page = pfn_swap_entry_to_page !! 2138                 page = pfn_to_page(swp_offset(entry));
2718                 write = is_writable_migration !! 2139         } else
2719                 if (PageAnon(page))           !! 2140 #endif
2720                         anon_exclusive = is_r << 
2721                 young = is_migration_entry_yo << 
2722                 dirty = is_migration_entry_di << 
2723                 soft_dirty = pmd_swp_soft_dir << 
2724                 uffd_wp = pmd_swp_uffd_wp(old << 
2725         } else {                              << 
2726                 /*                            << 
2727                  * Up to this point the pmd i << 
2728                  * the whole access to the hu << 
2729                  * happens in place). If we o << 
2730                  * version pointing to the pt << 
2731                  * all CPUs were bug free), u << 
2732                  * size TLB miss on the small << 
2733                  * entry is still established << 
2734                  * like that. See             << 
2735                  * http://support.amd.com/Tec << 
2736                  * 383 on page 105. Intel sho << 
2737                  * it's only safe if the perm << 
2738                  * two entries loaded in the  << 
2739                  * be the case here). But it  << 
2740                  * small and huge TLB entries << 
2741                  * loaded simultaneously. So  << 
2742                  * flush_pmd_tlb_range();" we << 
2743                  * notpresent (atomically bec << 
2744                  * remain set at all times on << 
2745                  * complete for this pmd), th << 
2746                  * we write the non-huge vers << 
2747                  * pmd_populate.              << 
2748                  */                           << 
2749                 old_pmd = pmdp_invalidate(vma << 
2750                 page = pmd_page(old_pmd);        2141                 page = pmd_page(old_pmd);
2751                 folio = page_folio(page);     !! 2142         VM_BUG_ON_PAGE(!page_count(page), page);
2752                 if (pmd_dirty(old_pmd)) {     !! 2143         page_ref_add(page, HPAGE_PMD_NR - 1);
2753                         dirty = true;         !! 2144         if (pmd_dirty(old_pmd))
2754                         folio_set_dirty(folio !! 2145                 SetPageDirty(page);
2755                 }                             !! 2146         write = pmd_write(old_pmd);
2756                 write = pmd_write(old_pmd);   !! 2147         young = pmd_young(old_pmd);
2757                 young = pmd_young(old_pmd);   !! 2148         soft_dirty = pmd_soft_dirty(old_pmd);
2758                 soft_dirty = pmd_soft_dirty(o << 
2759                 uffd_wp = pmd_uffd_wp(old_pmd << 
2760                                               << 
2761                 VM_WARN_ON_FOLIO(!folio_ref_c << 
2762                 VM_WARN_ON_FOLIO(!folio_test_ << 
2763                                               << 
2764                 /*                            << 
2765                  * Without "freeze", we'll si << 
2766                  * PageAnonExclusive() flag f << 
2767                  * each subpage -- no need to << 
2768                  *                            << 
2769                  * With "freeze" we want to r << 
2770                  * migration entries right aw << 
2771                  * managed to clear PageAnonE << 
2772                  * set_pmd_migration_entry(). << 
2773                  *                            << 
2774                  * In case we cannot clear Pa << 
2775                  * only and let try_to_migrat << 
2776                  *                            << 
2777                  * See folio_try_share_anon_r << 
2778                  */                           << 
2779                 anon_exclusive = PageAnonExcl << 
2780                 if (freeze && anon_exclusive  << 
2781                     folio_try_share_anon_rmap << 
2782                         freeze = false;       << 
2783                 if (!freeze) {                << 
2784                         rmap_t rmap_flags = R << 
2785                                               << 
2786                         folio_ref_add(folio,  << 
2787                         if (anon_exclusive)   << 
2788                                 rmap_flags |= << 
2789                         folio_add_anon_rmap_p << 
2790                                               << 
2791                 }                             << 
2792         }                                     << 
2793                                                  2149 
2794         /*                                       2150         /*
2795          * Withdraw the table only after we m    2151          * Withdraw the table only after we mark the pmd entry invalid.
2796          * This's critical for some architect    2152          * This's critical for some architectures (Power).
2797          */                                      2153          */
2798         pgtable = pgtable_trans_huge_withdraw    2154         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2799         pmd_populate(mm, &_pmd, pgtable);        2155         pmd_populate(mm, &_pmd, pgtable);
2800                                                  2156 
2801         pte = pte_offset_map(&_pmd, haddr);   !! 2157         for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2802         VM_BUG_ON(!pte);                      !! 2158                 pte_t entry, *pte;
2803                                               !! 2159                 /*
2804         /*                                    !! 2160                  * Note that NUMA hinting access restrictions are not
2805          * Note that NUMA hinting access rest !! 2161                  * transferred to avoid any possibility of altering
2806          * avoid any possibility of altering  !! 2162                  * permissions across VMAs.
2807          */                                   !! 2163                  */
2808         if (freeze || pmd_migration) {        !! 2164                 if (freeze || pmd_migration) {
2809                 for (i = 0, addr = haddr; i < << 
2810                         pte_t entry;          << 
2811                         swp_entry_t swp_entry    2165                         swp_entry_t swp_entry;
2812                                               !! 2166                         swp_entry = make_migration_entry(page + i, write);
2813                         if (write)            << 
2814                                 swp_entry = m << 
2815                                               << 
2816                         else if (anon_exclusi << 
2817                                 swp_entry = m << 
2818                                               << 
2819                         else                  << 
2820                                 swp_entry = m << 
2821                                               << 
2822                         if (young)            << 
2823                                 swp_entry = m << 
2824                         if (dirty)            << 
2825                                 swp_entry = m << 
2826                         entry = swp_entry_to_    2167                         entry = swp_entry_to_pte(swp_entry);
2827                         if (soft_dirty)          2168                         if (soft_dirty)
2828                                 entry = pte_s    2169                                 entry = pte_swp_mksoft_dirty(entry);
2829                         if (uffd_wp)          !! 2170                 } else {
2830                                 entry = pte_s !! 2171                         entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
2831                                               !! 2172                         entry = maybe_mkwrite(entry, vma);
2832                         VM_WARN_ON(!pte_none( !! 2173                         if (!write)
2833                         set_pte_at(mm, addr,  !! 2174                                 entry = pte_wrprotect(entry);
                                                   >> 2175                         if (!young)
                                                   >> 2176                                 entry = pte_mkold(entry);
                                                   >> 2177                         if (soft_dirty)
                                                   >> 2178                                 entry = pte_mksoft_dirty(entry);
2834                 }                                2179                 }
2835         } else {                              !! 2180                 pte = pte_offset_map(&_pmd, addr);
2836                 pte_t entry;                  !! 2181                 BUG_ON(!pte_none(*pte));
2837                                               !! 2182                 set_pte_at(mm, addr, pte, entry);
2838                 entry = mk_pte(page, READ_ONC !! 2183                 atomic_inc(&page[i]._mapcount);
2839                 if (write)                    !! 2184                 pte_unmap(pte);
2840                         entry = pte_mkwrite(e !! 2185         }
2841                 if (!young)                   << 
2842                         entry = pte_mkold(ent << 
2843                 /* NOTE: this may set soft-di << 
2844                 if (dirty)                    << 
2845                         entry = pte_mkdirty(e << 
2846                 if (soft_dirty)               << 
2847                         entry = pte_mksoft_di << 
2848                 if (uffd_wp)                  << 
2849                         entry = pte_mkuffd_wp << 
2850                                                  2186 
                                                   >> 2187         /*
                                                   >> 2188          * Set PG_double_map before dropping compound_mapcount to avoid
                                                   >> 2189          * false-negative page_mapped().
                                                   >> 2190          */
                                                   >> 2191         if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
2851                 for (i = 0; i < HPAGE_PMD_NR;    2192                 for (i = 0; i < HPAGE_PMD_NR; i++)
2852                         VM_WARN_ON(!pte_none( !! 2193                         atomic_inc(&page[i]._mapcount);
2853                                               << 
2854                 set_ptes(mm, haddr, pte, entr << 
2855         }                                        2194         }
2856         pte_unmap(pte);                       << 
2857                                                  2195 
2858         if (!pmd_migration)                   !! 2196         if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
2859                 folio_remove_rmap_pmd(folio,  !! 2197                 /* Last compound_mapcount is gone. */
2860         if (freeze)                           !! 2198                 __dec_node_page_state(page, NR_ANON_THPS);
2861                 put_page(page);               !! 2199                 if (TestClearPageDoubleMap(page)) {
                                                   >> 2200                         /* No need in mapcount reference anymore */
                                                   >> 2201                         for (i = 0; i < HPAGE_PMD_NR; i++)
                                                   >> 2202                                 atomic_dec(&page[i]._mapcount);
                                                   >> 2203                 }
                                                   >> 2204         }
2862                                                  2205 
2863         smp_wmb(); /* make pte visible before    2206         smp_wmb(); /* make pte visible before pmd */
2864         pmd_populate(mm, pmd, pgtable);          2207         pmd_populate(mm, pmd, pgtable);
2865 }                                             << 
2866                                                  2208 
2867 void split_huge_pmd_locked(struct vm_area_str !! 2209         if (freeze) {
2868                            pmd_t *pmd, bool f !! 2210                 for (i = 0; i < HPAGE_PMD_NR; i++) {
2869 {                                             !! 2211                         page_remove_rmap(page + i, false);
2870         VM_WARN_ON_ONCE(folio && !folio_test_ !! 2212                         put_page(page + i);
2871         VM_WARN_ON_ONCE(!IS_ALIGNED(address,  !! 2213                 }
2872         VM_WARN_ON_ONCE(folio && !folio_test_ << 
2873         VM_BUG_ON(freeze && !folio);          << 
2874                                               << 
2875         /*                                    << 
2876          * When the caller requests to set up << 
2877          * require a folio to check the PMD a << 
2878          * is a risk of replacing the wrong f << 
2879          */                                   << 
2880         if (pmd_trans_huge(*pmd) || pmd_devma << 
2881             is_pmd_migration_entry(*pmd)) {   << 
2882                 if (folio && folio != pmd_fol << 
2883                         return;               << 
2884                 __split_huge_pmd_locked(vma,  << 
2885         }                                        2214         }
2886 }                                                2215 }
2887                                                  2216 
2888 void __split_huge_pmd(struct vm_area_struct *    2217 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2889                 unsigned long address, bool f !! 2218                 unsigned long address, bool freeze, struct page *page)
2890 {                                                2219 {
2891         spinlock_t *ptl;                         2220         spinlock_t *ptl;
2892         struct mmu_notifier_range range;      !! 2221         struct mm_struct *mm = vma->vm_mm;
                                                   >> 2222         unsigned long haddr = address & HPAGE_PMD_MASK;
2893                                                  2223 
2894         mmu_notifier_range_init(&range, MMU_N !! 2224         mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
2895                                 address & HPA !! 2225         ptl = pmd_lock(mm, pmd);
2896                                 (address & HP !! 2226 
2897         mmu_notifier_invalidate_range_start(& !! 2227         /*
2898         ptl = pmd_lock(vma->vm_mm, pmd);      !! 2228          * If caller asks to setup a migration entries, we need a page to check
2899         split_huge_pmd_locked(vma, range.star !! 2229          * pmd against. Otherwise we can end up replacing wrong page.
                                                   >> 2230          */
                                                   >> 2231         VM_BUG_ON(freeze && !page);
                                                   >> 2232         if (page && page != pmd_page(*pmd))
                                                   >> 2233                 goto out;
                                                   >> 2234 
                                                   >> 2235         if (pmd_trans_huge(*pmd)) {
                                                   >> 2236                 page = pmd_page(*pmd);
                                                   >> 2237                 if (PageMlocked(page))
                                                   >> 2238                         clear_page_mlock(page);
                                                   >> 2239         } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
                                                   >> 2240                 goto out;
                                                   >> 2241         __split_huge_pmd_locked(vma, pmd, haddr, freeze);
                                                   >> 2242 out:
2900         spin_unlock(ptl);                        2243         spin_unlock(ptl);
2901         mmu_notifier_invalidate_range_end(&ra !! 2244         /*
                                                   >> 2245          * No need to double call mmu_notifier->invalidate_range() callback.
                                                   >> 2246          * They are 3 cases to consider inside __split_huge_pmd_locked():
                                                   >> 2247          *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
                                                   >> 2248          *  2) __split_huge_zero_page_pmd() read only zero page and any write
                                                   >> 2249          *    fault will trigger a flush_notify before pointing to a new page
                                                   >> 2250          *    (it is fine if the secondary mmu keeps pointing to the old zero
                                                   >> 2251          *    page in the meantime)
                                                   >> 2252          *  3) Split a huge pmd into pte pointing to the same page. No need
                                                   >> 2253          *     to invalidate secondary tlb entry they are all still valid.
                                                   >> 2254          *     any further changes to individual pte will notify. So no need
                                                   >> 2255          *     to call mmu_notifier->invalidate_range()
                                                   >> 2256          */
                                                   >> 2257         mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
                                                   >> 2258                                                HPAGE_PMD_SIZE);
2902 }                                                2259 }
2903                                                  2260 
2904 void split_huge_pmd_address(struct vm_area_st    2261 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
2905                 bool freeze, struct folio *fo !! 2262                 bool freeze, struct page *page)
2906 {                                                2263 {
2907         pmd_t *pmd = mm_find_pmd(vma->vm_mm,  !! 2264         pgd_t *pgd;
                                                   >> 2265         p4d_t *p4d;
                                                   >> 2266         pud_t *pud;
                                                   >> 2267         pmd_t *pmd;
2908                                                  2268 
2909         if (!pmd)                             !! 2269         pgd = pgd_offset(vma->vm_mm, address);
                                                   >> 2270         if (!pgd_present(*pgd))
2910                 return;                          2271                 return;
2911                                                  2272 
2912         __split_huge_pmd(vma, pmd, address, f !! 2273         p4d = p4d_offset(pgd, address);
2913 }                                             !! 2274         if (!p4d_present(*p4d))
                                                   >> 2275                 return;
2914                                                  2276 
2915 static inline void split_huge_pmd_if_needed(s !! 2277         pud = pud_offset(p4d, address);
2916 {                                             !! 2278         if (!pud_present(*pud))
2917         /*                                    !! 2279                 return;
2918          * If the new address isn't hpage ali !! 2280 
2919          * contain an hugepage: check if we n !! 2281         pmd = pmd_offset(pud, address);
2920          */                                   !! 2282 
2921         if (!IS_ALIGNED(address, HPAGE_PMD_SI !! 2283         __split_huge_pmd(vma, pmd, address, freeze, page);
2922             range_in_vma(vma, ALIGN_DOWN(addr << 
2923                          ALIGN(address, HPAGE << 
2924                 split_huge_pmd_address(vma, a << 
2925 }                                                2284 }
2926                                                  2285 
2927 void vma_adjust_trans_huge(struct vm_area_str    2286 void vma_adjust_trans_huge(struct vm_area_struct *vma,
2928                              unsigned long st    2287                              unsigned long start,
2929                              unsigned long en    2288                              unsigned long end,
2930                              long adjust_next    2289                              long adjust_next)
2931 {                                                2290 {
2932         /* Check if we need to split start fi << 
2933         split_huge_pmd_if_needed(vma, start); << 
2934                                               << 
2935         /* Check if we need to split end next << 
2936         split_huge_pmd_if_needed(vma, end);   << 
2937                                               << 
2938         /*                                    << 
2939          * If we're also updating the next vm << 
2940          * check if we need to split it.      << 
2941          */                                   << 
2942         if (adjust_next > 0) {                << 
2943                 struct vm_area_struct *next = << 
2944                 unsigned long nstart = next-> << 
2945                 nstart += adjust_next;        << 
2946                 split_huge_pmd_if_needed(next << 
2947         }                                     << 
2948 }                                             << 
2949                                               << 
2950 static void unmap_folio(struct folio *folio)  << 
2951 {                                             << 
2952         enum ttu_flags ttu_flags = TTU_RMAP_L << 
2953                 TTU_BATCH_FLUSH;              << 
2954                                               << 
2955         VM_BUG_ON_FOLIO(!folio_test_large(fol << 
2956                                               << 
2957         if (folio_test_pmd_mappable(folio))   << 
2958                 ttu_flags |= TTU_SPLIT_HUGE_P << 
2959                                               << 
2960         /*                                    << 
2961          * Anon pages need migration entries  << 
2962          * pages can simply be left unmapped, << 
2963          * If that is ever changed (perhaps f << 
2964          */                                   << 
2965         if (folio_test_anon(folio))           << 
2966                 try_to_migrate(folio, ttu_fla << 
2967         else                                  << 
2968                 try_to_unmap(folio, ttu_flags << 
2969                                               << 
2970         try_to_unmap_flush();                 << 
2971 }                                             << 
2972                                               << 
2973 static bool __discard_anon_folio_pmd_locked(s << 
2974                                             u << 
2975                                             s << 
2976 {                                             << 
2977         struct mm_struct *mm = vma->vm_mm;    << 
2978         int ref_count, map_count;             << 
2979         pmd_t orig_pmd = *pmdp;               << 
2980                                               << 
2981         if (folio_test_dirty(folio) || pmd_di << 
2982                 return false;                 << 
2983                                               << 
2984         orig_pmd = pmdp_huge_clear_flush(vma, << 
2985                                               << 
2986         /*                                       2291         /*
2987          * Syncing against concurrent GUP-fas !! 2292          * If the new start address isn't hpage aligned and it could
2988          * - clear PMD; barrier; read refcoun !! 2293          * previously contain an hugepage: check if we need to split
2989          * - inc refcount; barrier; read PMD  !! 2294          * an huge pmd.
2990          */                                      2295          */
2991         smp_mb();                             !! 2296         if (start & ~HPAGE_PMD_MASK &&
2992                                               !! 2297             (start & HPAGE_PMD_MASK) >= vma->vm_start &&
2993         ref_count = folio_ref_count(folio);   !! 2298             (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2994         map_count = folio_mapcount(folio);    !! 2299                 split_huge_pmd_address(vma, start, false, NULL);
2995                                                  2300 
2996         /*                                       2301         /*
2997          * Order reads for folio refcount and !! 2302          * If the new end address isn't hpage aligned and it could
2998          * (see comments in __remove_mapping( !! 2303          * previously contain an hugepage: check if we need to split
                                                   >> 2304          * an huge pmd.
2999          */                                      2305          */
3000         smp_rmb();                            !! 2306         if (end & ~HPAGE_PMD_MASK &&
                                                   >> 2307             (end & HPAGE_PMD_MASK) >= vma->vm_start &&
                                                   >> 2308             (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
                                                   >> 2309                 split_huge_pmd_address(vma, end, false, NULL);
3001                                                  2310 
3002         /*                                       2311         /*
3003          * If the folio or its PMD is redirti !! 2312          * If we're also updating the vma->vm_next->vm_start, if the new
3004          * are unexpected references, we will !! 2313          * vm_next->vm_start isn't page aligned and it could previously
3005          * and remap it.                      !! 2314          * contain an hugepage: check if we need to split an huge pmd.
3006          *                                    << 
3007          * The only folio refs must be one fr << 
3008          */                                      2315          */
3009         if (folio_test_dirty(folio) || pmd_di !! 2316         if (adjust_next > 0) {
3010             ref_count != map_count + 1) {     !! 2317                 struct vm_area_struct *next = vma->vm_next;
3011                 set_pmd_at(mm, addr, pmdp, or !! 2318                 unsigned long nstart = next->vm_start;
3012                 return false;                 !! 2319                 nstart += adjust_next << PAGE_SHIFT;
                                                   >> 2320                 if (nstart & ~HPAGE_PMD_MASK &&
                                                   >> 2321                     (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
                                                   >> 2322                     (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
                                                   >> 2323                         split_huge_pmd_address(next, nstart, false, NULL);
3013         }                                        2324         }
3014                                               << 
3015         folio_remove_rmap_pmd(folio, pmd_page << 
3016         zap_deposited_table(mm, pmdp);        << 
3017         add_mm_counter(mm, MM_ANONPAGES, -HPA << 
3018         if (vma->vm_flags & VM_LOCKED)        << 
3019                 mlock_drain_local();          << 
3020         folio_put(folio);                     << 
3021                                               << 
3022         return true;                          << 
3023 }                                                2325 }
3024                                                  2326 
3025 bool unmap_huge_pmd_locked(struct vm_area_str !! 2327 static void freeze_page(struct page *page)
3026                            pmd_t *pmdp, struc << 
3027 {                                                2328 {
3028         VM_WARN_ON_FOLIO(!folio_test_pmd_mapp !! 2329         enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
3029         VM_WARN_ON_FOLIO(!folio_test_locked(f !! 2330                 TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
3030         VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPA !! 2331         bool unmap_success;
3031                                               << 
3032         if (folio_test_anon(folio) && !folio_ << 
3033                 return __discard_anon_folio_p << 
3034                                                  2332 
3035         return false;                         !! 2333         VM_BUG_ON_PAGE(!PageHead(page), page);
3036 }                                             << 
3037                                                  2334 
3038 static void remap_page(struct folio *folio, u !! 2335         if (PageAnon(page))
3039 {                                             !! 2336                 ttu_flags |= TTU_SPLIT_FREEZE;
3040         int i = 0;                            << 
3041                                                  2337 
3042         /* If unmap_folio() uses try_to_migra !! 2338         unmap_success = try_to_unmap(page, ttu_flags);
3043         if (!folio_test_anon(folio))          !! 2339         VM_BUG_ON_PAGE(!unmap_success, page);
3044                 return;                       << 
3045         for (;;) {                            << 
3046                 remove_migration_ptes(folio,  << 
3047                 i += folio_nr_pages(folio);   << 
3048                 if (i >= nr)                  << 
3049                         break;                << 
3050                 folio = folio_next(folio);    << 
3051         }                                     << 
3052 }                                                2340 }
3053                                                  2341 
3054 static void lru_add_page_tail(struct folio *f !! 2342 static void unfreeze_page(struct page *page)
3055                 struct lruvec *lruvec, struct << 
3056 {                                                2343 {
3057         VM_BUG_ON_FOLIO(!folio_test_large(fol !! 2344         int i;
3058         VM_BUG_ON_FOLIO(PageLRU(tail), folio) !! 2345         if (PageTransHuge(page)) {
3059         lockdep_assert_held(&lruvec->lru_lock !! 2346                 remove_migration_ptes(page, page, true);
3060                                               << 
3061         if (list) {                           << 
3062                 /* page reclaim is reclaiming << 
3063                 VM_WARN_ON(folio_test_lru(fol << 
3064                 get_page(tail);               << 
3065                 list_add_tail(&tail->lru, lis << 
3066         } else {                                 2347         } else {
3067                 /* head is still on lru (and  !! 2348                 for (i = 0; i < HPAGE_PMD_NR; i++)
3068                 VM_WARN_ON(!folio_test_lru(fo !! 2349                         remove_migration_ptes(page + i, page + i, true);
3069                 if (folio_test_unevictable(fo << 
3070                         tail->mlock_count = 0 << 
3071                 else                          << 
3072                         list_add_tail(&tail-> << 
3073                 SetPageLRU(tail);             << 
3074         }                                        2350         }
3075 }                                                2351 }
3076                                                  2352 
3077 static void __split_huge_page_tail(struct fol !! 2353 static void __split_huge_page_tail(struct page *head, int tail,
3078                 struct lruvec *lruvec, struct !! 2354                 struct lruvec *lruvec, struct list_head *list)
3079                 unsigned int new_order)       << 
3080 {                                                2355 {
3081         struct page *head = &folio->page;     << 
3082         struct page *page_tail = head + tail;    2356         struct page *page_tail = head + tail;
3083         /*                                    << 
3084          * Careful: new_folio is not a "real" << 
3085          * Don't pass it around before clear_ << 
3086          */                                   << 
3087         struct folio *new_folio = (struct fol << 
3088                                                  2357 
3089         VM_BUG_ON_PAGE(atomic_read(&page_tail    2358         VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
                                                   >> 2359         VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
3090                                                  2360 
3091         /*                                       2361         /*
3092          * Clone page flags before unfreezing !! 2362          * tail_page->_refcount is zero and not changing from under us. But
3093          *                                    !! 2363          * get_page_unless_zero() may be running from under us on the
3094          * After successful get_page_unless_z !! 2364          * tail_page. If we used atomic_set() below instead of atomic_inc() or
3095          * for example lock_page() which set  !! 2365          * atomic_add(), we would then run atomic_set() concurrently with
3096          *                                    !! 2366          * get_page_unless_zero(), and atomic_set() is implemented in C not
3097          * Note that for mapped sub-pages of  !! 2367          * using locked ops. spin_unlock on x86 sometime uses locked ops
3098          * PG_anon_exclusive has been cleared !! 2368          * because of PPro errata 66, 92, so unless somebody can guarantee
3099          * the migration entry instead from w !! 2369          * atomic_set() here would be safe on all archs (and not only on x86),
3100          * We can still have PG_anon_exclusiv !! 2370          * it's safer to use atomic_inc()/atomic_add().
3101          * unreferenced sub-pages of an anony << 
3102          * PG_anon_exclusive (-> PG_mappedtod << 
3103          */                                      2371          */
                                                   >> 2372         if (PageAnon(head) && !PageSwapCache(head)) {
                                                   >> 2373                 page_ref_inc(page_tail);
                                                   >> 2374         } else {
                                                   >> 2375                 /* Additional pin to radix tree */
                                                   >> 2376                 page_ref_add(page_tail, 2);
                                                   >> 2377         }
                                                   >> 2378 
3104         page_tail->flags &= ~PAGE_FLAGS_CHECK    2379         page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
3105         page_tail->flags |= (head->flags &       2380         page_tail->flags |= (head->flags &
3106                         ((1L << PG_referenced    2381                         ((1L << PG_referenced) |
3107                          (1L << PG_swapbacked    2382                          (1L << PG_swapbacked) |
3108                          (1L << PG_swapcache)    2383                          (1L << PG_swapcache) |
3109                          (1L << PG_mlocked) |    2384                          (1L << PG_mlocked) |
3110                          (1L << PG_uptodate)     2385                          (1L << PG_uptodate) |
3111                          (1L << PG_active) |     2386                          (1L << PG_active) |
3112                          (1L << PG_workingset << 
3113                          (1L << PG_locked) |     2387                          (1L << PG_locked) |
3114                          (1L << PG_unevictabl    2388                          (1L << PG_unevictable) |
3115 #ifdef CONFIG_ARCH_USES_PG_ARCH_2             !! 2389                          (1L << PG_dirty)));
3116                          (1L << PG_arch_2) |  << 
3117 #endif                                        << 
3118 #ifdef CONFIG_ARCH_USES_PG_ARCH_3             << 
3119                          (1L << PG_arch_3) |  << 
3120 #endif                                        << 
3121                          (1L << PG_dirty) |   << 
3122                          LRU_GEN_MASK | LRU_R << 
3123                                               << 
3124         /* ->mapping in first and second tail << 
3125         VM_BUG_ON_PAGE(tail > 2 && page_tail- << 
3126                         page_tail);           << 
3127         page_tail->mapping = head->mapping;   << 
3128         page_tail->index = head->index + tail << 
3129                                                  2390 
3130         /*                                       2391         /*
3131          * page->private should not be set in !! 2392          * After clearing PageTail the gup refcount can be released.
3132          * if private is unexpectedly set.    !! 2393          * Page flags also must be visible before we make the page non-compound.
3133          */                                      2394          */
3134         if (unlikely(page_tail->private)) {   << 
3135                 VM_WARN_ON_ONCE_PAGE(true, pa << 
3136                 page_tail->private = 0;       << 
3137         }                                     << 
3138         if (folio_test_swapcache(folio))      << 
3139                 new_folio->swap.val = folio-> << 
3140                                               << 
3141         /* Page flags must be visible before  << 
3142         smp_wmb();                               2395         smp_wmb();
3143                                                  2396 
3144         /*                                    << 
3145          * Clear PageTail before unfreezing p << 
3146          *                                    << 
3147          * After successful get_page_unless_z << 
3148          * which needs correct compound_head( << 
3149          */                                   << 
3150         clear_compound_head(page_tail);          2397         clear_compound_head(page_tail);
3151         if (new_order) {                      << 
3152                 prep_compound_page(page_tail, << 
3153                 folio_set_large_rmappable(new << 
3154         }                                     << 
3155                                                  2398 
3156         /* Finally unfreeze refcount. Additio !! 2399         if (page_is_young(head))
3157         page_ref_unfreeze(page_tail,          !! 2400                 set_page_young(page_tail);
3158                 1 + ((!folio_test_anon(folio) !! 2401         if (page_is_idle(head))
3159                              folio_nr_pages(n !! 2402                 set_page_idle(page_tail);
3160                                                  2403 
3161         if (folio_test_young(folio))          !! 2404         /* ->mapping in first tail page is compound_mapcount */
3162                 folio_set_young(new_folio);   !! 2405         VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
3163         if (folio_test_idle(folio))           !! 2406                         page_tail);
3164                 folio_set_idle(new_folio);    !! 2407         page_tail->mapping = head->mapping;
3165                                               << 
3166         folio_xchg_last_cpupid(new_folio, fol << 
3167                                                  2408 
3168         /*                                    !! 2409         page_tail->index = head->index + tail;
3169          * always add to the tail because som !! 2410         page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
3170          * pages to show after the currently  !! 2411         lru_add_page_tail(head, page_tail, lruvec, list);
3171          * migrate_pages                      << 
3172          */                                   << 
3173         lru_add_page_tail(folio, page_tail, l << 
3174 }                                                2412 }
3175                                                  2413 
3176 static void __split_huge_page(struct page *pa    2414 static void __split_huge_page(struct page *page, struct list_head *list,
3177                 pgoff_t end, unsigned int new !! 2415                 unsigned long flags)
3178 {                                                2416 {
3179         struct folio *folio = page_folio(page !! 2417         struct page *head = compound_head(page);
3180         struct page *head = &folio->page;     !! 2418         struct zone *zone = page_zone(head);
3181         struct lruvec *lruvec;                   2419         struct lruvec *lruvec;
3182         struct address_space *swap_cache = NU !! 2420         pgoff_t end = -1;
3183         unsigned long offset = 0;             !! 2421         int i;
3184         int i, nr_dropped = 0;                << 
3185         unsigned int new_nr = 1 << new_order; << 
3186         int order = folio_order(folio);       << 
3187         unsigned int nr = 1 << order;         << 
3188                                               << 
3189         /* complete memcg works before add pa << 
3190         split_page_memcg(head, order, new_ord << 
3191                                                  2422 
3192         if (folio_test_anon(folio) && folio_t !! 2423         lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
3193                 offset = swap_cache_index(fol << 
3194                 swap_cache = swap_address_spa << 
3195                 xa_lock(&swap_cache->i_pages) << 
3196         }                                     << 
3197                                                  2424 
3198         /* lock lru list/PageCompound, ref fr !! 2425         /* complete memcg works before add pages to LRU */
3199         lruvec = folio_lruvec_lock(folio);    !! 2426         mem_cgroup_split_huge_fixup(head);
3200                                                  2427 
3201         ClearPageHasHWPoisoned(head);         !! 2428         if (!PageAnon(page))
                                                   >> 2429                 end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
3202                                                  2430 
3203         for (i = nr - new_nr; i >= new_nr; i  !! 2431         for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
3204                 __split_huge_page_tail(folio, !! 2432                 __split_huge_page_tail(head, i, lruvec, list);
3205                 /* Some pages can be beyond E !! 2433                 /* Some pages can be beyond i_size: drop them from page cache */
3206                 if (head[i].index >= end) {      2434                 if (head[i].index >= end) {
3207                         struct folio *tail =  !! 2435                         ClearPageDirty(head + i);
3208                                               !! 2436                         __delete_from_page_cache(head + i, NULL);
3209                         if (shmem_mapping(fol !! 2437                         if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
3210                                 nr_dropped++; !! 2438                                 shmem_uncharge(head->mapping->host, 1);
3211                         else if (folio_test_c !! 2439                         put_page(head + i);
3212                                 folio_account << 
3213                                         inode << 
3214                         __filemap_remove_foli << 
3215                         folio_put(tail);      << 
3216                 } else if (!PageAnon(page)) { << 
3217                         __xa_store(&folio->ma << 
3218                                         head  << 
3219                 } else if (swap_cache) {      << 
3220                         __xa_store(&swap_cach << 
3221                                         head  << 
3222                 }                                2440                 }
3223         }                                        2441         }
3224                                                  2442 
3225         if (!new_order)                       !! 2443         ClearPageCompound(head);
3226                 ClearPageCompound(head);      << 
3227         else {                                << 
3228                 struct folio *new_folio = (st << 
3229                                               << 
3230                 folio_set_order(new_folio, ne << 
3231         }                                     << 
3232         unlock_page_lruvec(lruvec);           << 
3233         /* Caller disabled irqs, so they are  << 
3234                                               << 
3235         split_page_owner(head, order, new_ord << 
3236         pgalloc_tag_split(folio, order, new_o << 
3237                                               << 
3238         /* See comment in __split_huge_page_t    2444         /* See comment in __split_huge_page_tail() */
3239         if (folio_test_anon(folio)) {         !! 2445         if (PageAnon(head)) {
3240                 /* Additional pin to swap cac !! 2446                 /* Additional pin to radix tree of swap cache */
3241                 if (folio_test_swapcache(foli !! 2447                 if (PageSwapCache(head))
3242                         folio_ref_add(folio,  !! 2448                         page_ref_add(head, 2);
3243                         xa_unlock(&swap_cache !! 2449                 else
3244                 } else {                      !! 2450                         page_ref_inc(head);
3245                         folio_ref_inc(folio); << 
3246                 }                             << 
3247         } else {                                 2451         } else {
3248                 /* Additional pin to page cac !! 2452                 /* Additional pin to radix tree */
3249                 folio_ref_add(folio, 1 + new_ !! 2453                 page_ref_add(head, 2);
3250                 xa_unlock(&folio->mapping->i_ !! 2454                 spin_unlock(&head->mapping->tree_lock);
3251         }                                        2455         }
3252         local_irq_enable();                   << 
3253                                                  2456 
3254         if (nr_dropped)                       !! 2457         spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
3255                 shmem_uncharge(folio->mapping << 
3256         remap_page(folio, nr, PageAnon(head)  << 
3257                                                  2458 
3258         /*                                    !! 2459         unfreeze_page(head);
3259          * set page to its compound_head when << 
3260          * we can skip unlocking it below, si << 
3261          * the compound_head of the page and  << 
3262          */                                   << 
3263         if (new_order)                        << 
3264                 page = compound_head(page);   << 
3265                                                  2460 
3266         for (i = 0; i < nr; i += new_nr) {    !! 2461         for (i = 0; i < HPAGE_PMD_NR; i++) {
3267                 struct page *subpage = head +    2462                 struct page *subpage = head + i;
3268                 struct folio *new_folio = pag << 
3269                 if (subpage == page)             2463                 if (subpage == page)
3270                         continue;                2464                         continue;
3271                 folio_unlock(new_folio);      !! 2465                 unlock_page(subpage);
3272                                                  2466 
3273                 /*                               2467                 /*
3274                  * Subpages may be freed if t    2468                  * Subpages may be freed if there wasn't any mapping
3275                  * like if add_to_swap() is r    2469                  * like if add_to_swap() is running on a lru page that
3276                  * had its mapping zapped. An    2470                  * had its mapping zapped. And freeing these pages
3277                  * requires taking the lru_lo    2471                  * requires taking the lru_lock so we do the put_page
3278                  * of the tail pages after th    2472                  * of the tail pages after the split is complete.
3279                  */                              2473                  */
3280                 free_page_and_swap_cache(subp !! 2474                 put_page(subpage);
3281         }                                        2475         }
3282 }                                                2476 }
3283                                                  2477 
                                                   >> 2478 int total_mapcount(struct page *page)
                                                   >> 2479 {
                                                   >> 2480         int i, compound, ret;
                                                   >> 2481 
                                                   >> 2482         VM_BUG_ON_PAGE(PageTail(page), page);
                                                   >> 2483 
                                                   >> 2484         if (likely(!PageCompound(page)))
                                                   >> 2485                 return atomic_read(&page->_mapcount) + 1;
                                                   >> 2486 
                                                   >> 2487         compound = compound_mapcount(page);
                                                   >> 2488         if (PageHuge(page))
                                                   >> 2489                 return compound;
                                                   >> 2490         ret = compound;
                                                   >> 2491         for (i = 0; i < HPAGE_PMD_NR; i++)
                                                   >> 2492                 ret += atomic_read(&page[i]._mapcount) + 1;
                                                   >> 2493         /* File pages has compound_mapcount included in _mapcount */
                                                   >> 2494         if (!PageAnon(page))
                                                   >> 2495                 return ret - compound * HPAGE_PMD_NR;
                                                   >> 2496         if (PageDoubleMap(page))
                                                   >> 2497                 ret -= HPAGE_PMD_NR;
                                                   >> 2498         return ret;
                                                   >> 2499 }
                                                   >> 2500 
                                                   >> 2501 /*
                                                   >> 2502  * This calculates accurately how many mappings a transparent hugepage
                                                   >> 2503  * has (unlike page_mapcount() which isn't fully accurate). This full
                                                   >> 2504  * accuracy is primarily needed to know if copy-on-write faults can
                                                   >> 2505  * reuse the page and change the mapping to read-write instead of
                                                   >> 2506  * copying them. At the same time this returns the total_mapcount too.
                                                   >> 2507  *
                                                   >> 2508  * The function returns the highest mapcount any one of the subpages
                                                   >> 2509  * has. If the return value is one, even if different processes are
                                                   >> 2510  * mapping different subpages of the transparent hugepage, they can
                                                   >> 2511  * all reuse it, because each process is reusing a different subpage.
                                                   >> 2512  *
                                                   >> 2513  * The total_mapcount is instead counting all virtual mappings of the
                                                   >> 2514  * subpages. If the total_mapcount is equal to "one", it tells the
                                                   >> 2515  * caller all mappings belong to the same "mm" and in turn the
                                                   >> 2516  * anon_vma of the transparent hugepage can become the vma->anon_vma
                                                   >> 2517  * local one as no other process may be mapping any of the subpages.
                                                   >> 2518  *
                                                   >> 2519  * It would be more accurate to replace page_mapcount() with
                                                   >> 2520  * page_trans_huge_mapcount(), however we only use
                                                   >> 2521  * page_trans_huge_mapcount() in the copy-on-write faults where we
                                                   >> 2522  * need full accuracy to avoid breaking page pinning, because
                                                   >> 2523  * page_trans_huge_mapcount() is slower than page_mapcount().
                                                   >> 2524  */
                                                   >> 2525 int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
                                                   >> 2526 {
                                                   >> 2527         int i, ret, _total_mapcount, mapcount;
                                                   >> 2528 
                                                   >> 2529         /* hugetlbfs shouldn't call it */
                                                   >> 2530         VM_BUG_ON_PAGE(PageHuge(page), page);
                                                   >> 2531 
                                                   >> 2532         if (likely(!PageTransCompound(page))) {
                                                   >> 2533                 mapcount = atomic_read(&page->_mapcount) + 1;
                                                   >> 2534                 if (total_mapcount)
                                                   >> 2535                         *total_mapcount = mapcount;
                                                   >> 2536                 return mapcount;
                                                   >> 2537         }
                                                   >> 2538 
                                                   >> 2539         page = compound_head(page);
                                                   >> 2540 
                                                   >> 2541         _total_mapcount = ret = 0;
                                                   >> 2542         for (i = 0; i < HPAGE_PMD_NR; i++) {
                                                   >> 2543                 mapcount = atomic_read(&page[i]._mapcount) + 1;
                                                   >> 2544                 ret = max(ret, mapcount);
                                                   >> 2545                 _total_mapcount += mapcount;
                                                   >> 2546         }
                                                   >> 2547         if (PageDoubleMap(page)) {
                                                   >> 2548                 ret -= 1;
                                                   >> 2549                 _total_mapcount -= HPAGE_PMD_NR;
                                                   >> 2550         }
                                                   >> 2551         mapcount = compound_mapcount(page);
                                                   >> 2552         ret += mapcount;
                                                   >> 2553         _total_mapcount += mapcount;
                                                   >> 2554         if (total_mapcount)
                                                   >> 2555                 *total_mapcount = _total_mapcount;
                                                   >> 2556         return ret;
                                                   >> 2557 }
                                                   >> 2558 
3284 /* Racy check whether the huge page can be sp    2559 /* Racy check whether the huge page can be split */
3285 bool can_split_folio(struct folio *folio, int !! 2560 bool can_split_huge_page(struct page *page, int *pextra_pins)
3286 {                                                2561 {
3287         int extra_pins;                          2562         int extra_pins;
3288                                                  2563 
3289         /* Additional pins from page cache */ !! 2564         /* Additional pins from radix tree */
3290         if (folio_test_anon(folio))           !! 2565         if (PageAnon(page))
3291                 extra_pins = folio_test_swapc !! 2566                 extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
3292                                 folio_nr_page << 
3293         else                                     2567         else
3294                 extra_pins = folio_nr_pages(f !! 2568                 extra_pins = HPAGE_PMD_NR;
3295         if (pextra_pins)                         2569         if (pextra_pins)
3296                 *pextra_pins = extra_pins;       2570                 *pextra_pins = extra_pins;
3297         return folio_mapcount(folio) == folio !! 2571         return total_mapcount(page) == page_count(page) - extra_pins - 1;
3298                                         calle << 
3299 }                                                2572 }
3300                                                  2573 
3301 /*                                               2574 /*
3302  * This function splits a large folio into sm !! 2575  * This function splits huge page into normal pages. @page can point to any
3303  * @page can point to any page of the large f !! 2576  * subpage of huge page to split. Split doesn't change the position of @page.
3304  * does not change the position of @page.     << 
3305  *                                            << 
3306  * Prerequisites:                             << 
3307  *                                            << 
3308  * 1) The caller must hold a reference on the << 
3309  *    as the large folio.                     << 
3310  *                                            << 
3311  * 2) The large folio must be locked.         << 
3312  *                                            << 
3313  * 3) The folio must not be pinned. Any unexp << 
3314  *    GUP pins, will result in the folio not  << 
3315  *    will receive an -EAGAIN.                << 
3316  *                                               2577  *
3317  * 4) @new_order > 1, usually. Splitting to o !! 2578  * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
3318  *    supported for non-file-backed folios, b !! 2579  * The huge page must be locked.
3319  *    is used by partially mapped folios, is  << 
3320  *    folio only has subpages 0 and 1. File-b << 
3321  *    since they do not use _deferred_list.   << 
3322  *                                            << 
3323  * After splitting, the caller's folio refere << 
3324  * resulting in a raised refcount of @page af << 
3325  * be freed if they are not mapped.           << 
3326  *                                               2580  *
3327  * If @list is null, tail pages will be added    2581  * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
3328  *                                               2582  *
3329  * Pages in @new_order will inherit the mappi !! 2583  * Both head page and tail pages will inherit mapping, flags, and so on from
3330  * huge page.                                 !! 2584  * the hugepage.
3331  *                                            << 
3332  * Returns 0 if the huge page was split succe << 
3333  *                                            << 
3334  * Returns -EAGAIN if the folio has unexpecte << 
3335  * the folio was concurrently removed from th << 
3336  *                                            << 
3337  * Returns -EBUSY when trying to split the hu << 
3338  * under writeback, if fs-specific folio meta << 
3339  * released, or if some unexpected race happe << 
3340  * truncation).                               << 
3341  *                                               2585  *
3342  * Callers should ensure that the order respe !! 2586  * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
3343  * min-order if one is set for non-anonymous  !! 2587  * they are not mapped.
3344  *                                               2588  *
3345  * Returns -EINVAL when trying to split to an !! 2589  * Returns 0 if the hugepage is split successfully.
3346  * with the folio. Splitting to order 0 is co !! 2590  * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
                                                   >> 2591  * us.
3347  */                                              2592  */
3348 int split_huge_page_to_list_to_order(struct p !! 2593 int split_huge_page_to_list(struct page *page, struct list_head *list)
3349                                      unsigned << 
3350 {                                                2594 {
3351         struct folio *folio = page_folio(page !! 2595         struct page *head = compound_head(page);
3352         struct deferred_split *ds_queue = get !! 2596         struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
3353         /* reset xarray order to new order af << 
3354         XA_STATE_ORDER(xas, &folio->mapping-> << 
3355         bool is_anon = folio_test_anon(folio) << 
3356         struct address_space *mapping = NULL; << 
3357         struct anon_vma *anon_vma = NULL;        2597         struct anon_vma *anon_vma = NULL;
3358         int order = folio_order(folio);       !! 2598         struct address_space *mapping = NULL;
3359         int extra_pins, ret;                  !! 2599         int count, mapcount, extra_pins, ret;
3360         pgoff_t end;                          !! 2600         bool mlocked;
3361         bool is_hzp;                          !! 2601         unsigned long flags;
3362                                               << 
3363         VM_BUG_ON_FOLIO(!folio_test_locked(fo << 
3364         VM_BUG_ON_FOLIO(!folio_test_large(fol << 
3365                                               << 
3366         if (new_order >= folio_order(folio))  << 
3367                 return -EINVAL;               << 
3368                                               << 
3369         if (is_anon) {                        << 
3370                 /* order-1 is not supported f << 
3371                 if (new_order == 1) {         << 
3372                         VM_WARN_ONCE(1, "Cann << 
3373                         return -EINVAL;       << 
3374                 }                             << 
3375         } else if (new_order) {               << 
3376                 /* Split shmem folio to non-z << 
3377                 if (shmem_mapping(folio->mapp << 
3378                         VM_WARN_ONCE(1,       << 
3379                                 "Cannot split << 
3380                         return -EINVAL;       << 
3381                 }                             << 
3382                 /*                            << 
3383                  * No split if the file syste << 
3384                  * Note that we might still h << 
3385                  * CONFIG_READ_ONLY_THP_FOR_F << 
3386                  * does not actually support  << 
3387                  */                           << 
3388                 if (IS_ENABLED(CONFIG_READ_ON << 
3389                     !mapping_large_folio_supp << 
3390                         VM_WARN_ONCE(1,       << 
3391                                 "Cannot split << 
3392                         return -EINVAL;       << 
3393                 }                             << 
3394         }                                     << 
3395                                               << 
3396         /* Only swapping a whole PMD-mapped f << 
3397         if (folio_test_swapcache(folio) && ne << 
3398                 return -EINVAL;               << 
3399                                                  2602 
3400         is_hzp = is_huge_zero_folio(folio);   !! 2603         VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
3401         if (is_hzp) {                         !! 2604         VM_BUG_ON_PAGE(!PageLocked(page), page);
3402                 pr_warn_ratelimited("Called s !! 2605         VM_BUG_ON_PAGE(!PageCompound(page), page);
3403                 return -EBUSY;                << 
3404         }                                     << 
3405                                                  2606 
3406         if (folio_test_writeback(folio))      !! 2607         if (PageWriteback(page))
3407                 return -EBUSY;                   2608                 return -EBUSY;
3408                                                  2609 
3409         if (is_anon) {                        !! 2610         if (PageAnon(head)) {
3410                 /*                               2611                 /*
3411                  * The caller does not necess !! 2612                  * The caller does not necessarily hold an mmap_sem that would
3412                  * prevent the anon_vma disap    2613                  * prevent the anon_vma disappearing so we first we take a
3413                  * reference to it and then l    2614                  * reference to it and then lock the anon_vma for write. This
3414                  * is similar to folio_lock_a !! 2615                  * is similar to page_lock_anon_vma_read except the write lock
3415                  * is taken to serialise agai    2616                  * is taken to serialise against parallel split or collapse
3416                  * operations.                   2617                  * operations.
3417                  */                              2618                  */
3418                 anon_vma = folio_get_anon_vma !! 2619                 anon_vma = page_get_anon_vma(head);
3419                 if (!anon_vma) {                 2620                 if (!anon_vma) {
3420                         ret = -EBUSY;            2621                         ret = -EBUSY;
3421                         goto out;                2622                         goto out;
3422                 }                                2623                 }
3423                 end = -1;                     << 
3424                 mapping = NULL;                  2624                 mapping = NULL;
3425                 anon_vma_lock_write(anon_vma)    2625                 anon_vma_lock_write(anon_vma);
3426         } else {                                 2626         } else {
3427                 unsigned int min_order;       !! 2627                 mapping = head->mapping;
3428                 gfp_t gfp;                    << 
3429                                               << 
3430                 mapping = folio->mapping;     << 
3431                                                  2628 
3432                 /* Truncated ? */                2629                 /* Truncated ? */
3433                 if (!mapping) {                  2630                 if (!mapping) {
3434                         ret = -EBUSY;            2631                         ret = -EBUSY;
3435                         goto out;                2632                         goto out;
3436                 }                                2633                 }
3437                                                  2634 
3438                 min_order = mapping_min_folio << 
3439                 if (new_order < min_order) {  << 
3440                         VM_WARN_ONCE(1, "Cann << 
3441                                      min_orde << 
3442                         ret = -EINVAL;        << 
3443                         goto out;             << 
3444                 }                             << 
3445                                               << 
3446                 gfp = current_gfp_context(map << 
3447                                               << 
3448                                               << 
3449                 if (!filemap_release_folio(fo << 
3450                         ret = -EBUSY;         << 
3451                         goto out;             << 
3452                 }                             << 
3453                                               << 
3454                 xas_split_alloc(&xas, folio,  << 
3455                 if (xas_error(&xas)) {        << 
3456                         ret = xas_error(&xas) << 
3457                         goto out;             << 
3458                 }                             << 
3459                                               << 
3460                 anon_vma = NULL;                 2635                 anon_vma = NULL;
3461                 i_mmap_lock_read(mapping);       2636                 i_mmap_lock_read(mapping);
3462                                               << 
3463                 /*                            << 
3464                  *__split_huge_page() may nee << 
3465                  * but on 32-bit, i_size_read << 
3466                  * which cannot be nested ins << 
3467                  * end now: i_size itself may << 
3468                  * folio lock is good enough  << 
3469                  */                           << 
3470                 end = DIV_ROUND_UP(i_size_rea << 
3471                 if (shmem_mapping(mapping))   << 
3472                         end = shmem_fallocend << 
3473         }                                        2637         }
3474                                                  2638 
3475         /*                                       2639         /*
3476          * Racy check if we can split the pag !! 2640          * Racy check if we can split the page, before freeze_page() will
3477          * split PMDs                            2641          * split PMDs
3478          */                                      2642          */
3479         if (!can_split_folio(folio, 1, &extra !! 2643         if (!can_split_huge_page(head, &extra_pins)) {
3480                 ret = -EAGAIN;                !! 2644                 ret = -EBUSY;
3481                 goto out_unlock;                 2645                 goto out_unlock;
3482         }                                        2646         }
3483                                                  2647 
3484         unmap_folio(folio);                   !! 2648         mlocked = PageMlocked(page);
                                                   >> 2649         freeze_page(head);
                                                   >> 2650         VM_BUG_ON_PAGE(compound_mapcount(head), head);
                                                   >> 2651 
                                                   >> 2652         /* Make sure the page is not on per-CPU pagevec as it takes pin */
                                                   >> 2653         if (mlocked)
                                                   >> 2654                 lru_add_drain();
                                                   >> 2655 
                                                   >> 2656         /* prevent PageLRU to go away from under us, and freeze lru stats */
                                                   >> 2657         spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
3485                                                  2658 
3486         /* block interrupt reentry in xa_lock << 
3487         local_irq_disable();                  << 
3488         if (mapping) {                           2659         if (mapping) {
                                                   >> 2660                 void **pslot;
                                                   >> 2661 
                                                   >> 2662                 spin_lock(&mapping->tree_lock);
                                                   >> 2663                 pslot = radix_tree_lookup_slot(&mapping->page_tree,
                                                   >> 2664                                 page_index(head));
3489                 /*                               2665                 /*
3490                  * Check if the folio is pres !! 2666                  * Check if the head page is present in radix tree.
3491                  * We assume all tail are pre !! 2667                  * We assume all tail are present too, if head is there.
3492                  */                              2668                  */
3493                 xas_lock(&xas);               !! 2669                 if (radix_tree_deref_slot_protected(pslot,
3494                 xas_reset(&xas);              !! 2670                                         &mapping->tree_lock) != head)
3495                 if (xas_load(&xas) != folio)  << 
3496                         goto fail;               2671                         goto fail;
3497         }                                        2672         }
3498                                                  2673 
3499         /* Prevent deferred_split_scan() touc    2674         /* Prevent deferred_split_scan() touching ->_refcount */
3500         spin_lock(&ds_queue->split_queue_lock !! 2675         spin_lock(&pgdata->split_queue_lock);
3501         if (folio_ref_freeze(folio, 1 + extra !! 2676         count = page_count(head);
3502                 if (folio_order(folio) > 1 && !! 2677         mapcount = total_mapcount(head);
3503                     !list_empty(&folio->_defe !! 2678         if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
3504                         ds_queue->split_queue !! 2679                 if (!list_empty(page_deferred_list(head))) {
3505                         if (folio_test_partia !! 2680                         pgdata->split_queue_len--;
3506                                 __folio_clear !! 2681                         list_del(page_deferred_list(head));
3507                                 mod_mthp_stat << 
3508                                               << 
3509                         }                     << 
3510                         /*                    << 
3511                          * Reinitialize page_ << 
3512                          * page from the spli << 
3513                          * split will see lis << 
3514                          * page_deferred_list << 
3515                          */                   << 
3516                         list_del_init(&folio- << 
3517                 }                             << 
3518                 spin_unlock(&ds_queue->split_ << 
3519                 if (mapping) {                << 
3520                         int nr = folio_nr_pag << 
3521                                               << 
3522                         xas_split(&xas, folio << 
3523                         if (folio_test_pmd_ma << 
3524                             new_order < HPAGE << 
3525                                 if (folio_tes << 
3526                                         __lru << 
3527                                               << 
3528                                 } else {      << 
3529                                         __lru << 
3530                                               << 
3531                                         filem << 
3532                                 }             << 
3533                         }                     << 
3534                 }                                2682                 }
                                                   >> 2683                 if (mapping)
                                                   >> 2684                         __dec_node_page_state(page, NR_SHMEM_THPS);
                                                   >> 2685                 spin_unlock(&pgdata->split_queue_lock);
                                                   >> 2686                 __split_huge_page(page, list, flags);
                                                   >> 2687                 if (PageSwapCache(head)) {
                                                   >> 2688                         swp_entry_t entry = { .val = page_private(head) };
3535                                                  2689 
3536                 if (is_anon) {                !! 2690                         ret = split_swap_cluster(entry);
3537                         mod_mthp_stat(order,  !! 2691                 } else
3538                         mod_mthp_stat(new_ord !! 2692                         ret = 0;
3539                 }                             << 
3540                 __split_huge_page(page, list, << 
3541                 ret = 0;                      << 
3542         } else {                                 2693         } else {
3543                 spin_unlock(&ds_queue->split_ !! 2694                 if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
3544 fail:                                         !! 2695                         pr_alert("total_mapcount: %u, page_count(): %u\n",
3545                 if (mapping)                  !! 2696                                         mapcount, count);
3546                         xas_unlock(&xas);     !! 2697                         if (PageTail(page))
3547                 local_irq_enable();           !! 2698                                 dump_page(head, NULL);
3548                 remap_page(folio, folio_nr_pa !! 2699                         dump_page(page, "total_mapcount(head) > 0");
3549                 ret = -EAGAIN;                !! 2700                         BUG();
                                                   >> 2701                 }
                                                   >> 2702                 spin_unlock(&pgdata->split_queue_lock);
                                                   >> 2703 fail:           if (mapping)
                                                   >> 2704                         spin_unlock(&mapping->tree_lock);
                                                   >> 2705                 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
                                                   >> 2706                 unfreeze_page(head);
                                                   >> 2707                 ret = -EBUSY;
3550         }                                        2708         }
3551                                                  2709 
3552 out_unlock:                                      2710 out_unlock:
3553         if (anon_vma) {                          2711         if (anon_vma) {
3554                 anon_vma_unlock_write(anon_vm    2712                 anon_vma_unlock_write(anon_vma);
3555                 put_anon_vma(anon_vma);          2713                 put_anon_vma(anon_vma);
3556         }                                        2714         }
3557         if (mapping)                             2715         if (mapping)
3558                 i_mmap_unlock_read(mapping);     2716                 i_mmap_unlock_read(mapping);
3559 out:                                             2717 out:
3560         xas_destroy(&xas);                    !! 2718         count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3561         if (order == HPAGE_PMD_ORDER)         << 
3562                 count_vm_event(!ret ? THP_SPL << 
3563         count_mthp_stat(order, !ret ? MTHP_ST << 
3564         return ret;                              2719         return ret;
3565 }                                                2720 }
3566                                                  2721 
3567 int min_order_for_split(struct folio *folio)  !! 2722 void free_transhuge_page(struct page *page)
3568 {                                             << 
3569         if (folio_test_anon(folio))           << 
3570                 return 0;                     << 
3571                                               << 
3572         if (!folio->mapping) {                << 
3573                 if (folio_test_pmd_mappable(f << 
3574                         count_vm_event(THP_SP << 
3575                 return -EBUSY;                << 
3576         }                                     << 
3577                                               << 
3578         return mapping_min_folio_order(folio- << 
3579 }                                             << 
3580                                               << 
3581 int split_folio_to_list(struct folio *folio,  << 
3582 {                                             << 
3583         int ret = min_order_for_split(folio); << 
3584                                               << 
3585         if (ret < 0)                          << 
3586                 return ret;                   << 
3587                                               << 
3588         return split_huge_page_to_list_to_ord << 
3589 }                                             << 
3590                                               << 
3591 /*                                            << 
3592  * __folio_unqueue_deferred_split() is not to << 
3593  * the folio_unqueue_deferred_split() inline  << 
3594  * limits its calls to those folios which may << 
3595  * queueing THP splits, and that list is (rac << 
3596  *                                            << 
3597  * It is unsafe to call folio_unqueue_deferre << 
3598  * zero: because even when split_queue_lock i << 
3599  * might be in use on deferred_split_scan()'s << 
3600  *                                            << 
3601  * If memory cgroups are enabled, split_queue << 
3602  * therefore important to unqueue deferred sp << 
3603  */                                           << 
3604 bool __folio_unqueue_deferred_split(struct fo << 
3605 {                                                2723 {
3606         struct deferred_split *ds_queue;      !! 2724         struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
3607         unsigned long flags;                     2725         unsigned long flags;
3608         bool unqueued = false;                << 
3609                                               << 
3610         WARN_ON_ONCE(folio_ref_count(folio)); << 
3611         WARN_ON_ONCE(!mem_cgroup_disabled() & << 
3612                                                  2726 
3613         ds_queue = get_deferred_split_queue(f !! 2727         spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3614         spin_lock_irqsave(&ds_queue->split_qu !! 2728         if (!list_empty(page_deferred_list(page))) {
3615         if (!list_empty(&folio->_deferred_lis !! 2729                 pgdata->split_queue_len--;
3616                 ds_queue->split_queue_len--;  !! 2730                 list_del(page_deferred_list(page));
3617                 if (folio_test_partially_mapp << 
3618                         __folio_clear_partial << 
3619                         mod_mthp_stat(folio_o << 
3620                                       MTHP_ST << 
3621                 }                             << 
3622                 list_del_init(&folio->_deferr << 
3623                 unqueued = true;              << 
3624         }                                        2731         }
3625         spin_unlock_irqrestore(&ds_queue->spl !! 2732         spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3626                                               !! 2733         free_compound_page(page);
3627         return unqueued;        /* useful for << 
3628 }                                                2734 }
3629                                                  2735 
3630 /* partially_mapped=false won't clear PG_part !! 2736 void deferred_split_huge_page(struct page *page)
3631 void deferred_split_folio(struct folio *folio << 
3632 {                                                2737 {
3633         struct deferred_split *ds_queue = get !! 2738         struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
3634 #ifdef CONFIG_MEMCG                           << 
3635         struct mem_cgroup *memcg = folio_memc << 
3636 #endif                                        << 
3637         unsigned long flags;                     2739         unsigned long flags;
3638                                                  2740 
3639         /*                                    !! 2741         VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3640          * Order 1 folios have no space for a << 
3641          * won't waste much memory by not add << 
3642          */                                   << 
3643         if (folio_order(folio) <= 1)          << 
3644                 return;                       << 
3645                                               << 
3646         if (!partially_mapped && !split_under << 
3647                 return;                       << 
3648                                               << 
3649         /*                                    << 
3650          * Exclude swapcache: originally to a << 
3651          * queue. Nowadays that is fully prev << 
3652          * but if page reclaim is already han << 
3653          * unnecessary to handle it again in  << 
3654          * swapcache here may still be a usef << 
3655          */                                   << 
3656         if (folio_test_swapcache(folio))      << 
3657                 return;                       << 
3658                                               << 
3659         spin_lock_irqsave(&ds_queue->split_qu << 
3660         if (partially_mapped) {               << 
3661                 if (!folio_test_partially_map << 
3662                         __folio_set_partially << 
3663                         if (folio_test_pmd_ma << 
3664                                 count_vm_even << 
3665                         count_mthp_stat(folio << 
3666                         mod_mthp_stat(folio_o << 
3667                                                  2742 
3668                 }                             !! 2743         spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3669         } else {                              !! 2744         if (list_empty(page_deferred_list(page))) {
3670                 /* partially mapped folios ca !! 2745                 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
3671                 VM_WARN_ON_FOLIO(folio_test_p !! 2746                 list_add_tail(page_deferred_list(page), &pgdata->split_queue);
3672         }                                     !! 2747                 pgdata->split_queue_len++;
3673         if (list_empty(&folio->_deferred_list << 
3674                 list_add_tail(&folio->_deferr << 
3675                 ds_queue->split_queue_len++;  << 
3676 #ifdef CONFIG_MEMCG                           << 
3677                 if (memcg)                    << 
3678                         set_shrinker_bit(memc << 
3679                                          defe << 
3680 #endif                                        << 
3681         }                                        2748         }
3682         spin_unlock_irqrestore(&ds_queue->spl !! 2749         spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3683 }                                                2750 }
3684                                                  2751 
3685 static unsigned long deferred_split_count(str    2752 static unsigned long deferred_split_count(struct shrinker *shrink,
3686                 struct shrink_control *sc)       2753                 struct shrink_control *sc)
3687 {                                                2754 {
3688         struct pglist_data *pgdata = NODE_DAT    2755         struct pglist_data *pgdata = NODE_DATA(sc->nid);
3689         struct deferred_split *ds_queue = &pg !! 2756         return READ_ONCE(pgdata->split_queue_len);
3690                                               << 
3691 #ifdef CONFIG_MEMCG                           << 
3692         if (sc->memcg)                        << 
3693                 ds_queue = &sc->memcg->deferr << 
3694 #endif                                        << 
3695         return READ_ONCE(ds_queue->split_queu << 
3696 }                                             << 
3697                                               << 
3698 static bool thp_underused(struct folio *folio << 
3699 {                                             << 
3700         int num_zero_pages = 0, num_filled_pa << 
3701         void *kaddr;                          << 
3702         int i;                                << 
3703                                               << 
3704         if (khugepaged_max_ptes_none == HPAGE << 
3705                 return false;                 << 
3706                                               << 
3707         for (i = 0; i < folio_nr_pages(folio) << 
3708                 kaddr = kmap_local_folio(foli << 
3709                 if (!memchr_inv(kaddr, 0, PAG << 
3710                         num_zero_pages++;     << 
3711                         if (num_zero_pages >  << 
3712                                 kunmap_local( << 
3713                                 return true;  << 
3714                         }                     << 
3715                 } else {                      << 
3716                         /*                    << 
3717                          * Another path for e << 
3718                          * of non-zero filled << 
3719                          */                   << 
3720                         num_filled_pages++;   << 
3721                         if (num_filled_pages  << 
3722                                 kunmap_local( << 
3723                                 return false; << 
3724                         }                     << 
3725                 }                             << 
3726                 kunmap_local(kaddr);          << 
3727         }                                     << 
3728         return false;                         << 
3729 }                                                2757 }
3730                                                  2758 
3731 static unsigned long deferred_split_scan(stru    2759 static unsigned long deferred_split_scan(struct shrinker *shrink,
3732                 struct shrink_control *sc)       2760                 struct shrink_control *sc)
3733 {                                                2761 {
3734         struct pglist_data *pgdata = NODE_DAT    2762         struct pglist_data *pgdata = NODE_DATA(sc->nid);
3735         struct deferred_split *ds_queue = &pg << 
3736         unsigned long flags;                     2763         unsigned long flags;
3737         LIST_HEAD(list);                      !! 2764         LIST_HEAD(list), *pos, *next;
3738         struct folio *folio, *next, *prev = N !! 2765         struct page *page;
3739         int split = 0, removed = 0;           !! 2766         int split = 0;
3740                                               << 
3741 #ifdef CONFIG_MEMCG                           << 
3742         if (sc->memcg)                        << 
3743                 ds_queue = &sc->memcg->deferr << 
3744 #endif                                        << 
3745                                                  2767 
3746         spin_lock_irqsave(&ds_queue->split_qu !! 2768         spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3747         /* Take pin on all head pages to avoi    2769         /* Take pin on all head pages to avoid freeing them under us */
3748         list_for_each_entry_safe(folio, next, !! 2770         list_for_each_safe(pos, next, &pgdata->split_queue) {
3749                                               !! 2771                 page = list_entry((void *)pos, struct page, mapping);
3750                 if (folio_try_get(folio)) {   !! 2772                 page = compound_head(page);
3751                         list_move(&folio->_de !! 2773                 if (get_page_unless_zero(page)) {
                                                   >> 2774                         list_move(page_deferred_list(page), &list);
3752                 } else {                         2775                 } else {
3753                         /* We lost race with  !! 2776                         /* We lost race with put_compound_page() */
3754                         if (folio_test_partia !! 2777                         list_del_init(page_deferred_list(page));
3755                                 __folio_clear !! 2778                         pgdata->split_queue_len--;
3756                                 mod_mthp_stat << 
3757                                               << 
3758                         }                     << 
3759                         list_del_init(&folio- << 
3760                         ds_queue->split_queue << 
3761                 }                                2779                 }
3762                 if (!--sc->nr_to_scan)           2780                 if (!--sc->nr_to_scan)
3763                         break;                   2781                         break;
3764         }                                        2782         }
3765         spin_unlock_irqrestore(&ds_queue->spl !! 2783         spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3766                                                  2784 
3767         list_for_each_entry_safe(folio, next, !! 2785         list_for_each_safe(pos, next, &list) {
3768                 bool did_split = false;       !! 2786                 page = list_entry((void *)pos, struct page, mapping);
3769                 bool underused = false;       !! 2787                 if (!trylock_page(page))
3770                                               << 
3771                 if (!folio_test_partially_map << 
3772                         underused = thp_under << 
3773                         if (!underused)       << 
3774                                 goto next;    << 
3775                 }                             << 
3776                 if (!folio_trylock(folio))    << 
3777                         goto next;               2788                         goto next;
3778                 if (!split_folio(folio)) {    !! 2789                 /* split_huge_page() removes page from list on success */
3779                         did_split = true;     !! 2790                 if (!split_huge_page(page))
3780                         if (underused)        << 
3781                                 count_vm_even << 
3782                         split++;                 2791                         split++;
3783                 }                             !! 2792                 unlock_page(page);
3784                 folio_unlock(folio);          << 
3785 next:                                            2793 next:
3786                 /*                            !! 2794                 put_page(page);
3787                  * split_folio() removes foli << 
3788                  * Only add back to the queue << 
3789                  * If thp_underused returns f << 
3790                  * in the case it was underus << 
3791                  * don't add it back to split << 
3792                  */                           << 
3793                 if (!did_split && !folio_test << 
3794                         list_del_init(&folio- << 
3795                         removed++;            << 
3796                 } else {                      << 
3797                         /*                    << 
3798                          * That unlocked list << 
3799                          * unless its folio i << 
3800                          * left on the list ( << 
3801                          * by one safe folio  << 
3802                          */                   << 
3803                         swap(folio, prev);    << 
3804                 }                             << 
3805                 if (folio)                    << 
3806                         folio_put(folio);     << 
3807         }                                        2795         }
3808                                                  2796 
3809         spin_lock_irqsave(&ds_queue->split_qu !! 2797         spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3810         list_splice_tail(&list, &ds_queue->sp !! 2798         list_splice_tail(&list, &pgdata->split_queue);
3811         ds_queue->split_queue_len -= removed; !! 2799         spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3812         spin_unlock_irqrestore(&ds_queue->spl << 
3813                                               << 
3814         if (prev)                             << 
3815                 folio_put(prev);              << 
3816                                                  2800 
3817         /*                                       2801         /*
3818          * Stop shrinker if we didn't split a    2802          * Stop shrinker if we didn't split any page, but the queue is empty.
3819          * This can happen if pages were free    2803          * This can happen if pages were freed under us.
3820          */                                      2804          */
3821         if (!split && list_empty(&ds_queue->s !! 2805         if (!split && list_empty(&pgdata->split_queue))
3822                 return SHRINK_STOP;              2806                 return SHRINK_STOP;
3823         return split;                            2807         return split;
3824 }                                                2808 }
3825                                                  2809 
                                                   >> 2810 static struct shrinker deferred_split_shrinker = {
                                                   >> 2811         .count_objects = deferred_split_count,
                                                   >> 2812         .scan_objects = deferred_split_scan,
                                                   >> 2813         .seeks = DEFAULT_SEEKS,
                                                   >> 2814         .flags = SHRINKER_NUMA_AWARE,
                                                   >> 2815 };
                                                   >> 2816 
3826 #ifdef CONFIG_DEBUG_FS                           2817 #ifdef CONFIG_DEBUG_FS
3827 static void split_huge_pages_all(void)        !! 2818 static int split_huge_pages_set(void *data, u64 val)
3828 {                                                2819 {
3829         struct zone *zone;                       2820         struct zone *zone;
3830         struct page *page;                       2821         struct page *page;
3831         struct folio *folio;                  << 
3832         unsigned long pfn, max_zone_pfn;         2822         unsigned long pfn, max_zone_pfn;
3833         unsigned long total = 0, split = 0;      2823         unsigned long total = 0, split = 0;
3834                                                  2824 
3835         pr_debug("Split all THPs\n");         !! 2825         if (val != 1)
3836         for_each_zone(zone) {                 !! 2826                 return -EINVAL;
3837                 if (!managed_zone(zone))      !! 2827 
3838                         continue;             !! 2828         for_each_populated_zone(zone) {
3839                 max_zone_pfn = zone_end_pfn(z    2829                 max_zone_pfn = zone_end_pfn(zone);
3840                 for (pfn = zone->zone_start_p    2830                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
3841                         int nr_pages;         !! 2831                         if (!pfn_valid(pfn))
3842                                               << 
3843                         page = pfn_to_online_ << 
3844                         if (!page || PageTail << 
3845                                 continue;     << 
3846                         folio = page_folio(pa << 
3847                         if (!folio_try_get(fo << 
3848                                 continue;        2832                                 continue;
3849                                                  2833 
3850                         if (unlikely(page_fol !! 2834                         page = pfn_to_page(pfn);
3851                                 goto next;    !! 2835                         if (!get_page_unless_zero(page))
                                                   >> 2836                                 continue;
3852                                                  2837 
3853                         if (zone != folio_zon !! 2838                         if (zone != page_zone(page))
3854                                 goto next;       2839                                 goto next;
3855                                                  2840 
3856                         if (!folio_test_large !! 2841                         if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
3857                                 || folio_test << 
3858                                 || !folio_tes << 
3859                                 goto next;       2842                                 goto next;
3860                                                  2843 
3861                         total++;                 2844                         total++;
3862                         folio_lock(folio);    !! 2845                         lock_page(page);
3863                         nr_pages = folio_nr_p !! 2846                         if (!split_huge_page(page))
3864                         if (!split_folio(foli << 
3865                                 split++;         2847                                 split++;
3866                         pfn += nr_pages - 1;  !! 2848                         unlock_page(page);
3867                         folio_unlock(folio);  << 
3868 next:                                            2849 next:
3869                         folio_put(folio);     !! 2850                         put_page(page);
3870                         cond_resched();       << 
3871                 }                                2851                 }
3872         }                                        2852         }
3873                                                  2853 
3874         pr_debug("%lu of %lu THP split\n", sp !! 2854         pr_info("%lu of %lu THP split\n", split, total);
3875 }                                             << 
3876                                               << 
3877 static inline bool vma_not_suitable_for_thp_s << 
3878 {                                             << 
3879         return vma_is_special_huge(vma) || (v << 
3880                     is_vm_hugetlb_page(vma);  << 
3881 }                                             << 
3882                                                  2855 
3883 static int split_huge_pages_pid(int pid, unsi !! 2856         return 0;
3884                                 unsigned long << 
3885 {                                             << 
3886         int ret = 0;                          << 
3887         struct task_struct *task;             << 
3888         struct mm_struct *mm;                 << 
3889         unsigned long total = 0, split = 0;   << 
3890         unsigned long addr;                   << 
3891                                               << 
3892         vaddr_start &= PAGE_MASK;             << 
3893         vaddr_end &= PAGE_MASK;               << 
3894                                               << 
3895         task = find_get_task_by_vpid(pid);    << 
3896         if (!task) {                          << 
3897                 ret = -ESRCH;                 << 
3898                 goto out;                     << 
3899         }                                     << 
3900                                               << 
3901         /* Find the mm_struct */              << 
3902         mm = get_task_mm(task);               << 
3903         put_task_struct(task);                << 
3904                                               << 
3905         if (!mm) {                            << 
3906                 ret = -EINVAL;                << 
3907                 goto out;                     << 
3908         }                                     << 
3909                                               << 
3910         pr_debug("Split huge pages in pid: %d << 
3911                  pid, vaddr_start, vaddr_end) << 
3912                                               << 
3913         mmap_read_lock(mm);                   << 
3914         /*                                    << 
3915          * always increase addr by PAGE_SIZE, << 
3916          * table filled with PTE-mapped THPs, << 
3917          */                                   << 
3918         for (addr = vaddr_start; addr < vaddr << 
3919                 struct vm_area_struct *vma =  << 
3920                 struct folio_walk fw;         << 
3921                 struct folio *folio;          << 
3922                 struct address_space *mapping << 
3923                 unsigned int target_order = n << 
3924                                               << 
3925                 if (!vma)                     << 
3926                         break;                << 
3927                                               << 
3928                 /* skip special VMA and huget << 
3929                 if (vma_not_suitable_for_thp_ << 
3930                         addr = vma->vm_end;   << 
3931                         continue;             << 
3932                 }                             << 
3933                                               << 
3934                 folio = folio_walk_start(&fw, << 
3935                 if (!folio)                   << 
3936                         continue;             << 
3937                                               << 
3938                 if (!is_transparent_hugepage( << 
3939                         goto next;            << 
3940                                               << 
3941                 if (!folio_test_anon(folio))  << 
3942                         mapping = folio->mapp << 
3943                         target_order = max(ne << 
3944                                            ma << 
3945                 }                             << 
3946                                               << 
3947                 if (target_order >= folio_ord << 
3948                         goto next;            << 
3949                                               << 
3950                 total++;                      << 
3951                 /*                            << 
3952                  * For folios with private, s << 
3953                  * will try to drop it before << 
3954                  * can be split or not. So sk << 
3955                  */                           << 
3956                 if (!folio_test_private(folio << 
3957                     !can_split_folio(folio, 0 << 
3958                         goto next;            << 
3959                                               << 
3960                 if (!folio_trylock(folio))    << 
3961                         goto next;            << 
3962                 folio_get(folio);             << 
3963                 folio_walk_end(&fw, vma);     << 
3964                                               << 
3965                 if (!folio_test_anon(folio) & << 
3966                         goto unlock;          << 
3967                                               << 
3968                 if (!split_folio_to_order(fol << 
3969                         split++;              << 
3970                                               << 
3971 unlock:                                       << 
3972                                               << 
3973                 folio_unlock(folio);          << 
3974                 folio_put(folio);             << 
3975                                               << 
3976                 cond_resched();               << 
3977                 continue;                     << 
3978 next:                                         << 
3979                 folio_walk_end(&fw, vma);     << 
3980                 cond_resched();               << 
3981         }                                     << 
3982         mmap_read_unlock(mm);                 << 
3983         mmput(mm);                            << 
3984                                               << 
3985         pr_debug("%lu of %lu THP split\n", sp << 
3986                                               << 
3987 out:                                          << 
3988         return ret;                           << 
3989 }                                             << 
3990                                               << 
3991 static int split_huge_pages_in_file(const cha << 
3992                                 pgoff_t off_e << 
3993 {                                             << 
3994         struct filename *file;                << 
3995         struct file *candidate;               << 
3996         struct address_space *mapping;        << 
3997         int ret = -EINVAL;                    << 
3998         pgoff_t index;                        << 
3999         int nr_pages = 1;                     << 
4000         unsigned long total = 0, split = 0;   << 
4001         unsigned int min_order;               << 
4002         unsigned int target_order;            << 
4003                                               << 
4004         file = getname_kernel(file_path);     << 
4005         if (IS_ERR(file))                     << 
4006                 return ret;                   << 
4007                                               << 
4008         candidate = file_open_name(file, O_RD << 
4009         if (IS_ERR(candidate))                << 
4010                 goto out;                     << 
4011                                               << 
4012         pr_debug("split file-backed THPs in f << 
4013                  file_path, off_start, off_en << 
4014                                               << 
4015         mapping = candidate->f_mapping;       << 
4016         min_order = mapping_min_folio_order(m << 
4017         target_order = max(new_order, min_ord << 
4018                                               << 
4019         for (index = off_start; index < off_e << 
4020                 struct folio *folio = filemap << 
4021                                               << 
4022                 nr_pages = 1;                 << 
4023                 if (IS_ERR(folio))            << 
4024                         continue;             << 
4025                                               << 
4026                 if (!folio_test_large(folio)) << 
4027                         goto next;            << 
4028                                               << 
4029                 total++;                      << 
4030                 nr_pages = folio_nr_pages(fol << 
4031                                               << 
4032                 if (target_order >= folio_ord << 
4033                         goto next;            << 
4034                                               << 
4035                 if (!folio_trylock(folio))    << 
4036                         goto next;            << 
4037                                               << 
4038                 if (folio->mapping != mapping << 
4039                         goto unlock;          << 
4040                                               << 
4041                 if (!split_folio_to_order(fol << 
4042                         split++;              << 
4043                                               << 
4044 unlock:                                       << 
4045                 folio_unlock(folio);          << 
4046 next:                                         << 
4047                 folio_put(folio);             << 
4048                 cond_resched();               << 
4049         }                                     << 
4050                                               << 
4051         filp_close(candidate, NULL);          << 
4052         ret = 0;                              << 
4053                                               << 
4054         pr_debug("%lu of %lu file-backed THP  << 
4055 out:                                          << 
4056         putname(file);                        << 
4057         return ret;                           << 
4058 }                                                2857 }
                                                   >> 2858 DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
                                                   >> 2859                 "%llu\n");
4059                                                  2860 
4060 #define MAX_INPUT_BUF_SZ 255                  !! 2861 static int __init split_huge_pages_debugfs(void)
4061                                               << 
4062 static ssize_t split_huge_pages_write(struct  << 
4063                                 size_t count, << 
4064 {                                                2862 {
4065         static DEFINE_MUTEX(split_debug_mutex !! 2863         void *ret;
4066         ssize_t ret;                          << 
4067         /*                                    << 
4068          * hold pid, start_vaddr, end_vaddr,  << 
4069          * file_path, off_start, off_end, new << 
4070          */                                   << 
4071         char input_buf[MAX_INPUT_BUF_SZ];     << 
4072         int pid;                              << 
4073         unsigned long vaddr_start, vaddr_end; << 
4074         unsigned int new_order = 0;           << 
4075                                               << 
4076         ret = mutex_lock_interruptible(&split << 
4077         if (ret)                              << 
4078                 return ret;                   << 
4079                                               << 
4080         ret = -EFAULT;                        << 
4081                                               << 
4082         memset(input_buf, 0, MAX_INPUT_BUF_SZ << 
4083         if (copy_from_user(input_buf, buf, mi << 
4084                 goto out;                     << 
4085                                               << 
4086         input_buf[MAX_INPUT_BUF_SZ - 1] = '\0 << 
4087                                                  2864 
4088         if (input_buf[0] == '/') {            !! 2865         ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
4089                 char *tok;                    !! 2866                         &split_huge_pages_fops);
4090                 char *buf = input_buf;        << 
4091                 char file_path[MAX_INPUT_BUF_ << 
4092                 pgoff_t off_start = 0, off_en << 
4093                 size_t input_len = strlen(inp << 
4094                                               << 
4095                 tok = strsep(&buf, ",");      << 
4096                 if (tok) {                    << 
4097                         strcpy(file_path, tok << 
4098                 } else {                      << 
4099                         ret = -EINVAL;        << 
4100                         goto out;             << 
4101                 }                             << 
4102                                               << 
4103                 ret = sscanf(buf, "0x%lx,0x%l << 
4104                 if (ret != 2 && ret != 3) {   << 
4105                         ret = -EINVAL;        << 
4106                         goto out;             << 
4107                 }                             << 
4108                 ret = split_huge_pages_in_fil << 
4109                 if (!ret)                     << 
4110                         ret = input_len;      << 
4111                                               << 
4112                 goto out;                     << 
4113         }                                     << 
4114                                               << 
4115         ret = sscanf(input_buf, "%d,0x%lx,0x% << 
4116         if (ret == 1 && pid == 1) {           << 
4117                 split_huge_pages_all();       << 
4118                 ret = strlen(input_buf);      << 
4119                 goto out;                     << 
4120         } else if (ret != 3 && ret != 4) {    << 
4121                 ret = -EINVAL;                << 
4122                 goto out;                     << 
4123         }                                     << 
4124                                               << 
4125         ret = split_huge_pages_pid(pid, vaddr << 
4126         if (!ret)                                2867         if (!ret)
4127                 ret = strlen(input_buf);      !! 2868                 pr_warn("Failed to create split_huge_pages in debugfs");
4128 out:                                          << 
4129         mutex_unlock(&split_debug_mutex);     << 
4130         return ret;                           << 
4131                                               << 
4132 }                                             << 
4133                                               << 
4134 static const struct file_operations split_hug << 
4135         .owner   = THIS_MODULE,               << 
4136         .write   = split_huge_pages_write,    << 
4137 };                                            << 
4138                                               << 
4139 static int __init split_huge_pages_debugfs(vo << 
4140 {                                             << 
4141         debugfs_create_file("split_huge_pages << 
4142                             &split_huge_pages << 
4143         return 0;                                2869         return 0;
4144 }                                                2870 }
4145 late_initcall(split_huge_pages_debugfs);         2871 late_initcall(split_huge_pages_debugfs);
4146 #endif                                           2872 #endif
4147                                                  2873 
4148 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION          2874 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
4149 int set_pmd_migration_entry(struct page_vma_m !! 2875 void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
4150                 struct page *page)               2876                 struct page *page)
4151 {                                                2877 {
4152         struct folio *folio = page_folio(page << 
4153         struct vm_area_struct *vma = pvmw->vm    2878         struct vm_area_struct *vma = pvmw->vma;
4154         struct mm_struct *mm = vma->vm_mm;       2879         struct mm_struct *mm = vma->vm_mm;
4155         unsigned long address = pvmw->address    2880         unsigned long address = pvmw->address;
4156         bool anon_exclusive;                  << 
4157         pmd_t pmdval;                            2881         pmd_t pmdval;
4158         swp_entry_t entry;                       2882         swp_entry_t entry;
4159         pmd_t pmdswp;                            2883         pmd_t pmdswp;
4160                                                  2884 
4161         if (!(pvmw->pmd && !pvmw->pte))          2885         if (!(pvmw->pmd && !pvmw->pte))
4162                 return 0;                     !! 2886                 return;
4163                                               << 
4164         flush_cache_range(vma, address, addre << 
4165         pmdval = pmdp_invalidate(vma, address << 
4166                                                  2887 
4167         /* See folio_try_share_anon_rmap_pmd( !! 2888         mmu_notifier_invalidate_range_start(mm, address,
4168         anon_exclusive = folio_test_anon(foli !! 2889                         address + HPAGE_PMD_SIZE);
4169         if (anon_exclusive && folio_try_share << 
4170                 set_pmd_at(mm, address, pvmw- << 
4171                 return -EBUSY;                << 
4172         }                                     << 
4173                                                  2890 
                                                   >> 2891         flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
                                                   >> 2892         pmdval = *pvmw->pmd;
                                                   >> 2893         pmdp_invalidate(vma, address, pvmw->pmd);
4174         if (pmd_dirty(pmdval))                   2894         if (pmd_dirty(pmdval))
4175                 folio_mark_dirty(folio);      !! 2895                 set_page_dirty(page);
4176         if (pmd_write(pmdval))                !! 2896         entry = make_migration_entry(page, pmd_write(pmdval));
4177                 entry = make_writable_migrati << 
4178         else if (anon_exclusive)              << 
4179                 entry = make_readable_exclusi << 
4180         else                                  << 
4181                 entry = make_readable_migrati << 
4182         if (pmd_young(pmdval))                << 
4183                 entry = make_migration_entry_ << 
4184         if (pmd_dirty(pmdval))                << 
4185                 entry = make_migration_entry_ << 
4186         pmdswp = swp_entry_to_pmd(entry);        2897         pmdswp = swp_entry_to_pmd(entry);
4187         if (pmd_soft_dirty(pmdval))              2898         if (pmd_soft_dirty(pmdval))
4188                 pmdswp = pmd_swp_mksoft_dirty    2899                 pmdswp = pmd_swp_mksoft_dirty(pmdswp);
4189         if (pmd_uffd_wp(pmdval))              << 
4190                 pmdswp = pmd_swp_mkuffd_wp(pm << 
4191         set_pmd_at(mm, address, pvmw->pmd, pm    2900         set_pmd_at(mm, address, pvmw->pmd, pmdswp);
4192         folio_remove_rmap_pmd(folio, page, vm !! 2901         page_remove_rmap(page, true);
4193         folio_put(folio);                     !! 2902         put_page(page);
4194         trace_set_migration_pmd(address, pmd_ << 
4195                                                  2903 
4196         return 0;                             !! 2904         mmu_notifier_invalidate_range_end(mm, address,
                                                   >> 2905                         address + HPAGE_PMD_SIZE);
4197 }                                                2906 }
4198                                                  2907 
4199 void remove_migration_pmd(struct page_vma_map    2908 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
4200 {                                                2909 {
4201         struct folio *folio = page_folio(new) << 
4202         struct vm_area_struct *vma = pvmw->vm    2910         struct vm_area_struct *vma = pvmw->vma;
4203         struct mm_struct *mm = vma->vm_mm;       2911         struct mm_struct *mm = vma->vm_mm;
4204         unsigned long address = pvmw->address    2912         unsigned long address = pvmw->address;
4205         unsigned long haddr = address & HPAGE !! 2913         unsigned long mmun_start = address & HPAGE_PMD_MASK;
4206         pmd_t pmde;                              2914         pmd_t pmde;
4207         swp_entry_t entry;                       2915         swp_entry_t entry;
4208                                                  2916 
4209         if (!(pvmw->pmd && !pvmw->pte))          2917         if (!(pvmw->pmd && !pvmw->pte))
4210                 return;                          2918                 return;
4211                                                  2919 
4212         entry = pmd_to_swp_entry(*pvmw->pmd);    2920         entry = pmd_to_swp_entry(*pvmw->pmd);
4213         folio_get(folio);                     !! 2921         get_page(new);
4214         pmde = mk_huge_pmd(new, READ_ONCE(vma !! 2922         pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
4215         if (pmd_swp_soft_dirty(*pvmw->pmd))      2923         if (pmd_swp_soft_dirty(*pvmw->pmd))
4216                 pmde = pmd_mksoft_dirty(pmde)    2924                 pmde = pmd_mksoft_dirty(pmde);
4217         if (is_writable_migration_entry(entry !! 2925         if (is_write_migration_entry(entry))
4218                 pmde = pmd_mkwrite(pmde, vma) !! 2926                 pmde = maybe_pmd_mkwrite(pmde, vma);
4219         if (pmd_swp_uffd_wp(*pvmw->pmd))      << 
4220                 pmde = pmd_mkuffd_wp(pmde);   << 
4221         if (!is_migration_entry_young(entry)) << 
4222                 pmde = pmd_mkold(pmde);       << 
4223         /* NOTE: this may contain setting sof << 
4224         if (folio_test_dirty(folio) && is_mig << 
4225                 pmde = pmd_mkdirty(pmde);     << 
4226                                               << 
4227         if (folio_test_anon(folio)) {         << 
4228                 rmap_t rmap_flags = RMAP_NONE << 
4229                                               << 
4230                 if (!is_readable_migration_en << 
4231                         rmap_flags |= RMAP_EX << 
4232                                                  2927 
4233                 folio_add_anon_rmap_pmd(folio !! 2928         flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
4234         } else {                              !! 2929         page_add_anon_rmap(new, vma, mmun_start, true);
4235                 folio_add_file_rmap_pmd(folio !! 2930         set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
4236         }                                     !! 2931         if (vma->vm_flags & VM_LOCKED)
4237         VM_BUG_ON(pmd_write(pmde) && folio_te !! 2932                 mlock_vma_page(new);
4238         set_pmd_at(mm, haddr, pvmw->pmd, pmde << 
4239                                               << 
4240         /* No need to invalidate - it was non << 
4241         update_mmu_cache_pmd(vma, address, pv    2933         update_mmu_cache_pmd(vma, address, pvmw->pmd);
4242         trace_remove_migration_pmd(address, p << 
4243 }                                                2934 }
4244 #endif                                           2935 #endif
4245                                                  2936 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php