~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/mm/huge_memory.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /mm/huge_memory.c (Version linux-6.12-rc7) and /mm/huge_memory.c (Version linux-4.20.17)


  1 // SPDX-License-Identifier: GPL-2.0-only       << 
  2 /*                                                  1 /*
  3  *  Copyright (C) 2009  Red Hat, Inc.               2  *  Copyright (C) 2009  Red Hat, Inc.
                                                   >>   3  *
                                                   >>   4  *  This work is licensed under the terms of the GNU GPL, version 2. See
                                                   >>   5  *  the COPYING file in the top-level directory.
  4  */                                                 6  */
  5                                                     7 
  6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt         8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  7                                                     9 
  8 #include <linux/mm.h>                              10 #include <linux/mm.h>
  9 #include <linux/sched.h>                           11 #include <linux/sched.h>
 10 #include <linux/sched/mm.h>                    << 
 11 #include <linux/sched/coredump.h>                  12 #include <linux/sched/coredump.h>
 12 #include <linux/sched/numa_balancing.h>            13 #include <linux/sched/numa_balancing.h>
 13 #include <linux/highmem.h>                         14 #include <linux/highmem.h>
 14 #include <linux/hugetlb.h>                         15 #include <linux/hugetlb.h>
 15 #include <linux/mmu_notifier.h>                    16 #include <linux/mmu_notifier.h>
 16 #include <linux/rmap.h>                            17 #include <linux/rmap.h>
 17 #include <linux/swap.h>                            18 #include <linux/swap.h>
 18 #include <linux/shrinker.h>                        19 #include <linux/shrinker.h>
 19 #include <linux/mm_inline.h>                       20 #include <linux/mm_inline.h>
 20 #include <linux/swapops.h>                         21 #include <linux/swapops.h>
 21 #include <linux/backing-dev.h>                 << 
 22 #include <linux/dax.h>                             22 #include <linux/dax.h>
 23 #include <linux/mm_types.h>                    << 
 24 #include <linux/khugepaged.h>                      23 #include <linux/khugepaged.h>
 25 #include <linux/freezer.h>                         24 #include <linux/freezer.h>
 26 #include <linux/pfn_t.h>                           25 #include <linux/pfn_t.h>
 27 #include <linux/mman.h>                            26 #include <linux/mman.h>
 28 #include <linux/memremap.h>                        27 #include <linux/memremap.h>
 29 #include <linux/pagemap.h>                         28 #include <linux/pagemap.h>
 30 #include <linux/debugfs.h>                         29 #include <linux/debugfs.h>
 31 #include <linux/migrate.h>                         30 #include <linux/migrate.h>
 32 #include <linux/hashtable.h>                       31 #include <linux/hashtable.h>
 33 #include <linux/userfaultfd_k.h>                   32 #include <linux/userfaultfd_k.h>
 34 #include <linux/page_idle.h>                       33 #include <linux/page_idle.h>
 35 #include <linux/shmem_fs.h>                        34 #include <linux/shmem_fs.h>
 36 #include <linux/oom.h>                             35 #include <linux/oom.h>
 37 #include <linux/numa.h>                        << 
 38 #include <linux/page_owner.h>                  << 
 39 #include <linux/sched/sysctl.h>                << 
 40 #include <linux/memory-tiers.h>                << 
 41 #include <linux/compat.h>                      << 
 42 #include <linux/pgalloc_tag.h>                 << 
 43 #include <linux/pagewalk.h>                    << 
 44                                                    36 
 45 #include <asm/tlb.h>                               37 #include <asm/tlb.h>
 46 #include <asm/pgalloc.h>                           38 #include <asm/pgalloc.h>
 47 #include "internal.h"                              39 #include "internal.h"
 48 #include "swap.h"                              << 
 49                                                << 
 50 #define CREATE_TRACE_POINTS                    << 
 51 #include <trace/events/thp.h>                  << 
 52                                                    40 
 53 /*                                                 41 /*
 54  * By default, transparent hugepage support is     42  * By default, transparent hugepage support is disabled in order to avoid
 55  * risking an increased memory footprint for a     43  * risking an increased memory footprint for applications that are not
 56  * guaranteed to benefit from it. When transpa     44  * guaranteed to benefit from it. When transparent hugepage support is
 57  * enabled, it is for all mappings, and khugep     45  * enabled, it is for all mappings, and khugepaged scans all mappings.
 58  * Defrag is invoked by khugepaged hugepage al     46  * Defrag is invoked by khugepaged hugepage allocations and by page faults
 59  * for all hugepage allocations.                   47  * for all hugepage allocations.
 60  */                                                48  */
 61 unsigned long transparent_hugepage_flags __rea     49 unsigned long transparent_hugepage_flags __read_mostly =
 62 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS          50 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
 63         (1<<TRANSPARENT_HUGEPAGE_FLAG)|            51         (1<<TRANSPARENT_HUGEPAGE_FLAG)|
 64 #endif                                             52 #endif
 65 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE         53 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
 66         (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG     54         (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
 67 #endif                                             55 #endif
 68         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MA     56         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
 69         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEP     57         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
 70         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE     58         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 71                                                    59 
 72 static struct shrinker *deferred_split_shrinke !!  60 static struct shrinker deferred_split_shrinker;
 73 static unsigned long deferred_split_count(stru << 
 74                                           stru << 
 75 static unsigned long deferred_split_scan(struc << 
 76                                          struc << 
 77 static bool split_underused_thp = true;        << 
 78                                                    61 
 79 static atomic_t huge_zero_refcount;                62 static atomic_t huge_zero_refcount;
 80 struct folio *huge_zero_folio __read_mostly;   !!  63 struct page *huge_zero_page __read_mostly;
 81 unsigned long huge_zero_pfn __read_mostly = ~0 << 
 82 unsigned long huge_anon_orders_always __read_m << 
 83 unsigned long huge_anon_orders_madvise __read_ << 
 84 unsigned long huge_anon_orders_inherit __read_ << 
 85 static bool anon_orders_configured __initdata; << 
 86                                                << 
 87 unsigned long __thp_vma_allowable_orders(struc << 
 88                                          unsig << 
 89                                          unsig << 
 90                                          unsig << 
 91 {                                              << 
 92         bool smaps = tva_flags & TVA_SMAPS;    << 
 93         bool in_pf = tva_flags & TVA_IN_PF;    << 
 94         bool enforce_sysfs = tva_flags & TVA_E << 
 95         unsigned long supported_orders;        << 
 96                                                << 
 97         /* Check the intersection of requested << 
 98         if (vma_is_anonymous(vma))             << 
 99                 supported_orders = THP_ORDERS_ << 
100         else if (vma_is_special_huge(vma))     << 
101                 supported_orders = THP_ORDERS_ << 
102         else                                   << 
103                 supported_orders = THP_ORDERS_ << 
104                                                << 
105         orders &= supported_orders;            << 
106         if (!orders)                           << 
107                 return 0;                      << 
108                                                << 
109         if (!vma->vm_mm)                /* vds << 
110                 return 0;                      << 
111                                                << 
112         if (thp_disabled_by_hw() || vma_thp_di << 
113                 return 0;                      << 
114                                                << 
115         /* khugepaged doesn't collapse DAX vma << 
116         if (vma_is_dax(vma))                   << 
117                 return in_pf ? orders : 0;     << 
118                                                << 
119         /*                                     << 
120          * khugepaged special VMA and hugetlb  << 
121          * Must be checked after dax since som << 
122          * VM_MIXEDMAP set.                    << 
123          */                                    << 
124         if (!in_pf && !smaps && (vm_flags & VM << 
125                 return 0;                      << 
126                                                << 
127         /*                                     << 
128          * Check alignment for file vma and si << 
129          * filtering out the unsuitable orders << 
130          *                                     << 
131          * Skip the check for page fault. Huge << 
132          * handlers.                           << 
133          */                                    << 
134         if (!in_pf) {                          << 
135                 int order = highest_order(orde << 
136                 unsigned long addr;            << 
137                                                << 
138                 while (orders) {               << 
139                         addr = vma->vm_end - ( << 
140                         if (thp_vma_suitable_o << 
141                                 break;         << 
142                         order = next_order(&or << 
143                 }                              << 
144                                                << 
145                 if (!orders)                   << 
146                         return 0;              << 
147         }                                      << 
148                                                << 
149         /*                                     << 
150          * Enabled via shmem mount options or  << 
151          * Must be done before hugepage flags  << 
152          * own flags.                          << 
153          */                                    << 
154         if (!in_pf && shmem_file(vma->vm_file) << 
155                 return shmem_allowable_huge_or << 
156                                                << 
157                                                << 
158                                                << 
159         if (!vma_is_anonymous(vma)) {          << 
160                 /*                             << 
161                  * Enforce sysfs THP requireme << 
162                  * were already handled in thp << 
163                  */                            << 
164                 if (enforce_sysfs &&           << 
165                     (!hugepage_global_enabled( << 
166                                                << 
167                         return 0;              << 
168                                                << 
169                 /*                             << 
170                  * Trust that ->huge_fault() h << 
171                  * in fault path.              << 
172                  */                            << 
173                 if (((in_pf || smaps)) && vma- << 
174                         return orders;         << 
175                 /* Only regular file is valid  << 
176                 if (((!in_pf || smaps)) && fil << 
177                         return orders;         << 
178                 return 0;                      << 
179         }                                      << 
180                                                    64 
181         if (vma_is_temporary_stack(vma))       !!  65 static struct page *get_huge_zero_page(void)
182                 return 0;                      << 
183                                                << 
184         /*                                     << 
185          * THPeligible bit of smaps should sho << 
186          * though anon_vma is not initialized  << 
187          *                                     << 
188          * Allow page fault since anon_vma may << 
189          * the first page fault.               << 
190          */                                    << 
191         if (!vma->anon_vma)                    << 
192                 return (smaps || in_pf) ? orde << 
193                                                << 
194         return orders;                         << 
195 }                                              << 
196                                                << 
197 static bool get_huge_zero_page(void)           << 
198 {                                                  66 {
199         struct folio *zero_folio;              !!  67         struct page *zero_page;
200 retry:                                             68 retry:
201         if (likely(atomic_inc_not_zero(&huge_z     69         if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
202                 return true;                   !!  70                 return READ_ONCE(huge_zero_page);
203                                                    71 
204         zero_folio = folio_alloc((GFP_TRANSHUG !!  72         zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
205                         HPAGE_PMD_ORDER);          73                         HPAGE_PMD_ORDER);
206         if (!zero_folio) {                     !!  74         if (!zero_page) {
207                 count_vm_event(THP_ZERO_PAGE_A     75                 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
208                 return false;                  !!  76                 return NULL;
209         }                                          77         }
210         /* Ensure zero folio won't have large_ !!  78         count_vm_event(THP_ZERO_PAGE_ALLOC);
211         folio_clear_large_rmappable(zero_folio << 
212         preempt_disable();                         79         preempt_disable();
213         if (cmpxchg(&huge_zero_folio, NULL, ze !!  80         if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
214                 preempt_enable();                  81                 preempt_enable();
215                 folio_put(zero_folio);         !!  82                 __free_pages(zero_page, compound_order(zero_page));
216                 goto retry;                        83                 goto retry;
217         }                                          84         }
218         WRITE_ONCE(huge_zero_pfn, folio_pfn(ze << 
219                                                    85 
220         /* We take additional reference here.      86         /* We take additional reference here. It will be put back by shrinker */
221         atomic_set(&huge_zero_refcount, 2);        87         atomic_set(&huge_zero_refcount, 2);
222         preempt_enable();                          88         preempt_enable();
223         count_vm_event(THP_ZERO_PAGE_ALLOC);   !!  89         return READ_ONCE(huge_zero_page);
224         return true;                           << 
225 }                                                  90 }
226                                                    91 
227 static void put_huge_zero_page(void)               92 static void put_huge_zero_page(void)
228 {                                                  93 {
229         /*                                         94         /*
230          * Counter should never go to zero her     95          * Counter should never go to zero here. Only shrinker can put
231          * last reference.                         96          * last reference.
232          */                                        97          */
233         BUG_ON(atomic_dec_and_test(&huge_zero_     98         BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
234 }                                                  99 }
235                                                   100 
236 struct folio *mm_get_huge_zero_folio(struct mm !! 101 struct page *mm_get_huge_zero_page(struct mm_struct *mm)
237 {                                                 102 {
238         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->    103         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
239                 return READ_ONCE(huge_zero_fol !! 104                 return READ_ONCE(huge_zero_page);
240                                                   105 
241         if (!get_huge_zero_page())                106         if (!get_huge_zero_page())
242                 return NULL;                      107                 return NULL;
243                                                   108 
244         if (test_and_set_bit(MMF_HUGE_ZERO_PAG    109         if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
245                 put_huge_zero_page();             110                 put_huge_zero_page();
246                                                   111 
247         return READ_ONCE(huge_zero_folio);     !! 112         return READ_ONCE(huge_zero_page);
248 }                                                 113 }
249                                                   114 
250 void mm_put_huge_zero_folio(struct mm_struct * !! 115 void mm_put_huge_zero_page(struct mm_struct *mm)
251 {                                                 116 {
252         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->    117         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
253                 put_huge_zero_page();             118                 put_huge_zero_page();
254 }                                                 119 }
255                                                   120 
256 static unsigned long shrink_huge_zero_page_cou    121 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
257                                         struct    122                                         struct shrink_control *sc)
258 {                                                 123 {
259         /* we can free zero page only if last     124         /* we can free zero page only if last reference remains */
260         return atomic_read(&huge_zero_refcount    125         return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
261 }                                                 126 }
262                                                   127 
263 static unsigned long shrink_huge_zero_page_sca    128 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
264                                        struct     129                                        struct shrink_control *sc)
265 {                                                 130 {
266         if (atomic_cmpxchg(&huge_zero_refcount    131         if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
267                 struct folio *zero_folio = xch !! 132                 struct page *zero_page = xchg(&huge_zero_page, NULL);
268                 BUG_ON(zero_folio == NULL);    !! 133                 BUG_ON(zero_page == NULL);
269                 WRITE_ONCE(huge_zero_pfn, ~0UL !! 134                 __free_pages(zero_page, compound_order(zero_page));
270                 folio_put(zero_folio);         << 
271                 return HPAGE_PMD_NR;              135                 return HPAGE_PMD_NR;
272         }                                         136         }
273                                                   137 
274         return 0;                                 138         return 0;
275 }                                                 139 }
276                                                   140 
277 static struct shrinker *huge_zero_page_shrinke !! 141 static struct shrinker huge_zero_page_shrinker = {
                                                   >> 142         .count_objects = shrink_huge_zero_page_count,
                                                   >> 143         .scan_objects = shrink_huge_zero_page_scan,
                                                   >> 144         .seeks = DEFAULT_SEEKS,
                                                   >> 145 };
278                                                   146 
279 #ifdef CONFIG_SYSFS                               147 #ifdef CONFIG_SYSFS
280 static ssize_t enabled_show(struct kobject *ko    148 static ssize_t enabled_show(struct kobject *kobj,
281                             struct kobj_attrib    149                             struct kobj_attribute *attr, char *buf)
282 {                                                 150 {
283         const char *output;                    << 
284                                                << 
285         if (test_bit(TRANSPARENT_HUGEPAGE_FLAG    151         if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
286                 output = "[always] madvise nev !! 152                 return sprintf(buf, "[always] madvise never\n");
287         else if (test_bit(TRANSPARENT_HUGEPAGE !! 153         else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
288                           &transparent_hugepag !! 154                 return sprintf(buf, "always [madvise] never\n");
289                 output = "always [madvise] nev << 
290         else                                      155         else
291                 output = "always madvise [neve !! 156                 return sprintf(buf, "always madvise [never]\n");
292                                                << 
293         return sysfs_emit(buf, "%s\n", output) << 
294 }                                                 157 }
295                                                   158 
296 static ssize_t enabled_store(struct kobject *k    159 static ssize_t enabled_store(struct kobject *kobj,
297                              struct kobj_attri    160                              struct kobj_attribute *attr,
298                              const char *buf,     161                              const char *buf, size_t count)
299 {                                                 162 {
300         ssize_t ret = count;                      163         ssize_t ret = count;
301                                                   164 
302         if (sysfs_streq(buf, "always")) {      !! 165         if (!memcmp("always", buf,
                                                   >> 166                     min(sizeof("always")-1, count))) {
303                 clear_bit(TRANSPARENT_HUGEPAGE    167                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
304                 set_bit(TRANSPARENT_HUGEPAGE_F    168                 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
305         } else if (sysfs_streq(buf, "madvise") !! 169         } else if (!memcmp("madvise", buf,
                                                   >> 170                            min(sizeof("madvise")-1, count))) {
306                 clear_bit(TRANSPARENT_HUGEPAGE    171                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
307                 set_bit(TRANSPARENT_HUGEPAGE_R    172                 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
308         } else if (sysfs_streq(buf, "never"))  !! 173         } else if (!memcmp("never", buf,
                                                   >> 174                            min(sizeof("never")-1, count))) {
309                 clear_bit(TRANSPARENT_HUGEPAGE    175                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
310                 clear_bit(TRANSPARENT_HUGEPAGE    176                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
311         } else                                    177         } else
312                 ret = -EINVAL;                    178                 ret = -EINVAL;
313                                                   179 
314         if (ret > 0) {                            180         if (ret > 0) {
315                 int err = start_stop_khugepage    181                 int err = start_stop_khugepaged();
316                 if (err)                          182                 if (err)
317                         ret = err;                183                         ret = err;
318         }                                         184         }
319         return ret;                               185         return ret;
320 }                                                 186 }
321                                                !! 187 static struct kobj_attribute enabled_attr =
322 static struct kobj_attribute enabled_attr = __ !! 188         __ATTR(enabled, 0644, enabled_show, enabled_store);
323                                                   189 
324 ssize_t single_hugepage_flag_show(struct kobje    190 ssize_t single_hugepage_flag_show(struct kobject *kobj,
325                                   struct kobj_ !! 191                                 struct kobj_attribute *attr, char *buf,
326                                   enum transpa !! 192                                 enum transparent_hugepage_flag flag)
327 {                                                 193 {
328         return sysfs_emit(buf, "%d\n",         !! 194         return sprintf(buf, "%d\n",
329                           !!test_bit(flag, &tr !! 195                        !!test_bit(flag, &transparent_hugepage_flags));
330 }                                                 196 }
331                                                   197 
332 ssize_t single_hugepage_flag_store(struct kobj    198 ssize_t single_hugepage_flag_store(struct kobject *kobj,
333                                  struct kobj_a    199                                  struct kobj_attribute *attr,
334                                  const char *b    200                                  const char *buf, size_t count,
335                                  enum transpar    201                                  enum transparent_hugepage_flag flag)
336 {                                                 202 {
337         unsigned long value;                      203         unsigned long value;
338         int ret;                                  204         int ret;
339                                                   205 
340         ret = kstrtoul(buf, 10, &value);          206         ret = kstrtoul(buf, 10, &value);
341         if (ret < 0)                              207         if (ret < 0)
342                 return ret;                       208                 return ret;
343         if (value > 1)                            209         if (value > 1)
344                 return -EINVAL;                   210                 return -EINVAL;
345                                                   211 
346         if (value)                                212         if (value)
347                 set_bit(flag, &transparent_hug    213                 set_bit(flag, &transparent_hugepage_flags);
348         else                                      214         else
349                 clear_bit(flag, &transparent_h    215                 clear_bit(flag, &transparent_hugepage_flags);
350                                                   216 
351         return count;                             217         return count;
352 }                                                 218 }
353                                                   219 
354 static ssize_t defrag_show(struct kobject *kob    220 static ssize_t defrag_show(struct kobject *kobj,
355                            struct kobj_attribu    221                            struct kobj_attribute *attr, char *buf)
356 {                                                 222 {
357         const char *output;                    !! 223         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
358                                                !! 224                 return sprintf(buf, "[always] defer defer+madvise madvise never\n");
359         if (test_bit(TRANSPARENT_HUGEPAGE_DEFR !! 225         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
360                      &transparent_hugepage_fla !! 226                 return sprintf(buf, "always [defer] defer+madvise madvise never\n");
361                 output = "[always] defer defer !! 227         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
362         else if (test_bit(TRANSPARENT_HUGEPAGE !! 228                 return sprintf(buf, "always defer [defer+madvise] madvise never\n");
363                           &transparent_hugepag !! 229         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
364                 output = "always [defer] defer !! 230                 return sprintf(buf, "always defer defer+madvise [madvise] never\n");
365         else if (test_bit(TRANSPARENT_HUGEPAGE !! 231         return sprintf(buf, "always defer defer+madvise madvise [never]\n");
366                           &transparent_hugepag << 
367                 output = "always defer [defer+ << 
368         else if (test_bit(TRANSPARENT_HUGEPAGE << 
369                           &transparent_hugepag << 
370                 output = "always defer defer+m << 
371         else                                   << 
372                 output = "always defer defer+m << 
373                                                << 
374         return sysfs_emit(buf, "%s\n", output) << 
375 }                                                 232 }
376                                                   233 
377 static ssize_t defrag_store(struct kobject *ko    234 static ssize_t defrag_store(struct kobject *kobj,
378                             struct kobj_attrib    235                             struct kobj_attribute *attr,
379                             const char *buf, s    236                             const char *buf, size_t count)
380 {                                                 237 {
381         if (sysfs_streq(buf, "always")) {      !! 238         if (!memcmp("always", buf,
                                                   >> 239                     min(sizeof("always")-1, count))) {
382                 clear_bit(TRANSPARENT_HUGEPAGE    240                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
383                 clear_bit(TRANSPARENT_HUGEPAGE    241                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
384                 clear_bit(TRANSPARENT_HUGEPAGE    242                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
385                 set_bit(TRANSPARENT_HUGEPAGE_D    243                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
386         } else if (sysfs_streq(buf, "defer+mad !! 244         } else if (!memcmp("defer+madvise", buf,
                                                   >> 245                     min(sizeof("defer+madvise")-1, count))) {
387                 clear_bit(TRANSPARENT_HUGEPAGE    246                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
388                 clear_bit(TRANSPARENT_HUGEPAGE    247                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
389                 clear_bit(TRANSPARENT_HUGEPAGE    248                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
390                 set_bit(TRANSPARENT_HUGEPAGE_D    249                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
391         } else if (sysfs_streq(buf, "defer"))  !! 250         } else if (!memcmp("defer", buf,
                                                   >> 251                     min(sizeof("defer")-1, count))) {
392                 clear_bit(TRANSPARENT_HUGEPAGE    252                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
393                 clear_bit(TRANSPARENT_HUGEPAGE    253                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
394                 clear_bit(TRANSPARENT_HUGEPAGE    254                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
395                 set_bit(TRANSPARENT_HUGEPAGE_D    255                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
396         } else if (sysfs_streq(buf, "madvise") !! 256         } else if (!memcmp("madvise", buf,
                                                   >> 257                            min(sizeof("madvise")-1, count))) {
397                 clear_bit(TRANSPARENT_HUGEPAGE    258                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
398                 clear_bit(TRANSPARENT_HUGEPAGE    259                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
399                 clear_bit(TRANSPARENT_HUGEPAGE    260                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
400                 set_bit(TRANSPARENT_HUGEPAGE_D    261                 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
401         } else if (sysfs_streq(buf, "never"))  !! 262         } else if (!memcmp("never", buf,
                                                   >> 263                            min(sizeof("never")-1, count))) {
402                 clear_bit(TRANSPARENT_HUGEPAGE    264                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
403                 clear_bit(TRANSPARENT_HUGEPAGE    265                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
404                 clear_bit(TRANSPARENT_HUGEPAGE    266                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
405                 clear_bit(TRANSPARENT_HUGEPAGE    267                 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
406         } else                                    268         } else
407                 return -EINVAL;                   269                 return -EINVAL;
408                                                   270 
409         return count;                             271         return count;
410 }                                                 272 }
411 static struct kobj_attribute defrag_attr = __A !! 273 static struct kobj_attribute defrag_attr =
                                                   >> 274         __ATTR(defrag, 0644, defrag_show, defrag_store);
412                                                   275 
413 static ssize_t use_zero_page_show(struct kobje    276 static ssize_t use_zero_page_show(struct kobject *kobj,
414                                   struct kobj_ !! 277                 struct kobj_attribute *attr, char *buf)
415 {                                                 278 {
416         return single_hugepage_flag_show(kobj,    279         return single_hugepage_flag_show(kobj, attr, buf,
417                                          TRANS !! 280                                 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
418 }                                                 281 }
419 static ssize_t use_zero_page_store(struct kobj    282 static ssize_t use_zero_page_store(struct kobject *kobj,
420                 struct kobj_attribute *attr, c    283                 struct kobj_attribute *attr, const char *buf, size_t count)
421 {                                                 284 {
422         return single_hugepage_flag_store(kobj    285         return single_hugepage_flag_store(kobj, attr, buf, count,
423                                  TRANSPARENT_H    286                                  TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
424 }                                                 287 }
425 static struct kobj_attribute use_zero_page_att !! 288 static struct kobj_attribute use_zero_page_attr =
                                                   >> 289         __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
426                                                   290 
427 static ssize_t hpage_pmd_size_show(struct kobj    291 static ssize_t hpage_pmd_size_show(struct kobject *kobj,
428                                    struct kobj !! 292                 struct kobj_attribute *attr, char *buf)
429 {                                                 293 {
430         return sysfs_emit(buf, "%lu\n", HPAGE_ !! 294         return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
431 }                                                 295 }
432 static struct kobj_attribute hpage_pmd_size_at    296 static struct kobj_attribute hpage_pmd_size_attr =
433         __ATTR_RO(hpage_pmd_size);                297         __ATTR_RO(hpage_pmd_size);
434                                                   298 
435 static ssize_t split_underused_thp_show(struct !! 299 #ifdef CONFIG_DEBUG_VM
436                             struct kobj_attrib !! 300 static ssize_t debug_cow_show(struct kobject *kobj,
                                                   >> 301                                 struct kobj_attribute *attr, char *buf)
437 {                                                 302 {
438         return sysfs_emit(buf, "%d\n", split_u !! 303         return single_hugepage_flag_show(kobj, attr, buf,
                                                   >> 304                                 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
439 }                                                 305 }
440                                                !! 306 static ssize_t debug_cow_store(struct kobject *kobj,
441 static ssize_t split_underused_thp_store(struc !! 307                                struct kobj_attribute *attr,
442                              struct kobj_attri !! 308                                const char *buf, size_t count)
443                              const char *buf,  << 
444 {                                                 309 {
445         int err = kstrtobool(buf, &split_under !! 310         return single_hugepage_flag_store(kobj, attr, buf, count,
446                                                !! 311                                  TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
447         if (err < 0)                           << 
448                 return err;                    << 
449                                                << 
450         return count;                          << 
451 }                                                 312 }
452                                                !! 313 static struct kobj_attribute debug_cow_attr =
453 static struct kobj_attribute split_underused_t !! 314         __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
454         shrink_underused, 0644, split_underuse !! 315 #endif /* CONFIG_DEBUG_VM */
455                                                   316 
456 static struct attribute *hugepage_attr[] = {      317 static struct attribute *hugepage_attr[] = {
457         &enabled_attr.attr,                       318         &enabled_attr.attr,
458         &defrag_attr.attr,                        319         &defrag_attr.attr,
459         &use_zero_page_attr.attr,                 320         &use_zero_page_attr.attr,
460         &hpage_pmd_size_attr.attr,                321         &hpage_pmd_size_attr.attr,
461 #ifdef CONFIG_SHMEM                            !! 322 #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
462         &shmem_enabled_attr.attr,                 323         &shmem_enabled_attr.attr,
463 #endif                                            324 #endif
464         &split_underused_thp_attr.attr,        !! 325 #ifdef CONFIG_DEBUG_VM
                                                   >> 326         &debug_cow_attr.attr,
                                                   >> 327 #endif
465         NULL,                                     328         NULL,
466 };                                                329 };
467                                                   330 
468 static const struct attribute_group hugepage_a    331 static const struct attribute_group hugepage_attr_group = {
469         .attrs = hugepage_attr,                   332         .attrs = hugepage_attr,
470 };                                                333 };
471                                                   334 
472 static void hugepage_exit_sysfs(struct kobject << 
473 static void thpsize_release(struct kobject *ko << 
474 static DEFINE_SPINLOCK(huge_anon_orders_lock); << 
475 static LIST_HEAD(thpsize_list);                << 
476                                                << 
477 static ssize_t anon_enabled_show(struct kobjec << 
478                                  struct kobj_a << 
479 {                                              << 
480         int order = to_thpsize(kobj)->order;   << 
481         const char *output;                    << 
482                                                << 
483         if (test_bit(order, &huge_anon_orders_ << 
484                 output = "[always] inherit mad << 
485         else if (test_bit(order, &huge_anon_or << 
486                 output = "always [inherit] mad << 
487         else if (test_bit(order, &huge_anon_or << 
488                 output = "always inherit [madv << 
489         else                                   << 
490                 output = "always inherit madvi << 
491                                                << 
492         return sysfs_emit(buf, "%s\n", output) << 
493 }                                              << 
494                                                << 
495 static ssize_t anon_enabled_store(struct kobje << 
496                                   struct kobj_ << 
497                                   const char * << 
498 {                                              << 
499         int order = to_thpsize(kobj)->order;   << 
500         ssize_t ret = count;                   << 
501                                                << 
502         if (sysfs_streq(buf, "always")) {      << 
503                 spin_lock(&huge_anon_orders_lo << 
504                 clear_bit(order, &huge_anon_or << 
505                 clear_bit(order, &huge_anon_or << 
506                 set_bit(order, &huge_anon_orde << 
507                 spin_unlock(&huge_anon_orders_ << 
508         } else if (sysfs_streq(buf, "inherit") << 
509                 spin_lock(&huge_anon_orders_lo << 
510                 clear_bit(order, &huge_anon_or << 
511                 clear_bit(order, &huge_anon_or << 
512                 set_bit(order, &huge_anon_orde << 
513                 spin_unlock(&huge_anon_orders_ << 
514         } else if (sysfs_streq(buf, "madvise") << 
515                 spin_lock(&huge_anon_orders_lo << 
516                 clear_bit(order, &huge_anon_or << 
517                 clear_bit(order, &huge_anon_or << 
518                 set_bit(order, &huge_anon_orde << 
519                 spin_unlock(&huge_anon_orders_ << 
520         } else if (sysfs_streq(buf, "never"))  << 
521                 spin_lock(&huge_anon_orders_lo << 
522                 clear_bit(order, &huge_anon_or << 
523                 clear_bit(order, &huge_anon_or << 
524                 clear_bit(order, &huge_anon_or << 
525                 spin_unlock(&huge_anon_orders_ << 
526         } else                                 << 
527                 ret = -EINVAL;                 << 
528                                                << 
529         if (ret > 0) {                         << 
530                 int err;                       << 
531                                                << 
532                 err = start_stop_khugepaged(); << 
533                 if (err)                       << 
534                         ret = err;             << 
535         }                                      << 
536         return ret;                            << 
537 }                                              << 
538                                                << 
539 static struct kobj_attribute anon_enabled_attr << 
540         __ATTR(enabled, 0644, anon_enabled_sho << 
541                                                << 
542 static struct attribute *anon_ctrl_attrs[] = { << 
543         &anon_enabled_attr.attr,               << 
544         NULL,                                  << 
545 };                                             << 
546                                                << 
547 static const struct attribute_group anon_ctrl_ << 
548         .attrs = anon_ctrl_attrs,              << 
549 };                                             << 
550                                                << 
551 static struct attribute *file_ctrl_attrs[] = { << 
552 #ifdef CONFIG_SHMEM                            << 
553         &thpsize_shmem_enabled_attr.attr,      << 
554 #endif                                         << 
555         NULL,                                  << 
556 };                                             << 
557                                                << 
558 static const struct attribute_group file_ctrl_ << 
559         .attrs = file_ctrl_attrs,              << 
560 };                                             << 
561                                                << 
562 static struct attribute *any_ctrl_attrs[] = {  << 
563         NULL,                                  << 
564 };                                             << 
565                                                << 
566 static const struct attribute_group any_ctrl_a << 
567         .attrs = any_ctrl_attrs,               << 
568 };                                             << 
569                                                << 
570 static const struct kobj_type thpsize_ktype =  << 
571         .release = &thpsize_release,           << 
572         .sysfs_ops = &kobj_sysfs_ops,          << 
573 };                                             << 
574                                                << 
575 DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = << 
576                                                << 
577 static unsigned long sum_mthp_stat(int order,  << 
578 {                                              << 
579         unsigned long sum = 0;                 << 
580         int cpu;                               << 
581                                                << 
582         for_each_possible_cpu(cpu) {           << 
583                 struct mthp_stat *this = &per_ << 
584                                                << 
585                 sum += this->stats[order][item << 
586         }                                      << 
587                                                << 
588         return sum;                            << 
589 }                                              << 
590                                                << 
591 #define DEFINE_MTHP_STAT_ATTR(_name, _index)   << 
592 static ssize_t _name##_show(struct kobject *ko << 
593                         struct kobj_attribute  << 
594 {                                              << 
595         int order = to_thpsize(kobj)->order;   << 
596                                                << 
597         return sysfs_emit(buf, "%lu\n", sum_mt << 
598 }                                              << 
599 static struct kobj_attribute _name##_attr = __ << 
600                                                << 
601 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_S << 
602 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTH << 
603 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_char << 
604 DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT << 
605 DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_ST << 
606 #ifdef CONFIG_SHMEM                            << 
607 DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_S << 
608 DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STA << 
609 DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, M << 
610 #endif                                         << 
611 DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); << 
612 DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_ << 
613 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STA << 
614 DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_AN << 
615 DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped << 
616                                                << 
617 static struct attribute *anon_stats_attrs[] =  << 
618         &anon_fault_alloc_attr.attr,           << 
619         &anon_fault_fallback_attr.attr,        << 
620         &anon_fault_fallback_charge_attr.attr, << 
621 #ifndef CONFIG_SHMEM                           << 
622         &swpout_attr.attr,                     << 
623         &swpout_fallback_attr.attr,            << 
624 #endif                                         << 
625         &split_deferred_attr.attr,             << 
626         &nr_anon_attr.attr,                    << 
627         &nr_anon_partially_mapped_attr.attr,   << 
628         NULL,                                  << 
629 };                                             << 
630                                                << 
631 static struct attribute_group anon_stats_attr_ << 
632         .name = "stats",                       << 
633         .attrs = anon_stats_attrs,             << 
634 };                                             << 
635                                                << 
636 static struct attribute *file_stats_attrs[] =  << 
637 #ifdef CONFIG_SHMEM                            << 
638         &shmem_alloc_attr.attr,                << 
639         &shmem_fallback_attr.attr,             << 
640         &shmem_fallback_charge_attr.attr,      << 
641 #endif                                         << 
642         NULL,                                  << 
643 };                                             << 
644                                                << 
645 static struct attribute_group file_stats_attr_ << 
646         .name = "stats",                       << 
647         .attrs = file_stats_attrs,             << 
648 };                                             << 
649                                                << 
650 static struct attribute *any_stats_attrs[] = { << 
651 #ifdef CONFIG_SHMEM                            << 
652         &swpout_attr.attr,                     << 
653         &swpout_fallback_attr.attr,            << 
654 #endif                                         << 
655         &split_attr.attr,                      << 
656         &split_failed_attr.attr,               << 
657         NULL,                                  << 
658 };                                             << 
659                                                << 
660 static struct attribute_group any_stats_attr_g << 
661         .name = "stats",                       << 
662         .attrs = any_stats_attrs,              << 
663 };                                             << 
664                                                << 
665 static int sysfs_add_group(struct kobject *kob << 
666                            const struct attrib << 
667 {                                              << 
668         int ret = -ENOENT;                     << 
669                                                << 
670         /*                                     << 
671          * If the group is named, try to merge << 
672          * was already created. This avoids th << 
673          * sysfs_create_group() if the directo << 
674          */                                    << 
675         if (grp->name)                         << 
676                 ret = sysfs_merge_group(kobj,  << 
677         if (ret)                               << 
678                 ret = sysfs_create_group(kobj, << 
679                                                << 
680         return ret;                            << 
681 }                                              << 
682                                                << 
683 static struct thpsize *thpsize_create(int orde << 
684 {                                              << 
685         unsigned long size = (PAGE_SIZE << ord << 
686         struct thpsize *thpsize;               << 
687         int ret = -ENOMEM;                     << 
688                                                << 
689         thpsize = kzalloc(sizeof(*thpsize), GF << 
690         if (!thpsize)                          << 
691                 goto err;                      << 
692                                                << 
693         thpsize->order = order;                << 
694                                                << 
695         ret = kobject_init_and_add(&thpsize->k << 
696                                    "hugepages- << 
697         if (ret) {                             << 
698                 kfree(thpsize);                << 
699                 goto err;                      << 
700         }                                      << 
701                                                << 
702                                                << 
703         ret = sysfs_add_group(&thpsize->kobj,  << 
704         if (ret)                               << 
705                 goto err_put;                  << 
706                                                << 
707         ret = sysfs_add_group(&thpsize->kobj,  << 
708         if (ret)                               << 
709                 goto err_put;                  << 
710                                                << 
711         if (BIT(order) & THP_ORDERS_ALL_ANON)  << 
712                 ret = sysfs_add_group(&thpsize << 
713                 if (ret)                       << 
714                         goto err_put;          << 
715                                                << 
716                 ret = sysfs_add_group(&thpsize << 
717                 if (ret)                       << 
718                         goto err_put;          << 
719         }                                      << 
720                                                << 
721         if (BIT(order) & THP_ORDERS_ALL_FILE_D << 
722                 ret = sysfs_add_group(&thpsize << 
723                 if (ret)                       << 
724                         goto err_put;          << 
725                                                << 
726                 ret = sysfs_add_group(&thpsize << 
727                 if (ret)                       << 
728                         goto err_put;          << 
729         }                                      << 
730                                                << 
731         return thpsize;                        << 
732 err_put:                                       << 
733         kobject_put(&thpsize->kobj);           << 
734 err:                                           << 
735         return ERR_PTR(ret);                   << 
736 }                                              << 
737                                                << 
738 static void thpsize_release(struct kobject *ko << 
739 {                                              << 
740         kfree(to_thpsize(kobj));               << 
741 }                                              << 
742                                                << 
743 static int __init hugepage_init_sysfs(struct k    335 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
744 {                                                 336 {
745         int err;                                  337         int err;
746         struct thpsize *thpsize;               << 
747         unsigned long orders;                  << 
748         int order;                             << 
749                                                << 
750         /*                                     << 
751          * Default to setting PMD-sized THP to << 
752          * disable all other sizes. powerpc's  << 
753          * constant so we have to do this here << 
754          */                                    << 
755         if (!anon_orders_configured)           << 
756                 huge_anon_orders_inherit = BIT << 
757                                                   338 
758         *hugepage_kobj = kobject_create_and_ad    339         *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
759         if (unlikely(!*hugepage_kobj)) {          340         if (unlikely(!*hugepage_kobj)) {
760                 pr_err("failed to create trans    341                 pr_err("failed to create transparent hugepage kobject\n");
761                 return -ENOMEM;                   342                 return -ENOMEM;
762         }                                         343         }
763                                                   344 
764         err = sysfs_create_group(*hugepage_kob    345         err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
765         if (err) {                                346         if (err) {
766                 pr_err("failed to register tra    347                 pr_err("failed to register transparent hugepage group\n");
767                 goto delete_obj;                  348                 goto delete_obj;
768         }                                         349         }
769                                                   350 
770         err = sysfs_create_group(*hugepage_kob    351         err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
771         if (err) {                                352         if (err) {
772                 pr_err("failed to register tra    353                 pr_err("failed to register transparent hugepage group\n");
773                 goto remove_hp_group;             354                 goto remove_hp_group;
774         }                                         355         }
775                                                   356 
776         orders = THP_ORDERS_ALL_ANON | THP_ORD << 
777         order = highest_order(orders);         << 
778         while (orders) {                       << 
779                 thpsize = thpsize_create(order << 
780                 if (IS_ERR(thpsize)) {         << 
781                         pr_err("failed to crea << 
782                         err = PTR_ERR(thpsize) << 
783                         goto remove_all;       << 
784                 }                              << 
785                 list_add(&thpsize->node, &thps << 
786                 order = next_order(&orders, or << 
787         }                                      << 
788                                                << 
789         return 0;                                 357         return 0;
790                                                   358 
791 remove_all:                                    << 
792         hugepage_exit_sysfs(*hugepage_kobj);   << 
793         return err;                            << 
794 remove_hp_group:                                  359 remove_hp_group:
795         sysfs_remove_group(*hugepage_kobj, &hu    360         sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
796 delete_obj:                                       361 delete_obj:
797         kobject_put(*hugepage_kobj);              362         kobject_put(*hugepage_kobj);
798         return err;                               363         return err;
799 }                                                 364 }
800                                                   365 
801 static void __init hugepage_exit_sysfs(struct     366 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
802 {                                                 367 {
803         struct thpsize *thpsize, *tmp;         << 
804                                                << 
805         list_for_each_entry_safe(thpsize, tmp, << 
806                 list_del(&thpsize->node);      << 
807                 kobject_put(&thpsize->kobj);   << 
808         }                                      << 
809                                                << 
810         sysfs_remove_group(hugepage_kobj, &khu    368         sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
811         sysfs_remove_group(hugepage_kobj, &hug    369         sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
812         kobject_put(hugepage_kobj);               370         kobject_put(hugepage_kobj);
813 }                                                 371 }
814 #else                                             372 #else
815 static inline int hugepage_init_sysfs(struct k    373 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
816 {                                                 374 {
817         return 0;                                 375         return 0;
818 }                                                 376 }
819                                                   377 
820 static inline void hugepage_exit_sysfs(struct     378 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
821 {                                                 379 {
822 }                                                 380 }
823 #endif /* CONFIG_SYSFS */                         381 #endif /* CONFIG_SYSFS */
824                                                   382 
825 static int __init thp_shrinker_init(void)      << 
826 {                                              << 
827         huge_zero_page_shrinker = shrinker_all << 
828         if (!huge_zero_page_shrinker)          << 
829                 return -ENOMEM;                << 
830                                                << 
831         deferred_split_shrinker = shrinker_all << 
832                                                << 
833                                                << 
834                                                << 
835         if (!deferred_split_shrinker) {        << 
836                 shrinker_free(huge_zero_page_s << 
837                 return -ENOMEM;                << 
838         }                                      << 
839                                                << 
840         huge_zero_page_shrinker->count_objects << 
841         huge_zero_page_shrinker->scan_objects  << 
842         shrinker_register(huge_zero_page_shrin << 
843                                                << 
844         deferred_split_shrinker->count_objects << 
845         deferred_split_shrinker->scan_objects  << 
846         shrinker_register(deferred_split_shrin << 
847                                                << 
848         return 0;                              << 
849 }                                              << 
850                                                << 
851 static void __init thp_shrinker_exit(void)     << 
852 {                                              << 
853         shrinker_free(huge_zero_page_shrinker) << 
854         shrinker_free(deferred_split_shrinker) << 
855 }                                              << 
856                                                << 
857 static int __init hugepage_init(void)             383 static int __init hugepage_init(void)
858 {                                                 384 {
859         int err;                                  385         int err;
860         struct kobject *hugepage_kobj;            386         struct kobject *hugepage_kobj;
861                                                   387 
862         if (!has_transparent_hugepage()) {        388         if (!has_transparent_hugepage()) {
863                 transparent_hugepage_flags = 1 !! 389                 transparent_hugepage_flags = 0;
864                 return -EINVAL;                   390                 return -EINVAL;
865         }                                         391         }
866                                                   392 
867         /*                                        393         /*
868          * hugepages can't be allocated by the    394          * hugepages can't be allocated by the buddy allocator
869          */                                       395          */
870         MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > M !! 396         MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
                                                   >> 397         /*
                                                   >> 398          * we use page->mapping and page->index in second tail page
                                                   >> 399          * as list_head: assuming THP order >= 2
                                                   >> 400          */
                                                   >> 401         MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
871                                                   402 
872         err = hugepage_init_sysfs(&hugepage_ko    403         err = hugepage_init_sysfs(&hugepage_kobj);
873         if (err)                                  404         if (err)
874                 goto err_sysfs;                   405                 goto err_sysfs;
875                                                   406 
876         err = khugepaged_init();                  407         err = khugepaged_init();
877         if (err)                                  408         if (err)
878                 goto err_slab;                    409                 goto err_slab;
879                                                   410 
880         err = thp_shrinker_init();             !! 411         err = register_shrinker(&huge_zero_page_shrinker);
                                                   >> 412         if (err)
                                                   >> 413                 goto err_hzp_shrinker;
                                                   >> 414         err = register_shrinker(&deferred_split_shrinker);
881         if (err)                                  415         if (err)
882                 goto err_shrinker;             !! 416                 goto err_split_shrinker;
883                                                   417 
884         /*                                        418         /*
885          * By default disable transparent huge    419          * By default disable transparent hugepages on smaller systems,
886          * where the extra memory used could h    420          * where the extra memory used could hurt more than TLB overhead
887          * is likely to save.  The admin can s    421          * is likely to save.  The admin can still enable it through /sys.
888          */                                       422          */
889         if (totalram_pages() < (512 << (20 - P !! 423         if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
890                 transparent_hugepage_flags = 0    424                 transparent_hugepage_flags = 0;
891                 return 0;                         425                 return 0;
892         }                                         426         }
893                                                   427 
894         err = start_stop_khugepaged();            428         err = start_stop_khugepaged();
895         if (err)                                  429         if (err)
896                 goto err_khugepaged;              430                 goto err_khugepaged;
897                                                   431 
898         return 0;                                 432         return 0;
899 err_khugepaged:                                   433 err_khugepaged:
900         thp_shrinker_exit();                   !! 434         unregister_shrinker(&deferred_split_shrinker);
901 err_shrinker:                                  !! 435 err_split_shrinker:
                                                   >> 436         unregister_shrinker(&huge_zero_page_shrinker);
                                                   >> 437 err_hzp_shrinker:
902         khugepaged_destroy();                     438         khugepaged_destroy();
903 err_slab:                                         439 err_slab:
904         hugepage_exit_sysfs(hugepage_kobj);       440         hugepage_exit_sysfs(hugepage_kobj);
905 err_sysfs:                                        441 err_sysfs:
906         return err;                               442         return err;
907 }                                                 443 }
908 subsys_initcall(hugepage_init);                   444 subsys_initcall(hugepage_init);
909                                                   445 
910 static int __init setup_transparent_hugepage(c    446 static int __init setup_transparent_hugepage(char *str)
911 {                                                 447 {
912         int ret = 0;                              448         int ret = 0;
913         if (!str)                                 449         if (!str)
914                 goto out;                         450                 goto out;
915         if (!strcmp(str, "always")) {             451         if (!strcmp(str, "always")) {
916                 set_bit(TRANSPARENT_HUGEPAGE_F    452                 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
917                         &transparent_hugepage_    453                         &transparent_hugepage_flags);
918                 clear_bit(TRANSPARENT_HUGEPAGE    454                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
919                           &transparent_hugepag    455                           &transparent_hugepage_flags);
920                 ret = 1;                          456                 ret = 1;
921         } else if (!strcmp(str, "madvise")) {     457         } else if (!strcmp(str, "madvise")) {
922                 clear_bit(TRANSPARENT_HUGEPAGE    458                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
923                           &transparent_hugepag    459                           &transparent_hugepage_flags);
924                 set_bit(TRANSPARENT_HUGEPAGE_R    460                 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
925                         &transparent_hugepage_    461                         &transparent_hugepage_flags);
926                 ret = 1;                          462                 ret = 1;
927         } else if (!strcmp(str, "never")) {       463         } else if (!strcmp(str, "never")) {
928                 clear_bit(TRANSPARENT_HUGEPAGE    464                 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
929                           &transparent_hugepag    465                           &transparent_hugepage_flags);
930                 clear_bit(TRANSPARENT_HUGEPAGE    466                 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
931                           &transparent_hugepag    467                           &transparent_hugepage_flags);
932                 ret = 1;                          468                 ret = 1;
933         }                                         469         }
934 out:                                              470 out:
935         if (!ret)                                 471         if (!ret)
936                 pr_warn("transparent_hugepage=    472                 pr_warn("transparent_hugepage= cannot parse, ignored\n");
937         return ret;                               473         return ret;
938 }                                                 474 }
939 __setup("transparent_hugepage=", setup_transpa    475 __setup("transparent_hugepage=", setup_transparent_hugepage);
940                                                   476 
941 static inline int get_order_from_str(const cha << 
942 {                                              << 
943         unsigned long size;                    << 
944         char *endptr;                          << 
945         int order;                             << 
946                                                << 
947         size = memparse(size_str, &endptr);    << 
948                                                << 
949         if (!is_power_of_2(size))              << 
950                 goto err;                      << 
951         order = get_order(size);               << 
952         if (BIT(order) & ~THP_ORDERS_ALL_ANON) << 
953                 goto err;                      << 
954                                                << 
955         return order;                          << 
956 err:                                           << 
957         pr_err("invalid size %s in thp_anon bo << 
958         return -EINVAL;                        << 
959 }                                              << 
960                                                << 
961 static char str_dup[PAGE_SIZE] __initdata;     << 
962 static int __init setup_thp_anon(char *str)    << 
963 {                                              << 
964         char *token, *range, *policy, *subtoke << 
965         unsigned long always, inherit, madvise << 
966         char *start_size, *end_size;           << 
967         int start, end, nr;                    << 
968         char *p;                               << 
969                                                << 
970         if (!str || strlen(str) + 1 > PAGE_SIZ << 
971                 goto err;                      << 
972         strcpy(str_dup, str);                  << 
973                                                << 
974         always = huge_anon_orders_always;      << 
975         madvise = huge_anon_orders_madvise;    << 
976         inherit = huge_anon_orders_inherit;    << 
977         p = str_dup;                           << 
978         while ((token = strsep(&p, ";")) != NU << 
979                 range = strsep(&token, ":");   << 
980                 policy = token;                << 
981                                                << 
982                 if (!policy)                   << 
983                         goto err;              << 
984                                                << 
985                 while ((subtoken = strsep(&ran << 
986                         if (strchr(subtoken, ' << 
987                                 start_size = s << 
988                                 end_size = sub << 
989                                                << 
990                                 start = get_or << 
991                                 end = get_orde << 
992                         } else {               << 
993                                 start = end =  << 
994                         }                      << 
995                                                << 
996                         if (start < 0 || end < << 
997                                 goto err;      << 
998                                                << 
999                         nr = end - start + 1;  << 
1000                         if (!strcmp(policy, " << 
1001                                 bitmap_set(&a << 
1002                                 bitmap_clear( << 
1003                                 bitmap_clear( << 
1004                         } else if (!strcmp(po << 
1005                                 bitmap_set(&m << 
1006                                 bitmap_clear( << 
1007                                 bitmap_clear( << 
1008                         } else if (!strcmp(po << 
1009                                 bitmap_set(&i << 
1010                                 bitmap_clear( << 
1011                                 bitmap_clear( << 
1012                         } else if (!strcmp(po << 
1013                                 bitmap_clear( << 
1014                                 bitmap_clear( << 
1015                                 bitmap_clear( << 
1016                         } else {              << 
1017                                 pr_err("inval << 
1018                                 goto err;     << 
1019                         }                     << 
1020                 }                             << 
1021         }                                     << 
1022                                               << 
1023         huge_anon_orders_always = always;     << 
1024         huge_anon_orders_madvise = madvise;   << 
1025         huge_anon_orders_inherit = inherit;   << 
1026         anon_orders_configured = true;        << 
1027         return 1;                             << 
1028                                               << 
1029 err:                                          << 
1030         pr_warn("thp_anon=%s: error parsing s << 
1031         return 0;                             << 
1032 }                                             << 
1033 __setup("thp_anon=", setup_thp_anon);         << 
1034                                               << 
1035 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_    477 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
1036 {                                                478 {
1037         if (likely(vma->vm_flags & VM_WRITE))    479         if (likely(vma->vm_flags & VM_WRITE))
1038                 pmd = pmd_mkwrite(pmd, vma);  !! 480                 pmd = pmd_mkwrite(pmd);
1039         return pmd;                              481         return pmd;
1040 }                                                482 }
1041                                                  483 
1042 #ifdef CONFIG_MEMCG                           !! 484 static inline struct list_head *page_deferred_list(struct page *page)
1043 static inline                                 << 
1044 struct deferred_split *get_deferred_split_que << 
1045 {                                                485 {
1046         struct mem_cgroup *memcg = folio_memc !! 486         /* ->lru in the tail pages is occupied by compound_head. */
1047         struct pglist_data *pgdat = NODE_DATA !! 487         return &page[2].deferred_list;
1048                                               << 
1049         if (memcg)                            << 
1050                 return &memcg->deferred_split << 
1051         else                                  << 
1052                 return &pgdat->deferred_split << 
1053 }                                                488 }
1054 #else                                         << 
1055 static inline                                 << 
1056 struct deferred_split *get_deferred_split_que << 
1057 {                                             << 
1058         struct pglist_data *pgdat = NODE_DATA << 
1059                                               << 
1060         return &pgdat->deferred_split_queue;  << 
1061 }                                             << 
1062 #endif                                        << 
1063                                                  489 
1064 static inline bool is_transparent_hugepage(co !! 490 void prep_transhuge_page(struct page *page)
1065 {                                                491 {
1066         if (!folio_test_large(folio))         !! 492         /*
1067                 return false;                 !! 493          * we use page->mapping and page->indexlru in second tail page
                                                   >> 494          * as list_head: assuming THP order >= 2
                                                   >> 495          */
1068                                                  496 
1069         return is_huge_zero_folio(folio) ||   !! 497         INIT_LIST_HEAD(page_deferred_list(page));
1070                 folio_test_large_rmappable(fo !! 498         set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
1071 }                                                499 }
1072                                                  500 
1073 static unsigned long __thp_get_unmapped_area( !! 501 unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
1074                 unsigned long addr, unsigned  !! 502                 loff_t off, unsigned long flags, unsigned long size)
1075                 loff_t off, unsigned long fla << 
1076                 vm_flags_t vm_flags)          << 
1077 {                                                503 {
                                                   >> 504         unsigned long addr;
1078         loff_t off_end = off + len;              505         loff_t off_end = off + len;
1079         loff_t off_align = round_up(off, size    506         loff_t off_align = round_up(off, size);
1080         unsigned long len_pad, ret, off_sub;  !! 507         unsigned long len_pad;
1081                                               << 
1082         if (!IS_ENABLED(CONFIG_64BIT) || in_c << 
1083                 return 0;                     << 
1084                                                  508 
1085         if (off_end <= off_align || (off_end     509         if (off_end <= off_align || (off_end - off_align) < size)
1086                 return 0;                        510                 return 0;
1087                                                  511 
1088         len_pad = len + size;                    512         len_pad = len + size;
1089         if (len_pad < len || (off + len_pad)     513         if (len_pad < len || (off + len_pad) < off)
1090                 return 0;                        514                 return 0;
1091                                                  515 
1092         ret = mm_get_unmapped_area_vmflags(cu !! 516         addr = current->mm->get_unmapped_area(filp, 0, len_pad,
1093                                            of !! 517                                               off >> PAGE_SHIFT, flags);
1094                                               !! 518         if (IS_ERR_VALUE(addr))
1095         /*                                    << 
1096          * The failure might be due to length << 
1097          * without the padding.               << 
1098          */                                   << 
1099         if (IS_ERR_VALUE(ret))                << 
1100                 return 0;                        519                 return 0;
1101                                                  520 
1102         /*                                    !! 521         addr += (off - addr) & (size - 1);
1103          * Do not try to align to THP boundar !! 522         return addr;
1104          * hint succeeds.                     << 
1105          */                                   << 
1106         if (ret == addr)                      << 
1107                 return addr;                  << 
1108                                               << 
1109         off_sub = (off - ret) & (size - 1);   << 
1110                                               << 
1111         if (test_bit(MMF_TOPDOWN, &current->m << 
1112                 return ret + size;            << 
1113                                               << 
1114         ret += off_sub;                       << 
1115         return ret;                           << 
1116 }                                                523 }
1117                                                  524 
1118 unsigned long thp_get_unmapped_area_vmflags(s !! 525 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
1119                 unsigned long len, unsigned l !! 526                 unsigned long len, unsigned long pgoff, unsigned long flags)
1120                 vm_flags_t vm_flags)          << 
1121 {                                                527 {
1122         unsigned long ret;                    << 
1123         loff_t off = (loff_t)pgoff << PAGE_SH    528         loff_t off = (loff_t)pgoff << PAGE_SHIFT;
1124                                                  529 
1125         ret = __thp_get_unmapped_area(filp, a !! 530         if (addr)
1126         if (ret)                              !! 531                 goto out;
1127                 return ret;                   !! 532         if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
                                                   >> 533                 goto out;
1128                                                  534 
1129         return mm_get_unmapped_area_vmflags(c !! 535         addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
1130                                             v !! 536         if (addr)
1131 }                                             !! 537                 return addr;
1132                                                  538 
1133 unsigned long thp_get_unmapped_area(struct fi !! 539  out:
1134                 unsigned long len, unsigned l !! 540         return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
1135 {                                             << 
1136         return thp_get_unmapped_area_vmflags( << 
1137 }                                                541 }
1138 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);        542 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
1139                                                  543 
1140 static vm_fault_t __do_huge_pmd_anonymous_pag    544 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
1141                         struct page *page, gf    545                         struct page *page, gfp_t gfp)
1142 {                                                546 {
1143         struct vm_area_struct *vma = vmf->vma    547         struct vm_area_struct *vma = vmf->vma;
1144         struct folio *folio = page_folio(page !! 548         struct mem_cgroup *memcg;
1145         pgtable_t pgtable;                       549         pgtable_t pgtable;
1146         unsigned long haddr = vmf->address &     550         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1147         vm_fault_t ret = 0;                      551         vm_fault_t ret = 0;
1148                                                  552 
1149         VM_BUG_ON_FOLIO(!folio_test_large(fol !! 553         VM_BUG_ON_PAGE(!PageCompound(page), page);
1150                                                  554 
1151         if (mem_cgroup_charge(folio, vma->vm_ !! 555         if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
1152                 folio_put(folio);             !! 556                 put_page(page);
1153                 count_vm_event(THP_FAULT_FALL    557                 count_vm_event(THP_FAULT_FALLBACK);
1154                 count_vm_event(THP_FAULT_FALL << 
1155                 count_mthp_stat(HPAGE_PMD_ORD << 
1156                 count_mthp_stat(HPAGE_PMD_ORD << 
1157                 return VM_FAULT_FALLBACK;        558                 return VM_FAULT_FALLBACK;
1158         }                                        559         }
1159         folio_throttle_swaprate(folio, gfp);  << 
1160                                                  560 
1161         pgtable = pte_alloc_one(vma->vm_mm);  !! 561         pgtable = pte_alloc_one(vma->vm_mm, haddr);
1162         if (unlikely(!pgtable)) {                562         if (unlikely(!pgtable)) {
1163                 ret = VM_FAULT_OOM;              563                 ret = VM_FAULT_OOM;
1164                 goto release;                    564                 goto release;
1165         }                                        565         }
1166                                                  566 
1167         folio_zero_user(folio, vmf->address); !! 567         clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
1168         /*                                       568         /*
1169          * The memory barrier inside __folio_ !! 569          * The memory barrier inside __SetPageUptodate makes sure that
1170          * folio_zero_user writes become visi !! 570          * clear_huge_page writes become visible before the set_pmd_at()
1171          * write.                                571          * write.
1172          */                                      572          */
1173         __folio_mark_uptodate(folio);         !! 573         __SetPageUptodate(page);
1174                                                  574 
1175         vmf->ptl = pmd_lock(vma->vm_mm, vmf->    575         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1176         if (unlikely(!pmd_none(*vmf->pmd))) {    576         if (unlikely(!pmd_none(*vmf->pmd))) {
1177                 goto unlock_release;             577                 goto unlock_release;
1178         } else {                                 578         } else {
1179                 pmd_t entry;                     579                 pmd_t entry;
1180                                                  580 
1181                 ret = check_stable_address_sp    581                 ret = check_stable_address_space(vma->vm_mm);
1182                 if (ret)                         582                 if (ret)
1183                         goto unlock_release;     583                         goto unlock_release;
1184                                                  584 
1185                 /* Deliver the page fault to     585                 /* Deliver the page fault to userland */
1186                 if (userfaultfd_missing(vma))    586                 if (userfaultfd_missing(vma)) {
                                                   >> 587                         vm_fault_t ret2;
                                                   >> 588 
1187                         spin_unlock(vmf->ptl)    589                         spin_unlock(vmf->ptl);
1188                         folio_put(folio);     !! 590                         mem_cgroup_cancel_charge(page, memcg, true);
                                                   >> 591                         put_page(page);
1189                         pte_free(vma->vm_mm,     592                         pte_free(vma->vm_mm, pgtable);
1190                         ret = handle_userfaul !! 593                         ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
1191                         VM_BUG_ON(ret & VM_FA !! 594                         VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
1192                         return ret;           !! 595                         return ret2;
1193                 }                                596                 }
1194                                                  597 
1195                 entry = mk_huge_pmd(page, vma    598                 entry = mk_huge_pmd(page, vma->vm_page_prot);
1196                 entry = maybe_pmd_mkwrite(pmd    599                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1197                 folio_add_new_anon_rmap(folio !! 600                 page_add_new_anon_rmap(page, vma, haddr, true);
1198                 folio_add_lru_vma(folio, vma) !! 601                 mem_cgroup_commit_charge(page, memcg, false, true);
                                                   >> 602                 lru_cache_add_active_or_unevictable(page, vma);
1199                 pgtable_trans_huge_deposit(vm    603                 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
1200                 set_pmd_at(vma->vm_mm, haddr,    604                 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
1201                 update_mmu_cache_pmd(vma, vmf << 
1202                 add_mm_counter(vma->vm_mm, MM    605                 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1203                 mm_inc_nr_ptes(vma->vm_mm);      606                 mm_inc_nr_ptes(vma->vm_mm);
1204                 deferred_split_folio(folio, f << 
1205                 spin_unlock(vmf->ptl);           607                 spin_unlock(vmf->ptl);
1206                 count_vm_event(THP_FAULT_ALLO    608                 count_vm_event(THP_FAULT_ALLOC);
1207                 count_mthp_stat(HPAGE_PMD_ORD << 
1208                 count_memcg_event_mm(vma->vm_ << 
1209         }                                        609         }
1210                                                  610 
1211         return 0;                                611         return 0;
1212 unlock_release:                                  612 unlock_release:
1213         spin_unlock(vmf->ptl);                   613         spin_unlock(vmf->ptl);
1214 release:                                         614 release:
1215         if (pgtable)                             615         if (pgtable)
1216                 pte_free(vma->vm_mm, pgtable)    616                 pte_free(vma->vm_mm, pgtable);
1217         folio_put(folio);                     !! 617         mem_cgroup_cancel_charge(page, memcg, true);
                                                   >> 618         put_page(page);
1218         return ret;                              619         return ret;
1219                                                  620 
1220 }                                                621 }
1221                                                  622 
1222 /*                                               623 /*
1223  * always: directly stall for all thp allocat    624  * always: directly stall for all thp allocations
1224  * defer: wake kswapd and fail if not immedia    625  * defer: wake kswapd and fail if not immediately available
1225  * defer+madvise: wake kswapd and directly st    626  * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
1226  *                fail if not immediately ava    627  *                fail if not immediately available
1227  * madvise: directly stall for MADV_HUGEPAGE,    628  * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
1228  *          available                            629  *          available
1229  * never: never stall for any thp allocation     630  * never: never stall for any thp allocation
1230  */                                              631  */
1231 gfp_t vma_thp_gfp_mask(struct vm_area_struct  !! 632 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
1232 {                                                633 {
1233         const bool vma_madvised = vma && (vma !! 634         const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
1234                                                  635 
1235         /* Always do synchronous compaction *    636         /* Always do synchronous compaction */
1236         if (test_bit(TRANSPARENT_HUGEPAGE_DEF    637         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
1237                 return GFP_TRANSHUGE | (vma_m    638                 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
1238                                                  639 
1239         /* Kick kcompactd and fail quickly */    640         /* Kick kcompactd and fail quickly */
1240         if (test_bit(TRANSPARENT_HUGEPAGE_DEF    641         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
1241                 return GFP_TRANSHUGE_LIGHT |     642                 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
1242                                                  643 
1243         /* Synchronous compaction if madvised    644         /* Synchronous compaction if madvised, otherwise kick kcompactd */
1244         if (test_bit(TRANSPARENT_HUGEPAGE_DEF    645         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
1245                 return GFP_TRANSHUGE_LIGHT |     646                 return GFP_TRANSHUGE_LIGHT |
1246                         (vma_madvised ? __GFP    647                         (vma_madvised ? __GFP_DIRECT_RECLAIM :
1247                                         __GFP    648                                         __GFP_KSWAPD_RECLAIM);
1248                                                  649 
1249         /* Only do synchronous compaction if     650         /* Only do synchronous compaction if madvised */
1250         if (test_bit(TRANSPARENT_HUGEPAGE_DEF    651         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
1251                 return GFP_TRANSHUGE_LIGHT |     652                 return GFP_TRANSHUGE_LIGHT |
1252                        (vma_madvised ? __GFP_    653                        (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
1253                                                  654 
1254         return GFP_TRANSHUGE_LIGHT;              655         return GFP_TRANSHUGE_LIGHT;
1255 }                                                656 }
1256                                                  657 
1257 /* Caller must hold page table lock. */          658 /* Caller must hold page table lock. */
1258 static void set_huge_zero_folio(pgtable_t pgt !! 659 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
1259                 struct vm_area_struct *vma, u    660                 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
1260                 struct folio *zero_folio)     !! 661                 struct page *zero_page)
1261 {                                                662 {
1262         pmd_t entry;                             663         pmd_t entry;
1263         if (!pmd_none(*pmd))                     664         if (!pmd_none(*pmd))
1264                 return;                       !! 665                 return false;
1265         entry = mk_pmd(&zero_folio->page, vma !! 666         entry = mk_pmd(zero_page, vma->vm_page_prot);
1266         entry = pmd_mkhuge(entry);               667         entry = pmd_mkhuge(entry);
1267         pgtable_trans_huge_deposit(mm, pmd, p !! 668         if (pgtable)
                                                   >> 669                 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1268         set_pmd_at(mm, haddr, pmd, entry);       670         set_pmd_at(mm, haddr, pmd, entry);
1269         mm_inc_nr_ptes(mm);                      671         mm_inc_nr_ptes(mm);
                                                   >> 672         return true;
1270 }                                                673 }
1271                                                  674 
1272 vm_fault_t do_huge_pmd_anonymous_page(struct     675 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
1273 {                                                676 {
1274         struct vm_area_struct *vma = vmf->vma    677         struct vm_area_struct *vma = vmf->vma;
1275         gfp_t gfp;                               678         gfp_t gfp;
1276         struct folio *folio;                  !! 679         struct page *page;
1277         unsigned long haddr = vmf->address &     680         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1278         vm_fault_t ret;                       << 
1279                                                  681 
1280         if (!thp_vma_suitable_order(vma, hadd !! 682         if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
1281                 return VM_FAULT_FALLBACK;        683                 return VM_FAULT_FALLBACK;
1282         ret = vmf_anon_prepare(vmf);          !! 684         if (unlikely(anon_vma_prepare(vma)))
1283         if (ret)                              !! 685                 return VM_FAULT_OOM;
1284                 return ret;                   !! 686         if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
1285         khugepaged_enter_vma(vma, vma->vm_fla !! 687                 return VM_FAULT_OOM;
1286                                               << 
1287         if (!(vmf->flags & FAULT_FLAG_WRITE)     688         if (!(vmf->flags & FAULT_FLAG_WRITE) &&
1288                         !mm_forbids_zeropage(    689                         !mm_forbids_zeropage(vma->vm_mm) &&
1289                         transparent_hugepage_    690                         transparent_hugepage_use_zero_page()) {
1290                 pgtable_t pgtable;               691                 pgtable_t pgtable;
1291                 struct folio *zero_folio;     !! 692                 struct page *zero_page;
                                                   >> 693                 bool set;
1292                 vm_fault_t ret;                  694                 vm_fault_t ret;
1293                                               !! 695                 pgtable = pte_alloc_one(vma->vm_mm, haddr);
1294                 pgtable = pte_alloc_one(vma-> << 
1295                 if (unlikely(!pgtable))          696                 if (unlikely(!pgtable))
1296                         return VM_FAULT_OOM;     697                         return VM_FAULT_OOM;
1297                 zero_folio = mm_get_huge_zero !! 698                 zero_page = mm_get_huge_zero_page(vma->vm_mm);
1298                 if (unlikely(!zero_folio)) {  !! 699                 if (unlikely(!zero_page)) {
1299                         pte_free(vma->vm_mm,     700                         pte_free(vma->vm_mm, pgtable);
1300                         count_vm_event(THP_FA    701                         count_vm_event(THP_FAULT_FALLBACK);
1301                         return VM_FAULT_FALLB    702                         return VM_FAULT_FALLBACK;
1302                 }                                703                 }
1303                 vmf->ptl = pmd_lock(vma->vm_m    704                 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1304                 ret = 0;                         705                 ret = 0;
                                                   >> 706                 set = false;
1305                 if (pmd_none(*vmf->pmd)) {       707                 if (pmd_none(*vmf->pmd)) {
1306                         ret = check_stable_ad    708                         ret = check_stable_address_space(vma->vm_mm);
1307                         if (ret) {               709                         if (ret) {
1308                                 spin_unlock(v    710                                 spin_unlock(vmf->ptl);
1309                                 pte_free(vma- << 
1310                         } else if (userfaultf    711                         } else if (userfaultfd_missing(vma)) {
1311                                 spin_unlock(v    712                                 spin_unlock(vmf->ptl);
1312                                 pte_free(vma- << 
1313                                 ret = handle_    713                                 ret = handle_userfault(vmf, VM_UFFD_MISSING);
1314                                 VM_BUG_ON(ret    714                                 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1315                         } else {                 715                         } else {
1316                                 set_huge_zero !! 716                                 set_huge_zero_page(pgtable, vma->vm_mm, vma,
1317                                               !! 717                                                    haddr, vmf->pmd, zero_page);
1318                                 update_mmu_ca << 
1319                                 spin_unlock(v    718                                 spin_unlock(vmf->ptl);
                                                   >> 719                                 set = true;
1320                         }                        720                         }
1321                 } else {                      !! 721                 } else
1322                         spin_unlock(vmf->ptl)    722                         spin_unlock(vmf->ptl);
                                                   >> 723                 if (!set)
1323                         pte_free(vma->vm_mm,     724                         pte_free(vma->vm_mm, pgtable);
1324                 }                             << 
1325                 return ret;                      725                 return ret;
1326         }                                        726         }
1327         gfp = vma_thp_gfp_mask(vma);          !! 727         gfp = alloc_hugepage_direct_gfpmask(vma);
1328         folio = vma_alloc_folio(gfp, HPAGE_PM !! 728         page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
1329         if (unlikely(!folio)) {               !! 729         if (unlikely(!page)) {
1330                 count_vm_event(THP_FAULT_FALL    730                 count_vm_event(THP_FAULT_FALLBACK);
1331                 count_mthp_stat(HPAGE_PMD_ORD << 
1332                 return VM_FAULT_FALLBACK;        731                 return VM_FAULT_FALLBACK;
1333         }                                        732         }
1334         return __do_huge_pmd_anonymous_page(v !! 733         prep_transhuge_page(page);
                                                   >> 734         return __do_huge_pmd_anonymous_page(vmf, page, gfp);
1335 }                                                735 }
1336                                                  736 
1337 static void insert_pfn_pmd(struct vm_area_str    737 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
1338                 pmd_t *pmd, pfn_t pfn, pgprot    738                 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
1339                 pgtable_t pgtable)               739                 pgtable_t pgtable)
1340 {                                                740 {
1341         struct mm_struct *mm = vma->vm_mm;       741         struct mm_struct *mm = vma->vm_mm;
1342         pmd_t entry;                             742         pmd_t entry;
1343         spinlock_t *ptl;                         743         spinlock_t *ptl;
1344                                                  744 
1345         ptl = pmd_lock(mm, pmd);                 745         ptl = pmd_lock(mm, pmd);
1346         if (!pmd_none(*pmd)) {                << 
1347                 if (write) {                  << 
1348                         if (pmd_pfn(*pmd) !=  << 
1349                                 WARN_ON_ONCE( << 
1350                                 goto out_unlo << 
1351                         }                     << 
1352                         entry = pmd_mkyoung(* << 
1353                         entry = maybe_pmd_mkw << 
1354                         if (pmdp_set_access_f << 
1355                                 update_mmu_ca << 
1356                 }                             << 
1357                                               << 
1358                 goto out_unlock;              << 
1359         }                                     << 
1360                                               << 
1361         entry = pmd_mkhuge(pfn_t_pmd(pfn, pro    746         entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
1362         if (pfn_t_devmap(pfn))                   747         if (pfn_t_devmap(pfn))
1363                 entry = pmd_mkdevmap(entry);     748                 entry = pmd_mkdevmap(entry);
1364         else                                  << 
1365                 entry = pmd_mkspecial(entry); << 
1366         if (write) {                             749         if (write) {
1367                 entry = pmd_mkyoung(pmd_mkdir    750                 entry = pmd_mkyoung(pmd_mkdirty(entry));
1368                 entry = maybe_pmd_mkwrite(ent    751                 entry = maybe_pmd_mkwrite(entry, vma);
1369         }                                        752         }
1370                                                  753 
1371         if (pgtable) {                           754         if (pgtable) {
1372                 pgtable_trans_huge_deposit(mm    755                 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1373                 mm_inc_nr_ptes(mm);              756                 mm_inc_nr_ptes(mm);
1374                 pgtable = NULL;               << 
1375         }                                        757         }
1376                                                  758 
1377         set_pmd_at(mm, addr, pmd, entry);        759         set_pmd_at(mm, addr, pmd, entry);
1378         update_mmu_cache_pmd(vma, addr, pmd);    760         update_mmu_cache_pmd(vma, addr, pmd);
1379                                               << 
1380 out_unlock:                                   << 
1381         spin_unlock(ptl);                        761         spin_unlock(ptl);
1382         if (pgtable)                          << 
1383                 pte_free(mm, pgtable);        << 
1384 }                                                762 }
1385                                                  763 
1386 /**                                           !! 764 vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
1387  * vmf_insert_pfn_pmd - insert a pmd size pfn !! 765                         pmd_t *pmd, pfn_t pfn, bool write)
1388  * @vmf: Structure describing the fault       << 
1389  * @pfn: pfn to insert                        << 
1390  * @write: whether it's a write fault         << 
1391  *                                            << 
1392  * Insert a pmd size pfn. See vmf_insert_pfn( << 
1393  *                                            << 
1394  * Return: vm_fault_t value.                  << 
1395  */                                           << 
1396 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault << 
1397 {                                                766 {
1398         unsigned long addr = vmf->address & P << 
1399         struct vm_area_struct *vma = vmf->vma << 
1400         pgprot_t pgprot = vma->vm_page_prot;     767         pgprot_t pgprot = vma->vm_page_prot;
1401         pgtable_t pgtable = NULL;                768         pgtable_t pgtable = NULL;
1402                                               << 
1403         /*                                       769         /*
1404          * If we had pmd_special, we could av    770          * If we had pmd_special, we could avoid all these restrictions,
1405          * but we need to be consistent with     771          * but we need to be consistent with PTEs and architectures that
1406          * can't support a 'special' bit.        772          * can't support a 'special' bit.
1407          */                                      773          */
1408         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|V    774         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1409                         !pfn_t_devmap(pfn));     775                         !pfn_t_devmap(pfn));
1410         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM    776         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1411                                                  777                                                 (VM_PFNMAP|VM_MIXEDMAP));
1412         BUG_ON((vma->vm_flags & VM_PFNMAP) &&    778         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1413                                                  779 
1414         if (addr < vma->vm_start || addr >= v    780         if (addr < vma->vm_start || addr >= vma->vm_end)
1415                 return VM_FAULT_SIGBUS;          781                 return VM_FAULT_SIGBUS;
1416                                                  782 
1417         if (arch_needs_pgtable_deposit()) {      783         if (arch_needs_pgtable_deposit()) {
1418                 pgtable = pte_alloc_one(vma-> !! 784                 pgtable = pte_alloc_one(vma->vm_mm, addr);
1419                 if (!pgtable)                    785                 if (!pgtable)
1420                         return VM_FAULT_OOM;     786                         return VM_FAULT_OOM;
1421         }                                        787         }
1422                                                  788 
1423         track_pfn_insert(vma, &pgprot, pfn);     789         track_pfn_insert(vma, &pgprot, pfn);
1424                                                  790 
1425         insert_pfn_pmd(vma, addr, vmf->pmd, p !! 791         insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable);
1426         return VM_FAULT_NOPAGE;                  792         return VM_FAULT_NOPAGE;
1427 }                                                793 }
1428 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);           794 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
1429                                                  795 
1430 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_    796 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1431 static pud_t maybe_pud_mkwrite(pud_t pud, str    797 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
1432 {                                                798 {
1433         if (likely(vma->vm_flags & VM_WRITE))    799         if (likely(vma->vm_flags & VM_WRITE))
1434                 pud = pud_mkwrite(pud);          800                 pud = pud_mkwrite(pud);
1435         return pud;                              801         return pud;
1436 }                                                802 }
1437                                                  803 
1438 static void insert_pfn_pud(struct vm_area_str    804 static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
1439                 pud_t *pud, pfn_t pfn, bool w !! 805                 pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
1440 {                                                806 {
1441         struct mm_struct *mm = vma->vm_mm;       807         struct mm_struct *mm = vma->vm_mm;
1442         pgprot_t prot = vma->vm_page_prot;    << 
1443         pud_t entry;                             808         pud_t entry;
1444         spinlock_t *ptl;                         809         spinlock_t *ptl;
1445                                                  810 
1446         ptl = pud_lock(mm, pud);                 811         ptl = pud_lock(mm, pud);
1447         if (!pud_none(*pud)) {                << 
1448                 if (write) {                  << 
1449                         if (WARN_ON_ONCE(pud_ << 
1450                                 goto out_unlo << 
1451                         entry = pud_mkyoung(* << 
1452                         entry = maybe_pud_mkw << 
1453                         if (pudp_set_access_f << 
1454                                 update_mmu_ca << 
1455                 }                             << 
1456                 goto out_unlock;              << 
1457         }                                     << 
1458                                               << 
1459         entry = pud_mkhuge(pfn_t_pud(pfn, pro    812         entry = pud_mkhuge(pfn_t_pud(pfn, prot));
1460         if (pfn_t_devmap(pfn))                   813         if (pfn_t_devmap(pfn))
1461                 entry = pud_mkdevmap(entry);     814                 entry = pud_mkdevmap(entry);
1462         else                                  << 
1463                 entry = pud_mkspecial(entry); << 
1464         if (write) {                             815         if (write) {
1465                 entry = pud_mkyoung(pud_mkdir    816                 entry = pud_mkyoung(pud_mkdirty(entry));
1466                 entry = maybe_pud_mkwrite(ent    817                 entry = maybe_pud_mkwrite(entry, vma);
1467         }                                        818         }
1468         set_pud_at(mm, addr, pud, entry);        819         set_pud_at(mm, addr, pud, entry);
1469         update_mmu_cache_pud(vma, addr, pud);    820         update_mmu_cache_pud(vma, addr, pud);
1470                                               << 
1471 out_unlock:                                   << 
1472         spin_unlock(ptl);                        821         spin_unlock(ptl);
1473 }                                                822 }
1474                                                  823 
1475 /**                                           !! 824 vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
1476  * vmf_insert_pfn_pud - insert a pud size pfn !! 825                         pud_t *pud, pfn_t pfn, bool write)
1477  * @vmf: Structure describing the fault       << 
1478  * @pfn: pfn to insert                        << 
1479  * @write: whether it's a write fault         << 
1480  *                                            << 
1481  * Insert a pud size pfn. See vmf_insert_pfn( << 
1482  *                                            << 
1483  * Return: vm_fault_t value.                  << 
1484  */                                           << 
1485 vm_fault_t vmf_insert_pfn_pud(struct vm_fault << 
1486 {                                                826 {
1487         unsigned long addr = vmf->address & P << 
1488         struct vm_area_struct *vma = vmf->vma << 
1489         pgprot_t pgprot = vma->vm_page_prot;     827         pgprot_t pgprot = vma->vm_page_prot;
1490                                               << 
1491         /*                                       828         /*
1492          * If we had pud_special, we could av    829          * If we had pud_special, we could avoid all these restrictions,
1493          * but we need to be consistent with     830          * but we need to be consistent with PTEs and architectures that
1494          * can't support a 'special' bit.        831          * can't support a 'special' bit.
1495          */                                      832          */
1496         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|V    833         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1497                         !pfn_t_devmap(pfn));     834                         !pfn_t_devmap(pfn));
1498         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM    835         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1499                                                  836                                                 (VM_PFNMAP|VM_MIXEDMAP));
1500         BUG_ON((vma->vm_flags & VM_PFNMAP) &&    837         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1501                                                  838 
1502         if (addr < vma->vm_start || addr >= v    839         if (addr < vma->vm_start || addr >= vma->vm_end)
1503                 return VM_FAULT_SIGBUS;          840                 return VM_FAULT_SIGBUS;
1504                                                  841 
1505         track_pfn_insert(vma, &pgprot, pfn);     842         track_pfn_insert(vma, &pgprot, pfn);
1506                                                  843 
1507         insert_pfn_pud(vma, addr, vmf->pud, p !! 844         insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
1508         return VM_FAULT_NOPAGE;                  845         return VM_FAULT_NOPAGE;
1509 }                                                846 }
1510 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);           847 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
1511 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPA    848 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1512                                                  849 
1513 void touch_pmd(struct vm_area_struct *vma, un !! 850 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1514                pmd_t *pmd, bool write)        !! 851                 pmd_t *pmd, int flags)
1515 {                                                852 {
1516         pmd_t _pmd;                              853         pmd_t _pmd;
1517                                                  854 
1518         _pmd = pmd_mkyoung(*pmd);                855         _pmd = pmd_mkyoung(*pmd);
1519         if (write)                            !! 856         if (flags & FOLL_WRITE)
1520                 _pmd = pmd_mkdirty(_pmd);        857                 _pmd = pmd_mkdirty(_pmd);
1521         if (pmdp_set_access_flags(vma, addr &    858         if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1522                                   pmd, _pmd,  !! 859                                 pmd, _pmd, flags & FOLL_WRITE))
1523                 update_mmu_cache_pmd(vma, add    860                 update_mmu_cache_pmd(vma, addr, pmd);
1524 }                                                861 }
1525                                                  862 
1526 struct page *follow_devmap_pmd(struct vm_area    863 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
1527                 pmd_t *pmd, int flags, struct    864                 pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
1528 {                                                865 {
1529         unsigned long pfn = pmd_pfn(*pmd);       866         unsigned long pfn = pmd_pfn(*pmd);
1530         struct mm_struct *mm = vma->vm_mm;       867         struct mm_struct *mm = vma->vm_mm;
1531         struct page *page;                       868         struct page *page;
1532         int ret;                              << 
1533                                                  869 
1534         assert_spin_locked(pmd_lockptr(mm, pm    870         assert_spin_locked(pmd_lockptr(mm, pmd));
1535                                                  871 
                                                   >> 872         /*
                                                   >> 873          * When we COW a devmap PMD entry, we split it into PTEs, so we should
                                                   >> 874          * not be in this function with `flags & FOLL_COW` set.
                                                   >> 875          */
                                                   >> 876         WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
                                                   >> 877 
1536         if (flags & FOLL_WRITE && !pmd_write(    878         if (flags & FOLL_WRITE && !pmd_write(*pmd))
1537                 return NULL;                     879                 return NULL;
1538                                                  880 
1539         if (pmd_present(*pmd) && pmd_devmap(*    881         if (pmd_present(*pmd) && pmd_devmap(*pmd))
1540                 /* pass */;                      882                 /* pass */;
1541         else                                     883         else
1542                 return NULL;                     884                 return NULL;
1543                                                  885 
1544         if (flags & FOLL_TOUCH)                  886         if (flags & FOLL_TOUCH)
1545                 touch_pmd(vma, addr, pmd, fla !! 887                 touch_pmd(vma, addr, pmd, flags);
1546                                                  888 
1547         /*                                       889         /*
1548          * device mapped pages can only be re    890          * device mapped pages can only be returned if the
1549          * caller will manage the page refere    891          * caller will manage the page reference count.
1550          */                                      892          */
1551         if (!(flags & (FOLL_GET | FOLL_PIN))) !! 893         if (!(flags & FOLL_GET))
1552                 return ERR_PTR(-EEXIST);         894                 return ERR_PTR(-EEXIST);
1553                                                  895 
1554         pfn += (addr & ~PMD_MASK) >> PAGE_SHI    896         pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
1555         *pgmap = get_dev_pagemap(pfn, *pgmap)    897         *pgmap = get_dev_pagemap(pfn, *pgmap);
1556         if (!*pgmap)                             898         if (!*pgmap)
1557                 return ERR_PTR(-EFAULT);         899                 return ERR_PTR(-EFAULT);
1558         page = pfn_to_page(pfn);                 900         page = pfn_to_page(pfn);
1559         ret = try_grab_folio(page_folio(page) !! 901         get_page(page);
1560         if (ret)                              << 
1561                 page = ERR_PTR(ret);          << 
1562                                                  902 
1563         return page;                             903         return page;
1564 }                                                904 }
1565                                                  905 
1566 int copy_huge_pmd(struct mm_struct *dst_mm, s    906 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1567                   pmd_t *dst_pmd, pmd_t *src_    907                   pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1568                   struct vm_area_struct *dst_ !! 908                   struct vm_area_struct *vma)
1569 {                                                909 {
1570         spinlock_t *dst_ptl, *src_ptl;           910         spinlock_t *dst_ptl, *src_ptl;
1571         struct page *src_page;                   911         struct page *src_page;
1572         struct folio *src_folio;              << 
1573         pmd_t pmd;                               912         pmd_t pmd;
1574         pgtable_t pgtable = NULL;                913         pgtable_t pgtable = NULL;
1575         int ret = -ENOMEM;                       914         int ret = -ENOMEM;
1576                                                  915 
1577         pmd = pmdp_get_lockless(src_pmd);     << 
1578         if (unlikely(pmd_present(pmd) && pmd_ << 
1579                 dst_ptl = pmd_lock(dst_mm, ds << 
1580                 src_ptl = pmd_lockptr(src_mm, << 
1581                 spin_lock_nested(src_ptl, SIN << 
1582                 /*                            << 
1583                  * No need to recheck the pmd << 
1584                  * mmap lock held here.       << 
1585                  *                            << 
1586                  * Meanwhile, making sure it' << 
1587                  * mapping, otherwise it mean << 
1588                  * applied special bit, or we << 
1589                  * able to wrongly write to t << 
1590                  */                           << 
1591                 VM_WARN_ON_ONCE(is_cow_mappin << 
1592                 goto set_pmd;                 << 
1593         }                                     << 
1594                                               << 
1595         /* Skip if can be re-fill on fault */    916         /* Skip if can be re-fill on fault */
1596         if (!vma_is_anonymous(dst_vma))       !! 917         if (!vma_is_anonymous(vma))
1597                 return 0;                        918                 return 0;
1598                                                  919 
1599         pgtable = pte_alloc_one(dst_mm);      !! 920         pgtable = pte_alloc_one(dst_mm, addr);
1600         if (unlikely(!pgtable))                  921         if (unlikely(!pgtable))
1601                 goto out;                        922                 goto out;
1602                                                  923 
1603         dst_ptl = pmd_lock(dst_mm, dst_pmd);     924         dst_ptl = pmd_lock(dst_mm, dst_pmd);
1604         src_ptl = pmd_lockptr(src_mm, src_pmd    925         src_ptl = pmd_lockptr(src_mm, src_pmd);
1605         spin_lock_nested(src_ptl, SINGLE_DEPT    926         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1606                                                  927 
1607         ret = -EAGAIN;                           928         ret = -EAGAIN;
1608         pmd = *src_pmd;                          929         pmd = *src_pmd;
1609                                                  930 
1610 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION          931 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1611         if (unlikely(is_swap_pmd(pmd))) {        932         if (unlikely(is_swap_pmd(pmd))) {
1612                 swp_entry_t entry = pmd_to_sw    933                 swp_entry_t entry = pmd_to_swp_entry(pmd);
1613                                                  934 
1614                 VM_BUG_ON(!is_pmd_migration_e    935                 VM_BUG_ON(!is_pmd_migration_entry(pmd));
1615                 if (!is_readable_migration_en !! 936                 if (is_write_migration_entry(entry)) {
1616                         entry = make_readable !! 937                         make_migration_entry_read(&entry);
1617                                               << 
1618                         pmd = swp_entry_to_pm    938                         pmd = swp_entry_to_pmd(entry);
1619                         if (pmd_swp_soft_dirt    939                         if (pmd_swp_soft_dirty(*src_pmd))
1620                                 pmd = pmd_swp    940                                 pmd = pmd_swp_mksoft_dirty(pmd);
1621                         if (pmd_swp_uffd_wp(* << 
1622                                 pmd = pmd_swp << 
1623                         set_pmd_at(src_mm, ad    941                         set_pmd_at(src_mm, addr, src_pmd, pmd);
1624                 }                                942                 }
1625                 add_mm_counter(dst_mm, MM_ANO    943                 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1626                 mm_inc_nr_ptes(dst_mm);          944                 mm_inc_nr_ptes(dst_mm);
1627                 pgtable_trans_huge_deposit(ds    945                 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1628                 if (!userfaultfd_wp(dst_vma)) << 
1629                         pmd = pmd_swp_clear_u << 
1630                 set_pmd_at(dst_mm, addr, dst_    946                 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1631                 ret = 0;                         947                 ret = 0;
1632                 goto out_unlock;                 948                 goto out_unlock;
1633         }                                        949         }
1634 #endif                                           950 #endif
1635                                                  951 
1636         if (unlikely(!pmd_trans_huge(pmd))) {    952         if (unlikely(!pmd_trans_huge(pmd))) {
1637                 pte_free(dst_mm, pgtable);       953                 pte_free(dst_mm, pgtable);
1638                 goto out_unlock;                 954                 goto out_unlock;
1639         }                                        955         }
1640         /*                                       956         /*
1641          * When page table lock is held, the     957          * When page table lock is held, the huge zero pmd should not be
1642          * under splitting since we don't spl    958          * under splitting since we don't split the page itself, only pmd to
1643          * a page table.                         959          * a page table.
1644          */                                      960          */
1645         if (is_huge_zero_pmd(pmd)) {             961         if (is_huge_zero_pmd(pmd)) {
                                                   >> 962                 struct page *zero_page;
1646                 /*                               963                 /*
1647                  * mm_get_huge_zero_folio() w !! 964                  * get_huge_zero_page() will never allocate a new page here,
1648                  * folio here, since we alrea !! 965                  * since we already have a zero page to copy. It just takes a
1649                  * copy. It just takes a refe !! 966                  * reference.
1650                  */                              967                  */
1651                 mm_get_huge_zero_folio(dst_mm !! 968                 zero_page = mm_get_huge_zero_page(dst_mm);
1652                 goto out_zero_page;           !! 969                 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
                                                   >> 970                                 zero_page);
                                                   >> 971                 ret = 0;
                                                   >> 972                 goto out_unlock;
1653         }                                        973         }
1654                                                  974 
1655         src_page = pmd_page(pmd);                975         src_page = pmd_page(pmd);
1656         VM_BUG_ON_PAGE(!PageHead(src_page), s    976         VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1657         src_folio = page_folio(src_page);     !! 977         get_page(src_page);
1658                                               !! 978         page_dup_rmap(src_page, true);
1659         folio_get(src_folio);                 << 
1660         if (unlikely(folio_try_dup_anon_rmap_ << 
1661                 /* Page maybe pinned: split a << 
1662                 folio_put(src_folio);         << 
1663                 pte_free(dst_mm, pgtable);    << 
1664                 spin_unlock(src_ptl);         << 
1665                 spin_unlock(dst_ptl);         << 
1666                 __split_huge_pmd(src_vma, src << 
1667                 return -EAGAIN;               << 
1668         }                                     << 
1669         add_mm_counter(dst_mm, MM_ANONPAGES,     979         add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1670 out_zero_page:                                << 
1671         mm_inc_nr_ptes(dst_mm);                  980         mm_inc_nr_ptes(dst_mm);
1672         pgtable_trans_huge_deposit(dst_mm, ds    981         pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
                                                   >> 982 
1673         pmdp_set_wrprotect(src_mm, addr, src_    983         pmdp_set_wrprotect(src_mm, addr, src_pmd);
1674         if (!userfaultfd_wp(dst_vma))         !! 984         pmd = pmd_mkold(pmd_wrprotect(pmd));
1675                 pmd = pmd_clear_uffd_wp(pmd); << 
1676         pmd = pmd_wrprotect(pmd);             << 
1677 set_pmd:                                      << 
1678         pmd = pmd_mkold(pmd);                 << 
1679         set_pmd_at(dst_mm, addr, dst_pmd, pmd    985         set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1680                                                  986 
1681         ret = 0;                                 987         ret = 0;
1682 out_unlock:                                      988 out_unlock:
1683         spin_unlock(src_ptl);                    989         spin_unlock(src_ptl);
1684         spin_unlock(dst_ptl);                    990         spin_unlock(dst_ptl);
1685 out:                                             991 out:
1686         return ret;                              992         return ret;
1687 }                                                993 }
1688                                                  994 
1689 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_    995 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1690 void touch_pud(struct vm_area_struct *vma, un !! 996 static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1691                pud_t *pud, bool write)        !! 997                 pud_t *pud, int flags)
1692 {                                                998 {
1693         pud_t _pud;                              999         pud_t _pud;
1694                                                  1000 
1695         _pud = pud_mkyoung(*pud);                1001         _pud = pud_mkyoung(*pud);
1696         if (write)                            !! 1002         if (flags & FOLL_WRITE)
1697                 _pud = pud_mkdirty(_pud);        1003                 _pud = pud_mkdirty(_pud);
1698         if (pudp_set_access_flags(vma, addr &    1004         if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
1699                                   pud, _pud,  !! 1005                                 pud, _pud, flags & FOLL_WRITE))
1700                 update_mmu_cache_pud(vma, add    1006                 update_mmu_cache_pud(vma, addr, pud);
1701 }                                                1007 }
1702                                                  1008 
                                                   >> 1009 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
                                                   >> 1010                 pud_t *pud, int flags, struct dev_pagemap **pgmap)
                                                   >> 1011 {
                                                   >> 1012         unsigned long pfn = pud_pfn(*pud);
                                                   >> 1013         struct mm_struct *mm = vma->vm_mm;
                                                   >> 1014         struct page *page;
                                                   >> 1015 
                                                   >> 1016         assert_spin_locked(pud_lockptr(mm, pud));
                                                   >> 1017 
                                                   >> 1018         if (flags & FOLL_WRITE && !pud_write(*pud))
                                                   >> 1019                 return NULL;
                                                   >> 1020 
                                                   >> 1021         if (pud_present(*pud) && pud_devmap(*pud))
                                                   >> 1022                 /* pass */;
                                                   >> 1023         else
                                                   >> 1024                 return NULL;
                                                   >> 1025 
                                                   >> 1026         if (flags & FOLL_TOUCH)
                                                   >> 1027                 touch_pud(vma, addr, pud, flags);
                                                   >> 1028 
                                                   >> 1029         /*
                                                   >> 1030          * device mapped pages can only be returned if the
                                                   >> 1031          * caller will manage the page reference count.
                                                   >> 1032          */
                                                   >> 1033         if (!(flags & FOLL_GET))
                                                   >> 1034                 return ERR_PTR(-EEXIST);
                                                   >> 1035 
                                                   >> 1036         pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
                                                   >> 1037         *pgmap = get_dev_pagemap(pfn, *pgmap);
                                                   >> 1038         if (!*pgmap)
                                                   >> 1039                 return ERR_PTR(-EFAULT);
                                                   >> 1040         page = pfn_to_page(pfn);
                                                   >> 1041         get_page(page);
                                                   >> 1042 
                                                   >> 1043         return page;
                                                   >> 1044 }
                                                   >> 1045 
1703 int copy_huge_pud(struct mm_struct *dst_mm, s    1046 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1704                   pud_t *dst_pud, pud_t *src_    1047                   pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1705                   struct vm_area_struct *vma)    1048                   struct vm_area_struct *vma)
1706 {                                                1049 {
1707         spinlock_t *dst_ptl, *src_ptl;           1050         spinlock_t *dst_ptl, *src_ptl;
1708         pud_t pud;                               1051         pud_t pud;
1709         int ret;                                 1052         int ret;
1710                                                  1053 
1711         dst_ptl = pud_lock(dst_mm, dst_pud);     1054         dst_ptl = pud_lock(dst_mm, dst_pud);
1712         src_ptl = pud_lockptr(src_mm, src_pud    1055         src_ptl = pud_lockptr(src_mm, src_pud);
1713         spin_lock_nested(src_ptl, SINGLE_DEPT    1056         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1714                                                  1057 
1715         ret = -EAGAIN;                           1058         ret = -EAGAIN;
1716         pud = *src_pud;                          1059         pud = *src_pud;
1717         if (unlikely(!pud_trans_huge(pud) &&     1060         if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1718                 goto out_unlock;                 1061                 goto out_unlock;
1719                                                  1062 
1720         /*                                       1063         /*
1721          * TODO: once we support anonymous pa !! 1064          * When page table lock is held, the huge zero pud should not be
1722          * folio_try_dup_anon_rmap_*() and sp !! 1065          * under splitting since we don't split the page itself, only pud to
                                                   >> 1066          * a page table.
1723          */                                      1067          */
1724         if (is_cow_mapping(vma->vm_flags) &&  !! 1068         if (is_huge_zero_pud(pud)) {
1725                 pudp_set_wrprotect(src_mm, ad !! 1069                 /* No huge zero pud yet */
1726                 pud = pud_wrprotect(pud);     << 
1727         }                                        1070         }
1728         pud = pud_mkold(pud);                 !! 1071 
                                                   >> 1072         pudp_set_wrprotect(src_mm, addr, src_pud);
                                                   >> 1073         pud = pud_mkold(pud_wrprotect(pud));
1729         set_pud_at(dst_mm, addr, dst_pud, pud    1074         set_pud_at(dst_mm, addr, dst_pud, pud);
1730                                                  1075 
1731         ret = 0;                                 1076         ret = 0;
1732 out_unlock:                                      1077 out_unlock:
1733         spin_unlock(src_ptl);                    1078         spin_unlock(src_ptl);
1734         spin_unlock(dst_ptl);                    1079         spin_unlock(dst_ptl);
1735         return ret;                              1080         return ret;
1736 }                                                1081 }
1737                                                  1082 
1738 void huge_pud_set_accessed(struct vm_fault *v    1083 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1739 {                                                1084 {
                                                   >> 1085         pud_t entry;
                                                   >> 1086         unsigned long haddr;
1740         bool write = vmf->flags & FAULT_FLAG_    1087         bool write = vmf->flags & FAULT_FLAG_WRITE;
1741                                                  1088 
1742         vmf->ptl = pud_lock(vmf->vma->vm_mm,     1089         vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1743         if (unlikely(!pud_same(*vmf->pud, ori    1090         if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1744                 goto unlock;                     1091                 goto unlock;
1745                                                  1092 
1746         touch_pud(vmf->vma, vmf->address, vmf !! 1093         entry = pud_mkyoung(orig_pud);
                                                   >> 1094         if (write)
                                                   >> 1095                 entry = pud_mkdirty(entry);
                                                   >> 1096         haddr = vmf->address & HPAGE_PUD_MASK;
                                                   >> 1097         if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
                                                   >> 1098                 update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
                                                   >> 1099 
1747 unlock:                                          1100 unlock:
1748         spin_unlock(vmf->ptl);                   1101         spin_unlock(vmf->ptl);
1749 }                                                1102 }
1750 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPA    1103 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1751                                                  1104 
1752 void huge_pmd_set_accessed(struct vm_fault *v !! 1105 void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
1753 {                                                1106 {
                                                   >> 1107         pmd_t entry;
                                                   >> 1108         unsigned long haddr;
1754         bool write = vmf->flags & FAULT_FLAG_    1109         bool write = vmf->flags & FAULT_FLAG_WRITE;
1755                                                  1110 
1756         vmf->ptl = pmd_lock(vmf->vma->vm_mm,     1111         vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1757         if (unlikely(!pmd_same(*vmf->pmd, vmf !! 1112         if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
1758                 goto unlock;                     1113                 goto unlock;
1759                                                  1114 
1760         touch_pmd(vmf->vma, vmf->address, vmf !! 1115         entry = pmd_mkyoung(orig_pmd);
                                                   >> 1116         if (write)
                                                   >> 1117                 entry = pmd_mkdirty(entry);
                                                   >> 1118         haddr = vmf->address & HPAGE_PMD_MASK;
                                                   >> 1119         if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
                                                   >> 1120                 update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
1761                                                  1121 
1762 unlock:                                          1122 unlock:
1763         spin_unlock(vmf->ptl);                   1123         spin_unlock(vmf->ptl);
1764 }                                                1124 }
1765                                                  1125 
1766 vm_fault_t do_huge_pmd_wp_page(struct vm_faul !! 1126 static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
                                                   >> 1127                         pmd_t orig_pmd, struct page *page)
1767 {                                                1128 {
1768         const bool unshare = vmf->flags & FAU << 
1769         struct vm_area_struct *vma = vmf->vma    1129         struct vm_area_struct *vma = vmf->vma;
1770         struct folio *folio;                  << 
1771         struct page *page;                    << 
1772         unsigned long haddr = vmf->address &     1130         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1773         pmd_t orig_pmd = vmf->orig_pmd;       !! 1131         struct mem_cgroup *memcg;
1774                                               !! 1132         pgtable_t pgtable;
1775         vmf->ptl = pmd_lockptr(vma->vm_mm, vm !! 1133         pmd_t _pmd;
1776         VM_BUG_ON_VMA(!vma->anon_vma, vma);   !! 1134         int i;
1777                                               !! 1135         vm_fault_t ret = 0;
1778         if (is_huge_zero_pmd(orig_pmd))       !! 1136         struct page **pages;
1779                 goto fallback;                !! 1137         unsigned long mmun_start;       /* For mmu_notifiers */
                                                   >> 1138         unsigned long mmun_end;         /* For mmu_notifiers */
                                                   >> 1139 
                                                   >> 1140         pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
                                                   >> 1141                               GFP_KERNEL);
                                                   >> 1142         if (unlikely(!pages)) {
                                                   >> 1143                 ret |= VM_FAULT_OOM;
                                                   >> 1144                 goto out;
                                                   >> 1145         }
1780                                                  1146 
1781         spin_lock(vmf->ptl);                  !! 1147         for (i = 0; i < HPAGE_PMD_NR; i++) {
                                                   >> 1148                 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
                                                   >> 1149                                                vmf->address, page_to_nid(page));
                                                   >> 1150                 if (unlikely(!pages[i] ||
                                                   >> 1151                              mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
                                                   >> 1152                                      GFP_KERNEL, &memcg, false))) {
                                                   >> 1153                         if (pages[i])
                                                   >> 1154                                 put_page(pages[i]);
                                                   >> 1155                         while (--i >= 0) {
                                                   >> 1156                                 memcg = (void *)page_private(pages[i]);
                                                   >> 1157                                 set_page_private(pages[i], 0);
                                                   >> 1158                                 mem_cgroup_cancel_charge(pages[i], memcg,
                                                   >> 1159                                                 false);
                                                   >> 1160                                 put_page(pages[i]);
                                                   >> 1161                         }
                                                   >> 1162                         kfree(pages);
                                                   >> 1163                         ret |= VM_FAULT_OOM;
                                                   >> 1164                         goto out;
                                                   >> 1165                 }
                                                   >> 1166                 set_page_private(pages[i], (unsigned long)memcg);
                                                   >> 1167         }
1782                                                  1168 
1783         if (unlikely(!pmd_same(*vmf->pmd, ori !! 1169         for (i = 0; i < HPAGE_PMD_NR; i++) {
1784                 spin_unlock(vmf->ptl);        !! 1170                 copy_user_highpage(pages[i], page + i,
1785                 return 0;                     !! 1171                                    haddr + PAGE_SIZE * i, vma);
                                                   >> 1172                 __SetPageUptodate(pages[i]);
                                                   >> 1173                 cond_resched();
1786         }                                        1174         }
1787                                                  1175 
1788         page = pmd_page(orig_pmd);            !! 1176         mmun_start = haddr;
1789         folio = page_folio(page);             !! 1177         mmun_end   = haddr + HPAGE_PMD_SIZE;
                                                   >> 1178         mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
                                                   >> 1179 
                                                   >> 1180         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
                                                   >> 1181         if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
                                                   >> 1182                 goto out_free_pages;
1790         VM_BUG_ON_PAGE(!PageHead(page), page)    1183         VM_BUG_ON_PAGE(!PageHead(page), page);
1791                                                  1184 
1792         /* Early check when only holding the  !! 1185         /*
1793         if (PageAnonExclusive(page))          !! 1186          * Leave pmd empty until pte is filled note we must notify here as
1794                 goto reuse;                   !! 1187          * concurrent CPU thread might write to new page before the call to
                                                   >> 1188          * mmu_notifier_invalidate_range_end() happens which can lead to a
                                                   >> 1189          * device seeing memory write in different order than CPU.
                                                   >> 1190          *
                                                   >> 1191          * See Documentation/vm/mmu_notifier.rst
                                                   >> 1192          */
                                                   >> 1193         pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
1795                                                  1194 
1796         if (!folio_trylock(folio)) {          !! 1195         pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
1797                 folio_get(folio);             !! 1196         pmd_populate(vma->vm_mm, &_pmd, pgtable);
1798                 spin_unlock(vmf->ptl);        << 
1799                 folio_lock(folio);            << 
1800                 spin_lock(vmf->ptl);          << 
1801                 if (unlikely(!pmd_same(*vmf-> << 
1802                         spin_unlock(vmf->ptl) << 
1803                         folio_unlock(folio);  << 
1804                         folio_put(folio);     << 
1805                         return 0;             << 
1806                 }                             << 
1807                 folio_put(folio);             << 
1808         }                                     << 
1809                                                  1197 
1810         /* Recheck after temporarily dropping !! 1198         for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1811         if (PageAnonExclusive(page)) {        !! 1199                 pte_t entry;
1812                 folio_unlock(folio);          !! 1200                 entry = mk_pte(pages[i], vma->vm_page_prot);
1813                 goto reuse;                   !! 1201                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                                                   >> 1202                 memcg = (void *)page_private(pages[i]);
                                                   >> 1203                 set_page_private(pages[i], 0);
                                                   >> 1204                 page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
                                                   >> 1205                 mem_cgroup_commit_charge(pages[i], memcg, false, false);
                                                   >> 1206                 lru_cache_add_active_or_unevictable(pages[i], vma);
                                                   >> 1207                 vmf->pte = pte_offset_map(&_pmd, haddr);
                                                   >> 1208                 VM_BUG_ON(!pte_none(*vmf->pte));
                                                   >> 1209                 set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
                                                   >> 1210                 pte_unmap(vmf->pte);
1814         }                                        1211         }
                                                   >> 1212         kfree(pages);
                                                   >> 1213 
                                                   >> 1214         smp_wmb(); /* make pte visible before pmd */
                                                   >> 1215         pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
                                                   >> 1216         page_remove_rmap(page, true);
                                                   >> 1217         spin_unlock(vmf->ptl);
1815                                                  1218 
1816         /*                                       1219         /*
1817          * See do_wp_page(): we can only reus !! 1220          * No need to double call mmu_notifier->invalidate_range() callback as
1818          * there are no additional references !! 1221          * the above pmdp_huge_clear_flush_notify() did already call it.
1819          * the LRU cache immediately after ad << 
1820          */                                      1222          */
1821         if (folio_ref_count(folio) >          !! 1223         mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
1822                         1 + folio_test_swapca !! 1224                                                 mmun_end);
1823                 goto unlock_fallback;         << 
1824         if (folio_test_swapcache(folio))      << 
1825                 folio_free_swap(folio);       << 
1826         if (folio_ref_count(folio) == 1) {    << 
1827                 pmd_t entry;                  << 
1828                                                  1225 
1829                 folio_move_anon_rmap(folio, v !! 1226         ret |= VM_FAULT_WRITE;
1830                 SetPageAnonExclusive(page);   !! 1227         put_page(page);
1831                 folio_unlock(folio);          !! 1228 
1832 reuse:                                        !! 1229 out:
1833                 if (unlikely(unshare)) {      !! 1230         return ret;
1834                         spin_unlock(vmf->ptl) !! 1231 
1835                         return 0;             !! 1232 out_free_pages:
                                                   >> 1233         spin_unlock(vmf->ptl);
                                                   >> 1234         mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
                                                   >> 1235         for (i = 0; i < HPAGE_PMD_NR; i++) {
                                                   >> 1236                 memcg = (void *)page_private(pages[i]);
                                                   >> 1237                 set_page_private(pages[i], 0);
                                                   >> 1238                 mem_cgroup_cancel_charge(pages[i], memcg, false);
                                                   >> 1239                 put_page(pages[i]);
                                                   >> 1240         }
                                                   >> 1241         kfree(pages);
                                                   >> 1242         goto out;
                                                   >> 1243 }
                                                   >> 1244 
                                                   >> 1245 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
                                                   >> 1246 {
                                                   >> 1247         struct vm_area_struct *vma = vmf->vma;
                                                   >> 1248         struct page *page = NULL, *new_page;
                                                   >> 1249         struct mem_cgroup *memcg;
                                                   >> 1250         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
                                                   >> 1251         unsigned long mmun_start;       /* For mmu_notifiers */
                                                   >> 1252         unsigned long mmun_end;         /* For mmu_notifiers */
                                                   >> 1253         gfp_t huge_gfp;                 /* for allocation and charge */
                                                   >> 1254         vm_fault_t ret = 0;
                                                   >> 1255 
                                                   >> 1256         vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
                                                   >> 1257         VM_BUG_ON_VMA(!vma->anon_vma, vma);
                                                   >> 1258         if (is_huge_zero_pmd(orig_pmd))
                                                   >> 1259                 goto alloc;
                                                   >> 1260         spin_lock(vmf->ptl);
                                                   >> 1261         if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
                                                   >> 1262                 goto out_unlock;
                                                   >> 1263 
                                                   >> 1264         page = pmd_page(orig_pmd);
                                                   >> 1265         VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
                                                   >> 1266         /*
                                                   >> 1267          * We can only reuse the page if nobody else maps the huge page or it's
                                                   >> 1268          * part.
                                                   >> 1269          */
                                                   >> 1270         if (!trylock_page(page)) {
                                                   >> 1271                 get_page(page);
                                                   >> 1272                 spin_unlock(vmf->ptl);
                                                   >> 1273                 lock_page(page);
                                                   >> 1274                 spin_lock(vmf->ptl);
                                                   >> 1275                 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
                                                   >> 1276                         unlock_page(page);
                                                   >> 1277                         put_page(page);
                                                   >> 1278                         goto out_unlock;
1836                 }                                1279                 }
                                                   >> 1280                 put_page(page);
                                                   >> 1281         }
                                                   >> 1282         if (reuse_swap_page(page, NULL)) {
                                                   >> 1283                 pmd_t entry;
1837                 entry = pmd_mkyoung(orig_pmd)    1284                 entry = pmd_mkyoung(orig_pmd);
1838                 entry = maybe_pmd_mkwrite(pmd    1285                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1839                 if (pmdp_set_access_flags(vma !! 1286                 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
1840                         update_mmu_cache_pmd(    1287                         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1841                 spin_unlock(vmf->ptl);        !! 1288                 ret |= VM_FAULT_WRITE;
1842                 return 0;                     !! 1289                 unlock_page(page);
                                                   >> 1290                 goto out_unlock;
1843         }                                        1291         }
                                                   >> 1292         unlock_page(page);
                                                   >> 1293         get_page(page);
                                                   >> 1294         spin_unlock(vmf->ptl);
                                                   >> 1295 alloc:
                                                   >> 1296         if (transparent_hugepage_enabled(vma) &&
                                                   >> 1297             !transparent_hugepage_debug_cow()) {
                                                   >> 1298                 huge_gfp = alloc_hugepage_direct_gfpmask(vma);
                                                   >> 1299                 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
                                                   >> 1300         } else
                                                   >> 1301                 new_page = NULL;
1844                                                  1302 
1845 unlock_fallback:                              !! 1303         if (likely(new_page)) {
1846         folio_unlock(folio);                  !! 1304                 prep_transhuge_page(new_page);
                                                   >> 1305         } else {
                                                   >> 1306                 if (!page) {
                                                   >> 1307                         split_huge_pmd(vma, vmf->pmd, vmf->address);
                                                   >> 1308                         ret |= VM_FAULT_FALLBACK;
                                                   >> 1309                 } else {
                                                   >> 1310                         ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
                                                   >> 1311                         if (ret & VM_FAULT_OOM) {
                                                   >> 1312                                 split_huge_pmd(vma, vmf->pmd, vmf->address);
                                                   >> 1313                                 ret |= VM_FAULT_FALLBACK;
                                                   >> 1314                         }
                                                   >> 1315                         put_page(page);
                                                   >> 1316                 }
                                                   >> 1317                 count_vm_event(THP_FAULT_FALLBACK);
                                                   >> 1318                 goto out;
                                                   >> 1319         }
                                                   >> 1320 
                                                   >> 1321         if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
                                                   >> 1322                                         huge_gfp, &memcg, true))) {
                                                   >> 1323                 put_page(new_page);
                                                   >> 1324                 split_huge_pmd(vma, vmf->pmd, vmf->address);
                                                   >> 1325                 if (page)
                                                   >> 1326                         put_page(page);
                                                   >> 1327                 ret |= VM_FAULT_FALLBACK;
                                                   >> 1328                 count_vm_event(THP_FAULT_FALLBACK);
                                                   >> 1329                 goto out;
                                                   >> 1330         }
                                                   >> 1331 
                                                   >> 1332         count_vm_event(THP_FAULT_ALLOC);
                                                   >> 1333 
                                                   >> 1334         if (!page)
                                                   >> 1335                 clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
                                                   >> 1336         else
                                                   >> 1337                 copy_user_huge_page(new_page, page, vmf->address,
                                                   >> 1338                                     vma, HPAGE_PMD_NR);
                                                   >> 1339         __SetPageUptodate(new_page);
                                                   >> 1340 
                                                   >> 1341         mmun_start = haddr;
                                                   >> 1342         mmun_end   = haddr + HPAGE_PMD_SIZE;
                                                   >> 1343         mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
                                                   >> 1344 
                                                   >> 1345         spin_lock(vmf->ptl);
                                                   >> 1346         if (page)
                                                   >> 1347                 put_page(page);
                                                   >> 1348         if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
                                                   >> 1349                 spin_unlock(vmf->ptl);
                                                   >> 1350                 mem_cgroup_cancel_charge(new_page, memcg, true);
                                                   >> 1351                 put_page(new_page);
                                                   >> 1352                 goto out_mn;
                                                   >> 1353         } else {
                                                   >> 1354                 pmd_t entry;
                                                   >> 1355                 entry = mk_huge_pmd(new_page, vma->vm_page_prot);
                                                   >> 1356                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                                                   >> 1357                 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
                                                   >> 1358                 page_add_new_anon_rmap(new_page, vma, haddr, true);
                                                   >> 1359                 mem_cgroup_commit_charge(new_page, memcg, false, true);
                                                   >> 1360                 lru_cache_add_active_or_unevictable(new_page, vma);
                                                   >> 1361                 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
                                                   >> 1362                 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                                                   >> 1363                 if (!page) {
                                                   >> 1364                         add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
                                                   >> 1365                 } else {
                                                   >> 1366                         VM_BUG_ON_PAGE(!PageHead(page), page);
                                                   >> 1367                         page_remove_rmap(page, true);
                                                   >> 1368                         put_page(page);
                                                   >> 1369                 }
                                                   >> 1370                 ret |= VM_FAULT_WRITE;
                                                   >> 1371         }
                                                   >> 1372         spin_unlock(vmf->ptl);
                                                   >> 1373 out_mn:
                                                   >> 1374         /*
                                                   >> 1375          * No need to double call mmu_notifier->invalidate_range() callback as
                                                   >> 1376          * the above pmdp_huge_clear_flush_notify() did already call it.
                                                   >> 1377          */
                                                   >> 1378         mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
                                                   >> 1379                                                mmun_end);
                                                   >> 1380 out:
                                                   >> 1381         return ret;
                                                   >> 1382 out_unlock:
1847         spin_unlock(vmf->ptl);                   1383         spin_unlock(vmf->ptl);
1848 fallback:                                     !! 1384         return ret;
1849         __split_huge_pmd(vma, vmf->pmd, vmf-> << 
1850         return VM_FAULT_FALLBACK;             << 
1851 }                                                1385 }
1852                                                  1386 
1853 static inline bool can_change_pmd_writable(st !! 1387 /*
1854                                            un !! 1388  * FOLL_FORCE can write to even unwritable pmd's, but only
                                                   >> 1389  * after we've gone through a COW cycle and they are dirty.
                                                   >> 1390  */
                                                   >> 1391 static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
1855 {                                                1392 {
1856         struct page *page;                    !! 1393         return pmd_write(pmd) ||
                                                   >> 1394                ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
                                                   >> 1395 }
1857                                                  1396 
1858         if (WARN_ON_ONCE(!(vma->vm_flags & VM !! 1397 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1859                 return false;                 !! 1398                                    unsigned long addr,
                                                   >> 1399                                    pmd_t *pmd,
                                                   >> 1400                                    unsigned int flags)
                                                   >> 1401 {
                                                   >> 1402         struct mm_struct *mm = vma->vm_mm;
                                                   >> 1403         struct page *page = NULL;
1860                                                  1404 
1861         /* Don't touch entries that are not e !! 1405         assert_spin_locked(pmd_lockptr(mm, pmd));
1862         if (pmd_protnone(pmd))                << 
1863                 return false;                 << 
1864                                                  1406 
1865         /* Do we need write faults for softdi !! 1407         if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
1866         if (pmd_needs_soft_dirty_wp(vma, pmd) !! 1408                 goto out;
1867                 return false;                 << 
1868                                                  1409 
1869         /* Do we need write faults for uffd-w !! 1410         /* Avoid dumping huge zero page */
1870         if (userfaultfd_huge_pmd_wp(vma, pmd) !! 1411         if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1871                 return false;                 !! 1412                 return ERR_PTR(-EFAULT);
1872                                                  1413 
1873         if (!(vma->vm_flags & VM_SHARED)) {   !! 1414         /* Full NUMA hinting faults to serialise migration in fault paths */
1874                 /* See can_change_pte_writabl !! 1415         if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
1875                 page = vm_normal_page_pmd(vma !! 1416                 goto out;
1876                 return page && PageAnon(page) !! 1417 
1877         }                                     !! 1418         page = pmd_page(*pmd);
                                                   >> 1419         VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
                                                   >> 1420         if (flags & FOLL_TOUCH)
                                                   >> 1421                 touch_pmd(vma, addr, pmd, flags);
                                                   >> 1422         if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
                                                   >> 1423                 /*
                                                   >> 1424                  * We don't mlock() pte-mapped THPs. This way we can avoid
                                                   >> 1425                  * leaking mlocked pages into non-VM_LOCKED VMAs.
                                                   >> 1426                  *
                                                   >> 1427                  * For anon THP:
                                                   >> 1428                  *
                                                   >> 1429                  * In most cases the pmd is the only mapping of the page as we
                                                   >> 1430                  * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
                                                   >> 1431                  * writable private mappings in populate_vma_page_range().
                                                   >> 1432                  *
                                                   >> 1433                  * The only scenario when we have the page shared here is if we
                                                   >> 1434                  * mlocking read-only mapping shared over fork(). We skip
                                                   >> 1435                  * mlocking such pages.
                                                   >> 1436                  *
                                                   >> 1437                  * For file THP:
                                                   >> 1438                  *
                                                   >> 1439                  * We can expect PageDoubleMap() to be stable under page lock:
                                                   >> 1440                  * for file pages we set it in page_add_file_rmap(), which
                                                   >> 1441                  * requires page to be locked.
                                                   >> 1442                  */
                                                   >> 1443 
                                                   >> 1444                 if (PageAnon(page) && compound_mapcount(page) != 1)
                                                   >> 1445                         goto skip_mlock;
                                                   >> 1446                 if (PageDoubleMap(page) || !page->mapping)
                                                   >> 1447                         goto skip_mlock;
                                                   >> 1448                 if (!trylock_page(page))
                                                   >> 1449                         goto skip_mlock;
                                                   >> 1450                 lru_add_drain();
                                                   >> 1451                 if (page->mapping && !PageDoubleMap(page))
                                                   >> 1452                         mlock_vma_page(page);
                                                   >> 1453                 unlock_page(page);
                                                   >> 1454         }
                                                   >> 1455 skip_mlock:
                                                   >> 1456         page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
                                                   >> 1457         VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
                                                   >> 1458         if (flags & FOLL_GET)
                                                   >> 1459                 get_page(page);
1878                                                  1460 
1879         /* See can_change_pte_writable(). */  !! 1461 out:
1880         return pmd_dirty(pmd);                !! 1462         return page;
1881 }                                                1463 }
1882                                                  1464 
1883 /* NUMA hinting page fault entry point for tr    1465 /* NUMA hinting page fault entry point for trans huge pmds */
1884 vm_fault_t do_huge_pmd_numa_page(struct vm_fa !! 1466 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
1885 {                                                1467 {
1886         struct vm_area_struct *vma = vmf->vma    1468         struct vm_area_struct *vma = vmf->vma;
1887         struct folio *folio;                  !! 1469         struct anon_vma *anon_vma = NULL;
                                                   >> 1470         struct page *page;
1888         unsigned long haddr = vmf->address &     1471         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1889         int nid = NUMA_NO_NODE;               !! 1472         int page_nid = -1, this_nid = numa_node_id();
1890         int target_nid, last_cpupid;          !! 1473         int target_nid, last_cpupid = -1;
1891         pmd_t pmd, old_pmd;                   !! 1474         bool page_locked;
1892         bool writable = false;                !! 1475         bool migrated = false;
                                                   >> 1476         bool was_writable;
1893         int flags = 0;                           1477         int flags = 0;
1894                                                  1478 
1895         vmf->ptl = pmd_lock(vma->vm_mm, vmf->    1479         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1896         old_pmd = pmdp_get(vmf->pmd);         !! 1480         if (unlikely(!pmd_same(pmd, *vmf->pmd)))
                                                   >> 1481                 goto out_unlock;
1897                                                  1482 
1898         if (unlikely(!pmd_same(old_pmd, vmf-> !! 1483         /*
                                                   >> 1484          * If there are potential migrations, wait for completion and retry
                                                   >> 1485          * without disrupting NUMA hinting information. Do not relock and
                                                   >> 1486          * check_same as the page may no longer be mapped.
                                                   >> 1487          */
                                                   >> 1488         if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
                                                   >> 1489                 page = pmd_page(*vmf->pmd);
                                                   >> 1490                 if (!get_page_unless_zero(page))
                                                   >> 1491                         goto out_unlock;
1899                 spin_unlock(vmf->ptl);           1492                 spin_unlock(vmf->ptl);
1900                 return 0;                     !! 1493                 wait_on_page_locked(page);
                                                   >> 1494                 put_page(page);
                                                   >> 1495                 goto out;
1901         }                                        1496         }
1902                                                  1497 
1903         pmd = pmd_modify(old_pmd, vma->vm_pag !! 1498         page = pmd_page(pmd);
                                                   >> 1499         BUG_ON(is_huge_zero_page(page));
                                                   >> 1500         page_nid = page_to_nid(page);
                                                   >> 1501         last_cpupid = page_cpupid_last(page);
                                                   >> 1502         count_vm_numa_event(NUMA_HINT_FAULTS);
                                                   >> 1503         if (page_nid == this_nid) {
                                                   >> 1504                 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
                                                   >> 1505                 flags |= TNF_FAULT_LOCAL;
                                                   >> 1506         }
                                                   >> 1507 
                                                   >> 1508         /* See similar comment in do_numa_page for explanation */
                                                   >> 1509         if (!pmd_savedwrite(pmd))
                                                   >> 1510                 flags |= TNF_NO_GROUP;
                                                   >> 1511 
                                                   >> 1512         /*
                                                   >> 1513          * Acquire the page lock to serialise THP migrations but avoid dropping
                                                   >> 1514          * page_table_lock if at all possible
                                                   >> 1515          */
                                                   >> 1516         page_locked = trylock_page(page);
                                                   >> 1517         target_nid = mpol_misplaced(page, vma, haddr);
                                                   >> 1518         if (target_nid == -1) {
                                                   >> 1519                 /* If the page was locked, there are no parallel migrations */
                                                   >> 1520                 if (page_locked)
                                                   >> 1521                         goto clear_pmdnuma;
                                                   >> 1522         }
                                                   >> 1523 
                                                   >> 1524         /* Migration could have started since the pmd_trans_migrating check */
                                                   >> 1525         if (!page_locked) {
                                                   >> 1526                 page_nid = -1;
                                                   >> 1527                 if (!get_page_unless_zero(page))
                                                   >> 1528                         goto out_unlock;
                                                   >> 1529                 spin_unlock(vmf->ptl);
                                                   >> 1530                 wait_on_page_locked(page);
                                                   >> 1531                 put_page(page);
                                                   >> 1532                 goto out;
                                                   >> 1533         }
1904                                                  1534 
1905         /*                                       1535         /*
1906          * Detect now whether the PMD could b !! 1536          * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
1907          * is only valid while holding the PT !! 1537          * to serialises splits
1908          */                                      1538          */
1909         writable = pmd_write(pmd);            !! 1539         get_page(page);
1910         if (!writable && vma_wants_manual_pte !! 1540         spin_unlock(vmf->ptl);
1911             can_change_pmd_writable(vma, vmf- !! 1541         anon_vma = page_lock_anon_vma_read(page);
1912                 writable = true;              << 
1913                                                  1542 
1914         folio = vm_normal_folio_pmd(vma, hadd !! 1543         /* Confirm the PMD did not change while page_table_lock was released */
1915         if (!folio)                           !! 1544         spin_lock(vmf->ptl);
1916                 goto out_map;                 !! 1545         if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
                                                   >> 1546                 unlock_page(page);
                                                   >> 1547                 put_page(page);
                                                   >> 1548                 page_nid = -1;
                                                   >> 1549                 goto out_unlock;
                                                   >> 1550         }
1917                                                  1551 
1918         nid = folio_nid(folio);               !! 1552         /* Bail if we fail to protect against THP splits for any reason */
                                                   >> 1553         if (unlikely(!anon_vma)) {
                                                   >> 1554                 put_page(page);
                                                   >> 1555                 page_nid = -1;
                                                   >> 1556                 goto clear_pmdnuma;
                                                   >> 1557         }
1919                                                  1558 
1920         target_nid = numa_migrate_check(folio !! 1559         /*
1921                                         &last !! 1560          * Since we took the NUMA fault, we must have observed the !accessible
1922         if (target_nid == NUMA_NO_NODE)       !! 1561          * bit. Make sure all other CPUs agree with that, to avoid them
1923                 goto out_map;                 !! 1562          * modifying the page we're about to migrate.
1924         if (migrate_misplaced_folio_prepare(f !! 1563          *
1925                 flags |= TNF_MIGRATE_FAIL;    !! 1564          * Must be done under PTL such that we'll observe the relevant
1926                 goto out_map;                 !! 1565          * inc_tlb_flush_pending().
                                                   >> 1566          *
                                                   >> 1567          * We are not sure a pending tlb flush here is for a huge page
                                                   >> 1568          * mapping or not. Hence use the tlb range variant
                                                   >> 1569          */
                                                   >> 1570         if (mm_tlb_flush_pending(vma->vm_mm)) {
                                                   >> 1571                 flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
                                                   >> 1572                 /*
                                                   >> 1573                  * change_huge_pmd() released the pmd lock before
                                                   >> 1574                  * invalidating the secondary MMUs sharing the primary
                                                   >> 1575                  * MMU pagetables (with ->invalidate_range()). The
                                                   >> 1576                  * mmu_notifier_invalidate_range_end() (which
                                                   >> 1577                  * internally calls ->invalidate_range()) in
                                                   >> 1578                  * change_pmd_range() will run after us, so we can't
                                                   >> 1579                  * rely on it here and we need an explicit invalidate.
                                                   >> 1580                  */
                                                   >> 1581                 mmu_notifier_invalidate_range(vma->vm_mm, haddr,
                                                   >> 1582                                               haddr + HPAGE_PMD_SIZE);
1927         }                                        1583         }
1928         /* The folio is isolated and isolatio !! 1584 
                                                   >> 1585         /*
                                                   >> 1586          * Migrate the THP to the requested node, returns with page unlocked
                                                   >> 1587          * and access rights restored.
                                                   >> 1588          */
1929         spin_unlock(vmf->ptl);                   1589         spin_unlock(vmf->ptl);
1930         writable = false;                     << 
1931                                                  1590 
1932         if (!migrate_misplaced_folio(folio, v !! 1591         migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
                                                   >> 1592                                 vmf->pmd, pmd, vmf->address, page, target_nid);
                                                   >> 1593         if (migrated) {
1933                 flags |= TNF_MIGRATED;           1594                 flags |= TNF_MIGRATED;
1934                 nid = target_nid;             !! 1595                 page_nid = target_nid;
1935                 task_numa_fault(last_cpupid,  !! 1596         } else
1936                 return 0;                     !! 1597                 flags |= TNF_MIGRATE_FAIL;
1937         }                                     << 
1938                                                  1598 
1939         flags |= TNF_MIGRATE_FAIL;            !! 1599         goto out;
1940         vmf->ptl = pmd_lock(vma->vm_mm, vmf-> !! 1600 clear_pmdnuma:
1941         if (unlikely(!pmd_same(pmdp_get(vmf-> !! 1601         BUG_ON(!PageLocked(page));
1942                 spin_unlock(vmf->ptl);        !! 1602         was_writable = pmd_savedwrite(pmd);
1943                 return 0;                     !! 1603         pmd = pmd_modify(pmd, vma->vm_page_prot);
1944         }                                     << 
1945 out_map:                                      << 
1946         /* Restore the PMD */                 << 
1947         pmd = pmd_modify(pmdp_get(vmf->pmd),  << 
1948         pmd = pmd_mkyoung(pmd);                  1604         pmd = pmd_mkyoung(pmd);
1949         if (writable)                         !! 1605         if (was_writable)
1950                 pmd = pmd_mkwrite(pmd, vma);  !! 1606                 pmd = pmd_mkwrite(pmd);
1951         set_pmd_at(vma->vm_mm, haddr, vmf->pm    1607         set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
1952         update_mmu_cache_pmd(vma, vmf->addres    1608         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                                                   >> 1609         unlock_page(page);
                                                   >> 1610 out_unlock:
1953         spin_unlock(vmf->ptl);                   1611         spin_unlock(vmf->ptl);
1954                                                  1612 
1955         if (nid != NUMA_NO_NODE)              !! 1613 out:
1956                 task_numa_fault(last_cpupid,  !! 1614         if (anon_vma)
                                                   >> 1615                 page_unlock_anon_vma_read(anon_vma);
                                                   >> 1616 
                                                   >> 1617         if (page_nid != -1)
                                                   >> 1618                 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
                                                   >> 1619                                 flags);
                                                   >> 1620 
1957         return 0;                                1621         return 0;
1958 }                                                1622 }
1959                                                  1623 
1960 /*                                               1624 /*
1961  * Return true if we do MADV_FREE successfull    1625  * Return true if we do MADV_FREE successfully on entire pmd page.
1962  * Otherwise, return false.                      1626  * Otherwise, return false.
1963  */                                              1627  */
1964 bool madvise_free_huge_pmd(struct mmu_gather     1628 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1965                 pmd_t *pmd, unsigned long add    1629                 pmd_t *pmd, unsigned long addr, unsigned long next)
1966 {                                                1630 {
1967         spinlock_t *ptl;                         1631         spinlock_t *ptl;
1968         pmd_t orig_pmd;                          1632         pmd_t orig_pmd;
1969         struct folio *folio;                  !! 1633         struct page *page;
1970         struct mm_struct *mm = tlb->mm;          1634         struct mm_struct *mm = tlb->mm;
1971         bool ret = false;                        1635         bool ret = false;
1972                                                  1636 
1973         tlb_change_page_size(tlb, HPAGE_PMD_S !! 1637         tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
1974                                                  1638 
1975         ptl = pmd_trans_huge_lock(pmd, vma);     1639         ptl = pmd_trans_huge_lock(pmd, vma);
1976         if (!ptl)                                1640         if (!ptl)
1977                 goto out_unlocked;               1641                 goto out_unlocked;
1978                                                  1642 
1979         orig_pmd = *pmd;                         1643         orig_pmd = *pmd;
1980         if (is_huge_zero_pmd(orig_pmd))          1644         if (is_huge_zero_pmd(orig_pmd))
1981                 goto out;                        1645                 goto out;
1982                                                  1646 
1983         if (unlikely(!pmd_present(orig_pmd)))    1647         if (unlikely(!pmd_present(orig_pmd))) {
1984                 VM_BUG_ON(thp_migration_suppo    1648                 VM_BUG_ON(thp_migration_supported() &&
1985                                   !is_pmd_mig    1649                                   !is_pmd_migration_entry(orig_pmd));
1986                 goto out;                        1650                 goto out;
1987         }                                        1651         }
1988                                                  1652 
1989         folio = pmd_folio(orig_pmd);          !! 1653         page = pmd_page(orig_pmd);
1990         /*                                       1654         /*
1991          * If other processes are mapping thi !! 1655          * If other processes are mapping this page, we couldn't discard
1992          * the folio unless they all do MADV_ !! 1656          * the page unless they all do MADV_FREE so let's skip the page.
1993          */                                      1657          */
1994         if (folio_likely_mapped_shared(folio) !! 1658         if (page_mapcount(page) != 1)
1995                 goto out;                        1659                 goto out;
1996                                                  1660 
1997         if (!folio_trylock(folio))            !! 1661         if (!trylock_page(page))
1998                 goto out;                        1662                 goto out;
1999                                                  1663 
2000         /*                                       1664         /*
2001          * If user want to discard part-pages    1665          * If user want to discard part-pages of THP, split it so MADV_FREE
2002          * will deactivate only them.            1666          * will deactivate only them.
2003          */                                      1667          */
2004         if (next - addr != HPAGE_PMD_SIZE) {     1668         if (next - addr != HPAGE_PMD_SIZE) {
2005                 folio_get(folio);             !! 1669                 get_page(page);
2006                 spin_unlock(ptl);                1670                 spin_unlock(ptl);
2007                 split_folio(folio);           !! 1671                 split_huge_page(page);
2008                 folio_unlock(folio);          !! 1672                 unlock_page(page);
2009                 folio_put(folio);             !! 1673                 put_page(page);
2010                 goto out_unlocked;               1674                 goto out_unlocked;
2011         }                                        1675         }
2012                                                  1676 
2013         if (folio_test_dirty(folio))          !! 1677         if (PageDirty(page))
2014                 folio_clear_dirty(folio);     !! 1678                 ClearPageDirty(page);
2015         folio_unlock(folio);                  !! 1679         unlock_page(page);
2016                                                  1680 
2017         if (pmd_young(orig_pmd) || pmd_dirty(    1681         if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
2018                 pmdp_invalidate(vma, addr, pm    1682                 pmdp_invalidate(vma, addr, pmd);
2019                 orig_pmd = pmd_mkold(orig_pmd    1683                 orig_pmd = pmd_mkold(orig_pmd);
2020                 orig_pmd = pmd_mkclean(orig_p    1684                 orig_pmd = pmd_mkclean(orig_pmd);
2021                                                  1685 
2022                 set_pmd_at(mm, addr, pmd, ori    1686                 set_pmd_at(mm, addr, pmd, orig_pmd);
2023                 tlb_remove_pmd_tlb_entry(tlb,    1687                 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2024         }                                        1688         }
2025                                                  1689 
2026         folio_mark_lazyfree(folio);           !! 1690         mark_page_lazyfree(page);
2027         ret = true;                              1691         ret = true;
2028 out:                                             1692 out:
2029         spin_unlock(ptl);                        1693         spin_unlock(ptl);
2030 out_unlocked:                                    1694 out_unlocked:
2031         return ret;                              1695         return ret;
2032 }                                                1696 }
2033                                                  1697 
2034 static inline void zap_deposited_table(struct    1698 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
2035 {                                                1699 {
2036         pgtable_t pgtable;                       1700         pgtable_t pgtable;
2037                                                  1701 
2038         pgtable = pgtable_trans_huge_withdraw    1702         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2039         pte_free(mm, pgtable);                   1703         pte_free(mm, pgtable);
2040         mm_dec_nr_ptes(mm);                      1704         mm_dec_nr_ptes(mm);
2041 }                                                1705 }
2042                                                  1706 
2043 int zap_huge_pmd(struct mmu_gather *tlb, stru    1707 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
2044                  pmd_t *pmd, unsigned long ad    1708                  pmd_t *pmd, unsigned long addr)
2045 {                                                1709 {
2046         pmd_t orig_pmd;                          1710         pmd_t orig_pmd;
2047         spinlock_t *ptl;                         1711         spinlock_t *ptl;
2048                                                  1712 
2049         tlb_change_page_size(tlb, HPAGE_PMD_S !! 1713         tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
2050                                                  1714 
2051         ptl = __pmd_trans_huge_lock(pmd, vma)    1715         ptl = __pmd_trans_huge_lock(pmd, vma);
2052         if (!ptl)                                1716         if (!ptl)
2053                 return 0;                        1717                 return 0;
2054         /*                                       1718         /*
2055          * For architectures like ppc64 we lo    1719          * For architectures like ppc64 we look at deposited pgtable
2056          * when calling pmdp_huge_get_and_cle    1720          * when calling pmdp_huge_get_and_clear. So do the
2057          * pgtable_trans_huge_withdraw after     1721          * pgtable_trans_huge_withdraw after finishing pmdp related
2058          * operations.                           1722          * operations.
2059          */                                      1723          */
2060         orig_pmd = pmdp_huge_get_and_clear_fu !! 1724         orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
2061                                               !! 1725                         tlb->fullmm);
2062         arch_check_zapped_pmd(vma, orig_pmd); << 
2063         tlb_remove_pmd_tlb_entry(tlb, pmd, ad    1726         tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2064         if (vma_is_special_huge(vma)) {       !! 1727         if (vma_is_dax(vma)) {
2065                 if (arch_needs_pgtable_deposi    1728                 if (arch_needs_pgtable_deposit())
2066                         zap_deposited_table(t    1729                         zap_deposited_table(tlb->mm, pmd);
2067                 spin_unlock(ptl);                1730                 spin_unlock(ptl);
                                                   >> 1731                 if (is_huge_zero_pmd(orig_pmd))
                                                   >> 1732                         tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
2068         } else if (is_huge_zero_pmd(orig_pmd)    1733         } else if (is_huge_zero_pmd(orig_pmd)) {
2069                 zap_deposited_table(tlb->mm,     1734                 zap_deposited_table(tlb->mm, pmd);
2070                 spin_unlock(ptl);                1735                 spin_unlock(ptl);
                                                   >> 1736                 tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
2071         } else {                                 1737         } else {
2072                 struct folio *folio = NULL;   !! 1738                 struct page *page = NULL;
2073                 int flush_needed = 1;            1739                 int flush_needed = 1;
2074                                                  1740 
2075                 if (pmd_present(orig_pmd)) {     1741                 if (pmd_present(orig_pmd)) {
2076                         struct page *page = p !! 1742                         page = pmd_page(orig_pmd);
2077                                               !! 1743                         page_remove_rmap(page, true);
2078                         folio = page_folio(pa !! 1744                         VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
2079                         folio_remove_rmap_pmd << 
2080                         WARN_ON_ONCE(folio_ma << 
2081                         VM_BUG_ON_PAGE(!PageH    1745                         VM_BUG_ON_PAGE(!PageHead(page), page);
2082                 } else if (thp_migration_supp    1746                 } else if (thp_migration_supported()) {
2083                         swp_entry_t entry;       1747                         swp_entry_t entry;
2084                                                  1748 
2085                         VM_BUG_ON(!is_pmd_mig    1749                         VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
2086                         entry = pmd_to_swp_en    1750                         entry = pmd_to_swp_entry(orig_pmd);
2087                         folio = pfn_swap_entr !! 1751                         page = pfn_to_page(swp_offset(entry));
2088                         flush_needed = 0;        1752                         flush_needed = 0;
2089                 } else                           1753                 } else
2090                         WARN_ONCE(1, "Non pre    1754                         WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
2091                                                  1755 
2092                 if (folio_test_anon(folio)) { !! 1756                 if (PageAnon(page)) {
2093                         zap_deposited_table(t    1757                         zap_deposited_table(tlb->mm, pmd);
2094                         add_mm_counter(tlb->m    1758                         add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
2095                 } else {                         1759                 } else {
2096                         if (arch_needs_pgtabl    1760                         if (arch_needs_pgtable_deposit())
2097                                 zap_deposited    1761                                 zap_deposited_table(tlb->mm, pmd);
2098                         add_mm_counter(tlb->m !! 1762                         add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
2099                                        -HPAGE << 
2100                 }                                1763                 }
2101                                                  1764 
2102                 spin_unlock(ptl);                1765                 spin_unlock(ptl);
2103                 if (flush_needed)                1766                 if (flush_needed)
2104                         tlb_remove_page_size( !! 1767                         tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
2105         }                                        1768         }
2106         return 1;                                1769         return 1;
2107 }                                                1770 }
2108                                                  1771 
2109 #ifndef pmd_move_must_withdraw                   1772 #ifndef pmd_move_must_withdraw
2110 static inline int pmd_move_must_withdraw(spin    1773 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
2111                                          spin    1774                                          spinlock_t *old_pmd_ptl,
2112                                          stru    1775                                          struct vm_area_struct *vma)
2113 {                                                1776 {
2114         /*                                       1777         /*
2115          * With split pmd lock we also need t    1778          * With split pmd lock we also need to move preallocated
2116          * PTE page table if new_pmd is on di    1779          * PTE page table if new_pmd is on different PMD page table.
2117          *                                       1780          *
2118          * We also don't deposit and withdraw    1781          * We also don't deposit and withdraw tables for file pages.
2119          */                                      1782          */
2120         return (new_pmd_ptl != old_pmd_ptl) &    1783         return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
2121 }                                                1784 }
2122 #endif                                           1785 #endif
2123                                                  1786 
2124 static pmd_t move_soft_dirty_pmd(pmd_t pmd)      1787 static pmd_t move_soft_dirty_pmd(pmd_t pmd)
2125 {                                                1788 {
2126 #ifdef CONFIG_MEM_SOFT_DIRTY                     1789 #ifdef CONFIG_MEM_SOFT_DIRTY
2127         if (unlikely(is_pmd_migration_entry(p    1790         if (unlikely(is_pmd_migration_entry(pmd)))
2128                 pmd = pmd_swp_mksoft_dirty(pm    1791                 pmd = pmd_swp_mksoft_dirty(pmd);
2129         else if (pmd_present(pmd))               1792         else if (pmd_present(pmd))
2130                 pmd = pmd_mksoft_dirty(pmd);     1793                 pmd = pmd_mksoft_dirty(pmd);
2131 #endif                                           1794 #endif
2132         return pmd;                              1795         return pmd;
2133 }                                                1796 }
2134                                                  1797 
2135 bool move_huge_pmd(struct vm_area_struct *vma    1798 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
2136                   unsigned long new_addr, pmd !! 1799                   unsigned long new_addr, unsigned long old_end,
                                                   >> 1800                   pmd_t *old_pmd, pmd_t *new_pmd)
2137 {                                                1801 {
2138         spinlock_t *old_ptl, *new_ptl;           1802         spinlock_t *old_ptl, *new_ptl;
2139         pmd_t pmd;                               1803         pmd_t pmd;
2140         struct mm_struct *mm = vma->vm_mm;       1804         struct mm_struct *mm = vma->vm_mm;
2141         bool force_flush = false;                1805         bool force_flush = false;
2142                                                  1806 
                                                   >> 1807         if ((old_addr & ~HPAGE_PMD_MASK) ||
                                                   >> 1808             (new_addr & ~HPAGE_PMD_MASK) ||
                                                   >> 1809             old_end - old_addr < HPAGE_PMD_SIZE)
                                                   >> 1810                 return false;
                                                   >> 1811 
2143         /*                                       1812         /*
2144          * The destination pmd shouldn't be e    1813          * The destination pmd shouldn't be established, free_pgtables()
2145          * should have released it; but move_ !! 1814          * should have release it.
2146          * inserted a page table, if racing a << 
2147          */                                      1815          */
2148         if (!pmd_none(*new_pmd)) {            !! 1816         if (WARN_ON(!pmd_none(*new_pmd))) {
2149                 VM_BUG_ON(pmd_trans_huge(*new    1817                 VM_BUG_ON(pmd_trans_huge(*new_pmd));
2150                 return false;                    1818                 return false;
2151         }                                        1819         }
2152                                                  1820 
2153         /*                                       1821         /*
2154          * We don't have to worry about the o    1822          * We don't have to worry about the ordering of src and dst
2155          * ptlocks because exclusive mmap_loc !! 1823          * ptlocks because exclusive mmap_sem prevents deadlock.
2156          */                                      1824          */
2157         old_ptl = __pmd_trans_huge_lock(old_p    1825         old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
2158         if (old_ptl) {                           1826         if (old_ptl) {
2159                 new_ptl = pmd_lockptr(mm, new    1827                 new_ptl = pmd_lockptr(mm, new_pmd);
2160                 if (new_ptl != old_ptl)          1828                 if (new_ptl != old_ptl)
2161                         spin_lock_nested(new_    1829                         spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
2162                 pmd = pmdp_huge_get_and_clear    1830                 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
2163                 if (pmd_present(pmd))            1831                 if (pmd_present(pmd))
2164                         force_flush = true;      1832                         force_flush = true;
2165                 VM_BUG_ON(!pmd_none(*new_pmd)    1833                 VM_BUG_ON(!pmd_none(*new_pmd));
2166                                                  1834 
2167                 if (pmd_move_must_withdraw(ne    1835                 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
2168                         pgtable_t pgtable;       1836                         pgtable_t pgtable;
2169                         pgtable = pgtable_tra    1837                         pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
2170                         pgtable_trans_huge_de    1838                         pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
2171                 }                                1839                 }
2172                 pmd = move_soft_dirty_pmd(pmd    1840                 pmd = move_soft_dirty_pmd(pmd);
2173                 set_pmd_at(mm, new_addr, new_    1841                 set_pmd_at(mm, new_addr, new_pmd, pmd);
2174                 if (force_flush)                 1842                 if (force_flush)
2175                         flush_pmd_tlb_range(v !! 1843                         flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
2176                 if (new_ptl != old_ptl)          1844                 if (new_ptl != old_ptl)
2177                         spin_unlock(new_ptl);    1845                         spin_unlock(new_ptl);
2178                 spin_unlock(old_ptl);            1846                 spin_unlock(old_ptl);
2179                 return true;                     1847                 return true;
2180         }                                        1848         }
2181         return false;                            1849         return false;
2182 }                                                1850 }
2183                                                  1851 
2184 /*                                               1852 /*
2185  * Returns                                       1853  * Returns
2186  *  - 0 if PMD could not be locked               1854  *  - 0 if PMD could not be locked
2187  *  - 1 if PMD was locked but protections unc !! 1855  *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
2188  *      or if prot_numa but THP migration is  !! 1856  *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
2189  *  - HPAGE_PMD_NR if protections changed and << 
2190  */                                              1857  */
2191 int change_huge_pmd(struct mmu_gather *tlb, s !! 1858 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2192                     pmd_t *pmd, unsigned long !! 1859                 unsigned long addr, pgprot_t newprot, int prot_numa)
2193                     unsigned long cp_flags)   << 
2194 {                                                1860 {
2195         struct mm_struct *mm = vma->vm_mm;       1861         struct mm_struct *mm = vma->vm_mm;
2196         spinlock_t *ptl;                         1862         spinlock_t *ptl;
2197         pmd_t oldpmd, entry;                  !! 1863         pmd_t entry;
2198         bool prot_numa = cp_flags & MM_CP_PRO !! 1864         bool preserve_write;
2199         bool uffd_wp = cp_flags & MM_CP_UFFD_ !! 1865         int ret;
2200         bool uffd_wp_resolve = cp_flags & MM_ << 
2201         int ret = 1;                          << 
2202                                               << 
2203         tlb_change_page_size(tlb, HPAGE_PMD_S << 
2204                                               << 
2205         if (prot_numa && !thp_migration_suppo << 
2206                 return 1;                     << 
2207                                                  1866 
2208         ptl = __pmd_trans_huge_lock(pmd, vma)    1867         ptl = __pmd_trans_huge_lock(pmd, vma);
2209         if (!ptl)                                1868         if (!ptl)
2210                 return 0;                        1869                 return 0;
2211                                                  1870 
                                                   >> 1871         preserve_write = prot_numa && pmd_write(*pmd);
                                                   >> 1872         ret = 1;
                                                   >> 1873 
2212 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION          1874 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
2213         if (is_swap_pmd(*pmd)) {                 1875         if (is_swap_pmd(*pmd)) {
2214                 swp_entry_t entry = pmd_to_sw    1876                 swp_entry_t entry = pmd_to_swp_entry(*pmd);
2215                 struct folio *folio = pfn_swa << 
2216                 pmd_t newpmd;                 << 
2217                                                  1877 
2218                 VM_BUG_ON(!is_pmd_migration_e    1878                 VM_BUG_ON(!is_pmd_migration_entry(*pmd));
2219                 if (is_writable_migration_ent !! 1879                 if (is_write_migration_entry(entry)) {
                                                   >> 1880                         pmd_t newpmd;
2220                         /*                       1881                         /*
2221                          * A protection check    1882                          * A protection check is difficult so
2222                          * just be safe and d    1883                          * just be safe and disable write
2223                          */                      1884                          */
2224                         if (folio_test_anon(f !! 1885                         make_migration_entry_read(&entry);
2225                                 entry = make_ << 
2226                         else                  << 
2227                                 entry = make_ << 
2228                         newpmd = swp_entry_to    1886                         newpmd = swp_entry_to_pmd(entry);
2229                         if (pmd_swp_soft_dirt    1887                         if (pmd_swp_soft_dirty(*pmd))
2230                                 newpmd = pmd_    1888                                 newpmd = pmd_swp_mksoft_dirty(newpmd);
2231                 } else {                      << 
2232                         newpmd = *pmd;        << 
2233                 }                             << 
2234                                               << 
2235                 if (uffd_wp)                  << 
2236                         newpmd = pmd_swp_mkuf << 
2237                 else if (uffd_wp_resolve)     << 
2238                         newpmd = pmd_swp_clea << 
2239                 if (!pmd_same(*pmd, newpmd))  << 
2240                         set_pmd_at(mm, addr,     1889                         set_pmd_at(mm, addr, pmd, newpmd);
                                                   >> 1890                 }
2241                 goto unlock;                     1891                 goto unlock;
2242         }                                        1892         }
2243 #endif                                           1893 #endif
2244                                                  1894 
2245         if (prot_numa) {                      !! 1895         /*
2246                 struct folio *folio;          !! 1896          * Avoid trapping faults against the zero page. The read-only
2247                 bool toptier;                 !! 1897          * data is likely to be read-cached on the local CPU and
2248                 /*                            !! 1898          * local/remote hits to the zero page are not interesting.
2249                  * Avoid trapping faults agai !! 1899          */
2250                  * data is likely to be read- !! 1900         if (prot_numa && is_huge_zero_pmd(*pmd))
2251                  * local/remote hits to the z !! 1901                 goto unlock;
2252                  */                           << 
2253                 if (is_huge_zero_pmd(*pmd))   << 
2254                         goto unlock;          << 
2255                                               << 
2256                 if (pmd_protnone(*pmd))       << 
2257                         goto unlock;          << 
2258                                                  1902 
2259                 folio = pmd_folio(*pmd);      !! 1903         if (prot_numa && pmd_protnone(*pmd))
2260                 toptier = node_is_toptier(fol !! 1904                 goto unlock;
2261                 /*                            << 
2262                  * Skip scanning top tier nod << 
2263                  * balancing is disabled      << 
2264                  */                           << 
2265                 if (!(sysctl_numa_balancing_m << 
2266                     toptier)                  << 
2267                         goto unlock;          << 
2268                                                  1905 
2269                 if (folio_use_access_time(fol << 
2270                         folio_xchg_access_tim << 
2271                                               << 
2272         }                                     << 
2273         /*                                       1906         /*
2274          * In case prot_numa, we are under mm !! 1907          * In case prot_numa, we are under down_read(mmap_sem). It's critical
2275          * to not clear pmd intermittently to    1908          * to not clear pmd intermittently to avoid race with MADV_DONTNEED
2276          * which is also under mmap_read_lock !! 1909          * which is also under down_read(mmap_sem):
2277          *                                       1910          *
2278          *      CPU0:                            1911          *      CPU0:                           CPU1:
2279          *                              chang    1912          *                              change_huge_pmd(prot_numa=1)
2280          *                               pmdp    1913          *                               pmdp_huge_get_and_clear_notify()
2281          * madvise_dontneed()                    1914          * madvise_dontneed()
2282          *  zap_pmd_range()                      1915          *  zap_pmd_range()
2283          *   pmd_trans_huge(*pmd) == 0 (witho    1916          *   pmd_trans_huge(*pmd) == 0 (without ptl)
2284          *   // skip the pmd                     1917          *   // skip the pmd
2285          *                               set_    1918          *                               set_pmd_at();
2286          *                               // p    1919          *                               // pmd is re-established
2287          *                                       1920          *
2288          * The race makes MADV_DONTNEED miss     1921          * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
2289          * which may break userspace.            1922          * which may break userspace.
2290          *                                       1923          *
2291          * pmdp_invalidate_ad() is required t !! 1924          * pmdp_invalidate() is required to make sure we don't miss
2292          * dirty/young flags set by hardware.    1925          * dirty/young flags set by hardware.
2293          */                                      1926          */
2294         oldpmd = pmdp_invalidate_ad(vma, addr !! 1927         entry = pmdp_invalidate(vma, addr, pmd);
2295                                               << 
2296         entry = pmd_modify(oldpmd, newprot);  << 
2297         if (uffd_wp)                          << 
2298                 entry = pmd_mkuffd_wp(entry); << 
2299         else if (uffd_wp_resolve)             << 
2300                 /*                            << 
2301                  * Leave the write bit to be  << 
2302                  * handler, then things like  << 
2303                  * handled.                   << 
2304                  */                           << 
2305                 entry = pmd_clear_uffd_wp(ent << 
2306                                               << 
2307         /* See change_pte_range(). */         << 
2308         if ((cp_flags & MM_CP_TRY_CHANGE_WRIT << 
2309             can_change_pmd_writable(vma, addr << 
2310                 entry = pmd_mkwrite(entry, vm << 
2311                                                  1928 
                                                   >> 1929         entry = pmd_modify(entry, newprot);
                                                   >> 1930         if (preserve_write)
                                                   >> 1931                 entry = pmd_mk_savedwrite(entry);
2312         ret = HPAGE_PMD_NR;                      1932         ret = HPAGE_PMD_NR;
2313         set_pmd_at(mm, addr, pmd, entry);        1933         set_pmd_at(mm, addr, pmd, entry);
2314                                               !! 1934         BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
2315         if (huge_pmd_needs_flush(oldpmd, entr << 
2316                 tlb_flush_pmd_range(tlb, addr << 
2317 unlock:                                          1935 unlock:
2318         spin_unlock(ptl);                        1936         spin_unlock(ptl);
2319         return ret;                              1937         return ret;
2320 }                                                1938 }
2321                                                  1939 
2322 /*                                               1940 /*
2323  * Returns:                                   << 
2324  *                                            << 
2325  * - 0: if pud leaf changed from under us     << 
2326  * - 1: if pud can be skipped                 << 
2327  * - HPAGE_PUD_NR: if pud was successfully pr << 
2328  */                                           << 
2329 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_ << 
2330 int change_huge_pud(struct mmu_gather *tlb, s << 
2331                     pud_t *pudp, unsigned lon << 
2332                     unsigned long cp_flags)   << 
2333 {                                             << 
2334         struct mm_struct *mm = vma->vm_mm;    << 
2335         pud_t oldpud, entry;                  << 
2336         spinlock_t *ptl;                      << 
2337                                               << 
2338         tlb_change_page_size(tlb, HPAGE_PUD_S << 
2339                                               << 
2340         /* NUMA balancing doesn't apply to da << 
2341         if (cp_flags & MM_CP_PROT_NUMA)       << 
2342                 return 1;                     << 
2343                                               << 
2344         /*                                    << 
2345          * Huge entries on userfault-wp only  << 
2346          * don't have anonymous PUDs yet.     << 
2347          */                                   << 
2348         if (WARN_ON_ONCE(cp_flags & MM_CP_UFF << 
2349                 return 1;                     << 
2350                                               << 
2351         ptl = __pud_trans_huge_lock(pudp, vma << 
2352         if (!ptl)                             << 
2353                 return 0;                     << 
2354                                               << 
2355         /*                                    << 
2356          * Can't clear PUD or it can race wit << 
2357          * change_huge_pmd().                 << 
2358          */                                   << 
2359         oldpud = pudp_invalidate(vma, addr, p << 
2360         entry = pud_modify(oldpud, newprot);  << 
2361         set_pud_at(mm, addr, pudp, entry);    << 
2362         tlb_flush_pud_range(tlb, addr, HPAGE_ << 
2363                                               << 
2364         spin_unlock(ptl);                     << 
2365         return HPAGE_PUD_NR;                  << 
2366 }                                             << 
2367 #endif                                        << 
2368                                               << 
2369 #ifdef CONFIG_USERFAULTFD                     << 
2370 /*                                            << 
2371  * The PT lock for src_pmd and dst_vma/src_vm << 
2372  * the caller, but it must return after relea << 
2373  * Just move the page from src_pmd to dst_pmd << 
2374  * Return zero if succeeded in moving the pag << 
2375  * repeated by the caller, or other errors in << 
2376  */                                           << 
2377 int move_pages_huge_pmd(struct mm_struct *mm, << 
2378                         struct vm_area_struct << 
2379                         unsigned long dst_add << 
2380 {                                             << 
2381         pmd_t _dst_pmd, src_pmdval;           << 
2382         struct page *src_page;                << 
2383         struct folio *src_folio;              << 
2384         struct anon_vma *src_anon_vma;        << 
2385         spinlock_t *src_ptl, *dst_ptl;        << 
2386         pgtable_t src_pgtable;                << 
2387         struct mmu_notifier_range range;      << 
2388         int err = 0;                          << 
2389                                               << 
2390         src_pmdval = *src_pmd;                << 
2391         src_ptl = pmd_lockptr(mm, src_pmd);   << 
2392                                               << 
2393         lockdep_assert_held(src_ptl);         << 
2394         vma_assert_locked(src_vma);           << 
2395         vma_assert_locked(dst_vma);           << 
2396                                               << 
2397         /* Sanity checks before the operation << 
2398         if (WARN_ON_ONCE(!pmd_none(dst_pmdval << 
2399             WARN_ON_ONCE(dst_addr & ~HPAGE_PM << 
2400                 spin_unlock(src_ptl);         << 
2401                 return -EINVAL;               << 
2402         }                                     << 
2403                                               << 
2404         if (!pmd_trans_huge(src_pmdval)) {    << 
2405                 spin_unlock(src_ptl);         << 
2406                 if (is_pmd_migration_entry(sr << 
2407                         pmd_migration_entry_w << 
2408                         return -EAGAIN;       << 
2409                 }                             << 
2410                 return -ENOENT;               << 
2411         }                                     << 
2412                                               << 
2413         src_page = pmd_page(src_pmdval);      << 
2414                                               << 
2415         if (!is_huge_zero_pmd(src_pmdval)) {  << 
2416                 if (unlikely(!PageAnonExclusi << 
2417                         spin_unlock(src_ptl); << 
2418                         return -EBUSY;        << 
2419                 }                             << 
2420                                               << 
2421                 src_folio = page_folio(src_pa << 
2422                 folio_get(src_folio);         << 
2423         } else                                << 
2424                 src_folio = NULL;             << 
2425                                               << 
2426         spin_unlock(src_ptl);                 << 
2427                                               << 
2428         flush_cache_range(src_vma, src_addr,  << 
2429         mmu_notifier_range_init(&range, MMU_N << 
2430                                 src_addr + HP << 
2431         mmu_notifier_invalidate_range_start(& << 
2432                                               << 
2433         if (src_folio) {                      << 
2434                 folio_lock(src_folio);        << 
2435                                               << 
2436                 /*                            << 
2437                  * split_huge_page walks the  << 
2438                  * lock. Serialize against it << 
2439                  * lock is not enough.        << 
2440                  */                           << 
2441                 src_anon_vma = folio_get_anon << 
2442                 if (!src_anon_vma) {          << 
2443                         err = -EAGAIN;        << 
2444                         goto unlock_folio;    << 
2445                 }                             << 
2446                 anon_vma_lock_write(src_anon_ << 
2447         } else                                << 
2448                 src_anon_vma = NULL;          << 
2449                                               << 
2450         dst_ptl = pmd_lockptr(mm, dst_pmd);   << 
2451         double_pt_lock(src_ptl, dst_ptl);     << 
2452         if (unlikely(!pmd_same(*src_pmd, src_ << 
2453                      !pmd_same(*dst_pmd, dst_ << 
2454                 err = -EAGAIN;                << 
2455                 goto unlock_ptls;             << 
2456         }                                     << 
2457         if (src_folio) {                      << 
2458                 if (folio_maybe_dma_pinned(sr << 
2459                     !PageAnonExclusive(&src_f << 
2460                         err = -EBUSY;         << 
2461                         goto unlock_ptls;     << 
2462                 }                             << 
2463                                               << 
2464                 if (WARN_ON_ONCE(!folio_test_ << 
2465                     WARN_ON_ONCE(!folio_test_ << 
2466                         err = -EBUSY;         << 
2467                         goto unlock_ptls;     << 
2468                 }                             << 
2469                                               << 
2470                 src_pmdval = pmdp_huge_clear_ << 
2471                 /* Folio got pinned from unde << 
2472                 if (folio_maybe_dma_pinned(sr << 
2473                         set_pmd_at(mm, src_ad << 
2474                         err = -EBUSY;         << 
2475                         goto unlock_ptls;     << 
2476                 }                             << 
2477                                               << 
2478                 folio_move_anon_rmap(src_foli << 
2479                 src_folio->index = linear_pag << 
2480                                               << 
2481                 _dst_pmd = mk_huge_pmd(&src_f << 
2482                 /* Follow mremap() behavior a << 
2483                 _dst_pmd = pmd_mkwrite(pmd_mk << 
2484         } else {                              << 
2485                 src_pmdval = pmdp_huge_clear_ << 
2486                 _dst_pmd = mk_huge_pmd(src_pa << 
2487         }                                     << 
2488         set_pmd_at(mm, dst_addr, dst_pmd, _ds << 
2489                                               << 
2490         src_pgtable = pgtable_trans_huge_with << 
2491         pgtable_trans_huge_deposit(mm, dst_pm << 
2492 unlock_ptls:                                  << 
2493         double_pt_unlock(src_ptl, dst_ptl);   << 
2494         if (src_anon_vma) {                   << 
2495                 anon_vma_unlock_write(src_ano << 
2496                 put_anon_vma(src_anon_vma);   << 
2497         }                                     << 
2498 unlock_folio:                                 << 
2499         /* unblock rmap walks */              << 
2500         if (src_folio)                        << 
2501                 folio_unlock(src_folio);      << 
2502         mmu_notifier_invalidate_range_end(&ra << 
2503         if (src_folio)                        << 
2504                 folio_put(src_folio);         << 
2505         return err;                           << 
2506 }                                             << 
2507 #endif /* CONFIG_USERFAULTFD */               << 
2508                                               << 
2509 /*                                            << 
2510  * Returns page table lock pointer if a given    1941  * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
2511  *                                               1942  *
2512  * Note that if it returns page table lock po    1943  * Note that if it returns page table lock pointer, this routine returns without
2513  * unlocking page table lock. So callers must    1944  * unlocking page table lock. So callers must unlock it.
2514  */                                              1945  */
2515 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,    1946 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
2516 {                                                1947 {
2517         spinlock_t *ptl;                         1948         spinlock_t *ptl;
2518         ptl = pmd_lock(vma->vm_mm, pmd);         1949         ptl = pmd_lock(vma->vm_mm, pmd);
2519         if (likely(is_swap_pmd(*pmd) || pmd_t    1950         if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
2520                         pmd_devmap(*pmd)))       1951                         pmd_devmap(*pmd)))
2521                 return ptl;                      1952                 return ptl;
2522         spin_unlock(ptl);                        1953         spin_unlock(ptl);
2523         return NULL;                             1954         return NULL;
2524 }                                                1955 }
2525                                                  1956 
2526 /*                                               1957 /*
2527  * Returns page table lock pointer if a given !! 1958  * Returns true if a given pud maps a thp, false otherwise.
2528  *                                               1959  *
2529  * Note that if it returns page table lock po !! 1960  * Note that if it returns true, this routine returns without unlocking page
2530  * unlocking page table lock. So callers must !! 1961  * table lock. So callers must unlock it.
2531  */                                              1962  */
2532 spinlock_t *__pud_trans_huge_lock(pud_t *pud,    1963 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
2533 {                                                1964 {
2534         spinlock_t *ptl;                         1965         spinlock_t *ptl;
2535                                                  1966 
2536         ptl = pud_lock(vma->vm_mm, pud);         1967         ptl = pud_lock(vma->vm_mm, pud);
2537         if (likely(pud_trans_huge(*pud) || pu    1968         if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
2538                 return ptl;                      1969                 return ptl;
2539         spin_unlock(ptl);                        1970         spin_unlock(ptl);
2540         return NULL;                             1971         return NULL;
2541 }                                                1972 }
2542                                                  1973 
2543 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_    1974 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2544 int zap_huge_pud(struct mmu_gather *tlb, stru    1975 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2545                  pud_t *pud, unsigned long ad    1976                  pud_t *pud, unsigned long addr)
2546 {                                                1977 {
2547         spinlock_t *ptl;                      << 
2548         pud_t orig_pud;                          1978         pud_t orig_pud;
                                                   >> 1979         spinlock_t *ptl;
2549                                                  1980 
2550         ptl = __pud_trans_huge_lock(pud, vma)    1981         ptl = __pud_trans_huge_lock(pud, vma);
2551         if (!ptl)                                1982         if (!ptl)
2552                 return 0;                        1983                 return 0;
2553                                               !! 1984         /*
2554         orig_pud = pudp_huge_get_and_clear_fu !! 1985          * For architectures like ppc64 we look at deposited pgtable
2555         arch_check_zapped_pud(vma, orig_pud); !! 1986          * when calling pudp_huge_get_and_clear. So do the
                                                   >> 1987          * pgtable_trans_huge_withdraw after finishing pudp related
                                                   >> 1988          * operations.
                                                   >> 1989          */
                                                   >> 1990         orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud,
                                                   >> 1991                         tlb->fullmm);
2556         tlb_remove_pud_tlb_entry(tlb, pud, ad    1992         tlb_remove_pud_tlb_entry(tlb, pud, addr);
2557         if (vma_is_special_huge(vma)) {       !! 1993         if (vma_is_dax(vma)) {
2558                 spin_unlock(ptl);                1994                 spin_unlock(ptl);
2559                 /* No zero page support yet *    1995                 /* No zero page support yet */
2560         } else {                                 1996         } else {
2561                 /* No support for anonymous P    1997                 /* No support for anonymous PUD pages yet */
2562                 BUG();                           1998                 BUG();
2563         }                                        1999         }
2564         return 1;                                2000         return 1;
2565 }                                                2001 }
2566                                                  2002 
2567 static void __split_huge_pud_locked(struct vm    2003 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2568                 unsigned long haddr)             2004                 unsigned long haddr)
2569 {                                                2005 {
2570         VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);      2006         VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2571         VM_BUG_ON_VMA(vma->vm_start > haddr,     2007         VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2572         VM_BUG_ON_VMA(vma->vm_end < haddr + H    2008         VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2573         VM_BUG_ON(!pud_trans_huge(*pud) && !p    2009         VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
2574                                                  2010 
2575         count_vm_event(THP_SPLIT_PUD);           2011         count_vm_event(THP_SPLIT_PUD);
2576                                                  2012 
2577         pudp_huge_clear_flush(vma, haddr, pud !! 2013         pudp_huge_clear_flush_notify(vma, haddr, pud);
2578 }                                                2014 }
2579                                                  2015 
2580 void __split_huge_pud(struct vm_area_struct *    2016 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2581                 unsigned long address)           2017                 unsigned long address)
2582 {                                                2018 {
2583         spinlock_t *ptl;                         2019         spinlock_t *ptl;
2584         struct mmu_notifier_range range;      !! 2020         struct mm_struct *mm = vma->vm_mm;
                                                   >> 2021         unsigned long haddr = address & HPAGE_PUD_MASK;
2585                                                  2022 
2586         mmu_notifier_range_init(&range, MMU_N !! 2023         mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
2587                                 address & HPA !! 2024         ptl = pud_lock(mm, pud);
2588                                 (address & HP << 
2589         mmu_notifier_invalidate_range_start(& << 
2590         ptl = pud_lock(vma->vm_mm, pud);      << 
2591         if (unlikely(!pud_trans_huge(*pud) &&    2025         if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
2592                 goto out;                        2026                 goto out;
2593         __split_huge_pud_locked(vma, pud, ran !! 2027         __split_huge_pud_locked(vma, pud, haddr);
2594                                                  2028 
2595 out:                                             2029 out:
2596         spin_unlock(ptl);                        2030         spin_unlock(ptl);
2597         mmu_notifier_invalidate_range_end(&ra !! 2031         /*
2598 }                                             !! 2032          * No need to double call mmu_notifier->invalidate_range() callback as
2599 #else                                         !! 2033          * the above pudp_huge_clear_flush_notify() did already call it.
2600 void __split_huge_pud(struct vm_area_struct * !! 2034          */
2601                 unsigned long address)        !! 2035         mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
2602 {                                             !! 2036                                                HPAGE_PUD_SIZE);
2603 }                                                2037 }
2604 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPA    2038 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2605                                                  2039 
2606 static void __split_huge_zero_page_pmd(struct    2040 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2607                 unsigned long haddr, pmd_t *p    2041                 unsigned long haddr, pmd_t *pmd)
2608 {                                                2042 {
2609         struct mm_struct *mm = vma->vm_mm;       2043         struct mm_struct *mm = vma->vm_mm;
2610         pgtable_t pgtable;                       2044         pgtable_t pgtable;
2611         pmd_t _pmd, old_pmd;                  !! 2045         pmd_t _pmd;
2612         unsigned long addr;                   << 
2613         pte_t *pte;                           << 
2614         int i;                                   2046         int i;
2615                                                  2047 
2616         /*                                       2048         /*
2617          * Leave pmd empty until pte is fille    2049          * Leave pmd empty until pte is filled note that it is fine to delay
2618          * notification until mmu_notifier_in    2050          * notification until mmu_notifier_invalidate_range_end() as we are
2619          * replacing a zero pmd write protect    2051          * replacing a zero pmd write protected page with a zero pte write
2620          * protected page.                       2052          * protected page.
2621          *                                       2053          *
2622          * See Documentation/mm/mmu_notifier. !! 2054          * See Documentation/vm/mmu_notifier.rst
2623          */                                      2055          */
2624         old_pmd = pmdp_huge_clear_flush(vma,  !! 2056         pmdp_huge_clear_flush(vma, haddr, pmd);
2625                                                  2057 
2626         pgtable = pgtable_trans_huge_withdraw    2058         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2627         pmd_populate(mm, &_pmd, pgtable);        2059         pmd_populate(mm, &_pmd, pgtable);
2628                                                  2060 
2629         pte = pte_offset_map(&_pmd, haddr);   !! 2061         for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2630         VM_BUG_ON(!pte);                      !! 2062                 pte_t *pte, entry;
2631         for (i = 0, addr = haddr; i < HPAGE_P !! 2063                 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
2632                 pte_t entry;                  << 
2633                                               << 
2634                 entry = pfn_pte(my_zero_pfn(a << 
2635                 entry = pte_mkspecial(entry);    2064                 entry = pte_mkspecial(entry);
2636                 if (pmd_uffd_wp(old_pmd))     !! 2065                 pte = pte_offset_map(&_pmd, haddr);
2637                         entry = pte_mkuffd_wp !! 2066                 VM_BUG_ON(!pte_none(*pte));
2638                 VM_BUG_ON(!pte_none(ptep_get( !! 2067                 set_pte_at(mm, haddr, pte, entry);
2639                 set_pte_at(mm, addr, pte, ent !! 2068                 pte_unmap(pte);
2640                 pte++;                        << 
2641         }                                        2069         }
2642         pte_unmap(pte - 1);                   << 
2643         smp_wmb(); /* make pte visible before    2070         smp_wmb(); /* make pte visible before pmd */
2644         pmd_populate(mm, pmd, pgtable);          2071         pmd_populate(mm, pmd, pgtable);
2645 }                                                2072 }
2646                                                  2073 
2647 static void __split_huge_pmd_locked(struct vm    2074 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2648                 unsigned long haddr, bool fre    2075                 unsigned long haddr, bool freeze)
2649 {                                                2076 {
2650         struct mm_struct *mm = vma->vm_mm;       2077         struct mm_struct *mm = vma->vm_mm;
2651         struct folio *folio;                  << 
2652         struct page *page;                       2078         struct page *page;
2653         pgtable_t pgtable;                       2079         pgtable_t pgtable;
2654         pmd_t old_pmd, _pmd;                     2080         pmd_t old_pmd, _pmd;
2655         bool young, write, soft_dirty, pmd_mi !! 2081         bool young, write, soft_dirty, pmd_migration = false;
2656         bool anon_exclusive = false, dirty =  << 
2657         unsigned long addr;                      2082         unsigned long addr;
2658         pte_t *pte;                           << 
2659         int i;                                   2083         int i;
2660                                                  2084 
2661         VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);      2085         VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2662         VM_BUG_ON_VMA(vma->vm_start > haddr,     2086         VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2663         VM_BUG_ON_VMA(vma->vm_end < haddr + H    2087         VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
2664         VM_BUG_ON(!is_pmd_migration_entry(*pm    2088         VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
2665                                 && !pmd_devma    2089                                 && !pmd_devmap(*pmd));
2666                                                  2090 
2667         count_vm_event(THP_SPLIT_PMD);           2091         count_vm_event(THP_SPLIT_PMD);
2668                                                  2092 
2669         if (!vma_is_anonymous(vma)) {            2093         if (!vma_is_anonymous(vma)) {
2670                 old_pmd = pmdp_huge_clear_flu !! 2094                 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
2671                 /*                               2095                 /*
2672                  * We are going to unmap this    2096                  * We are going to unmap this huge page. So
2673                  * just go ahead and zap it      2097                  * just go ahead and zap it
2674                  */                              2098                  */
2675                 if (arch_needs_pgtable_deposi    2099                 if (arch_needs_pgtable_deposit())
2676                         zap_deposited_table(m    2100                         zap_deposited_table(mm, pmd);
2677                 if (vma_is_special_huge(vma)) !! 2101                 if (vma_is_dax(vma))
2678                         return;                  2102                         return;
2679                 if (unlikely(is_pmd_migration !! 2103                 page = pmd_page(_pmd);
2680                         swp_entry_t entry;    !! 2104                 if (!PageDirty(page) && pmd_dirty(_pmd))
2681                                               !! 2105                         set_page_dirty(page);
2682                         entry = pmd_to_swp_en !! 2106                 if (!PageReferenced(page) && pmd_young(_pmd))
2683                         folio = pfn_swap_entr !! 2107                         SetPageReferenced(page);
2684                 } else {                      !! 2108                 page_remove_rmap(page, true);
2685                         page = pmd_page(old_p !! 2109                 put_page(page);
2686                         folio = page_folio(pa !! 2110                 add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
2687                         if (!folio_test_dirty << 
2688                                 folio_mark_di << 
2689                         if (!folio_test_refer << 
2690                                 folio_set_ref << 
2691                         folio_remove_rmap_pmd << 
2692                         folio_put(folio);     << 
2693                 }                             << 
2694                 add_mm_counter(mm, mm_counter << 
2695                 return;                          2111                 return;
2696         }                                     !! 2112         } else if (is_huge_zero_pmd(*pmd)) {
2697                                               << 
2698         if (is_huge_zero_pmd(*pmd)) {         << 
2699                 /*                               2113                 /*
2700                  * FIXME: Do we want to inval    2114                  * FIXME: Do we want to invalidate secondary mmu by calling
2701                  * mmu_notifier_arch_invalida !! 2115                  * mmu_notifier_invalidate_range() see comments below inside
2702                  * inside __split_huge_pmd()  !! 2116                  * __split_huge_pmd() ?
2703                  *                               2117                  *
2704                  * We are going from a zero h    2118                  * We are going from a zero huge page write protected to zero
2705                  * small page also write prot    2119                  * small page also write protected so it does not seems useful
2706                  * to invalidate secondary mm    2120                  * to invalidate secondary mmu at this time.
2707                  */                              2121                  */
2708                 return __split_huge_zero_page    2122                 return __split_huge_zero_page_pmd(vma, haddr, pmd);
2709         }                                        2123         }
2710                                                  2124 
2711         pmd_migration = is_pmd_migration_entr !! 2125         /*
                                                   >> 2126          * Up to this point the pmd is present and huge and userland has the
                                                   >> 2127          * whole access to the hugepage during the split (which happens in
                                                   >> 2128          * place). If we overwrite the pmd with the not-huge version pointing
                                                   >> 2129          * to the pte here (which of course we could if all CPUs were bug
                                                   >> 2130          * free), userland could trigger a small page size TLB miss on the
                                                   >> 2131          * small sized TLB while the hugepage TLB entry is still established in
                                                   >> 2132          * the huge TLB. Some CPU doesn't like that.
                                                   >> 2133          * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
                                                   >> 2134          * 383 on page 93. Intel should be safe but is also warns that it's
                                                   >> 2135          * only safe if the permission and cache attributes of the two entries
                                                   >> 2136          * loaded in the two TLB is identical (which should be the case here).
                                                   >> 2137          * But it is generally safer to never allow small and huge TLB entries
                                                   >> 2138          * for the same virtual address to be loaded simultaneously. So instead
                                                   >> 2139          * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
                                                   >> 2140          * current pmd notpresent (atomically because here the pmd_trans_huge
                                                   >> 2141          * must remain set at all times on the pmd until the split is complete
                                                   >> 2142          * for this pmd), then we flush the SMP TLB and finally we write the
                                                   >> 2143          * non-huge version of the pmd entry with pmd_populate.
                                                   >> 2144          */
                                                   >> 2145         old_pmd = pmdp_invalidate(vma, haddr, pmd);
                                                   >> 2146 
                                                   >> 2147         pmd_migration = is_pmd_migration_entry(old_pmd);
2712         if (unlikely(pmd_migration)) {           2148         if (unlikely(pmd_migration)) {
2713                 swp_entry_t entry;               2149                 swp_entry_t entry;
2714                                                  2150 
2715                 old_pmd = *pmd;               << 
2716                 entry = pmd_to_swp_entry(old_    2151                 entry = pmd_to_swp_entry(old_pmd);
2717                 page = pfn_swap_entry_to_page !! 2152                 page = pfn_to_page(swp_offset(entry));
2718                 write = is_writable_migration !! 2153                 write = is_write_migration_entry(entry);
2719                 if (PageAnon(page))           !! 2154                 young = false;
2720                         anon_exclusive = is_r << 
2721                 young = is_migration_entry_yo << 
2722                 dirty = is_migration_entry_di << 
2723                 soft_dirty = pmd_swp_soft_dir    2155                 soft_dirty = pmd_swp_soft_dirty(old_pmd);
2724                 uffd_wp = pmd_swp_uffd_wp(old << 
2725         } else {                                 2156         } else {
2726                 /*                            << 
2727                  * Up to this point the pmd i << 
2728                  * the whole access to the hu << 
2729                  * happens in place). If we o << 
2730                  * version pointing to the pt << 
2731                  * all CPUs were bug free), u << 
2732                  * size TLB miss on the small << 
2733                  * entry is still established << 
2734                  * like that. See             << 
2735                  * http://support.amd.com/Tec << 
2736                  * 383 on page 105. Intel sho << 
2737                  * it's only safe if the perm << 
2738                  * two entries loaded in the  << 
2739                  * be the case here). But it  << 
2740                  * small and huge TLB entries << 
2741                  * loaded simultaneously. So  << 
2742                  * flush_pmd_tlb_range();" we << 
2743                  * notpresent (atomically bec << 
2744                  * remain set at all times on << 
2745                  * complete for this pmd), th << 
2746                  * we write the non-huge vers << 
2747                  * pmd_populate.              << 
2748                  */                           << 
2749                 old_pmd = pmdp_invalidate(vma << 
2750                 page = pmd_page(old_pmd);        2157                 page = pmd_page(old_pmd);
2751                 folio = page_folio(page);     !! 2158                 if (pmd_dirty(old_pmd))
2752                 if (pmd_dirty(old_pmd)) {     !! 2159                         SetPageDirty(page);
2753                         dirty = true;         << 
2754                         folio_set_dirty(folio << 
2755                 }                             << 
2756                 write = pmd_write(old_pmd);      2160                 write = pmd_write(old_pmd);
2757                 young = pmd_young(old_pmd);      2161                 young = pmd_young(old_pmd);
2758                 soft_dirty = pmd_soft_dirty(o    2162                 soft_dirty = pmd_soft_dirty(old_pmd);
2759                 uffd_wp = pmd_uffd_wp(old_pmd << 
2760                                               << 
2761                 VM_WARN_ON_FOLIO(!folio_ref_c << 
2762                 VM_WARN_ON_FOLIO(!folio_test_ << 
2763                                               << 
2764                 /*                            << 
2765                  * Without "freeze", we'll si << 
2766                  * PageAnonExclusive() flag f << 
2767                  * each subpage -- no need to << 
2768                  *                            << 
2769                  * With "freeze" we want to r << 
2770                  * migration entries right aw << 
2771                  * managed to clear PageAnonE << 
2772                  * set_pmd_migration_entry(). << 
2773                  *                            << 
2774                  * In case we cannot clear Pa << 
2775                  * only and let try_to_migrat << 
2776                  *                            << 
2777                  * See folio_try_share_anon_r << 
2778                  */                           << 
2779                 anon_exclusive = PageAnonExcl << 
2780                 if (freeze && anon_exclusive  << 
2781                     folio_try_share_anon_rmap << 
2782                         freeze = false;       << 
2783                 if (!freeze) {                << 
2784                         rmap_t rmap_flags = R << 
2785                                               << 
2786                         folio_ref_add(folio,  << 
2787                         if (anon_exclusive)   << 
2788                                 rmap_flags |= << 
2789                         folio_add_anon_rmap_p << 
2790                                               << 
2791                 }                             << 
2792         }                                        2163         }
                                                   >> 2164         VM_BUG_ON_PAGE(!page_count(page), page);
                                                   >> 2165         page_ref_add(page, HPAGE_PMD_NR - 1);
2793                                                  2166 
2794         /*                                       2167         /*
2795          * Withdraw the table only after we m    2168          * Withdraw the table only after we mark the pmd entry invalid.
2796          * This's critical for some architect    2169          * This's critical for some architectures (Power).
2797          */                                      2170          */
2798         pgtable = pgtable_trans_huge_withdraw    2171         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2799         pmd_populate(mm, &_pmd, pgtable);        2172         pmd_populate(mm, &_pmd, pgtable);
2800                                                  2173 
2801         pte = pte_offset_map(&_pmd, haddr);   !! 2174         for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2802         VM_BUG_ON(!pte);                      !! 2175                 pte_t entry, *pte;
2803                                               !! 2176                 /*
2804         /*                                    !! 2177                  * Note that NUMA hinting access restrictions are not
2805          * Note that NUMA hinting access rest !! 2178                  * transferred to avoid any possibility of altering
2806          * avoid any possibility of altering  !! 2179                  * permissions across VMAs.
2807          */                                   !! 2180                  */
2808         if (freeze || pmd_migration) {        !! 2181                 if (freeze || pmd_migration) {
2809                 for (i = 0, addr = haddr; i < << 
2810                         pte_t entry;          << 
2811                         swp_entry_t swp_entry    2182                         swp_entry_t swp_entry;
2812                                               !! 2183                         swp_entry = make_migration_entry(page + i, write);
2813                         if (write)            << 
2814                                 swp_entry = m << 
2815                                               << 
2816                         else if (anon_exclusi << 
2817                                 swp_entry = m << 
2818                                               << 
2819                         else                  << 
2820                                 swp_entry = m << 
2821                                               << 
2822                         if (young)            << 
2823                                 swp_entry = m << 
2824                         if (dirty)            << 
2825                                 swp_entry = m << 
2826                         entry = swp_entry_to_    2184                         entry = swp_entry_to_pte(swp_entry);
2827                         if (soft_dirty)          2185                         if (soft_dirty)
2828                                 entry = pte_s    2186                                 entry = pte_swp_mksoft_dirty(entry);
2829                         if (uffd_wp)          !! 2187                 } else {
2830                                 entry = pte_s !! 2188                         entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
2831                                               !! 2189                         entry = maybe_mkwrite(entry, vma);
2832                         VM_WARN_ON(!pte_none( !! 2190                         if (!write)
2833                         set_pte_at(mm, addr,  !! 2191                                 entry = pte_wrprotect(entry);
                                                   >> 2192                         if (!young)
                                                   >> 2193                                 entry = pte_mkold(entry);
                                                   >> 2194                         if (soft_dirty)
                                                   >> 2195                                 entry = pte_mksoft_dirty(entry);
2834                 }                                2196                 }
2835         } else {                              !! 2197                 pte = pte_offset_map(&_pmd, addr);
2836                 pte_t entry;                  !! 2198                 BUG_ON(!pte_none(*pte));
2837                                               !! 2199                 set_pte_at(mm, addr, pte, entry);
2838                 entry = mk_pte(page, READ_ONC !! 2200                 atomic_inc(&page[i]._mapcount);
2839                 if (write)                    !! 2201                 pte_unmap(pte);
2840                         entry = pte_mkwrite(e !! 2202         }
2841                 if (!young)                   << 
2842                         entry = pte_mkold(ent << 
2843                 /* NOTE: this may set soft-di << 
2844                 if (dirty)                    << 
2845                         entry = pte_mkdirty(e << 
2846                 if (soft_dirty)               << 
2847                         entry = pte_mksoft_di << 
2848                 if (uffd_wp)                  << 
2849                         entry = pte_mkuffd_wp << 
2850                                                  2203 
                                                   >> 2204         /*
                                                   >> 2205          * Set PG_double_map before dropping compound_mapcount to avoid
                                                   >> 2206          * false-negative page_mapped().
                                                   >> 2207          */
                                                   >> 2208         if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
2851                 for (i = 0; i < HPAGE_PMD_NR;    2209                 for (i = 0; i < HPAGE_PMD_NR; i++)
2852                         VM_WARN_ON(!pte_none( !! 2210                         atomic_inc(&page[i]._mapcount);
2853                                               << 
2854                 set_ptes(mm, haddr, pte, entr << 
2855         }                                        2211         }
2856         pte_unmap(pte);                       << 
2857                                                  2212 
2858         if (!pmd_migration)                   !! 2213         if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
2859                 folio_remove_rmap_pmd(folio,  !! 2214                 /* Last compound_mapcount is gone. */
2860         if (freeze)                           !! 2215                 __dec_node_page_state(page, NR_ANON_THPS);
2861                 put_page(page);               !! 2216                 if (TestClearPageDoubleMap(page)) {
                                                   >> 2217                         /* No need in mapcount reference anymore */
                                                   >> 2218                         for (i = 0; i < HPAGE_PMD_NR; i++)
                                                   >> 2219                                 atomic_dec(&page[i]._mapcount);
                                                   >> 2220                 }
                                                   >> 2221         }
2862                                                  2222 
2863         smp_wmb(); /* make pte visible before    2223         smp_wmb(); /* make pte visible before pmd */
2864         pmd_populate(mm, pmd, pgtable);          2224         pmd_populate(mm, pmd, pgtable);
2865 }                                             << 
2866                                                  2225 
2867 void split_huge_pmd_locked(struct vm_area_str !! 2226         if (freeze) {
2868                            pmd_t *pmd, bool f !! 2227                 for (i = 0; i < HPAGE_PMD_NR; i++) {
2869 {                                             !! 2228                         page_remove_rmap(page + i, false);
2870         VM_WARN_ON_ONCE(folio && !folio_test_ !! 2229                         put_page(page + i);
2871         VM_WARN_ON_ONCE(!IS_ALIGNED(address,  !! 2230                 }
2872         VM_WARN_ON_ONCE(folio && !folio_test_ << 
2873         VM_BUG_ON(freeze && !folio);          << 
2874                                               << 
2875         /*                                    << 
2876          * When the caller requests to set up << 
2877          * require a folio to check the PMD a << 
2878          * is a risk of replacing the wrong f << 
2879          */                                   << 
2880         if (pmd_trans_huge(*pmd) || pmd_devma << 
2881             is_pmd_migration_entry(*pmd)) {   << 
2882                 if (folio && folio != pmd_fol << 
2883                         return;               << 
2884                 __split_huge_pmd_locked(vma,  << 
2885         }                                        2231         }
2886 }                                                2232 }
2887                                                  2233 
2888 void __split_huge_pmd(struct vm_area_struct *    2234 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2889                 unsigned long address, bool f !! 2235                 unsigned long address, bool freeze, struct page *page)
2890 {                                                2236 {
2891         spinlock_t *ptl;                         2237         spinlock_t *ptl;
2892         struct mmu_notifier_range range;      !! 2238         struct mm_struct *mm = vma->vm_mm;
                                                   >> 2239         unsigned long haddr = address & HPAGE_PMD_MASK;
2893                                                  2240 
2894         mmu_notifier_range_init(&range, MMU_N !! 2241         mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
2895                                 address & HPA !! 2242         ptl = pmd_lock(mm, pmd);
2896                                 (address & HP !! 2243 
2897         mmu_notifier_invalidate_range_start(& !! 2244         /*
2898         ptl = pmd_lock(vma->vm_mm, pmd);      !! 2245          * If caller asks to setup a migration entries, we need a page to check
2899         split_huge_pmd_locked(vma, range.star !! 2246          * pmd against. Otherwise we can end up replacing wrong page.
                                                   >> 2247          */
                                                   >> 2248         VM_BUG_ON(freeze && !page);
                                                   >> 2249         if (page && page != pmd_page(*pmd))
                                                   >> 2250                 goto out;
                                                   >> 2251 
                                                   >> 2252         if (pmd_trans_huge(*pmd)) {
                                                   >> 2253                 page = pmd_page(*pmd);
                                                   >> 2254                 if (PageMlocked(page))
                                                   >> 2255                         clear_page_mlock(page);
                                                   >> 2256         } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
                                                   >> 2257                 goto out;
                                                   >> 2258         __split_huge_pmd_locked(vma, pmd, haddr, freeze);
                                                   >> 2259 out:
2900         spin_unlock(ptl);                        2260         spin_unlock(ptl);
2901         mmu_notifier_invalidate_range_end(&ra !! 2261         /*
                                                   >> 2262          * No need to double call mmu_notifier->invalidate_range() callback.
                                                   >> 2263          * They are 3 cases to consider inside __split_huge_pmd_locked():
                                                   >> 2264          *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
                                                   >> 2265          *  2) __split_huge_zero_page_pmd() read only zero page and any write
                                                   >> 2266          *    fault will trigger a flush_notify before pointing to a new page
                                                   >> 2267          *    (it is fine if the secondary mmu keeps pointing to the old zero
                                                   >> 2268          *    page in the meantime)
                                                   >> 2269          *  3) Split a huge pmd into pte pointing to the same page. No need
                                                   >> 2270          *     to invalidate secondary tlb entry they are all still valid.
                                                   >> 2271          *     any further changes to individual pte will notify. So no need
                                                   >> 2272          *     to call mmu_notifier->invalidate_range()
                                                   >> 2273          */
                                                   >> 2274         mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
                                                   >> 2275                                                HPAGE_PMD_SIZE);
2902 }                                                2276 }
2903                                                  2277 
2904 void split_huge_pmd_address(struct vm_area_st    2278 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
2905                 bool freeze, struct folio *fo !! 2279                 bool freeze, struct page *page)
2906 {                                                2280 {
2907         pmd_t *pmd = mm_find_pmd(vma->vm_mm,  !! 2281         pgd_t *pgd;
                                                   >> 2282         p4d_t *p4d;
                                                   >> 2283         pud_t *pud;
                                                   >> 2284         pmd_t *pmd;
2908                                                  2285 
2909         if (!pmd)                             !! 2286         pgd = pgd_offset(vma->vm_mm, address);
                                                   >> 2287         if (!pgd_present(*pgd))
2910                 return;                          2288                 return;
2911                                                  2289 
2912         __split_huge_pmd(vma, pmd, address, f !! 2290         p4d = p4d_offset(pgd, address);
2913 }                                             !! 2291         if (!p4d_present(*p4d))
                                                   >> 2292                 return;
2914                                                  2293 
2915 static inline void split_huge_pmd_if_needed(s !! 2294         pud = pud_offset(p4d, address);
2916 {                                             !! 2295         if (!pud_present(*pud))
2917         /*                                    !! 2296                 return;
2918          * If the new address isn't hpage ali !! 2297 
2919          * contain an hugepage: check if we n !! 2298         pmd = pmd_offset(pud, address);
2920          */                                   !! 2299 
2921         if (!IS_ALIGNED(address, HPAGE_PMD_SI !! 2300         __split_huge_pmd(vma, pmd, address, freeze, page);
2922             range_in_vma(vma, ALIGN_DOWN(addr << 
2923                          ALIGN(address, HPAGE << 
2924                 split_huge_pmd_address(vma, a << 
2925 }                                                2301 }
2926                                                  2302 
2927 void vma_adjust_trans_huge(struct vm_area_str    2303 void vma_adjust_trans_huge(struct vm_area_struct *vma,
2928                              unsigned long st    2304                              unsigned long start,
2929                              unsigned long en    2305                              unsigned long end,
2930                              long adjust_next    2306                              long adjust_next)
2931 {                                                2307 {
2932         /* Check if we need to split start fi << 
2933         split_huge_pmd_if_needed(vma, start); << 
2934                                               << 
2935         /* Check if we need to split end next << 
2936         split_huge_pmd_if_needed(vma, end);   << 
2937                                               << 
2938         /*                                       2308         /*
2939          * If we're also updating the next vm !! 2309          * If the new start address isn't hpage aligned and it could
2940          * check if we need to split it.      !! 2310          * previously contain an hugepage: check if we need to split
                                                   >> 2311          * an huge pmd.
2941          */                                      2312          */
2942         if (adjust_next > 0) {                !! 2313         if (start & ~HPAGE_PMD_MASK &&
2943                 struct vm_area_struct *next = !! 2314             (start & HPAGE_PMD_MASK) >= vma->vm_start &&
2944                 unsigned long nstart = next-> !! 2315             (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2945                 nstart += adjust_next;        !! 2316                 split_huge_pmd_address(vma, start, false, NULL);
2946                 split_huge_pmd_if_needed(next << 
2947         }                                     << 
2948 }                                             << 
2949                                               << 
2950 static void unmap_folio(struct folio *folio)  << 
2951 {                                             << 
2952         enum ttu_flags ttu_flags = TTU_RMAP_L << 
2953                 TTU_BATCH_FLUSH;              << 
2954                                               << 
2955         VM_BUG_ON_FOLIO(!folio_test_large(fol << 
2956                                               << 
2957         if (folio_test_pmd_mappable(folio))   << 
2958                 ttu_flags |= TTU_SPLIT_HUGE_P << 
2959                                                  2317 
2960         /*                                       2318         /*
2961          * Anon pages need migration entries  !! 2319          * If the new end address isn't hpage aligned and it could
2962          * pages can simply be left unmapped, !! 2320          * previously contain an hugepage: check if we need to split
2963          * If that is ever changed (perhaps f !! 2321          * an huge pmd.
2964          */                                      2322          */
2965         if (folio_test_anon(folio))           !! 2323         if (end & ~HPAGE_PMD_MASK &&
2966                 try_to_migrate(folio, ttu_fla !! 2324             (end & HPAGE_PMD_MASK) >= vma->vm_start &&
2967         else                                  !! 2325             (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2968                 try_to_unmap(folio, ttu_flags !! 2326                 split_huge_pmd_address(vma, end, false, NULL);
2969                                               << 
2970         try_to_unmap_flush();                 << 
2971 }                                             << 
2972                                               << 
2973 static bool __discard_anon_folio_pmd_locked(s << 
2974                                             u << 
2975                                             s << 
2976 {                                             << 
2977         struct mm_struct *mm = vma->vm_mm;    << 
2978         int ref_count, map_count;             << 
2979         pmd_t orig_pmd = *pmdp;               << 
2980                                               << 
2981         if (folio_test_dirty(folio) || pmd_di << 
2982                 return false;                 << 
2983                                               << 
2984         orig_pmd = pmdp_huge_clear_flush(vma, << 
2985                                               << 
2986         /*                                    << 
2987          * Syncing against concurrent GUP-fas << 
2988          * - clear PMD; barrier; read refcoun << 
2989          * - inc refcount; barrier; read PMD  << 
2990          */                                   << 
2991         smp_mb();                             << 
2992                                               << 
2993         ref_count = folio_ref_count(folio);   << 
2994         map_count = folio_mapcount(folio);    << 
2995                                               << 
2996         /*                                    << 
2997          * Order reads for folio refcount and << 
2998          * (see comments in __remove_mapping( << 
2999          */                                   << 
3000         smp_rmb();                            << 
3001                                                  2327 
3002         /*                                       2328         /*
3003          * If the folio or its PMD is redirti !! 2329          * If we're also updating the vma->vm_next->vm_start, if the new
3004          * are unexpected references, we will !! 2330          * vm_next->vm_start isn't page aligned and it could previously
3005          * and remap it.                      !! 2331          * contain an hugepage: check if we need to split an huge pmd.
3006          *                                    << 
3007          * The only folio refs must be one fr << 
3008          */                                      2332          */
3009         if (folio_test_dirty(folio) || pmd_di !! 2333         if (adjust_next > 0) {
3010             ref_count != map_count + 1) {     !! 2334                 struct vm_area_struct *next = vma->vm_next;
3011                 set_pmd_at(mm, addr, pmdp, or !! 2335                 unsigned long nstart = next->vm_start;
3012                 return false;                 !! 2336                 nstart += adjust_next << PAGE_SHIFT;
                                                   >> 2337                 if (nstart & ~HPAGE_PMD_MASK &&
                                                   >> 2338                     (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
                                                   >> 2339                     (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
                                                   >> 2340                         split_huge_pmd_address(next, nstart, false, NULL);
3013         }                                        2341         }
3014                                               << 
3015         folio_remove_rmap_pmd(folio, pmd_page << 
3016         zap_deposited_table(mm, pmdp);        << 
3017         add_mm_counter(mm, MM_ANONPAGES, -HPA << 
3018         if (vma->vm_flags & VM_LOCKED)        << 
3019                 mlock_drain_local();          << 
3020         folio_put(folio);                     << 
3021                                               << 
3022         return true;                          << 
3023 }                                                2342 }
3024                                                  2343 
3025 bool unmap_huge_pmd_locked(struct vm_area_str !! 2344 static void unmap_page(struct page *page)
3026                            pmd_t *pmdp, struc << 
3027 {                                                2345 {
3028         VM_WARN_ON_FOLIO(!folio_test_pmd_mapp !! 2346         enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
3029         VM_WARN_ON_FOLIO(!folio_test_locked(f !! 2347                 TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
3030         VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPA !! 2348         bool unmap_success;
3031                                               << 
3032         if (folio_test_anon(folio) && !folio_ << 
3033                 return __discard_anon_folio_p << 
3034                                                  2349 
3035         return false;                         !! 2350         VM_BUG_ON_PAGE(!PageHead(page), page);
3036 }                                             << 
3037                                                  2351 
3038 static void remap_page(struct folio *folio, u !! 2352         if (PageAnon(page))
3039 {                                             !! 2353                 ttu_flags |= TTU_SPLIT_FREEZE;
3040         int i = 0;                            << 
3041                                                  2354 
3042         /* If unmap_folio() uses try_to_migra !! 2355         unmap_success = try_to_unmap(page, ttu_flags);
3043         if (!folio_test_anon(folio))          !! 2356         VM_BUG_ON_PAGE(!unmap_success, page);
3044                 return;                       << 
3045         for (;;) {                            << 
3046                 remove_migration_ptes(folio,  << 
3047                 i += folio_nr_pages(folio);   << 
3048                 if (i >= nr)                  << 
3049                         break;                << 
3050                 folio = folio_next(folio);    << 
3051         }                                     << 
3052 }                                                2357 }
3053                                                  2358 
3054 static void lru_add_page_tail(struct folio *f !! 2359 static void remap_page(struct page *page)
3055                 struct lruvec *lruvec, struct << 
3056 {                                                2360 {
3057         VM_BUG_ON_FOLIO(!folio_test_large(fol !! 2361         int i;
3058         VM_BUG_ON_FOLIO(PageLRU(tail), folio) !! 2362         if (PageTransHuge(page)) {
3059         lockdep_assert_held(&lruvec->lru_lock !! 2363                 remove_migration_ptes(page, page, true);
3060                                               << 
3061         if (list) {                           << 
3062                 /* page reclaim is reclaiming << 
3063                 VM_WARN_ON(folio_test_lru(fol << 
3064                 get_page(tail);               << 
3065                 list_add_tail(&tail->lru, lis << 
3066         } else {                                 2364         } else {
3067                 /* head is still on lru (and  !! 2365                 for (i = 0; i < HPAGE_PMD_NR; i++)
3068                 VM_WARN_ON(!folio_test_lru(fo !! 2366                         remove_migration_ptes(page + i, page + i, true);
3069                 if (folio_test_unevictable(fo << 
3070                         tail->mlock_count = 0 << 
3071                 else                          << 
3072                         list_add_tail(&tail-> << 
3073                 SetPageLRU(tail);             << 
3074         }                                        2367         }
3075 }                                                2368 }
3076                                                  2369 
3077 static void __split_huge_page_tail(struct fol !! 2370 static void __split_huge_page_tail(struct page *head, int tail,
3078                 struct lruvec *lruvec, struct !! 2371                 struct lruvec *lruvec, struct list_head *list)
3079                 unsigned int new_order)       << 
3080 {                                                2372 {
3081         struct page *head = &folio->page;     << 
3082         struct page *page_tail = head + tail;    2373         struct page *page_tail = head + tail;
3083         /*                                    << 
3084          * Careful: new_folio is not a "real" << 
3085          * Don't pass it around before clear_ << 
3086          */                                   << 
3087         struct folio *new_folio = (struct fol << 
3088                                                  2374 
3089         VM_BUG_ON_PAGE(atomic_read(&page_tail    2375         VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
3090                                                  2376 
3091         /*                                       2377         /*
3092          * Clone page flags before unfreezing    2378          * Clone page flags before unfreezing refcount.
3093          *                                       2379          *
3094          * After successful get_page_unless_z    2380          * After successful get_page_unless_zero() might follow flags change,
3095          * for example lock_page() which set  !! 2381          * for exmaple lock_page() which set PG_waiters.
3096          *                                    << 
3097          * Note that for mapped sub-pages of  << 
3098          * PG_anon_exclusive has been cleared << 
3099          * the migration entry instead from w << 
3100          * We can still have PG_anon_exclusiv << 
3101          * unreferenced sub-pages of an anony << 
3102          * PG_anon_exclusive (-> PG_mappedtod << 
3103          */                                      2382          */
3104         page_tail->flags &= ~PAGE_FLAGS_CHECK    2383         page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
3105         page_tail->flags |= (head->flags &       2384         page_tail->flags |= (head->flags &
3106                         ((1L << PG_referenced    2385                         ((1L << PG_referenced) |
3107                          (1L << PG_swapbacked    2386                          (1L << PG_swapbacked) |
3108                          (1L << PG_swapcache)    2387                          (1L << PG_swapcache) |
3109                          (1L << PG_mlocked) |    2388                          (1L << PG_mlocked) |
3110                          (1L << PG_uptodate)     2389                          (1L << PG_uptodate) |
3111                          (1L << PG_active) |     2390                          (1L << PG_active) |
3112                          (1L << PG_workingset    2391                          (1L << PG_workingset) |
3113                          (1L << PG_locked) |     2392                          (1L << PG_locked) |
3114                          (1L << PG_unevictabl    2393                          (1L << PG_unevictable) |
3115 #ifdef CONFIG_ARCH_USES_PG_ARCH_2             !! 2394                          (1L << PG_dirty)));
3116                          (1L << PG_arch_2) |  << 
3117 #endif                                        << 
3118 #ifdef CONFIG_ARCH_USES_PG_ARCH_3             << 
3119                          (1L << PG_arch_3) |  << 
3120 #endif                                        << 
3121                          (1L << PG_dirty) |   << 
3122                          LRU_GEN_MASK | LRU_R << 
3123                                                  2395 
3124         /* ->mapping in first and second tail !! 2396         /* ->mapping in first tail page is compound_mapcount */
3125         VM_BUG_ON_PAGE(tail > 2 && page_tail-    2397         VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
3126                         page_tail);              2398                         page_tail);
3127         page_tail->mapping = head->mapping;      2399         page_tail->mapping = head->mapping;
3128         page_tail->index = head->index + tail    2400         page_tail->index = head->index + tail;
3129                                                  2401 
3130         /*                                    << 
3131          * page->private should not be set in << 
3132          * if private is unexpectedly set.    << 
3133          */                                   << 
3134         if (unlikely(page_tail->private)) {   << 
3135                 VM_WARN_ON_ONCE_PAGE(true, pa << 
3136                 page_tail->private = 0;       << 
3137         }                                     << 
3138         if (folio_test_swapcache(folio))      << 
3139                 new_folio->swap.val = folio-> << 
3140                                               << 
3141         /* Page flags must be visible before     2402         /* Page flags must be visible before we make the page non-compound. */
3142         smp_wmb();                               2403         smp_wmb();
3143                                                  2404 
3144         /*                                       2405         /*
3145          * Clear PageTail before unfreezing p    2406          * Clear PageTail before unfreezing page refcount.
3146          *                                       2407          *
3147          * After successful get_page_unless_z    2408          * After successful get_page_unless_zero() might follow put_page()
3148          * which needs correct compound_head(    2409          * which needs correct compound_head().
3149          */                                      2410          */
3150         clear_compound_head(page_tail);          2411         clear_compound_head(page_tail);
3151         if (new_order) {                      << 
3152                 prep_compound_page(page_tail, << 
3153                 folio_set_large_rmappable(new << 
3154         }                                     << 
3155                                                  2412 
3156         /* Finally unfreeze refcount. Additio    2413         /* Finally unfreeze refcount. Additional reference from page cache. */
3157         page_ref_unfreeze(page_tail,          !! 2414         page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
3158                 1 + ((!folio_test_anon(folio) !! 2415                                           PageSwapCache(head)));
3159                              folio_nr_pages(n << 
3160                                               << 
3161         if (folio_test_young(folio))          << 
3162                 folio_set_young(new_folio);   << 
3163         if (folio_test_idle(folio))           << 
3164                 folio_set_idle(new_folio);    << 
3165                                                  2416 
3166         folio_xchg_last_cpupid(new_folio, fol !! 2417         if (page_is_young(head))
                                                   >> 2418                 set_page_young(page_tail);
                                                   >> 2419         if (page_is_idle(head))
                                                   >> 2420                 set_page_idle(page_tail);
                                                   >> 2421 
                                                   >> 2422         page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
3167                                                  2423 
3168         /*                                       2424         /*
3169          * always add to the tail because som    2425          * always add to the tail because some iterators expect new
3170          * pages to show after the currently     2426          * pages to show after the currently processed elements - e.g.
3171          * migrate_pages                         2427          * migrate_pages
3172          */                                      2428          */
3173         lru_add_page_tail(folio, page_tail, l !! 2429         lru_add_page_tail(head, page_tail, lruvec, list);
3174 }                                                2430 }
3175                                                  2431 
3176 static void __split_huge_page(struct page *pa    2432 static void __split_huge_page(struct page *page, struct list_head *list,
3177                 pgoff_t end, unsigned int new !! 2433                 pgoff_t end, unsigned long flags)
3178 {                                                2434 {
3179         struct folio *folio = page_folio(page !! 2435         struct page *head = compound_head(page);
3180         struct page *head = &folio->page;     !! 2436         struct zone *zone = page_zone(head);
3181         struct lruvec *lruvec;                   2437         struct lruvec *lruvec;
3182         struct address_space *swap_cache = NU !! 2438         int i;
3183         unsigned long offset = 0;             << 
3184         int i, nr_dropped = 0;                << 
3185         unsigned int new_nr = 1 << new_order; << 
3186         int order = folio_order(folio);       << 
3187         unsigned int nr = 1 << order;         << 
3188                                               << 
3189         /* complete memcg works before add pa << 
3190         split_page_memcg(head, order, new_ord << 
3191                                               << 
3192         if (folio_test_anon(folio) && folio_t << 
3193                 offset = swap_cache_index(fol << 
3194                 swap_cache = swap_address_spa << 
3195                 xa_lock(&swap_cache->i_pages) << 
3196         }                                     << 
3197                                                  2439 
3198         /* lock lru list/PageCompound, ref fr !! 2440         lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
3199         lruvec = folio_lruvec_lock(folio);    << 
3200                                                  2441 
3201         ClearPageHasHWPoisoned(head);         !! 2442         /* complete memcg works before add pages to LRU */
                                                   >> 2443         mem_cgroup_split_huge_fixup(head);
3202                                                  2444 
3203         for (i = nr - new_nr; i >= new_nr; i  !! 2445         for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
3204                 __split_huge_page_tail(folio, !! 2446                 __split_huge_page_tail(head, i, lruvec, list);
3205                 /* Some pages can be beyond E !! 2447                 /* Some pages can be beyond i_size: drop them from page cache */
3206                 if (head[i].index >= end) {      2448                 if (head[i].index >= end) {
3207                         struct folio *tail =  !! 2449                         ClearPageDirty(head + i);
3208                                               !! 2450                         __delete_from_page_cache(head + i, NULL);
3209                         if (shmem_mapping(fol !! 2451                         if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
3210                                 nr_dropped++; !! 2452                                 shmem_uncharge(head->mapping->host, 1);
3211                         else if (folio_test_c !! 2453                         put_page(head + i);
3212                                 folio_account << 
3213                                         inode << 
3214                         __filemap_remove_foli << 
3215                         folio_put(tail);      << 
3216                 } else if (!PageAnon(page)) { << 
3217                         __xa_store(&folio->ma << 
3218                                         head  << 
3219                 } else if (swap_cache) {      << 
3220                         __xa_store(&swap_cach << 
3221                                         head  << 
3222                 }                                2454                 }
3223         }                                        2455         }
3224                                                  2456 
3225         if (!new_order)                       !! 2457         ClearPageCompound(head);
3226                 ClearPageCompound(head);      << 
3227         else {                                << 
3228                 struct folio *new_folio = (st << 
3229                                               << 
3230                 folio_set_order(new_folio, ne << 
3231         }                                     << 
3232         unlock_page_lruvec(lruvec);           << 
3233         /* Caller disabled irqs, so they are  << 
3234                                               << 
3235         split_page_owner(head, order, new_ord << 
3236         pgalloc_tag_split(folio, order, new_o << 
3237                                               << 
3238         /* See comment in __split_huge_page_t    2458         /* See comment in __split_huge_page_tail() */
3239         if (folio_test_anon(folio)) {         !! 2459         if (PageAnon(head)) {
3240                 /* Additional pin to swap cac    2460                 /* Additional pin to swap cache */
3241                 if (folio_test_swapcache(foli !! 2461                 if (PageSwapCache(head))
3242                         folio_ref_add(folio,  !! 2462                         page_ref_add(head, 2);
3243                         xa_unlock(&swap_cache !! 2463                 else
3244                 } else {                      !! 2464                         page_ref_inc(head);
3245                         folio_ref_inc(folio); << 
3246                 }                             << 
3247         } else {                                 2465         } else {
3248                 /* Additional pin to page cac    2466                 /* Additional pin to page cache */
3249                 folio_ref_add(folio, 1 + new_ !! 2467                 page_ref_add(head, 2);
3250                 xa_unlock(&folio->mapping->i_ !! 2468                 xa_unlock(&head->mapping->i_pages);
3251         }                                        2469         }
3252         local_irq_enable();                   << 
3253                                                  2470 
3254         if (nr_dropped)                       !! 2471         spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
3255                 shmem_uncharge(folio->mapping << 
3256         remap_page(folio, nr, PageAnon(head)  << 
3257                                                  2472 
3258         /*                                    !! 2473         remap_page(head);
3259          * set page to its compound_head when << 
3260          * we can skip unlocking it below, si << 
3261          * the compound_head of the page and  << 
3262          */                                   << 
3263         if (new_order)                        << 
3264                 page = compound_head(page);   << 
3265                                                  2474 
3266         for (i = 0; i < nr; i += new_nr) {    !! 2475         for (i = 0; i < HPAGE_PMD_NR; i++) {
3267                 struct page *subpage = head +    2476                 struct page *subpage = head + i;
3268                 struct folio *new_folio = pag << 
3269                 if (subpage == page)             2477                 if (subpage == page)
3270                         continue;                2478                         continue;
3271                 folio_unlock(new_folio);      !! 2479                 unlock_page(subpage);
3272                                                  2480 
3273                 /*                               2481                 /*
3274                  * Subpages may be freed if t    2482                  * Subpages may be freed if there wasn't any mapping
3275                  * like if add_to_swap() is r    2483                  * like if add_to_swap() is running on a lru page that
3276                  * had its mapping zapped. An    2484                  * had its mapping zapped. And freeing these pages
3277                  * requires taking the lru_lo    2485                  * requires taking the lru_lock so we do the put_page
3278                  * of the tail pages after th    2486                  * of the tail pages after the split is complete.
3279                  */                              2487                  */
3280                 free_page_and_swap_cache(subp !! 2488                 put_page(subpage);
3281         }                                        2489         }
3282 }                                                2490 }
3283                                                  2491 
                                                   >> 2492 int total_mapcount(struct page *page)
                                                   >> 2493 {
                                                   >> 2494         int i, compound, ret;
                                                   >> 2495 
                                                   >> 2496         VM_BUG_ON_PAGE(PageTail(page), page);
                                                   >> 2497 
                                                   >> 2498         if (likely(!PageCompound(page)))
                                                   >> 2499                 return atomic_read(&page->_mapcount) + 1;
                                                   >> 2500 
                                                   >> 2501         compound = compound_mapcount(page);
                                                   >> 2502         if (PageHuge(page))
                                                   >> 2503                 return compound;
                                                   >> 2504         ret = compound;
                                                   >> 2505         for (i = 0; i < HPAGE_PMD_NR; i++)
                                                   >> 2506                 ret += atomic_read(&page[i]._mapcount) + 1;
                                                   >> 2507         /* File pages has compound_mapcount included in _mapcount */
                                                   >> 2508         if (!PageAnon(page))
                                                   >> 2509                 return ret - compound * HPAGE_PMD_NR;
                                                   >> 2510         if (PageDoubleMap(page))
                                                   >> 2511                 ret -= HPAGE_PMD_NR;
                                                   >> 2512         return ret;
                                                   >> 2513 }
                                                   >> 2514 
                                                   >> 2515 /*
                                                   >> 2516  * This calculates accurately how many mappings a transparent hugepage
                                                   >> 2517  * has (unlike page_mapcount() which isn't fully accurate). This full
                                                   >> 2518  * accuracy is primarily needed to know if copy-on-write faults can
                                                   >> 2519  * reuse the page and change the mapping to read-write instead of
                                                   >> 2520  * copying them. At the same time this returns the total_mapcount too.
                                                   >> 2521  *
                                                   >> 2522  * The function returns the highest mapcount any one of the subpages
                                                   >> 2523  * has. If the return value is one, even if different processes are
                                                   >> 2524  * mapping different subpages of the transparent hugepage, they can
                                                   >> 2525  * all reuse it, because each process is reusing a different subpage.
                                                   >> 2526  *
                                                   >> 2527  * The total_mapcount is instead counting all virtual mappings of the
                                                   >> 2528  * subpages. If the total_mapcount is equal to "one", it tells the
                                                   >> 2529  * caller all mappings belong to the same "mm" and in turn the
                                                   >> 2530  * anon_vma of the transparent hugepage can become the vma->anon_vma
                                                   >> 2531  * local one as no other process may be mapping any of the subpages.
                                                   >> 2532  *
                                                   >> 2533  * It would be more accurate to replace page_mapcount() with
                                                   >> 2534  * page_trans_huge_mapcount(), however we only use
                                                   >> 2535  * page_trans_huge_mapcount() in the copy-on-write faults where we
                                                   >> 2536  * need full accuracy to avoid breaking page pinning, because
                                                   >> 2537  * page_trans_huge_mapcount() is slower than page_mapcount().
                                                   >> 2538  */
                                                   >> 2539 int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
                                                   >> 2540 {
                                                   >> 2541         int i, ret, _total_mapcount, mapcount;
                                                   >> 2542 
                                                   >> 2543         /* hugetlbfs shouldn't call it */
                                                   >> 2544         VM_BUG_ON_PAGE(PageHuge(page), page);
                                                   >> 2545 
                                                   >> 2546         if (likely(!PageTransCompound(page))) {
                                                   >> 2547                 mapcount = atomic_read(&page->_mapcount) + 1;
                                                   >> 2548                 if (total_mapcount)
                                                   >> 2549                         *total_mapcount = mapcount;
                                                   >> 2550                 return mapcount;
                                                   >> 2551         }
                                                   >> 2552 
                                                   >> 2553         page = compound_head(page);
                                                   >> 2554 
                                                   >> 2555         _total_mapcount = ret = 0;
                                                   >> 2556         for (i = 0; i < HPAGE_PMD_NR; i++) {
                                                   >> 2557                 mapcount = atomic_read(&page[i]._mapcount) + 1;
                                                   >> 2558                 ret = max(ret, mapcount);
                                                   >> 2559                 _total_mapcount += mapcount;
                                                   >> 2560         }
                                                   >> 2561         if (PageDoubleMap(page)) {
                                                   >> 2562                 ret -= 1;
                                                   >> 2563                 _total_mapcount -= HPAGE_PMD_NR;
                                                   >> 2564         }
                                                   >> 2565         mapcount = compound_mapcount(page);
                                                   >> 2566         ret += mapcount;
                                                   >> 2567         _total_mapcount += mapcount;
                                                   >> 2568         if (total_mapcount)
                                                   >> 2569                 *total_mapcount = _total_mapcount;
                                                   >> 2570         return ret;
                                                   >> 2571 }
                                                   >> 2572 
3284 /* Racy check whether the huge page can be sp    2573 /* Racy check whether the huge page can be split */
3285 bool can_split_folio(struct folio *folio, int !! 2574 bool can_split_huge_page(struct page *page, int *pextra_pins)
3286 {                                                2575 {
3287         int extra_pins;                          2576         int extra_pins;
3288                                                  2577 
3289         /* Additional pins from page cache */    2578         /* Additional pins from page cache */
3290         if (folio_test_anon(folio))           !! 2579         if (PageAnon(page))
3291                 extra_pins = folio_test_swapc !! 2580                 extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
3292                                 folio_nr_page << 
3293         else                                     2581         else
3294                 extra_pins = folio_nr_pages(f !! 2582                 extra_pins = HPAGE_PMD_NR;
3295         if (pextra_pins)                         2583         if (pextra_pins)
3296                 *pextra_pins = extra_pins;       2584                 *pextra_pins = extra_pins;
3297         return folio_mapcount(folio) == folio !! 2585         return total_mapcount(page) == page_count(page) - extra_pins - 1;
3298                                         calle << 
3299 }                                                2586 }
3300                                                  2587 
3301 /*                                               2588 /*
3302  * This function splits a large folio into sm !! 2589  * This function splits huge page into normal pages. @page can point to any
3303  * @page can point to any page of the large f !! 2590  * subpage of huge page to split. Split doesn't change the position of @page.
3304  * does not change the position of @page.     << 
3305  *                                            << 
3306  * Prerequisites:                             << 
3307  *                                            << 
3308  * 1) The caller must hold a reference on the << 
3309  *    as the large folio.                     << 
3310  *                                            << 
3311  * 2) The large folio must be locked.         << 
3312  *                                               2591  *
3313  * 3) The folio must not be pinned. Any unexp !! 2592  * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
3314  *    GUP pins, will result in the folio not  !! 2593  * The huge page must be locked.
3315  *    will receive an -EAGAIN.                << 
3316  *                                            << 
3317  * 4) @new_order > 1, usually. Splitting to o << 
3318  *    supported for non-file-backed folios, b << 
3319  *    is used by partially mapped folios, is  << 
3320  *    folio only has subpages 0 and 1. File-b << 
3321  *    since they do not use _deferred_list.   << 
3322  *                                            << 
3323  * After splitting, the caller's folio refere << 
3324  * resulting in a raised refcount of @page af << 
3325  * be freed if they are not mapped.           << 
3326  *                                               2594  *
3327  * If @list is null, tail pages will be added    2595  * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
3328  *                                               2596  *
3329  * Pages in @new_order will inherit the mappi !! 2597  * Both head page and tail pages will inherit mapping, flags, and so on from
3330  * huge page.                                 !! 2598  * the hugepage.
3331  *                                            << 
3332  * Returns 0 if the huge page was split succe << 
3333  *                                            << 
3334  * Returns -EAGAIN if the folio has unexpecte << 
3335  * the folio was concurrently removed from th << 
3336  *                                            << 
3337  * Returns -EBUSY when trying to split the hu << 
3338  * under writeback, if fs-specific folio meta << 
3339  * released, or if some unexpected race happe << 
3340  * truncation).                               << 
3341  *                                               2599  *
3342  * Callers should ensure that the order respe !! 2600  * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
3343  * min-order if one is set for non-anonymous  !! 2601  * they are not mapped.
3344  *                                               2602  *
3345  * Returns -EINVAL when trying to split to an !! 2603  * Returns 0 if the hugepage is split successfully.
3346  * with the folio. Splitting to order 0 is co !! 2604  * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
                                                   >> 2605  * us.
3347  */                                              2606  */
3348 int split_huge_page_to_list_to_order(struct p !! 2607 int split_huge_page_to_list(struct page *page, struct list_head *list)
3349                                      unsigned << 
3350 {                                                2608 {
3351         struct folio *folio = page_folio(page !! 2609         struct page *head = compound_head(page);
3352         struct deferred_split *ds_queue = get !! 2610         struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
3353         /* reset xarray order to new order af << 
3354         XA_STATE_ORDER(xas, &folio->mapping-> << 
3355         bool is_anon = folio_test_anon(folio) << 
3356         struct address_space *mapping = NULL; << 
3357         struct anon_vma *anon_vma = NULL;        2611         struct anon_vma *anon_vma = NULL;
3358         int order = folio_order(folio);       !! 2612         struct address_space *mapping = NULL;
3359         int extra_pins, ret;                  !! 2613         int count, mapcount, extra_pins, ret;
                                                   >> 2614         bool mlocked;
                                                   >> 2615         unsigned long flags;
3360         pgoff_t end;                             2616         pgoff_t end;
3361         bool is_hzp;                          << 
3362                                               << 
3363         VM_BUG_ON_FOLIO(!folio_test_locked(fo << 
3364         VM_BUG_ON_FOLIO(!folio_test_large(fol << 
3365                                                  2617 
3366         if (new_order >= folio_order(folio))  !! 2618         VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
3367                 return -EINVAL;               !! 2619         VM_BUG_ON_PAGE(!PageLocked(page), page);
3368                                               !! 2620         VM_BUG_ON_PAGE(!PageCompound(page), page);
3369         if (is_anon) {                        << 
3370                 /* order-1 is not supported f << 
3371                 if (new_order == 1) {         << 
3372                         VM_WARN_ONCE(1, "Cann << 
3373                         return -EINVAL;       << 
3374                 }                             << 
3375         } else if (new_order) {               << 
3376                 /* Split shmem folio to non-z << 
3377                 if (shmem_mapping(folio->mapp << 
3378                         VM_WARN_ONCE(1,       << 
3379                                 "Cannot split << 
3380                         return -EINVAL;       << 
3381                 }                             << 
3382                 /*                            << 
3383                  * No split if the file syste << 
3384                  * Note that we might still h << 
3385                  * CONFIG_READ_ONLY_THP_FOR_F << 
3386                  * does not actually support  << 
3387                  */                           << 
3388                 if (IS_ENABLED(CONFIG_READ_ON << 
3389                     !mapping_large_folio_supp << 
3390                         VM_WARN_ONCE(1,       << 
3391                                 "Cannot split << 
3392                         return -EINVAL;       << 
3393                 }                             << 
3394         }                                     << 
3395                                                  2621 
3396         /* Only swapping a whole PMD-mapped f !! 2622         if (PageWriteback(page))
3397         if (folio_test_swapcache(folio) && ne << 
3398                 return -EINVAL;               << 
3399                                               << 
3400         is_hzp = is_huge_zero_folio(folio);   << 
3401         if (is_hzp) {                         << 
3402                 pr_warn_ratelimited("Called s << 
3403                 return -EBUSY;                << 
3404         }                                     << 
3405                                               << 
3406         if (folio_test_writeback(folio))      << 
3407                 return -EBUSY;                   2623                 return -EBUSY;
3408                                                  2624 
3409         if (is_anon) {                        !! 2625         if (PageAnon(head)) {
3410                 /*                               2626                 /*
3411                  * The caller does not necess !! 2627                  * The caller does not necessarily hold an mmap_sem that would
3412                  * prevent the anon_vma disap    2628                  * prevent the anon_vma disappearing so we first we take a
3413                  * reference to it and then l    2629                  * reference to it and then lock the anon_vma for write. This
3414                  * is similar to folio_lock_a !! 2630                  * is similar to page_lock_anon_vma_read except the write lock
3415                  * is taken to serialise agai    2631                  * is taken to serialise against parallel split or collapse
3416                  * operations.                   2632                  * operations.
3417                  */                              2633                  */
3418                 anon_vma = folio_get_anon_vma !! 2634                 anon_vma = page_get_anon_vma(head);
3419                 if (!anon_vma) {                 2635                 if (!anon_vma) {
3420                         ret = -EBUSY;            2636                         ret = -EBUSY;
3421                         goto out;                2637                         goto out;
3422                 }                                2638                 }
3423                 end = -1;                        2639                 end = -1;
3424                 mapping = NULL;                  2640                 mapping = NULL;
3425                 anon_vma_lock_write(anon_vma)    2641                 anon_vma_lock_write(anon_vma);
3426         } else {                                 2642         } else {
3427                 unsigned int min_order;       !! 2643                 mapping = head->mapping;
3428                 gfp_t gfp;                    << 
3429                                               << 
3430                 mapping = folio->mapping;     << 
3431                                                  2644 
3432                 /* Truncated ? */                2645                 /* Truncated ? */
3433                 if (!mapping) {                  2646                 if (!mapping) {
3434                         ret = -EBUSY;            2647                         ret = -EBUSY;
3435                         goto out;                2648                         goto out;
3436                 }                                2649                 }
3437                                                  2650 
3438                 min_order = mapping_min_folio << 
3439                 if (new_order < min_order) {  << 
3440                         VM_WARN_ONCE(1, "Cann << 
3441                                      min_orde << 
3442                         ret = -EINVAL;        << 
3443                         goto out;             << 
3444                 }                             << 
3445                                               << 
3446                 gfp = current_gfp_context(map << 
3447                                               << 
3448                                               << 
3449                 if (!filemap_release_folio(fo << 
3450                         ret = -EBUSY;         << 
3451                         goto out;             << 
3452                 }                             << 
3453                                               << 
3454                 xas_split_alloc(&xas, folio,  << 
3455                 if (xas_error(&xas)) {        << 
3456                         ret = xas_error(&xas) << 
3457                         goto out;             << 
3458                 }                             << 
3459                                               << 
3460                 anon_vma = NULL;                 2651                 anon_vma = NULL;
3461                 i_mmap_lock_read(mapping);       2652                 i_mmap_lock_read(mapping);
3462                                                  2653 
3463                 /*                               2654                 /*
3464                  *__split_huge_page() may nee    2655                  *__split_huge_page() may need to trim off pages beyond EOF:
3465                  * but on 32-bit, i_size_read    2656                  * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
3466                  * which cannot be nested ins    2657                  * which cannot be nested inside the page tree lock. So note
3467                  * end now: i_size itself may    2658                  * end now: i_size itself may be changed at any moment, but
3468                  * folio lock is good enough  !! 2659                  * head page lock is good enough to serialize the trimming.
3469                  */                              2660                  */
3470                 end = DIV_ROUND_UP(i_size_rea    2661                 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
3471                 if (shmem_mapping(mapping))   << 
3472                         end = shmem_fallocend << 
3473         }                                        2662         }
3474                                                  2663 
3475         /*                                       2664         /*
3476          * Racy check if we can split the pag !! 2665          * Racy check if we can split the page, before unmap_page() will
3477          * split PMDs                            2666          * split PMDs
3478          */                                      2667          */
3479         if (!can_split_folio(folio, 1, &extra !! 2668         if (!can_split_huge_page(head, &extra_pins)) {
3480                 ret = -EAGAIN;                !! 2669                 ret = -EBUSY;
3481                 goto out_unlock;                 2670                 goto out_unlock;
3482         }                                        2671         }
3483                                                  2672 
3484         unmap_folio(folio);                   !! 2673         mlocked = PageMlocked(page);
                                                   >> 2674         unmap_page(head);
                                                   >> 2675         VM_BUG_ON_PAGE(compound_mapcount(head), head);
                                                   >> 2676 
                                                   >> 2677         /* Make sure the page is not on per-CPU pagevec as it takes pin */
                                                   >> 2678         if (mlocked)
                                                   >> 2679                 lru_add_drain();
                                                   >> 2680 
                                                   >> 2681         /* prevent PageLRU to go away from under us, and freeze lru stats */
                                                   >> 2682         spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
3485                                                  2683 
3486         /* block interrupt reentry in xa_lock << 
3487         local_irq_disable();                  << 
3488         if (mapping) {                           2684         if (mapping) {
                                                   >> 2685                 XA_STATE(xas, &mapping->i_pages, page_index(head));
                                                   >> 2686 
3489                 /*                               2687                 /*
3490                  * Check if the folio is pres !! 2688                  * Check if the head page is present in page cache.
3491                  * We assume all tail are pre !! 2689                  * We assume all tail are present too, if head is there.
3492                  */                              2690                  */
3493                 xas_lock(&xas);               !! 2691                 xa_lock(&mapping->i_pages);
3494                 xas_reset(&xas);              !! 2692                 if (xas_load(&xas) != head)
3495                 if (xas_load(&xas) != folio)  << 
3496                         goto fail;               2693                         goto fail;
3497         }                                        2694         }
3498                                                  2695 
3499         /* Prevent deferred_split_scan() touc    2696         /* Prevent deferred_split_scan() touching ->_refcount */
3500         spin_lock(&ds_queue->split_queue_lock !! 2697         spin_lock(&pgdata->split_queue_lock);
3501         if (folio_ref_freeze(folio, 1 + extra !! 2698         count = page_count(head);
3502                 if (folio_order(folio) > 1 && !! 2699         mapcount = total_mapcount(head);
3503                     !list_empty(&folio->_defe !! 2700         if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
3504                         ds_queue->split_queue !! 2701                 if (!list_empty(page_deferred_list(head))) {
3505                         if (folio_test_partia !! 2702                         pgdata->split_queue_len--;
3506                                 __folio_clear !! 2703                         list_del(page_deferred_list(head));
3507                                 mod_mthp_stat << 
3508                                               << 
3509                         }                     << 
3510                         /*                    << 
3511                          * Reinitialize page_ << 
3512                          * page from the spli << 
3513                          * split will see lis << 
3514                          * page_deferred_list << 
3515                          */                   << 
3516                         list_del_init(&folio- << 
3517                 }                             << 
3518                 spin_unlock(&ds_queue->split_ << 
3519                 if (mapping) {                << 
3520                         int nr = folio_nr_pag << 
3521                                               << 
3522                         xas_split(&xas, folio << 
3523                         if (folio_test_pmd_ma << 
3524                             new_order < HPAGE << 
3525                                 if (folio_tes << 
3526                                         __lru << 
3527                                               << 
3528                                 } else {      << 
3529                                         __lru << 
3530                                               << 
3531                                         filem << 
3532                                 }             << 
3533                         }                     << 
3534                 }                                2704                 }
                                                   >> 2705                 if (mapping)
                                                   >> 2706                         __dec_node_page_state(page, NR_SHMEM_THPS);
                                                   >> 2707                 spin_unlock(&pgdata->split_queue_lock);
                                                   >> 2708                 __split_huge_page(page, list, end, flags);
                                                   >> 2709                 if (PageSwapCache(head)) {
                                                   >> 2710                         swp_entry_t entry = { .val = page_private(head) };
3535                                                  2711 
3536                 if (is_anon) {                !! 2712                         ret = split_swap_cluster(entry);
3537                         mod_mthp_stat(order,  !! 2713                 } else
3538                         mod_mthp_stat(new_ord !! 2714                         ret = 0;
3539                 }                             << 
3540                 __split_huge_page(page, list, << 
3541                 ret = 0;                      << 
3542         } else {                                 2715         } else {
3543                 spin_unlock(&ds_queue->split_ !! 2716                 if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
3544 fail:                                         !! 2717                         pr_alert("total_mapcount: %u, page_count(): %u\n",
3545                 if (mapping)                  !! 2718                                         mapcount, count);
3546                         xas_unlock(&xas);     !! 2719                         if (PageTail(page))
3547                 local_irq_enable();           !! 2720                                 dump_page(head, NULL);
3548                 remap_page(folio, folio_nr_pa !! 2721                         dump_page(page, "total_mapcount(head) > 0");
3549                 ret = -EAGAIN;                !! 2722                         BUG();
                                                   >> 2723                 }
                                                   >> 2724                 spin_unlock(&pgdata->split_queue_lock);
                                                   >> 2725 fail:           if (mapping)
                                                   >> 2726                         xa_unlock(&mapping->i_pages);
                                                   >> 2727                 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
                                                   >> 2728                 remap_page(head);
                                                   >> 2729                 ret = -EBUSY;
3550         }                                        2730         }
3551                                                  2731 
3552 out_unlock:                                      2732 out_unlock:
3553         if (anon_vma) {                          2733         if (anon_vma) {
3554                 anon_vma_unlock_write(anon_vm    2734                 anon_vma_unlock_write(anon_vma);
3555                 put_anon_vma(anon_vma);          2735                 put_anon_vma(anon_vma);
3556         }                                        2736         }
3557         if (mapping)                             2737         if (mapping)
3558                 i_mmap_unlock_read(mapping);     2738                 i_mmap_unlock_read(mapping);
3559 out:                                             2739 out:
3560         xas_destroy(&xas);                    !! 2740         count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3561         if (order == HPAGE_PMD_ORDER)         << 
3562                 count_vm_event(!ret ? THP_SPL << 
3563         count_mthp_stat(order, !ret ? MTHP_ST << 
3564         return ret;                              2741         return ret;
3565 }                                                2742 }
3566                                                  2743 
3567 int min_order_for_split(struct folio *folio)  !! 2744 void free_transhuge_page(struct page *page)
3568 {                                                2745 {
3569         if (folio_test_anon(folio))           !! 2746         struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
3570                 return 0;                     << 
3571                                               << 
3572         if (!folio->mapping) {                << 
3573                 if (folio_test_pmd_mappable(f << 
3574                         count_vm_event(THP_SP << 
3575                 return -EBUSY;                << 
3576         }                                     << 
3577                                               << 
3578         return mapping_min_folio_order(folio- << 
3579 }                                             << 
3580                                               << 
3581 int split_folio_to_list(struct folio *folio,  << 
3582 {                                             << 
3583         int ret = min_order_for_split(folio); << 
3584                                               << 
3585         if (ret < 0)                          << 
3586                 return ret;                   << 
3587                                               << 
3588         return split_huge_page_to_list_to_ord << 
3589 }                                             << 
3590                                               << 
3591 /*                                            << 
3592  * __folio_unqueue_deferred_split() is not to << 
3593  * the folio_unqueue_deferred_split() inline  << 
3594  * limits its calls to those folios which may << 
3595  * queueing THP splits, and that list is (rac << 
3596  *                                            << 
3597  * It is unsafe to call folio_unqueue_deferre << 
3598  * zero: because even when split_queue_lock i << 
3599  * might be in use on deferred_split_scan()'s << 
3600  *                                            << 
3601  * If memory cgroups are enabled, split_queue << 
3602  * therefore important to unqueue deferred sp << 
3603  */                                           << 
3604 bool __folio_unqueue_deferred_split(struct fo << 
3605 {                                             << 
3606         struct deferred_split *ds_queue;      << 
3607         unsigned long flags;                     2747         unsigned long flags;
3608         bool unqueued = false;                << 
3609                                               << 
3610         WARN_ON_ONCE(folio_ref_count(folio)); << 
3611         WARN_ON_ONCE(!mem_cgroup_disabled() & << 
3612                                                  2748 
3613         ds_queue = get_deferred_split_queue(f !! 2749         spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3614         spin_lock_irqsave(&ds_queue->split_qu !! 2750         if (!list_empty(page_deferred_list(page))) {
3615         if (!list_empty(&folio->_deferred_lis !! 2751                 pgdata->split_queue_len--;
3616                 ds_queue->split_queue_len--;  !! 2752                 list_del(page_deferred_list(page));
3617                 if (folio_test_partially_mapp << 
3618                         __folio_clear_partial << 
3619                         mod_mthp_stat(folio_o << 
3620                                       MTHP_ST << 
3621                 }                             << 
3622                 list_del_init(&folio->_deferr << 
3623                 unqueued = true;              << 
3624         }                                        2753         }
3625         spin_unlock_irqrestore(&ds_queue->spl !! 2754         spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3626                                               !! 2755         free_compound_page(page);
3627         return unqueued;        /* useful for << 
3628 }                                                2756 }
3629                                                  2757 
3630 /* partially_mapped=false won't clear PG_part !! 2758 void deferred_split_huge_page(struct page *page)
3631 void deferred_split_folio(struct folio *folio << 
3632 {                                                2759 {
3633         struct deferred_split *ds_queue = get !! 2760         struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
3634 #ifdef CONFIG_MEMCG                           << 
3635         struct mem_cgroup *memcg = folio_memc << 
3636 #endif                                        << 
3637         unsigned long flags;                     2761         unsigned long flags;
3638                                                  2762 
3639         /*                                    !! 2763         VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3640          * Order 1 folios have no space for a << 
3641          * won't waste much memory by not add << 
3642          */                                   << 
3643         if (folio_order(folio) <= 1)          << 
3644                 return;                       << 
3645                                               << 
3646         if (!partially_mapped && !split_under << 
3647                 return;                       << 
3648                                                  2764 
3649         /*                                    !! 2765         spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3650          * Exclude swapcache: originally to a !! 2766         if (list_empty(page_deferred_list(page))) {
3651          * queue. Nowadays that is fully prev !! 2767                 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
3652          * but if page reclaim is already han !! 2768                 list_add_tail(page_deferred_list(page), &pgdata->split_queue);
3653          * unnecessary to handle it again in  !! 2769                 pgdata->split_queue_len++;
3654          * swapcache here may still be a usef << 
3655          */                                   << 
3656         if (folio_test_swapcache(folio))      << 
3657                 return;                       << 
3658                                               << 
3659         spin_lock_irqsave(&ds_queue->split_qu << 
3660         if (partially_mapped) {               << 
3661                 if (!folio_test_partially_map << 
3662                         __folio_set_partially << 
3663                         if (folio_test_pmd_ma << 
3664                                 count_vm_even << 
3665                         count_mthp_stat(folio << 
3666                         mod_mthp_stat(folio_o << 
3667                                               << 
3668                 }                             << 
3669         } else {                              << 
3670                 /* partially mapped folios ca << 
3671                 VM_WARN_ON_FOLIO(folio_test_p << 
3672         }                                     << 
3673         if (list_empty(&folio->_deferred_list << 
3674                 list_add_tail(&folio->_deferr << 
3675                 ds_queue->split_queue_len++;  << 
3676 #ifdef CONFIG_MEMCG                           << 
3677                 if (memcg)                    << 
3678                         set_shrinker_bit(memc << 
3679                                          defe << 
3680 #endif                                        << 
3681         }                                        2770         }
3682         spin_unlock_irqrestore(&ds_queue->spl !! 2771         spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3683 }                                                2772 }
3684                                                  2773 
3685 static unsigned long deferred_split_count(str    2774 static unsigned long deferred_split_count(struct shrinker *shrink,
3686                 struct shrink_control *sc)       2775                 struct shrink_control *sc)
3687 {                                                2776 {
3688         struct pglist_data *pgdata = NODE_DAT    2777         struct pglist_data *pgdata = NODE_DATA(sc->nid);
3689         struct deferred_split *ds_queue = &pg !! 2778         return READ_ONCE(pgdata->split_queue_len);
3690                                               << 
3691 #ifdef CONFIG_MEMCG                           << 
3692         if (sc->memcg)                        << 
3693                 ds_queue = &sc->memcg->deferr << 
3694 #endif                                        << 
3695         return READ_ONCE(ds_queue->split_queu << 
3696 }                                             << 
3697                                               << 
3698 static bool thp_underused(struct folio *folio << 
3699 {                                             << 
3700         int num_zero_pages = 0, num_filled_pa << 
3701         void *kaddr;                          << 
3702         int i;                                << 
3703                                               << 
3704         if (khugepaged_max_ptes_none == HPAGE << 
3705                 return false;                 << 
3706                                               << 
3707         for (i = 0; i < folio_nr_pages(folio) << 
3708                 kaddr = kmap_local_folio(foli << 
3709                 if (!memchr_inv(kaddr, 0, PAG << 
3710                         num_zero_pages++;     << 
3711                         if (num_zero_pages >  << 
3712                                 kunmap_local( << 
3713                                 return true;  << 
3714                         }                     << 
3715                 } else {                      << 
3716                         /*                    << 
3717                          * Another path for e << 
3718                          * of non-zero filled << 
3719                          */                   << 
3720                         num_filled_pages++;   << 
3721                         if (num_filled_pages  << 
3722                                 kunmap_local( << 
3723                                 return false; << 
3724                         }                     << 
3725                 }                             << 
3726                 kunmap_local(kaddr);          << 
3727         }                                     << 
3728         return false;                         << 
3729 }                                                2779 }
3730                                                  2780 
3731 static unsigned long deferred_split_scan(stru    2781 static unsigned long deferred_split_scan(struct shrinker *shrink,
3732                 struct shrink_control *sc)       2782                 struct shrink_control *sc)
3733 {                                                2783 {
3734         struct pglist_data *pgdata = NODE_DAT    2784         struct pglist_data *pgdata = NODE_DATA(sc->nid);
3735         struct deferred_split *ds_queue = &pg << 
3736         unsigned long flags;                     2785         unsigned long flags;
3737         LIST_HEAD(list);                      !! 2786         LIST_HEAD(list), *pos, *next;
3738         struct folio *folio, *next, *prev = N !! 2787         struct page *page;
3739         int split = 0, removed = 0;           !! 2788         int split = 0;
3740                                               << 
3741 #ifdef CONFIG_MEMCG                           << 
3742         if (sc->memcg)                        << 
3743                 ds_queue = &sc->memcg->deferr << 
3744 #endif                                        << 
3745                                                  2789 
3746         spin_lock_irqsave(&ds_queue->split_qu !! 2790         spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3747         /* Take pin on all head pages to avoi    2791         /* Take pin on all head pages to avoid freeing them under us */
3748         list_for_each_entry_safe(folio, next, !! 2792         list_for_each_safe(pos, next, &pgdata->split_queue) {
3749                                               !! 2793                 page = list_entry((void *)pos, struct page, mapping);
3750                 if (folio_try_get(folio)) {   !! 2794                 page = compound_head(page);
3751                         list_move(&folio->_de !! 2795                 if (get_page_unless_zero(page)) {
                                                   >> 2796                         list_move(page_deferred_list(page), &list);
3752                 } else {                         2797                 } else {
3753                         /* We lost race with  !! 2798                         /* We lost race with put_compound_page() */
3754                         if (folio_test_partia !! 2799                         list_del_init(page_deferred_list(page));
3755                                 __folio_clear !! 2800                         pgdata->split_queue_len--;
3756                                 mod_mthp_stat << 
3757                                               << 
3758                         }                     << 
3759                         list_del_init(&folio- << 
3760                         ds_queue->split_queue << 
3761                 }                                2801                 }
3762                 if (!--sc->nr_to_scan)           2802                 if (!--sc->nr_to_scan)
3763                         break;                   2803                         break;
3764         }                                        2804         }
3765         spin_unlock_irqrestore(&ds_queue->spl !! 2805         spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3766                                                  2806 
3767         list_for_each_entry_safe(folio, next, !! 2807         list_for_each_safe(pos, next, &list) {
3768                 bool did_split = false;       !! 2808                 page = list_entry((void *)pos, struct page, mapping);
3769                 bool underused = false;       !! 2809                 if (!trylock_page(page))
3770                                               << 
3771                 if (!folio_test_partially_map << 
3772                         underused = thp_under << 
3773                         if (!underused)       << 
3774                                 goto next;    << 
3775                 }                             << 
3776                 if (!folio_trylock(folio))    << 
3777                         goto next;               2810                         goto next;
3778                 if (!split_folio(folio)) {    !! 2811                 /* split_huge_page() removes page from list on success */
3779                         did_split = true;     !! 2812                 if (!split_huge_page(page))
3780                         if (underused)        << 
3781                                 count_vm_even << 
3782                         split++;                 2813                         split++;
3783                 }                             !! 2814                 unlock_page(page);
3784                 folio_unlock(folio);          << 
3785 next:                                            2815 next:
3786                 /*                            !! 2816                 put_page(page);
3787                  * split_folio() removes foli << 
3788                  * Only add back to the queue << 
3789                  * If thp_underused returns f << 
3790                  * in the case it was underus << 
3791                  * don't add it back to split << 
3792                  */                           << 
3793                 if (!did_split && !folio_test << 
3794                         list_del_init(&folio- << 
3795                         removed++;            << 
3796                 } else {                      << 
3797                         /*                    << 
3798                          * That unlocked list << 
3799                          * unless its folio i << 
3800                          * left on the list ( << 
3801                          * by one safe folio  << 
3802                          */                   << 
3803                         swap(folio, prev);    << 
3804                 }                             << 
3805                 if (folio)                    << 
3806                         folio_put(folio);     << 
3807         }                                        2817         }
3808                                                  2818 
3809         spin_lock_irqsave(&ds_queue->split_qu !! 2819         spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3810         list_splice_tail(&list, &ds_queue->sp !! 2820         list_splice_tail(&list, &pgdata->split_queue);
3811         ds_queue->split_queue_len -= removed; !! 2821         spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3812         spin_unlock_irqrestore(&ds_queue->spl << 
3813                                               << 
3814         if (prev)                             << 
3815                 folio_put(prev);              << 
3816                                                  2822 
3817         /*                                       2823         /*
3818          * Stop shrinker if we didn't split a    2824          * Stop shrinker if we didn't split any page, but the queue is empty.
3819          * This can happen if pages were free    2825          * This can happen if pages were freed under us.
3820          */                                      2826          */
3821         if (!split && list_empty(&ds_queue->s !! 2827         if (!split && list_empty(&pgdata->split_queue))
3822                 return SHRINK_STOP;              2828                 return SHRINK_STOP;
3823         return split;                            2829         return split;
3824 }                                                2830 }
3825                                                  2831 
                                                   >> 2832 static struct shrinker deferred_split_shrinker = {
                                                   >> 2833         .count_objects = deferred_split_count,
                                                   >> 2834         .scan_objects = deferred_split_scan,
                                                   >> 2835         .seeks = DEFAULT_SEEKS,
                                                   >> 2836         .flags = SHRINKER_NUMA_AWARE,
                                                   >> 2837 };
                                                   >> 2838 
3826 #ifdef CONFIG_DEBUG_FS                           2839 #ifdef CONFIG_DEBUG_FS
3827 static void split_huge_pages_all(void)        !! 2840 static int split_huge_pages_set(void *data, u64 val)
3828 {                                                2841 {
3829         struct zone *zone;                       2842         struct zone *zone;
3830         struct page *page;                       2843         struct page *page;
3831         struct folio *folio;                  << 
3832         unsigned long pfn, max_zone_pfn;         2844         unsigned long pfn, max_zone_pfn;
3833         unsigned long total = 0, split = 0;      2845         unsigned long total = 0, split = 0;
3834                                                  2846 
3835         pr_debug("Split all THPs\n");         !! 2847         if (val != 1)
3836         for_each_zone(zone) {                 !! 2848                 return -EINVAL;
3837                 if (!managed_zone(zone))      !! 2849 
3838                         continue;             !! 2850         for_each_populated_zone(zone) {
3839                 max_zone_pfn = zone_end_pfn(z    2851                 max_zone_pfn = zone_end_pfn(zone);
3840                 for (pfn = zone->zone_start_p    2852                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
3841                         int nr_pages;         !! 2853                         if (!pfn_valid(pfn))
3842                                               << 
3843                         page = pfn_to_online_ << 
3844                         if (!page || PageTail << 
3845                                 continue;     << 
3846                         folio = page_folio(pa << 
3847                         if (!folio_try_get(fo << 
3848                                 continue;        2854                                 continue;
3849                                                  2855 
3850                         if (unlikely(page_fol !! 2856                         page = pfn_to_page(pfn);
3851                                 goto next;    !! 2857                         if (!get_page_unless_zero(page))
                                                   >> 2858                                 continue;
3852                                                  2859 
3853                         if (zone != folio_zon !! 2860                         if (zone != page_zone(page))
3854                                 goto next;       2861                                 goto next;
3855                                                  2862 
3856                         if (!folio_test_large !! 2863                         if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
3857                                 || folio_test << 
3858                                 || !folio_tes << 
3859                                 goto next;       2864                                 goto next;
3860                                                  2865 
3861                         total++;                 2866                         total++;
3862                         folio_lock(folio);    !! 2867                         lock_page(page);
3863                         nr_pages = folio_nr_p !! 2868                         if (!split_huge_page(page))
3864                         if (!split_folio(foli << 
3865                                 split++;         2869                                 split++;
3866                         pfn += nr_pages - 1;  !! 2870                         unlock_page(page);
3867                         folio_unlock(folio);  << 
3868 next:                                            2871 next:
3869                         folio_put(folio);     !! 2872                         put_page(page);
3870                         cond_resched();       << 
3871                 }                             << 
3872         }                                     << 
3873                                               << 
3874         pr_debug("%lu of %lu THP split\n", sp << 
3875 }                                             << 
3876                                               << 
3877 static inline bool vma_not_suitable_for_thp_s << 
3878 {                                             << 
3879         return vma_is_special_huge(vma) || (v << 
3880                     is_vm_hugetlb_page(vma);  << 
3881 }                                             << 
3882                                               << 
3883 static int split_huge_pages_pid(int pid, unsi << 
3884                                 unsigned long << 
3885 {                                             << 
3886         int ret = 0;                          << 
3887         struct task_struct *task;             << 
3888         struct mm_struct *mm;                 << 
3889         unsigned long total = 0, split = 0;   << 
3890         unsigned long addr;                   << 
3891                                               << 
3892         vaddr_start &= PAGE_MASK;             << 
3893         vaddr_end &= PAGE_MASK;               << 
3894                                               << 
3895         task = find_get_task_by_vpid(pid);    << 
3896         if (!task) {                          << 
3897                 ret = -ESRCH;                 << 
3898                 goto out;                     << 
3899         }                                     << 
3900                                               << 
3901         /* Find the mm_struct */              << 
3902         mm = get_task_mm(task);               << 
3903         put_task_struct(task);                << 
3904                                               << 
3905         if (!mm) {                            << 
3906                 ret = -EINVAL;                << 
3907                 goto out;                     << 
3908         }                                     << 
3909                                               << 
3910         pr_debug("Split huge pages in pid: %d << 
3911                  pid, vaddr_start, vaddr_end) << 
3912                                               << 
3913         mmap_read_lock(mm);                   << 
3914         /*                                    << 
3915          * always increase addr by PAGE_SIZE, << 
3916          * table filled with PTE-mapped THPs, << 
3917          */                                   << 
3918         for (addr = vaddr_start; addr < vaddr << 
3919                 struct vm_area_struct *vma =  << 
3920                 struct folio_walk fw;         << 
3921                 struct folio *folio;          << 
3922                 struct address_space *mapping << 
3923                 unsigned int target_order = n << 
3924                                               << 
3925                 if (!vma)                     << 
3926                         break;                << 
3927                                               << 
3928                 /* skip special VMA and huget << 
3929                 if (vma_not_suitable_for_thp_ << 
3930                         addr = vma->vm_end;   << 
3931                         continue;             << 
3932                 }                             << 
3933                                               << 
3934                 folio = folio_walk_start(&fw, << 
3935                 if (!folio)                   << 
3936                         continue;             << 
3937                                               << 
3938                 if (!is_transparent_hugepage( << 
3939                         goto next;            << 
3940                                               << 
3941                 if (!folio_test_anon(folio))  << 
3942                         mapping = folio->mapp << 
3943                         target_order = max(ne << 
3944                                            ma << 
3945                 }                                2873                 }
3946                                               << 
3947                 if (target_order >= folio_ord << 
3948                         goto next;            << 
3949                                               << 
3950                 total++;                      << 
3951                 /*                            << 
3952                  * For folios with private, s << 
3953                  * will try to drop it before << 
3954                  * can be split or not. So sk << 
3955                  */                           << 
3956                 if (!folio_test_private(folio << 
3957                     !can_split_folio(folio, 0 << 
3958                         goto next;            << 
3959                                               << 
3960                 if (!folio_trylock(folio))    << 
3961                         goto next;            << 
3962                 folio_get(folio);             << 
3963                 folio_walk_end(&fw, vma);     << 
3964                                               << 
3965                 if (!folio_test_anon(folio) & << 
3966                         goto unlock;          << 
3967                                               << 
3968                 if (!split_folio_to_order(fol << 
3969                         split++;              << 
3970                                               << 
3971 unlock:                                       << 
3972                                               << 
3973                 folio_unlock(folio);          << 
3974                 folio_put(folio);             << 
3975                                               << 
3976                 cond_resched();               << 
3977                 continue;                     << 
3978 next:                                         << 
3979                 folio_walk_end(&fw, vma);     << 
3980                 cond_resched();               << 
3981         }                                     << 
3982         mmap_read_unlock(mm);                 << 
3983         mmput(mm);                            << 
3984                                               << 
3985         pr_debug("%lu of %lu THP split\n", sp << 
3986                                               << 
3987 out:                                          << 
3988         return ret;                           << 
3989 }                                             << 
3990                                               << 
3991 static int split_huge_pages_in_file(const cha << 
3992                                 pgoff_t off_e << 
3993 {                                             << 
3994         struct filename *file;                << 
3995         struct file *candidate;               << 
3996         struct address_space *mapping;        << 
3997         int ret = -EINVAL;                    << 
3998         pgoff_t index;                        << 
3999         int nr_pages = 1;                     << 
4000         unsigned long total = 0, split = 0;   << 
4001         unsigned int min_order;               << 
4002         unsigned int target_order;            << 
4003                                               << 
4004         file = getname_kernel(file_path);     << 
4005         if (IS_ERR(file))                     << 
4006                 return ret;                   << 
4007                                               << 
4008         candidate = file_open_name(file, O_RD << 
4009         if (IS_ERR(candidate))                << 
4010                 goto out;                     << 
4011                                               << 
4012         pr_debug("split file-backed THPs in f << 
4013                  file_path, off_start, off_en << 
4014                                               << 
4015         mapping = candidate->f_mapping;       << 
4016         min_order = mapping_min_folio_order(m << 
4017         target_order = max(new_order, min_ord << 
4018                                               << 
4019         for (index = off_start; index < off_e << 
4020                 struct folio *folio = filemap << 
4021                                               << 
4022                 nr_pages = 1;                 << 
4023                 if (IS_ERR(folio))            << 
4024                         continue;             << 
4025                                               << 
4026                 if (!folio_test_large(folio)) << 
4027                         goto next;            << 
4028                                               << 
4029                 total++;                      << 
4030                 nr_pages = folio_nr_pages(fol << 
4031                                               << 
4032                 if (target_order >= folio_ord << 
4033                         goto next;            << 
4034                                               << 
4035                 if (!folio_trylock(folio))    << 
4036                         goto next;            << 
4037                                               << 
4038                 if (folio->mapping != mapping << 
4039                         goto unlock;          << 
4040                                               << 
4041                 if (!split_folio_to_order(fol << 
4042                         split++;              << 
4043                                               << 
4044 unlock:                                       << 
4045                 folio_unlock(folio);          << 
4046 next:                                         << 
4047                 folio_put(folio);             << 
4048                 cond_resched();               << 
4049         }                                        2874         }
4050                                                  2875 
4051         filp_close(candidate, NULL);          !! 2876         pr_info("%lu of %lu THP split\n", split, total);
4052         ret = 0;                              << 
4053                                                  2877 
4054         pr_debug("%lu of %lu file-backed THP  !! 2878         return 0;
4055 out:                                          << 
4056         putname(file);                        << 
4057         return ret;                           << 
4058 }                                                2879 }
                                                   >> 2880 DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
                                                   >> 2881                 "%llu\n");
4059                                                  2882 
4060 #define MAX_INPUT_BUF_SZ 255                  !! 2883 static int __init split_huge_pages_debugfs(void)
4061                                               << 
4062 static ssize_t split_huge_pages_write(struct  << 
4063                                 size_t count, << 
4064 {                                                2884 {
4065         static DEFINE_MUTEX(split_debug_mutex !! 2885         void *ret;
4066         ssize_t ret;                          << 
4067         /*                                    << 
4068          * hold pid, start_vaddr, end_vaddr,  << 
4069          * file_path, off_start, off_end, new << 
4070          */                                   << 
4071         char input_buf[MAX_INPUT_BUF_SZ];     << 
4072         int pid;                              << 
4073         unsigned long vaddr_start, vaddr_end; << 
4074         unsigned int new_order = 0;           << 
4075                                               << 
4076         ret = mutex_lock_interruptible(&split << 
4077         if (ret)                              << 
4078                 return ret;                   << 
4079                                               << 
4080         ret = -EFAULT;                        << 
4081                                               << 
4082         memset(input_buf, 0, MAX_INPUT_BUF_SZ << 
4083         if (copy_from_user(input_buf, buf, mi << 
4084                 goto out;                     << 
4085                                               << 
4086         input_buf[MAX_INPUT_BUF_SZ - 1] = '\0 << 
4087                                               << 
4088         if (input_buf[0] == '/') {            << 
4089                 char *tok;                    << 
4090                 char *buf = input_buf;        << 
4091                 char file_path[MAX_INPUT_BUF_ << 
4092                 pgoff_t off_start = 0, off_en << 
4093                 size_t input_len = strlen(inp << 
4094                                               << 
4095                 tok = strsep(&buf, ",");      << 
4096                 if (tok) {                    << 
4097                         strcpy(file_path, tok << 
4098                 } else {                      << 
4099                         ret = -EINVAL;        << 
4100                         goto out;             << 
4101                 }                             << 
4102                                               << 
4103                 ret = sscanf(buf, "0x%lx,0x%l << 
4104                 if (ret != 2 && ret != 3) {   << 
4105                         ret = -EINVAL;        << 
4106                         goto out;             << 
4107                 }                             << 
4108                 ret = split_huge_pages_in_fil << 
4109                 if (!ret)                     << 
4110                         ret = input_len;      << 
4111                                               << 
4112                 goto out;                     << 
4113         }                                     << 
4114                                                  2886 
4115         ret = sscanf(input_buf, "%d,0x%lx,0x% !! 2887         ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
4116         if (ret == 1 && pid == 1) {           !! 2888                         &split_huge_pages_fops);
4117                 split_huge_pages_all();       << 
4118                 ret = strlen(input_buf);      << 
4119                 goto out;                     << 
4120         } else if (ret != 3 && ret != 4) {    << 
4121                 ret = -EINVAL;                << 
4122                 goto out;                     << 
4123         }                                     << 
4124                                               << 
4125         ret = split_huge_pages_pid(pid, vaddr << 
4126         if (!ret)                                2889         if (!ret)
4127                 ret = strlen(input_buf);      !! 2890                 pr_warn("Failed to create split_huge_pages in debugfs");
4128 out:                                          << 
4129         mutex_unlock(&split_debug_mutex);     << 
4130         return ret;                           << 
4131                                               << 
4132 }                                             << 
4133                                               << 
4134 static const struct file_operations split_hug << 
4135         .owner   = THIS_MODULE,               << 
4136         .write   = split_huge_pages_write,    << 
4137 };                                            << 
4138                                               << 
4139 static int __init split_huge_pages_debugfs(vo << 
4140 {                                             << 
4141         debugfs_create_file("split_huge_pages << 
4142                             &split_huge_pages << 
4143         return 0;                                2891         return 0;
4144 }                                                2892 }
4145 late_initcall(split_huge_pages_debugfs);         2893 late_initcall(split_huge_pages_debugfs);
4146 #endif                                           2894 #endif
4147                                                  2895 
4148 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION          2896 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
4149 int set_pmd_migration_entry(struct page_vma_m !! 2897 void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
4150                 struct page *page)               2898                 struct page *page)
4151 {                                                2899 {
4152         struct folio *folio = page_folio(page << 
4153         struct vm_area_struct *vma = pvmw->vm    2900         struct vm_area_struct *vma = pvmw->vma;
4154         struct mm_struct *mm = vma->vm_mm;       2901         struct mm_struct *mm = vma->vm_mm;
4155         unsigned long address = pvmw->address    2902         unsigned long address = pvmw->address;
4156         bool anon_exclusive;                  << 
4157         pmd_t pmdval;                            2903         pmd_t pmdval;
4158         swp_entry_t entry;                       2904         swp_entry_t entry;
4159         pmd_t pmdswp;                            2905         pmd_t pmdswp;
4160                                                  2906 
4161         if (!(pvmw->pmd && !pvmw->pte))          2907         if (!(pvmw->pmd && !pvmw->pte))
4162                 return 0;                     !! 2908                 return;
4163                                                  2909 
4164         flush_cache_range(vma, address, addre    2910         flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
4165         pmdval = pmdp_invalidate(vma, address !! 2911         pmdval = *pvmw->pmd;
4166                                               !! 2912         pmdp_invalidate(vma, address, pvmw->pmd);
4167         /* See folio_try_share_anon_rmap_pmd( << 
4168         anon_exclusive = folio_test_anon(foli << 
4169         if (anon_exclusive && folio_try_share << 
4170                 set_pmd_at(mm, address, pvmw- << 
4171                 return -EBUSY;                << 
4172         }                                     << 
4173                                               << 
4174         if (pmd_dirty(pmdval))                << 
4175                 folio_mark_dirty(folio);      << 
4176         if (pmd_write(pmdval))                << 
4177                 entry = make_writable_migrati << 
4178         else if (anon_exclusive)              << 
4179                 entry = make_readable_exclusi << 
4180         else                                  << 
4181                 entry = make_readable_migrati << 
4182         if (pmd_young(pmdval))                << 
4183                 entry = make_migration_entry_ << 
4184         if (pmd_dirty(pmdval))                   2913         if (pmd_dirty(pmdval))
4185                 entry = make_migration_entry_ !! 2914                 set_page_dirty(page);
                                                   >> 2915         entry = make_migration_entry(page, pmd_write(pmdval));
4186         pmdswp = swp_entry_to_pmd(entry);        2916         pmdswp = swp_entry_to_pmd(entry);
4187         if (pmd_soft_dirty(pmdval))              2917         if (pmd_soft_dirty(pmdval))
4188                 pmdswp = pmd_swp_mksoft_dirty    2918                 pmdswp = pmd_swp_mksoft_dirty(pmdswp);
4189         if (pmd_uffd_wp(pmdval))              << 
4190                 pmdswp = pmd_swp_mkuffd_wp(pm << 
4191         set_pmd_at(mm, address, pvmw->pmd, pm    2919         set_pmd_at(mm, address, pvmw->pmd, pmdswp);
4192         folio_remove_rmap_pmd(folio, page, vm !! 2920         page_remove_rmap(page, true);
4193         folio_put(folio);                     !! 2921         put_page(page);
4194         trace_set_migration_pmd(address, pmd_ << 
4195                                               << 
4196         return 0;                             << 
4197 }                                                2922 }
4198                                                  2923 
4199 void remove_migration_pmd(struct page_vma_map    2924 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
4200 {                                                2925 {
4201         struct folio *folio = page_folio(new) << 
4202         struct vm_area_struct *vma = pvmw->vm    2926         struct vm_area_struct *vma = pvmw->vma;
4203         struct mm_struct *mm = vma->vm_mm;       2927         struct mm_struct *mm = vma->vm_mm;
4204         unsigned long address = pvmw->address    2928         unsigned long address = pvmw->address;
4205         unsigned long haddr = address & HPAGE !! 2929         unsigned long mmun_start = address & HPAGE_PMD_MASK;
4206         pmd_t pmde;                              2930         pmd_t pmde;
4207         swp_entry_t entry;                       2931         swp_entry_t entry;
4208                                                  2932 
4209         if (!(pvmw->pmd && !pvmw->pte))          2933         if (!(pvmw->pmd && !pvmw->pte))
4210                 return;                          2934                 return;
4211                                                  2935 
4212         entry = pmd_to_swp_entry(*pvmw->pmd);    2936         entry = pmd_to_swp_entry(*pvmw->pmd);
4213         folio_get(folio);                     !! 2937         get_page(new);
4214         pmde = mk_huge_pmd(new, READ_ONCE(vma !! 2938         pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
4215         if (pmd_swp_soft_dirty(*pvmw->pmd))      2939         if (pmd_swp_soft_dirty(*pvmw->pmd))
4216                 pmde = pmd_mksoft_dirty(pmde)    2940                 pmde = pmd_mksoft_dirty(pmde);
4217         if (is_writable_migration_entry(entry !! 2941         if (is_write_migration_entry(entry))
4218                 pmde = pmd_mkwrite(pmde, vma) !! 2942                 pmde = maybe_pmd_mkwrite(pmde, vma);
4219         if (pmd_swp_uffd_wp(*pvmw->pmd))      << 
4220                 pmde = pmd_mkuffd_wp(pmde);   << 
4221         if (!is_migration_entry_young(entry)) << 
4222                 pmde = pmd_mkold(pmde);       << 
4223         /* NOTE: this may contain setting sof << 
4224         if (folio_test_dirty(folio) && is_mig << 
4225                 pmde = pmd_mkdirty(pmde);     << 
4226                                               << 
4227         if (folio_test_anon(folio)) {         << 
4228                 rmap_t rmap_flags = RMAP_NONE << 
4229                                               << 
4230                 if (!is_readable_migration_en << 
4231                         rmap_flags |= RMAP_EX << 
4232                                                  2943 
4233                 folio_add_anon_rmap_pmd(folio !! 2944         flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
4234         } else {                              !! 2945         if (PageAnon(new))
4235                 folio_add_file_rmap_pmd(folio !! 2946                 page_add_anon_rmap(new, vma, mmun_start, true);
4236         }                                     !! 2947         else
4237         VM_BUG_ON(pmd_write(pmde) && folio_te !! 2948                 page_add_file_rmap(new, true);
4238         set_pmd_at(mm, haddr, pvmw->pmd, pmde !! 2949         set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
4239                                               !! 2950         if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
4240         /* No need to invalidate - it was non !! 2951                 mlock_vma_page(new);
4241         update_mmu_cache_pmd(vma, address, pv    2952         update_mmu_cache_pmd(vma, address, pvmw->pmd);
4242         trace_remove_migration_pmd(address, p << 
4243 }                                                2953 }
4244 #endif                                           2954 #endif
4245                                                  2955 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php