1 // SPDX-License-Identifier: GPL-2.0-only << 2 /* 1 /* 3 * Copyright (C) 2009 Red Hat, Inc. 2 * Copyright (C) 2009 Red Hat, Inc. >> 3 * >> 4 * This work is licensed under the terms of the GNU GPL, version 2. See >> 5 * the COPYING file in the top-level directory. 4 */ 6 */ 5 7 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 9 8 #include <linux/mm.h> 10 #include <linux/mm.h> 9 #include <linux/sched.h> 11 #include <linux/sched.h> 10 #include <linux/sched/mm.h> << 11 #include <linux/sched/coredump.h> 12 #include <linux/sched/coredump.h> 12 #include <linux/sched/numa_balancing.h> 13 #include <linux/sched/numa_balancing.h> 13 #include <linux/highmem.h> 14 #include <linux/highmem.h> 14 #include <linux/hugetlb.h> 15 #include <linux/hugetlb.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/mmu_notifier.h> 16 #include <linux/rmap.h> 17 #include <linux/rmap.h> 17 #include <linux/swap.h> 18 #include <linux/swap.h> 18 #include <linux/shrinker.h> 19 #include <linux/shrinker.h> 19 #include <linux/mm_inline.h> 20 #include <linux/mm_inline.h> 20 #include <linux/swapops.h> 21 #include <linux/swapops.h> 21 #include <linux/backing-dev.h> << 22 #include <linux/dax.h> 22 #include <linux/dax.h> 23 #include <linux/mm_types.h> << 24 #include <linux/khugepaged.h> 23 #include <linux/khugepaged.h> 25 #include <linux/freezer.h> 24 #include <linux/freezer.h> 26 #include <linux/pfn_t.h> 25 #include <linux/pfn_t.h> 27 #include <linux/mman.h> 26 #include <linux/mman.h> 28 #include <linux/memremap.h> 27 #include <linux/memremap.h> 29 #include <linux/pagemap.h> 28 #include <linux/pagemap.h> 30 #include <linux/debugfs.h> 29 #include <linux/debugfs.h> 31 #include <linux/migrate.h> 30 #include <linux/migrate.h> 32 #include <linux/hashtable.h> 31 #include <linux/hashtable.h> 33 #include <linux/userfaultfd_k.h> 32 #include <linux/userfaultfd_k.h> 34 #include <linux/page_idle.h> 33 #include <linux/page_idle.h> 35 #include <linux/shmem_fs.h> 34 #include <linux/shmem_fs.h> 36 #include <linux/oom.h> 35 #include <linux/oom.h> 37 #include <linux/numa.h> << 38 #include <linux/page_owner.h> << 39 #include <linux/sched/sysctl.h> << 40 #include <linux/memory-tiers.h> << 41 #include <linux/compat.h> << 42 #include <linux/pgalloc_tag.h> << 43 #include <linux/pagewalk.h> << 44 36 45 #include <asm/tlb.h> 37 #include <asm/tlb.h> 46 #include <asm/pgalloc.h> 38 #include <asm/pgalloc.h> 47 #include "internal.h" 39 #include "internal.h" 48 #include "swap.h" << 49 << 50 #define CREATE_TRACE_POINTS << 51 #include <trace/events/thp.h> << 52 40 53 /* 41 /* 54 * By default, transparent hugepage support is 42 * By default, transparent hugepage support is disabled in order to avoid 55 * risking an increased memory footprint for a 43 * risking an increased memory footprint for applications that are not 56 * guaranteed to benefit from it. When transpa 44 * guaranteed to benefit from it. When transparent hugepage support is 57 * enabled, it is for all mappings, and khugep 45 * enabled, it is for all mappings, and khugepaged scans all mappings. 58 * Defrag is invoked by khugepaged hugepage al 46 * Defrag is invoked by khugepaged hugepage allocations and by page faults 59 * for all hugepage allocations. 47 * for all hugepage allocations. 60 */ 48 */ 61 unsigned long transparent_hugepage_flags __rea 49 unsigned long transparent_hugepage_flags __read_mostly = 62 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 50 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 63 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 51 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 64 #endif 52 #endif 65 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 53 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 66 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG 54 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 67 #endif 55 #endif 68 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MA 56 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| 69 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEP 57 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 70 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE 58 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 71 59 72 static struct shrinker *deferred_split_shrinke !! 60 static struct shrinker deferred_split_shrinker; 73 static unsigned long deferred_split_count(stru << 74 stru << 75 static unsigned long deferred_split_scan(struc << 76 struc << 77 static bool split_underused_thp = true; << 78 61 79 static atomic_t huge_zero_refcount; 62 static atomic_t huge_zero_refcount; 80 struct folio *huge_zero_folio __read_mostly; !! 63 struct page *huge_zero_page __read_mostly; 81 unsigned long huge_zero_pfn __read_mostly = ~0 << 82 unsigned long huge_anon_orders_always __read_m << 83 unsigned long huge_anon_orders_madvise __read_ << 84 unsigned long huge_anon_orders_inherit __read_ << 85 static bool anon_orders_configured __initdata; << 86 << 87 unsigned long __thp_vma_allowable_orders(struc << 88 unsig << 89 unsig << 90 unsig << 91 { << 92 bool smaps = tva_flags & TVA_SMAPS; << 93 bool in_pf = tva_flags & TVA_IN_PF; << 94 bool enforce_sysfs = tva_flags & TVA_E << 95 unsigned long supported_orders; << 96 << 97 /* Check the intersection of requested << 98 if (vma_is_anonymous(vma)) << 99 supported_orders = THP_ORDERS_ << 100 else if (vma_is_special_huge(vma)) << 101 supported_orders = THP_ORDERS_ << 102 else << 103 supported_orders = THP_ORDERS_ << 104 << 105 orders &= supported_orders; << 106 if (!orders) << 107 return 0; << 108 << 109 if (!vma->vm_mm) /* vds << 110 return 0; << 111 << 112 if (thp_disabled_by_hw() || vma_thp_di << 113 return 0; << 114 << 115 /* khugepaged doesn't collapse DAX vma << 116 if (vma_is_dax(vma)) << 117 return in_pf ? orders : 0; << 118 << 119 /* << 120 * khugepaged special VMA and hugetlb << 121 * Must be checked after dax since som << 122 * VM_MIXEDMAP set. << 123 */ << 124 if (!in_pf && !smaps && (vm_flags & VM << 125 return 0; << 126 << 127 /* << 128 * Check alignment for file vma and si << 129 * filtering out the unsuitable orders << 130 * << 131 * Skip the check for page fault. Huge << 132 * handlers. << 133 */ << 134 if (!in_pf) { << 135 int order = highest_order(orde << 136 unsigned long addr; << 137 << 138 while (orders) { << 139 addr = vma->vm_end - ( << 140 if (thp_vma_suitable_o << 141 break; << 142 order = next_order(&or << 143 } << 144 << 145 if (!orders) << 146 return 0; << 147 } << 148 << 149 /* << 150 * Enabled via shmem mount options or << 151 * Must be done before hugepage flags << 152 * own flags. << 153 */ << 154 if (!in_pf && shmem_file(vma->vm_file) << 155 return shmem_allowable_huge_or << 156 << 157 << 158 << 159 if (!vma_is_anonymous(vma)) { << 160 /* << 161 * Enforce sysfs THP requireme << 162 * were already handled in thp << 163 */ << 164 if (enforce_sysfs && << 165 (!hugepage_global_enabled( << 166 << 167 return 0; << 168 << 169 /* << 170 * Trust that ->huge_fault() h << 171 * in fault path. << 172 */ << 173 if (((in_pf || smaps)) && vma- << 174 return orders; << 175 /* Only regular file is valid << 176 if (((!in_pf || smaps)) && fil << 177 return orders; << 178 return 0; << 179 } << 180 64 181 if (vma_is_temporary_stack(vma)) !! 65 static struct page *get_huge_zero_page(void) 182 return 0; << 183 << 184 /* << 185 * THPeligible bit of smaps should sho << 186 * though anon_vma is not initialized << 187 * << 188 * Allow page fault since anon_vma may << 189 * the first page fault. << 190 */ << 191 if (!vma->anon_vma) << 192 return (smaps || in_pf) ? orde << 193 << 194 return orders; << 195 } << 196 << 197 static bool get_huge_zero_page(void) << 198 { 66 { 199 struct folio *zero_folio; !! 67 struct page *zero_page; 200 retry: 68 retry: 201 if (likely(atomic_inc_not_zero(&huge_z 69 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 202 return true; !! 70 return READ_ONCE(huge_zero_page); 203 71 204 zero_folio = folio_alloc((GFP_TRANSHUG !! 72 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 205 HPAGE_PMD_ORDER); 73 HPAGE_PMD_ORDER); 206 if (!zero_folio) { !! 74 if (!zero_page) { 207 count_vm_event(THP_ZERO_PAGE_A 75 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 208 return false; !! 76 return NULL; 209 } 77 } 210 /* Ensure zero folio won't have large_ !! 78 count_vm_event(THP_ZERO_PAGE_ALLOC); 211 folio_clear_large_rmappable(zero_folio << 212 preempt_disable(); 79 preempt_disable(); 213 if (cmpxchg(&huge_zero_folio, NULL, ze !! 80 if (cmpxchg(&huge_zero_page, NULL, zero_page)) { 214 preempt_enable(); 81 preempt_enable(); 215 folio_put(zero_folio); !! 82 __free_pages(zero_page, compound_order(zero_page)); 216 goto retry; 83 goto retry; 217 } 84 } 218 WRITE_ONCE(huge_zero_pfn, folio_pfn(ze << 219 85 220 /* We take additional reference here. 86 /* We take additional reference here. It will be put back by shrinker */ 221 atomic_set(&huge_zero_refcount, 2); 87 atomic_set(&huge_zero_refcount, 2); 222 preempt_enable(); 88 preempt_enable(); 223 count_vm_event(THP_ZERO_PAGE_ALLOC); !! 89 return READ_ONCE(huge_zero_page); 224 return true; << 225 } 90 } 226 91 227 static void put_huge_zero_page(void) 92 static void put_huge_zero_page(void) 228 { 93 { 229 /* 94 /* 230 * Counter should never go to zero her 95 * Counter should never go to zero here. Only shrinker can put 231 * last reference. 96 * last reference. 232 */ 97 */ 233 BUG_ON(atomic_dec_and_test(&huge_zero_ 98 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 234 } 99 } 235 100 236 struct folio *mm_get_huge_zero_folio(struct mm !! 101 struct page *mm_get_huge_zero_page(struct mm_struct *mm) 237 { 102 { 238 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm-> 103 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 239 return READ_ONCE(huge_zero_fol !! 104 return READ_ONCE(huge_zero_page); 240 105 241 if (!get_huge_zero_page()) 106 if (!get_huge_zero_page()) 242 return NULL; 107 return NULL; 243 108 244 if (test_and_set_bit(MMF_HUGE_ZERO_PAG 109 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 245 put_huge_zero_page(); 110 put_huge_zero_page(); 246 111 247 return READ_ONCE(huge_zero_folio); !! 112 return READ_ONCE(huge_zero_page); 248 } 113 } 249 114 250 void mm_put_huge_zero_folio(struct mm_struct * !! 115 void mm_put_huge_zero_page(struct mm_struct *mm) 251 { 116 { 252 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm-> 117 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 253 put_huge_zero_page(); 118 put_huge_zero_page(); 254 } 119 } 255 120 256 static unsigned long shrink_huge_zero_page_cou 121 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 257 struct 122 struct shrink_control *sc) 258 { 123 { 259 /* we can free zero page only if last 124 /* we can free zero page only if last reference remains */ 260 return atomic_read(&huge_zero_refcount 125 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 261 } 126 } 262 127 263 static unsigned long shrink_huge_zero_page_sca 128 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, 264 struct 129 struct shrink_control *sc) 265 { 130 { 266 if (atomic_cmpxchg(&huge_zero_refcount 131 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 267 struct folio *zero_folio = xch !! 132 struct page *zero_page = xchg(&huge_zero_page, NULL); 268 BUG_ON(zero_folio == NULL); !! 133 BUG_ON(zero_page == NULL); 269 WRITE_ONCE(huge_zero_pfn, ~0UL !! 134 __free_pages(zero_page, compound_order(zero_page)); 270 folio_put(zero_folio); << 271 return HPAGE_PMD_NR; 135 return HPAGE_PMD_NR; 272 } 136 } 273 137 274 return 0; 138 return 0; 275 } 139 } 276 140 277 static struct shrinker *huge_zero_page_shrinke !! 141 static struct shrinker huge_zero_page_shrinker = { >> 142 .count_objects = shrink_huge_zero_page_count, >> 143 .scan_objects = shrink_huge_zero_page_scan, >> 144 .seeks = DEFAULT_SEEKS, >> 145 }; 278 146 279 #ifdef CONFIG_SYSFS 147 #ifdef CONFIG_SYSFS 280 static ssize_t enabled_show(struct kobject *ko 148 static ssize_t enabled_show(struct kobject *kobj, 281 struct kobj_attrib 149 struct kobj_attribute *attr, char *buf) 282 { 150 { 283 const char *output; << 284 << 285 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG 151 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) 286 output = "[always] madvise nev !! 152 return sprintf(buf, "[always] madvise never\n"); 287 else if (test_bit(TRANSPARENT_HUGEPAGE !! 153 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags)) 288 &transparent_hugepag !! 154 return sprintf(buf, "always [madvise] never\n"); 289 output = "always [madvise] nev << 290 else 155 else 291 output = "always madvise [neve !! 156 return sprintf(buf, "always madvise [never]\n"); 292 << 293 return sysfs_emit(buf, "%s\n", output) << 294 } 157 } 295 158 296 static ssize_t enabled_store(struct kobject *k 159 static ssize_t enabled_store(struct kobject *kobj, 297 struct kobj_attri 160 struct kobj_attribute *attr, 298 const char *buf, 161 const char *buf, size_t count) 299 { 162 { 300 ssize_t ret = count; 163 ssize_t ret = count; 301 164 302 if (sysfs_streq(buf, "always")) { !! 165 if (!memcmp("always", buf, >> 166 min(sizeof("always")-1, count))) { 303 clear_bit(TRANSPARENT_HUGEPAGE 167 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 304 set_bit(TRANSPARENT_HUGEPAGE_F 168 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 305 } else if (sysfs_streq(buf, "madvise") !! 169 } else if (!memcmp("madvise", buf, >> 170 min(sizeof("madvise")-1, count))) { 306 clear_bit(TRANSPARENT_HUGEPAGE 171 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 307 set_bit(TRANSPARENT_HUGEPAGE_R 172 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 308 } else if (sysfs_streq(buf, "never")) !! 173 } else if (!memcmp("never", buf, >> 174 min(sizeof("never")-1, count))) { 309 clear_bit(TRANSPARENT_HUGEPAGE 175 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 310 clear_bit(TRANSPARENT_HUGEPAGE 176 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 311 } else 177 } else 312 ret = -EINVAL; 178 ret = -EINVAL; 313 179 314 if (ret > 0) { 180 if (ret > 0) { 315 int err = start_stop_khugepage 181 int err = start_stop_khugepaged(); 316 if (err) 182 if (err) 317 ret = err; 183 ret = err; 318 } 184 } 319 return ret; 185 return ret; 320 } 186 } 321 !! 187 static struct kobj_attribute enabled_attr = 322 static struct kobj_attribute enabled_attr = __ !! 188 __ATTR(enabled, 0644, enabled_show, enabled_store); 323 189 324 ssize_t single_hugepage_flag_show(struct kobje 190 ssize_t single_hugepage_flag_show(struct kobject *kobj, 325 struct kobj_ !! 191 struct kobj_attribute *attr, char *buf, 326 enum transpa !! 192 enum transparent_hugepage_flag flag) 327 { 193 { 328 return sysfs_emit(buf, "%d\n", !! 194 return sprintf(buf, "%d\n", 329 !!test_bit(flag, &tr !! 195 !!test_bit(flag, &transparent_hugepage_flags)); 330 } 196 } 331 197 332 ssize_t single_hugepage_flag_store(struct kobj 198 ssize_t single_hugepage_flag_store(struct kobject *kobj, 333 struct kobj_a 199 struct kobj_attribute *attr, 334 const char *b 200 const char *buf, size_t count, 335 enum transpar 201 enum transparent_hugepage_flag flag) 336 { 202 { 337 unsigned long value; 203 unsigned long value; 338 int ret; 204 int ret; 339 205 340 ret = kstrtoul(buf, 10, &value); 206 ret = kstrtoul(buf, 10, &value); 341 if (ret < 0) 207 if (ret < 0) 342 return ret; 208 return ret; 343 if (value > 1) 209 if (value > 1) 344 return -EINVAL; 210 return -EINVAL; 345 211 346 if (value) 212 if (value) 347 set_bit(flag, &transparent_hug 213 set_bit(flag, &transparent_hugepage_flags); 348 else 214 else 349 clear_bit(flag, &transparent_h 215 clear_bit(flag, &transparent_hugepage_flags); 350 216 351 return count; 217 return count; 352 } 218 } 353 219 354 static ssize_t defrag_show(struct kobject *kob 220 static ssize_t defrag_show(struct kobject *kobj, 355 struct kobj_attribu 221 struct kobj_attribute *attr, char *buf) 356 { 222 { 357 const char *output; !! 223 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 358 !! 224 return sprintf(buf, "[always] defer defer+madvise madvise never\n"); 359 if (test_bit(TRANSPARENT_HUGEPAGE_DEFR !! 225 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 360 &transparent_hugepage_fla !! 226 return sprintf(buf, "always [defer] defer+madvise madvise never\n"); 361 output = "[always] defer defer !! 227 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 362 else if (test_bit(TRANSPARENT_HUGEPAGE !! 228 return sprintf(buf, "always defer [defer+madvise] madvise never\n"); 363 &transparent_hugepag !! 229 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 364 output = "always [defer] defer !! 230 return sprintf(buf, "always defer defer+madvise [madvise] never\n"); 365 else if (test_bit(TRANSPARENT_HUGEPAGE !! 231 return sprintf(buf, "always defer defer+madvise madvise [never]\n"); 366 &transparent_hugepag << 367 output = "always defer [defer+ << 368 else if (test_bit(TRANSPARENT_HUGEPAGE << 369 &transparent_hugepag << 370 output = "always defer defer+m << 371 else << 372 output = "always defer defer+m << 373 << 374 return sysfs_emit(buf, "%s\n", output) << 375 } 232 } 376 233 377 static ssize_t defrag_store(struct kobject *ko 234 static ssize_t defrag_store(struct kobject *kobj, 378 struct kobj_attrib 235 struct kobj_attribute *attr, 379 const char *buf, s 236 const char *buf, size_t count) 380 { 237 { 381 if (sysfs_streq(buf, "always")) { !! 238 if (!memcmp("always", buf, >> 239 min(sizeof("always")-1, count))) { 382 clear_bit(TRANSPARENT_HUGEPAGE 240 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 383 clear_bit(TRANSPARENT_HUGEPAGE 241 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 384 clear_bit(TRANSPARENT_HUGEPAGE 242 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 385 set_bit(TRANSPARENT_HUGEPAGE_D 243 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 386 } else if (sysfs_streq(buf, "defer+mad !! 244 } else if (!memcmp("defer+madvise", buf, >> 245 min(sizeof("defer+madvise")-1, count))) { 387 clear_bit(TRANSPARENT_HUGEPAGE 246 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 388 clear_bit(TRANSPARENT_HUGEPAGE 247 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 389 clear_bit(TRANSPARENT_HUGEPAGE 248 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 390 set_bit(TRANSPARENT_HUGEPAGE_D 249 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 391 } else if (sysfs_streq(buf, "defer")) !! 250 } else if (!memcmp("defer", buf, >> 251 min(sizeof("defer")-1, count))) { 392 clear_bit(TRANSPARENT_HUGEPAGE 252 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 393 clear_bit(TRANSPARENT_HUGEPAGE 253 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 394 clear_bit(TRANSPARENT_HUGEPAGE 254 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 395 set_bit(TRANSPARENT_HUGEPAGE_D 255 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 396 } else if (sysfs_streq(buf, "madvise") !! 256 } else if (!memcmp("madvise", buf, >> 257 min(sizeof("madvise")-1, count))) { 397 clear_bit(TRANSPARENT_HUGEPAGE 258 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 398 clear_bit(TRANSPARENT_HUGEPAGE 259 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 399 clear_bit(TRANSPARENT_HUGEPAGE 260 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 400 set_bit(TRANSPARENT_HUGEPAGE_D 261 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 401 } else if (sysfs_streq(buf, "never")) !! 262 } else if (!memcmp("never", buf, >> 263 min(sizeof("never")-1, count))) { 402 clear_bit(TRANSPARENT_HUGEPAGE 264 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 403 clear_bit(TRANSPARENT_HUGEPAGE 265 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 404 clear_bit(TRANSPARENT_HUGEPAGE 266 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 405 clear_bit(TRANSPARENT_HUGEPAGE 267 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 406 } else 268 } else 407 return -EINVAL; 269 return -EINVAL; 408 270 409 return count; 271 return count; 410 } 272 } 411 static struct kobj_attribute defrag_attr = __A !! 273 static struct kobj_attribute defrag_attr = >> 274 __ATTR(defrag, 0644, defrag_show, defrag_store); 412 275 413 static ssize_t use_zero_page_show(struct kobje 276 static ssize_t use_zero_page_show(struct kobject *kobj, 414 struct kobj_ !! 277 struct kobj_attribute *attr, char *buf) 415 { 278 { 416 return single_hugepage_flag_show(kobj, 279 return single_hugepage_flag_show(kobj, attr, buf, 417 TRANS !! 280 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 418 } 281 } 419 static ssize_t use_zero_page_store(struct kobj 282 static ssize_t use_zero_page_store(struct kobject *kobj, 420 struct kobj_attribute *attr, c 283 struct kobj_attribute *attr, const char *buf, size_t count) 421 { 284 { 422 return single_hugepage_flag_store(kobj 285 return single_hugepage_flag_store(kobj, attr, buf, count, 423 TRANSPARENT_H 286 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 424 } 287 } 425 static struct kobj_attribute use_zero_page_att !! 288 static struct kobj_attribute use_zero_page_attr = >> 289 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); 426 290 427 static ssize_t hpage_pmd_size_show(struct kobj 291 static ssize_t hpage_pmd_size_show(struct kobject *kobj, 428 struct kobj !! 292 struct kobj_attribute *attr, char *buf) 429 { 293 { 430 return sysfs_emit(buf, "%lu\n", HPAGE_ !! 294 return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE); 431 } 295 } 432 static struct kobj_attribute hpage_pmd_size_at 296 static struct kobj_attribute hpage_pmd_size_attr = 433 __ATTR_RO(hpage_pmd_size); 297 __ATTR_RO(hpage_pmd_size); 434 298 435 static ssize_t split_underused_thp_show(struct !! 299 #ifdef CONFIG_DEBUG_VM 436 struct kobj_attrib !! 300 static ssize_t debug_cow_show(struct kobject *kobj, >> 301 struct kobj_attribute *attr, char *buf) 437 { 302 { 438 return sysfs_emit(buf, "%d\n", split_u !! 303 return single_hugepage_flag_show(kobj, attr, buf, >> 304 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 439 } 305 } 440 !! 306 static ssize_t debug_cow_store(struct kobject *kobj, 441 static ssize_t split_underused_thp_store(struc !! 307 struct kobj_attribute *attr, 442 struct kobj_attri !! 308 const char *buf, size_t count) 443 const char *buf, << 444 { 309 { 445 int err = kstrtobool(buf, &split_under !! 310 return single_hugepage_flag_store(kobj, attr, buf, count, 446 !! 311 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 447 if (err < 0) << 448 return err; << 449 << 450 return count; << 451 } 312 } 452 !! 313 static struct kobj_attribute debug_cow_attr = 453 static struct kobj_attribute split_underused_t !! 314 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); 454 shrink_underused, 0644, split_underuse !! 315 #endif /* CONFIG_DEBUG_VM */ 455 316 456 static struct attribute *hugepage_attr[] = { 317 static struct attribute *hugepage_attr[] = { 457 &enabled_attr.attr, 318 &enabled_attr.attr, 458 &defrag_attr.attr, 319 &defrag_attr.attr, 459 &use_zero_page_attr.attr, 320 &use_zero_page_attr.attr, 460 &hpage_pmd_size_attr.attr, 321 &hpage_pmd_size_attr.attr, 461 #ifdef CONFIG_SHMEM !! 322 #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) 462 &shmem_enabled_attr.attr, 323 &shmem_enabled_attr.attr, 463 #endif 324 #endif 464 &split_underused_thp_attr.attr, !! 325 #ifdef CONFIG_DEBUG_VM >> 326 &debug_cow_attr.attr, >> 327 #endif 465 NULL, 328 NULL, 466 }; 329 }; 467 330 468 static const struct attribute_group hugepage_a 331 static const struct attribute_group hugepage_attr_group = { 469 .attrs = hugepage_attr, 332 .attrs = hugepage_attr, 470 }; 333 }; 471 334 472 static void hugepage_exit_sysfs(struct kobject << 473 static void thpsize_release(struct kobject *ko << 474 static DEFINE_SPINLOCK(huge_anon_orders_lock); << 475 static LIST_HEAD(thpsize_list); << 476 << 477 static ssize_t anon_enabled_show(struct kobjec << 478 struct kobj_a << 479 { << 480 int order = to_thpsize(kobj)->order; << 481 const char *output; << 482 << 483 if (test_bit(order, &huge_anon_orders_ << 484 output = "[always] inherit mad << 485 else if (test_bit(order, &huge_anon_or << 486 output = "always [inherit] mad << 487 else if (test_bit(order, &huge_anon_or << 488 output = "always inherit [madv << 489 else << 490 output = "always inherit madvi << 491 << 492 return sysfs_emit(buf, "%s\n", output) << 493 } << 494 << 495 static ssize_t anon_enabled_store(struct kobje << 496 struct kobj_ << 497 const char * << 498 { << 499 int order = to_thpsize(kobj)->order; << 500 ssize_t ret = count; << 501 << 502 if (sysfs_streq(buf, "always")) { << 503 spin_lock(&huge_anon_orders_lo << 504 clear_bit(order, &huge_anon_or << 505 clear_bit(order, &huge_anon_or << 506 set_bit(order, &huge_anon_orde << 507 spin_unlock(&huge_anon_orders_ << 508 } else if (sysfs_streq(buf, "inherit") << 509 spin_lock(&huge_anon_orders_lo << 510 clear_bit(order, &huge_anon_or << 511 clear_bit(order, &huge_anon_or << 512 set_bit(order, &huge_anon_orde << 513 spin_unlock(&huge_anon_orders_ << 514 } else if (sysfs_streq(buf, "madvise") << 515 spin_lock(&huge_anon_orders_lo << 516 clear_bit(order, &huge_anon_or << 517 clear_bit(order, &huge_anon_or << 518 set_bit(order, &huge_anon_orde << 519 spin_unlock(&huge_anon_orders_ << 520 } else if (sysfs_streq(buf, "never")) << 521 spin_lock(&huge_anon_orders_lo << 522 clear_bit(order, &huge_anon_or << 523 clear_bit(order, &huge_anon_or << 524 clear_bit(order, &huge_anon_or << 525 spin_unlock(&huge_anon_orders_ << 526 } else << 527 ret = -EINVAL; << 528 << 529 if (ret > 0) { << 530 int err; << 531 << 532 err = start_stop_khugepaged(); << 533 if (err) << 534 ret = err; << 535 } << 536 return ret; << 537 } << 538 << 539 static struct kobj_attribute anon_enabled_attr << 540 __ATTR(enabled, 0644, anon_enabled_sho << 541 << 542 static struct attribute *anon_ctrl_attrs[] = { << 543 &anon_enabled_attr.attr, << 544 NULL, << 545 }; << 546 << 547 static const struct attribute_group anon_ctrl_ << 548 .attrs = anon_ctrl_attrs, << 549 }; << 550 << 551 static struct attribute *file_ctrl_attrs[] = { << 552 #ifdef CONFIG_SHMEM << 553 &thpsize_shmem_enabled_attr.attr, << 554 #endif << 555 NULL, << 556 }; << 557 << 558 static const struct attribute_group file_ctrl_ << 559 .attrs = file_ctrl_attrs, << 560 }; << 561 << 562 static struct attribute *any_ctrl_attrs[] = { << 563 NULL, << 564 }; << 565 << 566 static const struct attribute_group any_ctrl_a << 567 .attrs = any_ctrl_attrs, << 568 }; << 569 << 570 static const struct kobj_type thpsize_ktype = << 571 .release = &thpsize_release, << 572 .sysfs_ops = &kobj_sysfs_ops, << 573 }; << 574 << 575 DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = << 576 << 577 static unsigned long sum_mthp_stat(int order, << 578 { << 579 unsigned long sum = 0; << 580 int cpu; << 581 << 582 for_each_possible_cpu(cpu) { << 583 struct mthp_stat *this = &per_ << 584 << 585 sum += this->stats[order][item << 586 } << 587 << 588 return sum; << 589 } << 590 << 591 #define DEFINE_MTHP_STAT_ATTR(_name, _index) << 592 static ssize_t _name##_show(struct kobject *ko << 593 struct kobj_attribute << 594 { << 595 int order = to_thpsize(kobj)->order; << 596 << 597 return sysfs_emit(buf, "%lu\n", sum_mt << 598 } << 599 static struct kobj_attribute _name##_attr = __ << 600 << 601 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_S << 602 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTH << 603 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_char << 604 DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT << 605 DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_ST << 606 #ifdef CONFIG_SHMEM << 607 DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_S << 608 DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STA << 609 DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, M << 610 #endif << 611 DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); << 612 DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_ << 613 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STA << 614 DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_AN << 615 DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped << 616 << 617 static struct attribute *anon_stats_attrs[] = << 618 &anon_fault_alloc_attr.attr, << 619 &anon_fault_fallback_attr.attr, << 620 &anon_fault_fallback_charge_attr.attr, << 621 #ifndef CONFIG_SHMEM << 622 &swpout_attr.attr, << 623 &swpout_fallback_attr.attr, << 624 #endif << 625 &split_deferred_attr.attr, << 626 &nr_anon_attr.attr, << 627 &nr_anon_partially_mapped_attr.attr, << 628 NULL, << 629 }; << 630 << 631 static struct attribute_group anon_stats_attr_ << 632 .name = "stats", << 633 .attrs = anon_stats_attrs, << 634 }; << 635 << 636 static struct attribute *file_stats_attrs[] = << 637 #ifdef CONFIG_SHMEM << 638 &shmem_alloc_attr.attr, << 639 &shmem_fallback_attr.attr, << 640 &shmem_fallback_charge_attr.attr, << 641 #endif << 642 NULL, << 643 }; << 644 << 645 static struct attribute_group file_stats_attr_ << 646 .name = "stats", << 647 .attrs = file_stats_attrs, << 648 }; << 649 << 650 static struct attribute *any_stats_attrs[] = { << 651 #ifdef CONFIG_SHMEM << 652 &swpout_attr.attr, << 653 &swpout_fallback_attr.attr, << 654 #endif << 655 &split_attr.attr, << 656 &split_failed_attr.attr, << 657 NULL, << 658 }; << 659 << 660 static struct attribute_group any_stats_attr_g << 661 .name = "stats", << 662 .attrs = any_stats_attrs, << 663 }; << 664 << 665 static int sysfs_add_group(struct kobject *kob << 666 const struct attrib << 667 { << 668 int ret = -ENOENT; << 669 << 670 /* << 671 * If the group is named, try to merge << 672 * was already created. This avoids th << 673 * sysfs_create_group() if the directo << 674 */ << 675 if (grp->name) << 676 ret = sysfs_merge_group(kobj, << 677 if (ret) << 678 ret = sysfs_create_group(kobj, << 679 << 680 return ret; << 681 } << 682 << 683 static struct thpsize *thpsize_create(int orde << 684 { << 685 unsigned long size = (PAGE_SIZE << ord << 686 struct thpsize *thpsize; << 687 int ret = -ENOMEM; << 688 << 689 thpsize = kzalloc(sizeof(*thpsize), GF << 690 if (!thpsize) << 691 goto err; << 692 << 693 thpsize->order = order; << 694 << 695 ret = kobject_init_and_add(&thpsize->k << 696 "hugepages- << 697 if (ret) { << 698 kfree(thpsize); << 699 goto err; << 700 } << 701 << 702 << 703 ret = sysfs_add_group(&thpsize->kobj, << 704 if (ret) << 705 goto err_put; << 706 << 707 ret = sysfs_add_group(&thpsize->kobj, << 708 if (ret) << 709 goto err_put; << 710 << 711 if (BIT(order) & THP_ORDERS_ALL_ANON) << 712 ret = sysfs_add_group(&thpsize << 713 if (ret) << 714 goto err_put; << 715 << 716 ret = sysfs_add_group(&thpsize << 717 if (ret) << 718 goto err_put; << 719 } << 720 << 721 if (BIT(order) & THP_ORDERS_ALL_FILE_D << 722 ret = sysfs_add_group(&thpsize << 723 if (ret) << 724 goto err_put; << 725 << 726 ret = sysfs_add_group(&thpsize << 727 if (ret) << 728 goto err_put; << 729 } << 730 << 731 return thpsize; << 732 err_put: << 733 kobject_put(&thpsize->kobj); << 734 err: << 735 return ERR_PTR(ret); << 736 } << 737 << 738 static void thpsize_release(struct kobject *ko << 739 { << 740 kfree(to_thpsize(kobj)); << 741 } << 742 << 743 static int __init hugepage_init_sysfs(struct k 335 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 744 { 336 { 745 int err; 337 int err; 746 struct thpsize *thpsize; << 747 unsigned long orders; << 748 int order; << 749 << 750 /* << 751 * Default to setting PMD-sized THP to << 752 * disable all other sizes. powerpc's << 753 * constant so we have to do this here << 754 */ << 755 if (!anon_orders_configured) << 756 huge_anon_orders_inherit = BIT << 757 338 758 *hugepage_kobj = kobject_create_and_ad 339 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 759 if (unlikely(!*hugepage_kobj)) { 340 if (unlikely(!*hugepage_kobj)) { 760 pr_err("failed to create trans 341 pr_err("failed to create transparent hugepage kobject\n"); 761 return -ENOMEM; 342 return -ENOMEM; 762 } 343 } 763 344 764 err = sysfs_create_group(*hugepage_kob 345 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 765 if (err) { 346 if (err) { 766 pr_err("failed to register tra 347 pr_err("failed to register transparent hugepage group\n"); 767 goto delete_obj; 348 goto delete_obj; 768 } 349 } 769 350 770 err = sysfs_create_group(*hugepage_kob 351 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 771 if (err) { 352 if (err) { 772 pr_err("failed to register tra 353 pr_err("failed to register transparent hugepage group\n"); 773 goto remove_hp_group; 354 goto remove_hp_group; 774 } 355 } 775 356 776 orders = THP_ORDERS_ALL_ANON | THP_ORD << 777 order = highest_order(orders); << 778 while (orders) { << 779 thpsize = thpsize_create(order << 780 if (IS_ERR(thpsize)) { << 781 pr_err("failed to crea << 782 err = PTR_ERR(thpsize) << 783 goto remove_all; << 784 } << 785 list_add(&thpsize->node, &thps << 786 order = next_order(&orders, or << 787 } << 788 << 789 return 0; 357 return 0; 790 358 791 remove_all: << 792 hugepage_exit_sysfs(*hugepage_kobj); << 793 return err; << 794 remove_hp_group: 359 remove_hp_group: 795 sysfs_remove_group(*hugepage_kobj, &hu 360 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 796 delete_obj: 361 delete_obj: 797 kobject_put(*hugepage_kobj); 362 kobject_put(*hugepage_kobj); 798 return err; 363 return err; 799 } 364 } 800 365 801 static void __init hugepage_exit_sysfs(struct 366 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 802 { 367 { 803 struct thpsize *thpsize, *tmp; << 804 << 805 list_for_each_entry_safe(thpsize, tmp, << 806 list_del(&thpsize->node); << 807 kobject_put(&thpsize->kobj); << 808 } << 809 << 810 sysfs_remove_group(hugepage_kobj, &khu 368 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 811 sysfs_remove_group(hugepage_kobj, &hug 369 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 812 kobject_put(hugepage_kobj); 370 kobject_put(hugepage_kobj); 813 } 371 } 814 #else 372 #else 815 static inline int hugepage_init_sysfs(struct k 373 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 816 { 374 { 817 return 0; 375 return 0; 818 } 376 } 819 377 820 static inline void hugepage_exit_sysfs(struct 378 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 821 { 379 { 822 } 380 } 823 #endif /* CONFIG_SYSFS */ 381 #endif /* CONFIG_SYSFS */ 824 382 825 static int __init thp_shrinker_init(void) << 826 { << 827 huge_zero_page_shrinker = shrinker_all << 828 if (!huge_zero_page_shrinker) << 829 return -ENOMEM; << 830 << 831 deferred_split_shrinker = shrinker_all << 832 << 833 << 834 << 835 if (!deferred_split_shrinker) { << 836 shrinker_free(huge_zero_page_s << 837 return -ENOMEM; << 838 } << 839 << 840 huge_zero_page_shrinker->count_objects << 841 huge_zero_page_shrinker->scan_objects << 842 shrinker_register(huge_zero_page_shrin << 843 << 844 deferred_split_shrinker->count_objects << 845 deferred_split_shrinker->scan_objects << 846 shrinker_register(deferred_split_shrin << 847 << 848 return 0; << 849 } << 850 << 851 static void __init thp_shrinker_exit(void) << 852 { << 853 shrinker_free(huge_zero_page_shrinker) << 854 shrinker_free(deferred_split_shrinker) << 855 } << 856 << 857 static int __init hugepage_init(void) 383 static int __init hugepage_init(void) 858 { 384 { 859 int err; 385 int err; 860 struct kobject *hugepage_kobj; 386 struct kobject *hugepage_kobj; 861 387 862 if (!has_transparent_hugepage()) { 388 if (!has_transparent_hugepage()) { 863 transparent_hugepage_flags = 1 !! 389 transparent_hugepage_flags = 0; 864 return -EINVAL; 390 return -EINVAL; 865 } 391 } 866 392 867 /* 393 /* 868 * hugepages can't be allocated by the 394 * hugepages can't be allocated by the buddy allocator 869 */ 395 */ 870 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > M !! 396 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER); >> 397 /* >> 398 * we use page->mapping and page->index in second tail page >> 399 * as list_head: assuming THP order >= 2 >> 400 */ >> 401 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); 871 402 872 err = hugepage_init_sysfs(&hugepage_ko 403 err = hugepage_init_sysfs(&hugepage_kobj); 873 if (err) 404 if (err) 874 goto err_sysfs; 405 goto err_sysfs; 875 406 876 err = khugepaged_init(); 407 err = khugepaged_init(); 877 if (err) 408 if (err) 878 goto err_slab; 409 goto err_slab; 879 410 880 err = thp_shrinker_init(); !! 411 err = register_shrinker(&huge_zero_page_shrinker); >> 412 if (err) >> 413 goto err_hzp_shrinker; >> 414 err = register_shrinker(&deferred_split_shrinker); 881 if (err) 415 if (err) 882 goto err_shrinker; !! 416 goto err_split_shrinker; 883 417 884 /* 418 /* 885 * By default disable transparent huge 419 * By default disable transparent hugepages on smaller systems, 886 * where the extra memory used could h 420 * where the extra memory used could hurt more than TLB overhead 887 * is likely to save. The admin can s 421 * is likely to save. The admin can still enable it through /sys. 888 */ 422 */ 889 if (totalram_pages() < (512 << (20 - P !! 423 if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { 890 transparent_hugepage_flags = 0 424 transparent_hugepage_flags = 0; 891 return 0; 425 return 0; 892 } 426 } 893 427 894 err = start_stop_khugepaged(); 428 err = start_stop_khugepaged(); 895 if (err) 429 if (err) 896 goto err_khugepaged; 430 goto err_khugepaged; 897 431 898 return 0; 432 return 0; 899 err_khugepaged: 433 err_khugepaged: 900 thp_shrinker_exit(); !! 434 unregister_shrinker(&deferred_split_shrinker); 901 err_shrinker: !! 435 err_split_shrinker: >> 436 unregister_shrinker(&huge_zero_page_shrinker); >> 437 err_hzp_shrinker: 902 khugepaged_destroy(); 438 khugepaged_destroy(); 903 err_slab: 439 err_slab: 904 hugepage_exit_sysfs(hugepage_kobj); 440 hugepage_exit_sysfs(hugepage_kobj); 905 err_sysfs: 441 err_sysfs: 906 return err; 442 return err; 907 } 443 } 908 subsys_initcall(hugepage_init); 444 subsys_initcall(hugepage_init); 909 445 910 static int __init setup_transparent_hugepage(c 446 static int __init setup_transparent_hugepage(char *str) 911 { 447 { 912 int ret = 0; 448 int ret = 0; 913 if (!str) 449 if (!str) 914 goto out; 450 goto out; 915 if (!strcmp(str, "always")) { 451 if (!strcmp(str, "always")) { 916 set_bit(TRANSPARENT_HUGEPAGE_F 452 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 917 &transparent_hugepage_ 453 &transparent_hugepage_flags); 918 clear_bit(TRANSPARENT_HUGEPAGE 454 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 919 &transparent_hugepag 455 &transparent_hugepage_flags); 920 ret = 1; 456 ret = 1; 921 } else if (!strcmp(str, "madvise")) { 457 } else if (!strcmp(str, "madvise")) { 922 clear_bit(TRANSPARENT_HUGEPAGE 458 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 923 &transparent_hugepag 459 &transparent_hugepage_flags); 924 set_bit(TRANSPARENT_HUGEPAGE_R 460 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 925 &transparent_hugepage_ 461 &transparent_hugepage_flags); 926 ret = 1; 462 ret = 1; 927 } else if (!strcmp(str, "never")) { 463 } else if (!strcmp(str, "never")) { 928 clear_bit(TRANSPARENT_HUGEPAGE 464 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 929 &transparent_hugepag 465 &transparent_hugepage_flags); 930 clear_bit(TRANSPARENT_HUGEPAGE 466 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 931 &transparent_hugepag 467 &transparent_hugepage_flags); 932 ret = 1; 468 ret = 1; 933 } 469 } 934 out: 470 out: 935 if (!ret) 471 if (!ret) 936 pr_warn("transparent_hugepage= 472 pr_warn("transparent_hugepage= cannot parse, ignored\n"); 937 return ret; 473 return ret; 938 } 474 } 939 __setup("transparent_hugepage=", setup_transpa 475 __setup("transparent_hugepage=", setup_transparent_hugepage); 940 476 941 static inline int get_order_from_str(const cha << 942 { << 943 unsigned long size; << 944 char *endptr; << 945 int order; << 946 << 947 size = memparse(size_str, &endptr); << 948 << 949 if (!is_power_of_2(size)) << 950 goto err; << 951 order = get_order(size); << 952 if (BIT(order) & ~THP_ORDERS_ALL_ANON) << 953 goto err; << 954 << 955 return order; << 956 err: << 957 pr_err("invalid size %s in thp_anon bo << 958 return -EINVAL; << 959 } << 960 << 961 static char str_dup[PAGE_SIZE] __initdata; << 962 static int __init setup_thp_anon(char *str) << 963 { << 964 char *token, *range, *policy, *subtoke << 965 unsigned long always, inherit, madvise << 966 char *start_size, *end_size; << 967 int start, end, nr; << 968 char *p; << 969 << 970 if (!str || strlen(str) + 1 > PAGE_SIZ << 971 goto err; << 972 strcpy(str_dup, str); << 973 << 974 always = huge_anon_orders_always; << 975 madvise = huge_anon_orders_madvise; << 976 inherit = huge_anon_orders_inherit; << 977 p = str_dup; << 978 while ((token = strsep(&p, ";")) != NU << 979 range = strsep(&token, ":"); << 980 policy = token; << 981 << 982 if (!policy) << 983 goto err; << 984 << 985 while ((subtoken = strsep(&ran << 986 if (strchr(subtoken, ' << 987 start_size = s << 988 end_size = sub << 989 << 990 start = get_or << 991 end = get_orde << 992 } else { << 993 start = end = << 994 } << 995 << 996 if (start < 0 || end < << 997 goto err; << 998 << 999 nr = end - start + 1; << 1000 if (!strcmp(policy, " << 1001 bitmap_set(&a << 1002 bitmap_clear( << 1003 bitmap_clear( << 1004 } else if (!strcmp(po << 1005 bitmap_set(&m << 1006 bitmap_clear( << 1007 bitmap_clear( << 1008 } else if (!strcmp(po << 1009 bitmap_set(&i << 1010 bitmap_clear( << 1011 bitmap_clear( << 1012 } else if (!strcmp(po << 1013 bitmap_clear( << 1014 bitmap_clear( << 1015 bitmap_clear( << 1016 } else { << 1017 pr_err("inval << 1018 goto err; << 1019 } << 1020 } << 1021 } << 1022 << 1023 huge_anon_orders_always = always; << 1024 huge_anon_orders_madvise = madvise; << 1025 huge_anon_orders_inherit = inherit; << 1026 anon_orders_configured = true; << 1027 return 1; << 1028 << 1029 err: << 1030 pr_warn("thp_anon=%s: error parsing s << 1031 return 0; << 1032 } << 1033 __setup("thp_anon=", setup_thp_anon); << 1034 << 1035 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_ 477 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 1036 { 478 { 1037 if (likely(vma->vm_flags & VM_WRITE)) 479 if (likely(vma->vm_flags & VM_WRITE)) 1038 pmd = pmd_mkwrite(pmd, vma); !! 480 pmd = pmd_mkwrite(pmd); 1039 return pmd; 481 return pmd; 1040 } 482 } 1041 483 1042 #ifdef CONFIG_MEMCG !! 484 static inline struct list_head *page_deferred_list(struct page *page) 1043 static inline << 1044 struct deferred_split *get_deferred_split_que << 1045 { 485 { 1046 struct mem_cgroup *memcg = folio_memc !! 486 /* ->lru in the tail pages is occupied by compound_head. */ 1047 struct pglist_data *pgdat = NODE_DATA !! 487 return &page[2].deferred_list; 1048 << 1049 if (memcg) << 1050 return &memcg->deferred_split << 1051 else << 1052 return &pgdat->deferred_split << 1053 } 488 } 1054 #else << 1055 static inline << 1056 struct deferred_split *get_deferred_split_que << 1057 { << 1058 struct pglist_data *pgdat = NODE_DATA << 1059 << 1060 return &pgdat->deferred_split_queue; << 1061 } << 1062 #endif << 1063 489 1064 static inline bool is_transparent_hugepage(co !! 490 void prep_transhuge_page(struct page *page) 1065 { 491 { 1066 if (!folio_test_large(folio)) !! 492 /* 1067 return false; !! 493 * we use page->mapping and page->indexlru in second tail page >> 494 * as list_head: assuming THP order >= 2 >> 495 */ 1068 496 1069 return is_huge_zero_folio(folio) || !! 497 INIT_LIST_HEAD(page_deferred_list(page)); 1070 folio_test_large_rmappable(fo !! 498 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); 1071 } 499 } 1072 500 1073 static unsigned long __thp_get_unmapped_area( !! 501 unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, 1074 unsigned long addr, unsigned !! 502 loff_t off, unsigned long flags, unsigned long size) 1075 loff_t off, unsigned long fla << 1076 vm_flags_t vm_flags) << 1077 { 503 { >> 504 unsigned long addr; 1078 loff_t off_end = off + len; 505 loff_t off_end = off + len; 1079 loff_t off_align = round_up(off, size 506 loff_t off_align = round_up(off, size); 1080 unsigned long len_pad, ret, off_sub; !! 507 unsigned long len_pad; 1081 << 1082 if (!IS_ENABLED(CONFIG_64BIT) || in_c << 1083 return 0; << 1084 508 1085 if (off_end <= off_align || (off_end 509 if (off_end <= off_align || (off_end - off_align) < size) 1086 return 0; 510 return 0; 1087 511 1088 len_pad = len + size; 512 len_pad = len + size; 1089 if (len_pad < len || (off + len_pad) 513 if (len_pad < len || (off + len_pad) < off) 1090 return 0; 514 return 0; 1091 515 1092 ret = mm_get_unmapped_area_vmflags(cu !! 516 addr = current->mm->get_unmapped_area(filp, 0, len_pad, 1093 of !! 517 off >> PAGE_SHIFT, flags); 1094 !! 518 if (IS_ERR_VALUE(addr)) 1095 /* << 1096 * The failure might be due to length << 1097 * without the padding. << 1098 */ << 1099 if (IS_ERR_VALUE(ret)) << 1100 return 0; 519 return 0; 1101 520 1102 /* !! 521 addr += (off - addr) & (size - 1); 1103 * Do not try to align to THP boundar !! 522 return addr; 1104 * hint succeeds. << 1105 */ << 1106 if (ret == addr) << 1107 return addr; << 1108 << 1109 off_sub = (off - ret) & (size - 1); << 1110 << 1111 if (test_bit(MMF_TOPDOWN, ¤t->m << 1112 return ret + size; << 1113 << 1114 ret += off_sub; << 1115 return ret; << 1116 } 523 } 1117 524 1118 unsigned long thp_get_unmapped_area_vmflags(s !! 525 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, 1119 unsigned long len, unsigned l !! 526 unsigned long len, unsigned long pgoff, unsigned long flags) 1120 vm_flags_t vm_flags) << 1121 { 527 { 1122 unsigned long ret; << 1123 loff_t off = (loff_t)pgoff << PAGE_SH 528 loff_t off = (loff_t)pgoff << PAGE_SHIFT; 1124 529 1125 ret = __thp_get_unmapped_area(filp, a !! 530 if (addr) 1126 if (ret) !! 531 goto out; 1127 return ret; !! 532 if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) >> 533 goto out; 1128 534 1129 return mm_get_unmapped_area_vmflags(c !! 535 addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE); 1130 v !! 536 if (addr) 1131 } !! 537 return addr; 1132 538 1133 unsigned long thp_get_unmapped_area(struct fi !! 539 out: 1134 unsigned long len, unsigned l !! 540 return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); 1135 { << 1136 return thp_get_unmapped_area_vmflags( << 1137 } 541 } 1138 EXPORT_SYMBOL_GPL(thp_get_unmapped_area); 542 EXPORT_SYMBOL_GPL(thp_get_unmapped_area); 1139 543 1140 static vm_fault_t __do_huge_pmd_anonymous_pag 544 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, 1141 struct page *page, gf 545 struct page *page, gfp_t gfp) 1142 { 546 { 1143 struct vm_area_struct *vma = vmf->vma 547 struct vm_area_struct *vma = vmf->vma; 1144 struct folio *folio = page_folio(page !! 548 struct mem_cgroup *memcg; 1145 pgtable_t pgtable; 549 pgtable_t pgtable; 1146 unsigned long haddr = vmf->address & 550 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1147 vm_fault_t ret = 0; 551 vm_fault_t ret = 0; 1148 552 1149 VM_BUG_ON_FOLIO(!folio_test_large(fol !! 553 VM_BUG_ON_PAGE(!PageCompound(page), page); 1150 554 1151 if (mem_cgroup_charge(folio, vma->vm_ !! 555 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { 1152 folio_put(folio); !! 556 put_page(page); 1153 count_vm_event(THP_FAULT_FALL 557 count_vm_event(THP_FAULT_FALLBACK); 1154 count_vm_event(THP_FAULT_FALL << 1155 count_mthp_stat(HPAGE_PMD_ORD << 1156 count_mthp_stat(HPAGE_PMD_ORD << 1157 return VM_FAULT_FALLBACK; 558 return VM_FAULT_FALLBACK; 1158 } 559 } 1159 folio_throttle_swaprate(folio, gfp); << 1160 560 1161 pgtable = pte_alloc_one(vma->vm_mm); !! 561 pgtable = pte_alloc_one(vma->vm_mm, haddr); 1162 if (unlikely(!pgtable)) { 562 if (unlikely(!pgtable)) { 1163 ret = VM_FAULT_OOM; 563 ret = VM_FAULT_OOM; 1164 goto release; 564 goto release; 1165 } 565 } 1166 566 1167 folio_zero_user(folio, vmf->address); !! 567 clear_huge_page(page, vmf->address, HPAGE_PMD_NR); 1168 /* 568 /* 1169 * The memory barrier inside __folio_ !! 569 * The memory barrier inside __SetPageUptodate makes sure that 1170 * folio_zero_user writes become visi !! 570 * clear_huge_page writes become visible before the set_pmd_at() 1171 * write. 571 * write. 1172 */ 572 */ 1173 __folio_mark_uptodate(folio); !! 573 __SetPageUptodate(page); 1174 574 1175 vmf->ptl = pmd_lock(vma->vm_mm, vmf-> 575 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1176 if (unlikely(!pmd_none(*vmf->pmd))) { 576 if (unlikely(!pmd_none(*vmf->pmd))) { 1177 goto unlock_release; 577 goto unlock_release; 1178 } else { 578 } else { 1179 pmd_t entry; 579 pmd_t entry; 1180 580 1181 ret = check_stable_address_sp 581 ret = check_stable_address_space(vma->vm_mm); 1182 if (ret) 582 if (ret) 1183 goto unlock_release; 583 goto unlock_release; 1184 584 1185 /* Deliver the page fault to 585 /* Deliver the page fault to userland */ 1186 if (userfaultfd_missing(vma)) 586 if (userfaultfd_missing(vma)) { >> 587 vm_fault_t ret2; >> 588 1187 spin_unlock(vmf->ptl) 589 spin_unlock(vmf->ptl); 1188 folio_put(folio); !! 590 mem_cgroup_cancel_charge(page, memcg, true); >> 591 put_page(page); 1189 pte_free(vma->vm_mm, 592 pte_free(vma->vm_mm, pgtable); 1190 ret = handle_userfaul !! 593 ret2 = handle_userfault(vmf, VM_UFFD_MISSING); 1191 VM_BUG_ON(ret & VM_FA !! 594 VM_BUG_ON(ret2 & VM_FAULT_FALLBACK); 1192 return ret; !! 595 return ret2; 1193 } 596 } 1194 597 1195 entry = mk_huge_pmd(page, vma 598 entry = mk_huge_pmd(page, vma->vm_page_prot); 1196 entry = maybe_pmd_mkwrite(pmd 599 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1197 folio_add_new_anon_rmap(folio !! 600 page_add_new_anon_rmap(page, vma, haddr, true); 1198 folio_add_lru_vma(folio, vma) !! 601 mem_cgroup_commit_charge(page, memcg, false, true); >> 602 lru_cache_add_active_or_unevictable(page, vma); 1199 pgtable_trans_huge_deposit(vm 603 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 1200 set_pmd_at(vma->vm_mm, haddr, 604 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); 1201 update_mmu_cache_pmd(vma, vmf << 1202 add_mm_counter(vma->vm_mm, MM 605 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1203 mm_inc_nr_ptes(vma->vm_mm); 606 mm_inc_nr_ptes(vma->vm_mm); 1204 deferred_split_folio(folio, f << 1205 spin_unlock(vmf->ptl); 607 spin_unlock(vmf->ptl); 1206 count_vm_event(THP_FAULT_ALLO 608 count_vm_event(THP_FAULT_ALLOC); 1207 count_mthp_stat(HPAGE_PMD_ORD << 1208 count_memcg_event_mm(vma->vm_ << 1209 } 609 } 1210 610 1211 return 0; 611 return 0; 1212 unlock_release: 612 unlock_release: 1213 spin_unlock(vmf->ptl); 613 spin_unlock(vmf->ptl); 1214 release: 614 release: 1215 if (pgtable) 615 if (pgtable) 1216 pte_free(vma->vm_mm, pgtable) 616 pte_free(vma->vm_mm, pgtable); 1217 folio_put(folio); !! 617 mem_cgroup_cancel_charge(page, memcg, true); >> 618 put_page(page); 1218 return ret; 619 return ret; 1219 620 1220 } 621 } 1221 622 1222 /* 623 /* 1223 * always: directly stall for all thp allocat 624 * always: directly stall for all thp allocations 1224 * defer: wake kswapd and fail if not immedia 625 * defer: wake kswapd and fail if not immediately available 1225 * defer+madvise: wake kswapd and directly st 626 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise 1226 * fail if not immediately ava 627 * fail if not immediately available 1227 * madvise: directly stall for MADV_HUGEPAGE, 628 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately 1228 * available 629 * available 1229 * never: never stall for any thp allocation 630 * never: never stall for any thp allocation 1230 */ 631 */ 1231 gfp_t vma_thp_gfp_mask(struct vm_area_struct !! 632 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) 1232 { 633 { 1233 const bool vma_madvised = vma && (vma !! 634 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); 1234 635 1235 /* Always do synchronous compaction * 636 /* Always do synchronous compaction */ 1236 if (test_bit(TRANSPARENT_HUGEPAGE_DEF 637 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 1237 return GFP_TRANSHUGE | (vma_m 638 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 1238 639 1239 /* Kick kcompactd and fail quickly */ 640 /* Kick kcompactd and fail quickly */ 1240 if (test_bit(TRANSPARENT_HUGEPAGE_DEF 641 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 1241 return GFP_TRANSHUGE_LIGHT | 642 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 1242 643 1243 /* Synchronous compaction if madvised 644 /* Synchronous compaction if madvised, otherwise kick kcompactd */ 1244 if (test_bit(TRANSPARENT_HUGEPAGE_DEF 645 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 1245 return GFP_TRANSHUGE_LIGHT | 646 return GFP_TRANSHUGE_LIGHT | 1246 (vma_madvised ? __GFP 647 (vma_madvised ? __GFP_DIRECT_RECLAIM : 1247 __GFP 648 __GFP_KSWAPD_RECLAIM); 1248 649 1249 /* Only do synchronous compaction if 650 /* Only do synchronous compaction if madvised */ 1250 if (test_bit(TRANSPARENT_HUGEPAGE_DEF 651 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 1251 return GFP_TRANSHUGE_LIGHT | 652 return GFP_TRANSHUGE_LIGHT | 1252 (vma_madvised ? __GFP_ 653 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); 1253 654 1254 return GFP_TRANSHUGE_LIGHT; 655 return GFP_TRANSHUGE_LIGHT; 1255 } 656 } 1256 657 1257 /* Caller must hold page table lock. */ 658 /* Caller must hold page table lock. */ 1258 static void set_huge_zero_folio(pgtable_t pgt !! 659 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 1259 struct vm_area_struct *vma, u 660 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 1260 struct folio *zero_folio) !! 661 struct page *zero_page) 1261 { 662 { 1262 pmd_t entry; 663 pmd_t entry; 1263 if (!pmd_none(*pmd)) 664 if (!pmd_none(*pmd)) 1264 return; !! 665 return false; 1265 entry = mk_pmd(&zero_folio->page, vma !! 666 entry = mk_pmd(zero_page, vma->vm_page_prot); 1266 entry = pmd_mkhuge(entry); 667 entry = pmd_mkhuge(entry); 1267 pgtable_trans_huge_deposit(mm, pmd, p !! 668 if (pgtable) >> 669 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1268 set_pmd_at(mm, haddr, pmd, entry); 670 set_pmd_at(mm, haddr, pmd, entry); 1269 mm_inc_nr_ptes(mm); 671 mm_inc_nr_ptes(mm); >> 672 return true; 1270 } 673 } 1271 674 1272 vm_fault_t do_huge_pmd_anonymous_page(struct 675 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) 1273 { 676 { 1274 struct vm_area_struct *vma = vmf->vma 677 struct vm_area_struct *vma = vmf->vma; 1275 gfp_t gfp; 678 gfp_t gfp; 1276 struct folio *folio; !! 679 struct page *page; 1277 unsigned long haddr = vmf->address & 680 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1278 vm_fault_t ret; << 1279 681 1280 if (!thp_vma_suitable_order(vma, hadd !! 682 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) 1281 return VM_FAULT_FALLBACK; 683 return VM_FAULT_FALLBACK; 1282 ret = vmf_anon_prepare(vmf); !! 684 if (unlikely(anon_vma_prepare(vma))) 1283 if (ret) !! 685 return VM_FAULT_OOM; 1284 return ret; !! 686 if (unlikely(khugepaged_enter(vma, vma->vm_flags))) 1285 khugepaged_enter_vma(vma, vma->vm_fla !! 687 return VM_FAULT_OOM; 1286 << 1287 if (!(vmf->flags & FAULT_FLAG_WRITE) 688 if (!(vmf->flags & FAULT_FLAG_WRITE) && 1288 !mm_forbids_zeropage( 689 !mm_forbids_zeropage(vma->vm_mm) && 1289 transparent_hugepage_ 690 transparent_hugepage_use_zero_page()) { 1290 pgtable_t pgtable; 691 pgtable_t pgtable; 1291 struct folio *zero_folio; !! 692 struct page *zero_page; >> 693 bool set; 1292 vm_fault_t ret; 694 vm_fault_t ret; 1293 !! 695 pgtable = pte_alloc_one(vma->vm_mm, haddr); 1294 pgtable = pte_alloc_one(vma-> << 1295 if (unlikely(!pgtable)) 696 if (unlikely(!pgtable)) 1296 return VM_FAULT_OOM; 697 return VM_FAULT_OOM; 1297 zero_folio = mm_get_huge_zero !! 698 zero_page = mm_get_huge_zero_page(vma->vm_mm); 1298 if (unlikely(!zero_folio)) { !! 699 if (unlikely(!zero_page)) { 1299 pte_free(vma->vm_mm, 700 pte_free(vma->vm_mm, pgtable); 1300 count_vm_event(THP_FA 701 count_vm_event(THP_FAULT_FALLBACK); 1301 return VM_FAULT_FALLB 702 return VM_FAULT_FALLBACK; 1302 } 703 } 1303 vmf->ptl = pmd_lock(vma->vm_m 704 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1304 ret = 0; 705 ret = 0; >> 706 set = false; 1305 if (pmd_none(*vmf->pmd)) { 707 if (pmd_none(*vmf->pmd)) { 1306 ret = check_stable_ad 708 ret = check_stable_address_space(vma->vm_mm); 1307 if (ret) { 709 if (ret) { 1308 spin_unlock(v 710 spin_unlock(vmf->ptl); 1309 pte_free(vma- << 1310 } else if (userfaultf 711 } else if (userfaultfd_missing(vma)) { 1311 spin_unlock(v 712 spin_unlock(vmf->ptl); 1312 pte_free(vma- << 1313 ret = handle_ 713 ret = handle_userfault(vmf, VM_UFFD_MISSING); 1314 VM_BUG_ON(ret 714 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 1315 } else { 715 } else { 1316 set_huge_zero !! 716 set_huge_zero_page(pgtable, vma->vm_mm, vma, 1317 !! 717 haddr, vmf->pmd, zero_page); 1318 update_mmu_ca << 1319 spin_unlock(v 718 spin_unlock(vmf->ptl); >> 719 set = true; 1320 } 720 } 1321 } else { !! 721 } else 1322 spin_unlock(vmf->ptl) 722 spin_unlock(vmf->ptl); >> 723 if (!set) 1323 pte_free(vma->vm_mm, 724 pte_free(vma->vm_mm, pgtable); 1324 } << 1325 return ret; 725 return ret; 1326 } 726 } 1327 gfp = vma_thp_gfp_mask(vma); !! 727 gfp = alloc_hugepage_direct_gfpmask(vma); 1328 folio = vma_alloc_folio(gfp, HPAGE_PM !! 728 page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); 1329 if (unlikely(!folio)) { !! 729 if (unlikely(!page)) { 1330 count_vm_event(THP_FAULT_FALL 730 count_vm_event(THP_FAULT_FALLBACK); 1331 count_mthp_stat(HPAGE_PMD_ORD << 1332 return VM_FAULT_FALLBACK; 731 return VM_FAULT_FALLBACK; 1333 } 732 } 1334 return __do_huge_pmd_anonymous_page(v !! 733 prep_transhuge_page(page); >> 734 return __do_huge_pmd_anonymous_page(vmf, page, gfp); 1335 } 735 } 1336 736 1337 static void insert_pfn_pmd(struct vm_area_str 737 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 1338 pmd_t *pmd, pfn_t pfn, pgprot 738 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, 1339 pgtable_t pgtable) 739 pgtable_t pgtable) 1340 { 740 { 1341 struct mm_struct *mm = vma->vm_mm; 741 struct mm_struct *mm = vma->vm_mm; 1342 pmd_t entry; 742 pmd_t entry; 1343 spinlock_t *ptl; 743 spinlock_t *ptl; 1344 744 1345 ptl = pmd_lock(mm, pmd); 745 ptl = pmd_lock(mm, pmd); 1346 if (!pmd_none(*pmd)) { << 1347 if (write) { << 1348 if (pmd_pfn(*pmd) != << 1349 WARN_ON_ONCE( << 1350 goto out_unlo << 1351 } << 1352 entry = pmd_mkyoung(* << 1353 entry = maybe_pmd_mkw << 1354 if (pmdp_set_access_f << 1355 update_mmu_ca << 1356 } << 1357 << 1358 goto out_unlock; << 1359 } << 1360 << 1361 entry = pmd_mkhuge(pfn_t_pmd(pfn, pro 746 entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); 1362 if (pfn_t_devmap(pfn)) 747 if (pfn_t_devmap(pfn)) 1363 entry = pmd_mkdevmap(entry); 748 entry = pmd_mkdevmap(entry); 1364 else << 1365 entry = pmd_mkspecial(entry); << 1366 if (write) { 749 if (write) { 1367 entry = pmd_mkyoung(pmd_mkdir 750 entry = pmd_mkyoung(pmd_mkdirty(entry)); 1368 entry = maybe_pmd_mkwrite(ent 751 entry = maybe_pmd_mkwrite(entry, vma); 1369 } 752 } 1370 753 1371 if (pgtable) { 754 if (pgtable) { 1372 pgtable_trans_huge_deposit(mm 755 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1373 mm_inc_nr_ptes(mm); 756 mm_inc_nr_ptes(mm); 1374 pgtable = NULL; << 1375 } 757 } 1376 758 1377 set_pmd_at(mm, addr, pmd, entry); 759 set_pmd_at(mm, addr, pmd, entry); 1378 update_mmu_cache_pmd(vma, addr, pmd); 760 update_mmu_cache_pmd(vma, addr, pmd); 1379 << 1380 out_unlock: << 1381 spin_unlock(ptl); 761 spin_unlock(ptl); 1382 if (pgtable) << 1383 pte_free(mm, pgtable); << 1384 } 762 } 1385 763 1386 /** !! 764 vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 1387 * vmf_insert_pfn_pmd - insert a pmd size pfn !! 765 pmd_t *pmd, pfn_t pfn, bool write) 1388 * @vmf: Structure describing the fault << 1389 * @pfn: pfn to insert << 1390 * @write: whether it's a write fault << 1391 * << 1392 * Insert a pmd size pfn. See vmf_insert_pfn( << 1393 * << 1394 * Return: vm_fault_t value. << 1395 */ << 1396 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault << 1397 { 766 { 1398 unsigned long addr = vmf->address & P << 1399 struct vm_area_struct *vma = vmf->vma << 1400 pgprot_t pgprot = vma->vm_page_prot; 767 pgprot_t pgprot = vma->vm_page_prot; 1401 pgtable_t pgtable = NULL; 768 pgtable_t pgtable = NULL; 1402 << 1403 /* 769 /* 1404 * If we had pmd_special, we could av 770 * If we had pmd_special, we could avoid all these restrictions, 1405 * but we need to be consistent with 771 * but we need to be consistent with PTEs and architectures that 1406 * can't support a 'special' bit. 772 * can't support a 'special' bit. 1407 */ 773 */ 1408 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|V 774 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 1409 !pfn_t_devmap(pfn)); 775 !pfn_t_devmap(pfn)); 1410 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM 776 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1411 777 (VM_PFNMAP|VM_MIXEDMAP)); 1412 BUG_ON((vma->vm_flags & VM_PFNMAP) && 778 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1413 779 1414 if (addr < vma->vm_start || addr >= v 780 if (addr < vma->vm_start || addr >= vma->vm_end) 1415 return VM_FAULT_SIGBUS; 781 return VM_FAULT_SIGBUS; 1416 782 1417 if (arch_needs_pgtable_deposit()) { 783 if (arch_needs_pgtable_deposit()) { 1418 pgtable = pte_alloc_one(vma-> !! 784 pgtable = pte_alloc_one(vma->vm_mm, addr); 1419 if (!pgtable) 785 if (!pgtable) 1420 return VM_FAULT_OOM; 786 return VM_FAULT_OOM; 1421 } 787 } 1422 788 1423 track_pfn_insert(vma, &pgprot, pfn); 789 track_pfn_insert(vma, &pgprot, pfn); 1424 790 1425 insert_pfn_pmd(vma, addr, vmf->pmd, p !! 791 insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable); 1426 return VM_FAULT_NOPAGE; 792 return VM_FAULT_NOPAGE; 1427 } 793 } 1428 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 794 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 1429 795 1430 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_ 796 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1431 static pud_t maybe_pud_mkwrite(pud_t pud, str 797 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) 1432 { 798 { 1433 if (likely(vma->vm_flags & VM_WRITE)) 799 if (likely(vma->vm_flags & VM_WRITE)) 1434 pud = pud_mkwrite(pud); 800 pud = pud_mkwrite(pud); 1435 return pud; 801 return pud; 1436 } 802 } 1437 803 1438 static void insert_pfn_pud(struct vm_area_str 804 static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, 1439 pud_t *pud, pfn_t pfn, bool w !! 805 pud_t *pud, pfn_t pfn, pgprot_t prot, bool write) 1440 { 806 { 1441 struct mm_struct *mm = vma->vm_mm; 807 struct mm_struct *mm = vma->vm_mm; 1442 pgprot_t prot = vma->vm_page_prot; << 1443 pud_t entry; 808 pud_t entry; 1444 spinlock_t *ptl; 809 spinlock_t *ptl; 1445 810 1446 ptl = pud_lock(mm, pud); 811 ptl = pud_lock(mm, pud); 1447 if (!pud_none(*pud)) { << 1448 if (write) { << 1449 if (WARN_ON_ONCE(pud_ << 1450 goto out_unlo << 1451 entry = pud_mkyoung(* << 1452 entry = maybe_pud_mkw << 1453 if (pudp_set_access_f << 1454 update_mmu_ca << 1455 } << 1456 goto out_unlock; << 1457 } << 1458 << 1459 entry = pud_mkhuge(pfn_t_pud(pfn, pro 812 entry = pud_mkhuge(pfn_t_pud(pfn, prot)); 1460 if (pfn_t_devmap(pfn)) 813 if (pfn_t_devmap(pfn)) 1461 entry = pud_mkdevmap(entry); 814 entry = pud_mkdevmap(entry); 1462 else << 1463 entry = pud_mkspecial(entry); << 1464 if (write) { 815 if (write) { 1465 entry = pud_mkyoung(pud_mkdir 816 entry = pud_mkyoung(pud_mkdirty(entry)); 1466 entry = maybe_pud_mkwrite(ent 817 entry = maybe_pud_mkwrite(entry, vma); 1467 } 818 } 1468 set_pud_at(mm, addr, pud, entry); 819 set_pud_at(mm, addr, pud, entry); 1469 update_mmu_cache_pud(vma, addr, pud); 820 update_mmu_cache_pud(vma, addr, pud); 1470 << 1471 out_unlock: << 1472 spin_unlock(ptl); 821 spin_unlock(ptl); 1473 } 822 } 1474 823 1475 /** !! 824 vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, 1476 * vmf_insert_pfn_pud - insert a pud size pfn !! 825 pud_t *pud, pfn_t pfn, bool write) 1477 * @vmf: Structure describing the fault << 1478 * @pfn: pfn to insert << 1479 * @write: whether it's a write fault << 1480 * << 1481 * Insert a pud size pfn. See vmf_insert_pfn( << 1482 * << 1483 * Return: vm_fault_t value. << 1484 */ << 1485 vm_fault_t vmf_insert_pfn_pud(struct vm_fault << 1486 { 826 { 1487 unsigned long addr = vmf->address & P << 1488 struct vm_area_struct *vma = vmf->vma << 1489 pgprot_t pgprot = vma->vm_page_prot; 827 pgprot_t pgprot = vma->vm_page_prot; 1490 << 1491 /* 828 /* 1492 * If we had pud_special, we could av 829 * If we had pud_special, we could avoid all these restrictions, 1493 * but we need to be consistent with 830 * but we need to be consistent with PTEs and architectures that 1494 * can't support a 'special' bit. 831 * can't support a 'special' bit. 1495 */ 832 */ 1496 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|V 833 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 1497 !pfn_t_devmap(pfn)); 834 !pfn_t_devmap(pfn)); 1498 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM 835 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1499 836 (VM_PFNMAP|VM_MIXEDMAP)); 1500 BUG_ON((vma->vm_flags & VM_PFNMAP) && 837 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1501 838 1502 if (addr < vma->vm_start || addr >= v 839 if (addr < vma->vm_start || addr >= vma->vm_end) 1503 return VM_FAULT_SIGBUS; 840 return VM_FAULT_SIGBUS; 1504 841 1505 track_pfn_insert(vma, &pgprot, pfn); 842 track_pfn_insert(vma, &pgprot, pfn); 1506 843 1507 insert_pfn_pud(vma, addr, vmf->pud, p !! 844 insert_pfn_pud(vma, addr, pud, pfn, pgprot, write); 1508 return VM_FAULT_NOPAGE; 845 return VM_FAULT_NOPAGE; 1509 } 846 } 1510 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); 847 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); 1511 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPA 848 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1512 849 1513 void touch_pmd(struct vm_area_struct *vma, un !! 850 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 1514 pmd_t *pmd, bool write) !! 851 pmd_t *pmd, int flags) 1515 { 852 { 1516 pmd_t _pmd; 853 pmd_t _pmd; 1517 854 1518 _pmd = pmd_mkyoung(*pmd); 855 _pmd = pmd_mkyoung(*pmd); 1519 if (write) !! 856 if (flags & FOLL_WRITE) 1520 _pmd = pmd_mkdirty(_pmd); 857 _pmd = pmd_mkdirty(_pmd); 1521 if (pmdp_set_access_flags(vma, addr & 858 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 1522 pmd, _pmd, !! 859 pmd, _pmd, flags & FOLL_WRITE)) 1523 update_mmu_cache_pmd(vma, add 860 update_mmu_cache_pmd(vma, addr, pmd); 1524 } 861 } 1525 862 1526 struct page *follow_devmap_pmd(struct vm_area 863 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, 1527 pmd_t *pmd, int flags, struct 864 pmd_t *pmd, int flags, struct dev_pagemap **pgmap) 1528 { 865 { 1529 unsigned long pfn = pmd_pfn(*pmd); 866 unsigned long pfn = pmd_pfn(*pmd); 1530 struct mm_struct *mm = vma->vm_mm; 867 struct mm_struct *mm = vma->vm_mm; 1531 struct page *page; 868 struct page *page; 1532 int ret; << 1533 869 1534 assert_spin_locked(pmd_lockptr(mm, pm 870 assert_spin_locked(pmd_lockptr(mm, pmd)); 1535 871 >> 872 /* >> 873 * When we COW a devmap PMD entry, we split it into PTEs, so we should >> 874 * not be in this function with `flags & FOLL_COW` set. >> 875 */ >> 876 WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); >> 877 1536 if (flags & FOLL_WRITE && !pmd_write( 878 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1537 return NULL; 879 return NULL; 1538 880 1539 if (pmd_present(*pmd) && pmd_devmap(* 881 if (pmd_present(*pmd) && pmd_devmap(*pmd)) 1540 /* pass */; 882 /* pass */; 1541 else 883 else 1542 return NULL; 884 return NULL; 1543 885 1544 if (flags & FOLL_TOUCH) 886 if (flags & FOLL_TOUCH) 1545 touch_pmd(vma, addr, pmd, fla !! 887 touch_pmd(vma, addr, pmd, flags); 1546 888 1547 /* 889 /* 1548 * device mapped pages can only be re 890 * device mapped pages can only be returned if the 1549 * caller will manage the page refere 891 * caller will manage the page reference count. 1550 */ 892 */ 1551 if (!(flags & (FOLL_GET | FOLL_PIN))) !! 893 if (!(flags & FOLL_GET)) 1552 return ERR_PTR(-EEXIST); 894 return ERR_PTR(-EEXIST); 1553 895 1554 pfn += (addr & ~PMD_MASK) >> PAGE_SHI 896 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; 1555 *pgmap = get_dev_pagemap(pfn, *pgmap) 897 *pgmap = get_dev_pagemap(pfn, *pgmap); 1556 if (!*pgmap) 898 if (!*pgmap) 1557 return ERR_PTR(-EFAULT); 899 return ERR_PTR(-EFAULT); 1558 page = pfn_to_page(pfn); 900 page = pfn_to_page(pfn); 1559 ret = try_grab_folio(page_folio(page) !! 901 get_page(page); 1560 if (ret) << 1561 page = ERR_PTR(ret); << 1562 902 1563 return page; 903 return page; 1564 } 904 } 1565 905 1566 int copy_huge_pmd(struct mm_struct *dst_mm, s 906 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1567 pmd_t *dst_pmd, pmd_t *src_ 907 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 1568 struct vm_area_struct *dst_ !! 908 struct vm_area_struct *vma) 1569 { 909 { 1570 spinlock_t *dst_ptl, *src_ptl; 910 spinlock_t *dst_ptl, *src_ptl; 1571 struct page *src_page; 911 struct page *src_page; 1572 struct folio *src_folio; << 1573 pmd_t pmd; 912 pmd_t pmd; 1574 pgtable_t pgtable = NULL; 913 pgtable_t pgtable = NULL; 1575 int ret = -ENOMEM; 914 int ret = -ENOMEM; 1576 915 1577 pmd = pmdp_get_lockless(src_pmd); << 1578 if (unlikely(pmd_present(pmd) && pmd_ << 1579 dst_ptl = pmd_lock(dst_mm, ds << 1580 src_ptl = pmd_lockptr(src_mm, << 1581 spin_lock_nested(src_ptl, SIN << 1582 /* << 1583 * No need to recheck the pmd << 1584 * mmap lock held here. << 1585 * << 1586 * Meanwhile, making sure it' << 1587 * mapping, otherwise it mean << 1588 * applied special bit, or we << 1589 * able to wrongly write to t << 1590 */ << 1591 VM_WARN_ON_ONCE(is_cow_mappin << 1592 goto set_pmd; << 1593 } << 1594 << 1595 /* Skip if can be re-fill on fault */ 916 /* Skip if can be re-fill on fault */ 1596 if (!vma_is_anonymous(dst_vma)) !! 917 if (!vma_is_anonymous(vma)) 1597 return 0; 918 return 0; 1598 919 1599 pgtable = pte_alloc_one(dst_mm); !! 920 pgtable = pte_alloc_one(dst_mm, addr); 1600 if (unlikely(!pgtable)) 921 if (unlikely(!pgtable)) 1601 goto out; 922 goto out; 1602 923 1603 dst_ptl = pmd_lock(dst_mm, dst_pmd); 924 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1604 src_ptl = pmd_lockptr(src_mm, src_pmd 925 src_ptl = pmd_lockptr(src_mm, src_pmd); 1605 spin_lock_nested(src_ptl, SINGLE_DEPT 926 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1606 927 1607 ret = -EAGAIN; 928 ret = -EAGAIN; 1608 pmd = *src_pmd; 929 pmd = *src_pmd; 1609 930 1610 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 931 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1611 if (unlikely(is_swap_pmd(pmd))) { 932 if (unlikely(is_swap_pmd(pmd))) { 1612 swp_entry_t entry = pmd_to_sw 933 swp_entry_t entry = pmd_to_swp_entry(pmd); 1613 934 1614 VM_BUG_ON(!is_pmd_migration_e 935 VM_BUG_ON(!is_pmd_migration_entry(pmd)); 1615 if (!is_readable_migration_en !! 936 if (is_write_migration_entry(entry)) { 1616 entry = make_readable !! 937 make_migration_entry_read(&entry); 1617 << 1618 pmd = swp_entry_to_pm 938 pmd = swp_entry_to_pmd(entry); 1619 if (pmd_swp_soft_dirt 939 if (pmd_swp_soft_dirty(*src_pmd)) 1620 pmd = pmd_swp 940 pmd = pmd_swp_mksoft_dirty(pmd); 1621 if (pmd_swp_uffd_wp(* << 1622 pmd = pmd_swp << 1623 set_pmd_at(src_mm, ad 941 set_pmd_at(src_mm, addr, src_pmd, pmd); 1624 } 942 } 1625 add_mm_counter(dst_mm, MM_ANO 943 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1626 mm_inc_nr_ptes(dst_mm); 944 mm_inc_nr_ptes(dst_mm); 1627 pgtable_trans_huge_deposit(ds 945 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1628 if (!userfaultfd_wp(dst_vma)) << 1629 pmd = pmd_swp_clear_u << 1630 set_pmd_at(dst_mm, addr, dst_ 946 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1631 ret = 0; 947 ret = 0; 1632 goto out_unlock; 948 goto out_unlock; 1633 } 949 } 1634 #endif 950 #endif 1635 951 1636 if (unlikely(!pmd_trans_huge(pmd))) { 952 if (unlikely(!pmd_trans_huge(pmd))) { 1637 pte_free(dst_mm, pgtable); 953 pte_free(dst_mm, pgtable); 1638 goto out_unlock; 954 goto out_unlock; 1639 } 955 } 1640 /* 956 /* 1641 * When page table lock is held, the 957 * When page table lock is held, the huge zero pmd should not be 1642 * under splitting since we don't spl 958 * under splitting since we don't split the page itself, only pmd to 1643 * a page table. 959 * a page table. 1644 */ 960 */ 1645 if (is_huge_zero_pmd(pmd)) { 961 if (is_huge_zero_pmd(pmd)) { >> 962 struct page *zero_page; 1646 /* 963 /* 1647 * mm_get_huge_zero_folio() w !! 964 * get_huge_zero_page() will never allocate a new page here, 1648 * folio here, since we alrea !! 965 * since we already have a zero page to copy. It just takes a 1649 * copy. It just takes a refe !! 966 * reference. 1650 */ 967 */ 1651 mm_get_huge_zero_folio(dst_mm !! 968 zero_page = mm_get_huge_zero_page(dst_mm); 1652 goto out_zero_page; !! 969 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, >> 970 zero_page); >> 971 ret = 0; >> 972 goto out_unlock; 1653 } 973 } 1654 974 1655 src_page = pmd_page(pmd); 975 src_page = pmd_page(pmd); 1656 VM_BUG_ON_PAGE(!PageHead(src_page), s 976 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 1657 src_folio = page_folio(src_page); !! 977 get_page(src_page); 1658 !! 978 page_dup_rmap(src_page, true); 1659 folio_get(src_folio); << 1660 if (unlikely(folio_try_dup_anon_rmap_ << 1661 /* Page maybe pinned: split a << 1662 folio_put(src_folio); << 1663 pte_free(dst_mm, pgtable); << 1664 spin_unlock(src_ptl); << 1665 spin_unlock(dst_ptl); << 1666 __split_huge_pmd(src_vma, src << 1667 return -EAGAIN; << 1668 } << 1669 add_mm_counter(dst_mm, MM_ANONPAGES, 979 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1670 out_zero_page: << 1671 mm_inc_nr_ptes(dst_mm); 980 mm_inc_nr_ptes(dst_mm); 1672 pgtable_trans_huge_deposit(dst_mm, ds 981 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); >> 982 1673 pmdp_set_wrprotect(src_mm, addr, src_ 983 pmdp_set_wrprotect(src_mm, addr, src_pmd); 1674 if (!userfaultfd_wp(dst_vma)) !! 984 pmd = pmd_mkold(pmd_wrprotect(pmd)); 1675 pmd = pmd_clear_uffd_wp(pmd); << 1676 pmd = pmd_wrprotect(pmd); << 1677 set_pmd: << 1678 pmd = pmd_mkold(pmd); << 1679 set_pmd_at(dst_mm, addr, dst_pmd, pmd 985 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1680 986 1681 ret = 0; 987 ret = 0; 1682 out_unlock: 988 out_unlock: 1683 spin_unlock(src_ptl); 989 spin_unlock(src_ptl); 1684 spin_unlock(dst_ptl); 990 spin_unlock(dst_ptl); 1685 out: 991 out: 1686 return ret; 992 return ret; 1687 } 993 } 1688 994 1689 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_ 995 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1690 void touch_pud(struct vm_area_struct *vma, un !! 996 static void touch_pud(struct vm_area_struct *vma, unsigned long addr, 1691 pud_t *pud, bool write) !! 997 pud_t *pud, int flags) 1692 { 998 { 1693 pud_t _pud; 999 pud_t _pud; 1694 1000 1695 _pud = pud_mkyoung(*pud); 1001 _pud = pud_mkyoung(*pud); 1696 if (write) !! 1002 if (flags & FOLL_WRITE) 1697 _pud = pud_mkdirty(_pud); 1003 _pud = pud_mkdirty(_pud); 1698 if (pudp_set_access_flags(vma, addr & 1004 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, 1699 pud, _pud, !! 1005 pud, _pud, flags & FOLL_WRITE)) 1700 update_mmu_cache_pud(vma, add 1006 update_mmu_cache_pud(vma, addr, pud); 1701 } 1007 } 1702 1008 >> 1009 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, >> 1010 pud_t *pud, int flags, struct dev_pagemap **pgmap) >> 1011 { >> 1012 unsigned long pfn = pud_pfn(*pud); >> 1013 struct mm_struct *mm = vma->vm_mm; >> 1014 struct page *page; >> 1015 >> 1016 assert_spin_locked(pud_lockptr(mm, pud)); >> 1017 >> 1018 if (flags & FOLL_WRITE && !pud_write(*pud)) >> 1019 return NULL; >> 1020 >> 1021 if (pud_present(*pud) && pud_devmap(*pud)) >> 1022 /* pass */; >> 1023 else >> 1024 return NULL; >> 1025 >> 1026 if (flags & FOLL_TOUCH) >> 1027 touch_pud(vma, addr, pud, flags); >> 1028 >> 1029 /* >> 1030 * device mapped pages can only be returned if the >> 1031 * caller will manage the page reference count. >> 1032 */ >> 1033 if (!(flags & FOLL_GET)) >> 1034 return ERR_PTR(-EEXIST); >> 1035 >> 1036 pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; >> 1037 *pgmap = get_dev_pagemap(pfn, *pgmap); >> 1038 if (!*pgmap) >> 1039 return ERR_PTR(-EFAULT); >> 1040 page = pfn_to_page(pfn); >> 1041 get_page(page); >> 1042 >> 1043 return page; >> 1044 } >> 1045 1703 int copy_huge_pud(struct mm_struct *dst_mm, s 1046 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1704 pud_t *dst_pud, pud_t *src_ 1047 pud_t *dst_pud, pud_t *src_pud, unsigned long addr, 1705 struct vm_area_struct *vma) 1048 struct vm_area_struct *vma) 1706 { 1049 { 1707 spinlock_t *dst_ptl, *src_ptl; 1050 spinlock_t *dst_ptl, *src_ptl; 1708 pud_t pud; 1051 pud_t pud; 1709 int ret; 1052 int ret; 1710 1053 1711 dst_ptl = pud_lock(dst_mm, dst_pud); 1054 dst_ptl = pud_lock(dst_mm, dst_pud); 1712 src_ptl = pud_lockptr(src_mm, src_pud 1055 src_ptl = pud_lockptr(src_mm, src_pud); 1713 spin_lock_nested(src_ptl, SINGLE_DEPT 1056 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1714 1057 1715 ret = -EAGAIN; 1058 ret = -EAGAIN; 1716 pud = *src_pud; 1059 pud = *src_pud; 1717 if (unlikely(!pud_trans_huge(pud) && 1060 if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) 1718 goto out_unlock; 1061 goto out_unlock; 1719 1062 1720 /* 1063 /* 1721 * TODO: once we support anonymous pa !! 1064 * When page table lock is held, the huge zero pud should not be 1722 * folio_try_dup_anon_rmap_*() and sp !! 1065 * under splitting since we don't split the page itself, only pud to >> 1066 * a page table. 1723 */ 1067 */ 1724 if (is_cow_mapping(vma->vm_flags) && !! 1068 if (is_huge_zero_pud(pud)) { 1725 pudp_set_wrprotect(src_mm, ad !! 1069 /* No huge zero pud yet */ 1726 pud = pud_wrprotect(pud); << 1727 } 1070 } 1728 pud = pud_mkold(pud); !! 1071 >> 1072 pudp_set_wrprotect(src_mm, addr, src_pud); >> 1073 pud = pud_mkold(pud_wrprotect(pud)); 1729 set_pud_at(dst_mm, addr, dst_pud, pud 1074 set_pud_at(dst_mm, addr, dst_pud, pud); 1730 1075 1731 ret = 0; 1076 ret = 0; 1732 out_unlock: 1077 out_unlock: 1733 spin_unlock(src_ptl); 1078 spin_unlock(src_ptl); 1734 spin_unlock(dst_ptl); 1079 spin_unlock(dst_ptl); 1735 return ret; 1080 return ret; 1736 } 1081 } 1737 1082 1738 void huge_pud_set_accessed(struct vm_fault *v 1083 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) 1739 { 1084 { >> 1085 pud_t entry; >> 1086 unsigned long haddr; 1740 bool write = vmf->flags & FAULT_FLAG_ 1087 bool write = vmf->flags & FAULT_FLAG_WRITE; 1741 1088 1742 vmf->ptl = pud_lock(vmf->vma->vm_mm, 1089 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); 1743 if (unlikely(!pud_same(*vmf->pud, ori 1090 if (unlikely(!pud_same(*vmf->pud, orig_pud))) 1744 goto unlock; 1091 goto unlock; 1745 1092 1746 touch_pud(vmf->vma, vmf->address, vmf !! 1093 entry = pud_mkyoung(orig_pud); >> 1094 if (write) >> 1095 entry = pud_mkdirty(entry); >> 1096 haddr = vmf->address & HPAGE_PUD_MASK; >> 1097 if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write)) >> 1098 update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud); >> 1099 1747 unlock: 1100 unlock: 1748 spin_unlock(vmf->ptl); 1101 spin_unlock(vmf->ptl); 1749 } 1102 } 1750 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPA 1103 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1751 1104 1752 void huge_pmd_set_accessed(struct vm_fault *v !! 1105 void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) 1753 { 1106 { >> 1107 pmd_t entry; >> 1108 unsigned long haddr; 1754 bool write = vmf->flags & FAULT_FLAG_ 1109 bool write = vmf->flags & FAULT_FLAG_WRITE; 1755 1110 1756 vmf->ptl = pmd_lock(vmf->vma->vm_mm, 1111 vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1757 if (unlikely(!pmd_same(*vmf->pmd, vmf !! 1112 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) 1758 goto unlock; 1113 goto unlock; 1759 1114 1760 touch_pmd(vmf->vma, vmf->address, vmf !! 1115 entry = pmd_mkyoung(orig_pmd); >> 1116 if (write) >> 1117 entry = pmd_mkdirty(entry); >> 1118 haddr = vmf->address & HPAGE_PMD_MASK; >> 1119 if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write)) >> 1120 update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); 1761 1121 1762 unlock: 1122 unlock: 1763 spin_unlock(vmf->ptl); 1123 spin_unlock(vmf->ptl); 1764 } 1124 } 1765 1125 1766 vm_fault_t do_huge_pmd_wp_page(struct vm_faul !! 1126 static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, >> 1127 pmd_t orig_pmd, struct page *page) 1767 { 1128 { 1768 const bool unshare = vmf->flags & FAU << 1769 struct vm_area_struct *vma = vmf->vma 1129 struct vm_area_struct *vma = vmf->vma; 1770 struct folio *folio; << 1771 struct page *page; << 1772 unsigned long haddr = vmf->address & 1130 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1773 pmd_t orig_pmd = vmf->orig_pmd; !! 1131 struct mem_cgroup *memcg; 1774 !! 1132 pgtable_t pgtable; 1775 vmf->ptl = pmd_lockptr(vma->vm_mm, vm !! 1133 pmd_t _pmd; 1776 VM_BUG_ON_VMA(!vma->anon_vma, vma); !! 1134 int i; 1777 !! 1135 vm_fault_t ret = 0; 1778 if (is_huge_zero_pmd(orig_pmd)) !! 1136 struct page **pages; 1779 goto fallback; !! 1137 unsigned long mmun_start; /* For mmu_notifiers */ >> 1138 unsigned long mmun_end; /* For mmu_notifiers */ >> 1139 >> 1140 pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), >> 1141 GFP_KERNEL); >> 1142 if (unlikely(!pages)) { >> 1143 ret |= VM_FAULT_OOM; >> 1144 goto out; >> 1145 } 1780 1146 1781 spin_lock(vmf->ptl); !! 1147 for (i = 0; i < HPAGE_PMD_NR; i++) { >> 1148 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, >> 1149 vmf->address, page_to_nid(page)); >> 1150 if (unlikely(!pages[i] || >> 1151 mem_cgroup_try_charge_delay(pages[i], vma->vm_mm, >> 1152 GFP_KERNEL, &memcg, false))) { >> 1153 if (pages[i]) >> 1154 put_page(pages[i]); >> 1155 while (--i >= 0) { >> 1156 memcg = (void *)page_private(pages[i]); >> 1157 set_page_private(pages[i], 0); >> 1158 mem_cgroup_cancel_charge(pages[i], memcg, >> 1159 false); >> 1160 put_page(pages[i]); >> 1161 } >> 1162 kfree(pages); >> 1163 ret |= VM_FAULT_OOM; >> 1164 goto out; >> 1165 } >> 1166 set_page_private(pages[i], (unsigned long)memcg); >> 1167 } 1782 1168 1783 if (unlikely(!pmd_same(*vmf->pmd, ori !! 1169 for (i = 0; i < HPAGE_PMD_NR; i++) { 1784 spin_unlock(vmf->ptl); !! 1170 copy_user_highpage(pages[i], page + i, 1785 return 0; !! 1171 haddr + PAGE_SIZE * i, vma); >> 1172 __SetPageUptodate(pages[i]); >> 1173 cond_resched(); 1786 } 1174 } 1787 1175 1788 page = pmd_page(orig_pmd); !! 1176 mmun_start = haddr; 1789 folio = page_folio(page); !! 1177 mmun_end = haddr + HPAGE_PMD_SIZE; >> 1178 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); >> 1179 >> 1180 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); >> 1181 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) >> 1182 goto out_free_pages; 1790 VM_BUG_ON_PAGE(!PageHead(page), page) 1183 VM_BUG_ON_PAGE(!PageHead(page), page); 1791 1184 1792 /* Early check when only holding the !! 1185 /* 1793 if (PageAnonExclusive(page)) !! 1186 * Leave pmd empty until pte is filled note we must notify here as 1794 goto reuse; !! 1187 * concurrent CPU thread might write to new page before the call to >> 1188 * mmu_notifier_invalidate_range_end() happens which can lead to a >> 1189 * device seeing memory write in different order than CPU. >> 1190 * >> 1191 * See Documentation/vm/mmu_notifier.rst >> 1192 */ >> 1193 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); 1795 1194 1796 if (!folio_trylock(folio)) { !! 1195 pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); 1797 folio_get(folio); !! 1196 pmd_populate(vma->vm_mm, &_pmd, pgtable); 1798 spin_unlock(vmf->ptl); << 1799 folio_lock(folio); << 1800 spin_lock(vmf->ptl); << 1801 if (unlikely(!pmd_same(*vmf-> << 1802 spin_unlock(vmf->ptl) << 1803 folio_unlock(folio); << 1804 folio_put(folio); << 1805 return 0; << 1806 } << 1807 folio_put(folio); << 1808 } << 1809 1197 1810 /* Recheck after temporarily dropping !! 1198 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1811 if (PageAnonExclusive(page)) { !! 1199 pte_t entry; 1812 folio_unlock(folio); !! 1200 entry = mk_pte(pages[i], vma->vm_page_prot); 1813 goto reuse; !! 1201 entry = maybe_mkwrite(pte_mkdirty(entry), vma); >> 1202 memcg = (void *)page_private(pages[i]); >> 1203 set_page_private(pages[i], 0); >> 1204 page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); >> 1205 mem_cgroup_commit_charge(pages[i], memcg, false, false); >> 1206 lru_cache_add_active_or_unevictable(pages[i], vma); >> 1207 vmf->pte = pte_offset_map(&_pmd, haddr); >> 1208 VM_BUG_ON(!pte_none(*vmf->pte)); >> 1209 set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); >> 1210 pte_unmap(vmf->pte); 1814 } 1211 } >> 1212 kfree(pages); >> 1213 >> 1214 smp_wmb(); /* make pte visible before pmd */ >> 1215 pmd_populate(vma->vm_mm, vmf->pmd, pgtable); >> 1216 page_remove_rmap(page, true); >> 1217 spin_unlock(vmf->ptl); 1815 1218 1816 /* 1219 /* 1817 * See do_wp_page(): we can only reus !! 1220 * No need to double call mmu_notifier->invalidate_range() callback as 1818 * there are no additional references !! 1221 * the above pmdp_huge_clear_flush_notify() did already call it. 1819 * the LRU cache immediately after ad << 1820 */ 1222 */ 1821 if (folio_ref_count(folio) > !! 1223 mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, 1822 1 + folio_test_swapca !! 1224 mmun_end); 1823 goto unlock_fallback; << 1824 if (folio_test_swapcache(folio)) << 1825 folio_free_swap(folio); << 1826 if (folio_ref_count(folio) == 1) { << 1827 pmd_t entry; << 1828 1225 1829 folio_move_anon_rmap(folio, v !! 1226 ret |= VM_FAULT_WRITE; 1830 SetPageAnonExclusive(page); !! 1227 put_page(page); 1831 folio_unlock(folio); !! 1228 1832 reuse: !! 1229 out: 1833 if (unlikely(unshare)) { !! 1230 return ret; 1834 spin_unlock(vmf->ptl) !! 1231 1835 return 0; !! 1232 out_free_pages: >> 1233 spin_unlock(vmf->ptl); >> 1234 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); >> 1235 for (i = 0; i < HPAGE_PMD_NR; i++) { >> 1236 memcg = (void *)page_private(pages[i]); >> 1237 set_page_private(pages[i], 0); >> 1238 mem_cgroup_cancel_charge(pages[i], memcg, false); >> 1239 put_page(pages[i]); >> 1240 } >> 1241 kfree(pages); >> 1242 goto out; >> 1243 } >> 1244 >> 1245 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) >> 1246 { >> 1247 struct vm_area_struct *vma = vmf->vma; >> 1248 struct page *page = NULL, *new_page; >> 1249 struct mem_cgroup *memcg; >> 1250 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; >> 1251 unsigned long mmun_start; /* For mmu_notifiers */ >> 1252 unsigned long mmun_end; /* For mmu_notifiers */ >> 1253 gfp_t huge_gfp; /* for allocation and charge */ >> 1254 vm_fault_t ret = 0; >> 1255 >> 1256 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); >> 1257 VM_BUG_ON_VMA(!vma->anon_vma, vma); >> 1258 if (is_huge_zero_pmd(orig_pmd)) >> 1259 goto alloc; >> 1260 spin_lock(vmf->ptl); >> 1261 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) >> 1262 goto out_unlock; >> 1263 >> 1264 page = pmd_page(orig_pmd); >> 1265 VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); >> 1266 /* >> 1267 * We can only reuse the page if nobody else maps the huge page or it's >> 1268 * part. >> 1269 */ >> 1270 if (!trylock_page(page)) { >> 1271 get_page(page); >> 1272 spin_unlock(vmf->ptl); >> 1273 lock_page(page); >> 1274 spin_lock(vmf->ptl); >> 1275 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { >> 1276 unlock_page(page); >> 1277 put_page(page); >> 1278 goto out_unlock; 1836 } 1279 } >> 1280 put_page(page); >> 1281 } >> 1282 if (reuse_swap_page(page, NULL)) { >> 1283 pmd_t entry; 1837 entry = pmd_mkyoung(orig_pmd) 1284 entry = pmd_mkyoung(orig_pmd); 1838 entry = maybe_pmd_mkwrite(pmd 1285 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1839 if (pmdp_set_access_flags(vma !! 1286 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) 1840 update_mmu_cache_pmd( 1287 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1841 spin_unlock(vmf->ptl); !! 1288 ret |= VM_FAULT_WRITE; 1842 return 0; !! 1289 unlock_page(page); >> 1290 goto out_unlock; 1843 } 1291 } >> 1292 unlock_page(page); >> 1293 get_page(page); >> 1294 spin_unlock(vmf->ptl); >> 1295 alloc: >> 1296 if (transparent_hugepage_enabled(vma) && >> 1297 !transparent_hugepage_debug_cow()) { >> 1298 huge_gfp = alloc_hugepage_direct_gfpmask(vma); >> 1299 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); >> 1300 } else >> 1301 new_page = NULL; 1844 1302 1845 unlock_fallback: !! 1303 if (likely(new_page)) { 1846 folio_unlock(folio); !! 1304 prep_transhuge_page(new_page); >> 1305 } else { >> 1306 if (!page) { >> 1307 split_huge_pmd(vma, vmf->pmd, vmf->address); >> 1308 ret |= VM_FAULT_FALLBACK; >> 1309 } else { >> 1310 ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); >> 1311 if (ret & VM_FAULT_OOM) { >> 1312 split_huge_pmd(vma, vmf->pmd, vmf->address); >> 1313 ret |= VM_FAULT_FALLBACK; >> 1314 } >> 1315 put_page(page); >> 1316 } >> 1317 count_vm_event(THP_FAULT_FALLBACK); >> 1318 goto out; >> 1319 } >> 1320 >> 1321 if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm, >> 1322 huge_gfp, &memcg, true))) { >> 1323 put_page(new_page); >> 1324 split_huge_pmd(vma, vmf->pmd, vmf->address); >> 1325 if (page) >> 1326 put_page(page); >> 1327 ret |= VM_FAULT_FALLBACK; >> 1328 count_vm_event(THP_FAULT_FALLBACK); >> 1329 goto out; >> 1330 } >> 1331 >> 1332 count_vm_event(THP_FAULT_ALLOC); >> 1333 >> 1334 if (!page) >> 1335 clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); >> 1336 else >> 1337 copy_user_huge_page(new_page, page, vmf->address, >> 1338 vma, HPAGE_PMD_NR); >> 1339 __SetPageUptodate(new_page); >> 1340 >> 1341 mmun_start = haddr; >> 1342 mmun_end = haddr + HPAGE_PMD_SIZE; >> 1343 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); >> 1344 >> 1345 spin_lock(vmf->ptl); >> 1346 if (page) >> 1347 put_page(page); >> 1348 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { >> 1349 spin_unlock(vmf->ptl); >> 1350 mem_cgroup_cancel_charge(new_page, memcg, true); >> 1351 put_page(new_page); >> 1352 goto out_mn; >> 1353 } else { >> 1354 pmd_t entry; >> 1355 entry = mk_huge_pmd(new_page, vma->vm_page_prot); >> 1356 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); >> 1357 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); >> 1358 page_add_new_anon_rmap(new_page, vma, haddr, true); >> 1359 mem_cgroup_commit_charge(new_page, memcg, false, true); >> 1360 lru_cache_add_active_or_unevictable(new_page, vma); >> 1361 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); >> 1362 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); >> 1363 if (!page) { >> 1364 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); >> 1365 } else { >> 1366 VM_BUG_ON_PAGE(!PageHead(page), page); >> 1367 page_remove_rmap(page, true); >> 1368 put_page(page); >> 1369 } >> 1370 ret |= VM_FAULT_WRITE; >> 1371 } >> 1372 spin_unlock(vmf->ptl); >> 1373 out_mn: >> 1374 /* >> 1375 * No need to double call mmu_notifier->invalidate_range() callback as >> 1376 * the above pmdp_huge_clear_flush_notify() did already call it. >> 1377 */ >> 1378 mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, >> 1379 mmun_end); >> 1380 out: >> 1381 return ret; >> 1382 out_unlock: 1847 spin_unlock(vmf->ptl); 1383 spin_unlock(vmf->ptl); 1848 fallback: !! 1384 return ret; 1849 __split_huge_pmd(vma, vmf->pmd, vmf-> << 1850 return VM_FAULT_FALLBACK; << 1851 } 1385 } 1852 1386 1853 static inline bool can_change_pmd_writable(st !! 1387 /* 1854 un !! 1388 * FOLL_FORCE can write to even unwritable pmd's, but only >> 1389 * after we've gone through a COW cycle and they are dirty. >> 1390 */ >> 1391 static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) 1855 { 1392 { 1856 struct page *page; !! 1393 return pmd_write(pmd) || >> 1394 ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); >> 1395 } 1857 1396 1858 if (WARN_ON_ONCE(!(vma->vm_flags & VM !! 1397 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 1859 return false; !! 1398 unsigned long addr, >> 1399 pmd_t *pmd, >> 1400 unsigned int flags) >> 1401 { >> 1402 struct mm_struct *mm = vma->vm_mm; >> 1403 struct page *page = NULL; 1860 1404 1861 /* Don't touch entries that are not e !! 1405 assert_spin_locked(pmd_lockptr(mm, pmd)); 1862 if (pmd_protnone(pmd)) << 1863 return false; << 1864 1406 1865 /* Do we need write faults for softdi !! 1407 if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags)) 1866 if (pmd_needs_soft_dirty_wp(vma, pmd) !! 1408 goto out; 1867 return false; << 1868 1409 1869 /* Do we need write faults for uffd-w !! 1410 /* Avoid dumping huge zero page */ 1870 if (userfaultfd_huge_pmd_wp(vma, pmd) !! 1411 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) 1871 return false; !! 1412 return ERR_PTR(-EFAULT); 1872 1413 1873 if (!(vma->vm_flags & VM_SHARED)) { !! 1414 /* Full NUMA hinting faults to serialise migration in fault paths */ 1874 /* See can_change_pte_writabl !! 1415 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) 1875 page = vm_normal_page_pmd(vma !! 1416 goto out; 1876 return page && PageAnon(page) !! 1417 1877 } !! 1418 page = pmd_page(*pmd); >> 1419 VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); >> 1420 if (flags & FOLL_TOUCH) >> 1421 touch_pmd(vma, addr, pmd, flags); >> 1422 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { >> 1423 /* >> 1424 * We don't mlock() pte-mapped THPs. This way we can avoid >> 1425 * leaking mlocked pages into non-VM_LOCKED VMAs. >> 1426 * >> 1427 * For anon THP: >> 1428 * >> 1429 * In most cases the pmd is the only mapping of the page as we >> 1430 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for >> 1431 * writable private mappings in populate_vma_page_range(). >> 1432 * >> 1433 * The only scenario when we have the page shared here is if we >> 1434 * mlocking read-only mapping shared over fork(). We skip >> 1435 * mlocking such pages. >> 1436 * >> 1437 * For file THP: >> 1438 * >> 1439 * We can expect PageDoubleMap() to be stable under page lock: >> 1440 * for file pages we set it in page_add_file_rmap(), which >> 1441 * requires page to be locked. >> 1442 */ >> 1443 >> 1444 if (PageAnon(page) && compound_mapcount(page) != 1) >> 1445 goto skip_mlock; >> 1446 if (PageDoubleMap(page) || !page->mapping) >> 1447 goto skip_mlock; >> 1448 if (!trylock_page(page)) >> 1449 goto skip_mlock; >> 1450 lru_add_drain(); >> 1451 if (page->mapping && !PageDoubleMap(page)) >> 1452 mlock_vma_page(page); >> 1453 unlock_page(page); >> 1454 } >> 1455 skip_mlock: >> 1456 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; >> 1457 VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); >> 1458 if (flags & FOLL_GET) >> 1459 get_page(page); 1878 1460 1879 /* See can_change_pte_writable(). */ !! 1461 out: 1880 return pmd_dirty(pmd); !! 1462 return page; 1881 } 1463 } 1882 1464 1883 /* NUMA hinting page fault entry point for tr 1465 /* NUMA hinting page fault entry point for trans huge pmds */ 1884 vm_fault_t do_huge_pmd_numa_page(struct vm_fa !! 1466 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) 1885 { 1467 { 1886 struct vm_area_struct *vma = vmf->vma 1468 struct vm_area_struct *vma = vmf->vma; 1887 struct folio *folio; !! 1469 struct anon_vma *anon_vma = NULL; >> 1470 struct page *page; 1888 unsigned long haddr = vmf->address & 1471 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1889 int nid = NUMA_NO_NODE; !! 1472 int page_nid = -1, this_nid = numa_node_id(); 1890 int target_nid, last_cpupid; !! 1473 int target_nid, last_cpupid = -1; 1891 pmd_t pmd, old_pmd; !! 1474 bool page_locked; 1892 bool writable = false; !! 1475 bool migrated = false; >> 1476 bool was_writable; 1893 int flags = 0; 1477 int flags = 0; 1894 1478 1895 vmf->ptl = pmd_lock(vma->vm_mm, vmf-> 1479 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1896 old_pmd = pmdp_get(vmf->pmd); !! 1480 if (unlikely(!pmd_same(pmd, *vmf->pmd))) >> 1481 goto out_unlock; 1897 1482 1898 if (unlikely(!pmd_same(old_pmd, vmf-> !! 1483 /* >> 1484 * If there are potential migrations, wait for completion and retry >> 1485 * without disrupting NUMA hinting information. Do not relock and >> 1486 * check_same as the page may no longer be mapped. >> 1487 */ >> 1488 if (unlikely(pmd_trans_migrating(*vmf->pmd))) { >> 1489 page = pmd_page(*vmf->pmd); >> 1490 if (!get_page_unless_zero(page)) >> 1491 goto out_unlock; 1899 spin_unlock(vmf->ptl); 1492 spin_unlock(vmf->ptl); 1900 return 0; !! 1493 wait_on_page_locked(page); >> 1494 put_page(page); >> 1495 goto out; 1901 } 1496 } 1902 1497 1903 pmd = pmd_modify(old_pmd, vma->vm_pag !! 1498 page = pmd_page(pmd); >> 1499 BUG_ON(is_huge_zero_page(page)); >> 1500 page_nid = page_to_nid(page); >> 1501 last_cpupid = page_cpupid_last(page); >> 1502 count_vm_numa_event(NUMA_HINT_FAULTS); >> 1503 if (page_nid == this_nid) { >> 1504 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); >> 1505 flags |= TNF_FAULT_LOCAL; >> 1506 } >> 1507 >> 1508 /* See similar comment in do_numa_page for explanation */ >> 1509 if (!pmd_savedwrite(pmd)) >> 1510 flags |= TNF_NO_GROUP; >> 1511 >> 1512 /* >> 1513 * Acquire the page lock to serialise THP migrations but avoid dropping >> 1514 * page_table_lock if at all possible >> 1515 */ >> 1516 page_locked = trylock_page(page); >> 1517 target_nid = mpol_misplaced(page, vma, haddr); >> 1518 if (target_nid == -1) { >> 1519 /* If the page was locked, there are no parallel migrations */ >> 1520 if (page_locked) >> 1521 goto clear_pmdnuma; >> 1522 } >> 1523 >> 1524 /* Migration could have started since the pmd_trans_migrating check */ >> 1525 if (!page_locked) { >> 1526 page_nid = -1; >> 1527 if (!get_page_unless_zero(page)) >> 1528 goto out_unlock; >> 1529 spin_unlock(vmf->ptl); >> 1530 wait_on_page_locked(page); >> 1531 put_page(page); >> 1532 goto out; >> 1533 } 1904 1534 1905 /* 1535 /* 1906 * Detect now whether the PMD could b !! 1536 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma 1907 * is only valid while holding the PT !! 1537 * to serialises splits 1908 */ 1538 */ 1909 writable = pmd_write(pmd); !! 1539 get_page(page); 1910 if (!writable && vma_wants_manual_pte !! 1540 spin_unlock(vmf->ptl); 1911 can_change_pmd_writable(vma, vmf- !! 1541 anon_vma = page_lock_anon_vma_read(page); 1912 writable = true; << 1913 1542 1914 folio = vm_normal_folio_pmd(vma, hadd !! 1543 /* Confirm the PMD did not change while page_table_lock was released */ 1915 if (!folio) !! 1544 spin_lock(vmf->ptl); 1916 goto out_map; !! 1545 if (unlikely(!pmd_same(pmd, *vmf->pmd))) { >> 1546 unlock_page(page); >> 1547 put_page(page); >> 1548 page_nid = -1; >> 1549 goto out_unlock; >> 1550 } 1917 1551 1918 nid = folio_nid(folio); !! 1552 /* Bail if we fail to protect against THP splits for any reason */ >> 1553 if (unlikely(!anon_vma)) { >> 1554 put_page(page); >> 1555 page_nid = -1; >> 1556 goto clear_pmdnuma; >> 1557 } 1919 1558 1920 target_nid = numa_migrate_check(folio !! 1559 /* 1921 &last !! 1560 * Since we took the NUMA fault, we must have observed the !accessible 1922 if (target_nid == NUMA_NO_NODE) !! 1561 * bit. Make sure all other CPUs agree with that, to avoid them 1923 goto out_map; !! 1562 * modifying the page we're about to migrate. 1924 if (migrate_misplaced_folio_prepare(f !! 1563 * 1925 flags |= TNF_MIGRATE_FAIL; !! 1564 * Must be done under PTL such that we'll observe the relevant 1926 goto out_map; !! 1565 * inc_tlb_flush_pending(). >> 1566 * >> 1567 * We are not sure a pending tlb flush here is for a huge page >> 1568 * mapping or not. Hence use the tlb range variant >> 1569 */ >> 1570 if (mm_tlb_flush_pending(vma->vm_mm)) { >> 1571 flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); >> 1572 /* >> 1573 * change_huge_pmd() released the pmd lock before >> 1574 * invalidating the secondary MMUs sharing the primary >> 1575 * MMU pagetables (with ->invalidate_range()). The >> 1576 * mmu_notifier_invalidate_range_end() (which >> 1577 * internally calls ->invalidate_range()) in >> 1578 * change_pmd_range() will run after us, so we can't >> 1579 * rely on it here and we need an explicit invalidate. >> 1580 */ >> 1581 mmu_notifier_invalidate_range(vma->vm_mm, haddr, >> 1582 haddr + HPAGE_PMD_SIZE); 1927 } 1583 } 1928 /* The folio is isolated and isolatio !! 1584 >> 1585 /* >> 1586 * Migrate the THP to the requested node, returns with page unlocked >> 1587 * and access rights restored. >> 1588 */ 1929 spin_unlock(vmf->ptl); 1589 spin_unlock(vmf->ptl); 1930 writable = false; << 1931 1590 1932 if (!migrate_misplaced_folio(folio, v !! 1591 migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, >> 1592 vmf->pmd, pmd, vmf->address, page, target_nid); >> 1593 if (migrated) { 1933 flags |= TNF_MIGRATED; 1594 flags |= TNF_MIGRATED; 1934 nid = target_nid; !! 1595 page_nid = target_nid; 1935 task_numa_fault(last_cpupid, !! 1596 } else 1936 return 0; !! 1597 flags |= TNF_MIGRATE_FAIL; 1937 } << 1938 1598 1939 flags |= TNF_MIGRATE_FAIL; !! 1599 goto out; 1940 vmf->ptl = pmd_lock(vma->vm_mm, vmf-> !! 1600 clear_pmdnuma: 1941 if (unlikely(!pmd_same(pmdp_get(vmf-> !! 1601 BUG_ON(!PageLocked(page)); 1942 spin_unlock(vmf->ptl); !! 1602 was_writable = pmd_savedwrite(pmd); 1943 return 0; !! 1603 pmd = pmd_modify(pmd, vma->vm_page_prot); 1944 } << 1945 out_map: << 1946 /* Restore the PMD */ << 1947 pmd = pmd_modify(pmdp_get(vmf->pmd), << 1948 pmd = pmd_mkyoung(pmd); 1604 pmd = pmd_mkyoung(pmd); 1949 if (writable) !! 1605 if (was_writable) 1950 pmd = pmd_mkwrite(pmd, vma); !! 1606 pmd = pmd_mkwrite(pmd); 1951 set_pmd_at(vma->vm_mm, haddr, vmf->pm 1607 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); 1952 update_mmu_cache_pmd(vma, vmf->addres 1608 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); >> 1609 unlock_page(page); >> 1610 out_unlock: 1953 spin_unlock(vmf->ptl); 1611 spin_unlock(vmf->ptl); 1954 1612 1955 if (nid != NUMA_NO_NODE) !! 1613 out: 1956 task_numa_fault(last_cpupid, !! 1614 if (anon_vma) >> 1615 page_unlock_anon_vma_read(anon_vma); >> 1616 >> 1617 if (page_nid != -1) >> 1618 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, >> 1619 flags); >> 1620 1957 return 0; 1621 return 0; 1958 } 1622 } 1959 1623 1960 /* 1624 /* 1961 * Return true if we do MADV_FREE successfull 1625 * Return true if we do MADV_FREE successfully on entire pmd page. 1962 * Otherwise, return false. 1626 * Otherwise, return false. 1963 */ 1627 */ 1964 bool madvise_free_huge_pmd(struct mmu_gather 1628 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1965 pmd_t *pmd, unsigned long add 1629 pmd_t *pmd, unsigned long addr, unsigned long next) 1966 { 1630 { 1967 spinlock_t *ptl; 1631 spinlock_t *ptl; 1968 pmd_t orig_pmd; 1632 pmd_t orig_pmd; 1969 struct folio *folio; !! 1633 struct page *page; 1970 struct mm_struct *mm = tlb->mm; 1634 struct mm_struct *mm = tlb->mm; 1971 bool ret = false; 1635 bool ret = false; 1972 1636 1973 tlb_change_page_size(tlb, HPAGE_PMD_S !! 1637 tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); 1974 1638 1975 ptl = pmd_trans_huge_lock(pmd, vma); 1639 ptl = pmd_trans_huge_lock(pmd, vma); 1976 if (!ptl) 1640 if (!ptl) 1977 goto out_unlocked; 1641 goto out_unlocked; 1978 1642 1979 orig_pmd = *pmd; 1643 orig_pmd = *pmd; 1980 if (is_huge_zero_pmd(orig_pmd)) 1644 if (is_huge_zero_pmd(orig_pmd)) 1981 goto out; 1645 goto out; 1982 1646 1983 if (unlikely(!pmd_present(orig_pmd))) 1647 if (unlikely(!pmd_present(orig_pmd))) { 1984 VM_BUG_ON(thp_migration_suppo 1648 VM_BUG_ON(thp_migration_supported() && 1985 !is_pmd_mig 1649 !is_pmd_migration_entry(orig_pmd)); 1986 goto out; 1650 goto out; 1987 } 1651 } 1988 1652 1989 folio = pmd_folio(orig_pmd); !! 1653 page = pmd_page(orig_pmd); 1990 /* 1654 /* 1991 * If other processes are mapping thi !! 1655 * If other processes are mapping this page, we couldn't discard 1992 * the folio unless they all do MADV_ !! 1656 * the page unless they all do MADV_FREE so let's skip the page. 1993 */ 1657 */ 1994 if (folio_likely_mapped_shared(folio) !! 1658 if (page_mapcount(page) != 1) 1995 goto out; 1659 goto out; 1996 1660 1997 if (!folio_trylock(folio)) !! 1661 if (!trylock_page(page)) 1998 goto out; 1662 goto out; 1999 1663 2000 /* 1664 /* 2001 * If user want to discard part-pages 1665 * If user want to discard part-pages of THP, split it so MADV_FREE 2002 * will deactivate only them. 1666 * will deactivate only them. 2003 */ 1667 */ 2004 if (next - addr != HPAGE_PMD_SIZE) { 1668 if (next - addr != HPAGE_PMD_SIZE) { 2005 folio_get(folio); !! 1669 get_page(page); 2006 spin_unlock(ptl); 1670 spin_unlock(ptl); 2007 split_folio(folio); !! 1671 split_huge_page(page); 2008 folio_unlock(folio); !! 1672 unlock_page(page); 2009 folio_put(folio); !! 1673 put_page(page); 2010 goto out_unlocked; 1674 goto out_unlocked; 2011 } 1675 } 2012 1676 2013 if (folio_test_dirty(folio)) !! 1677 if (PageDirty(page)) 2014 folio_clear_dirty(folio); !! 1678 ClearPageDirty(page); 2015 folio_unlock(folio); !! 1679 unlock_page(page); 2016 1680 2017 if (pmd_young(orig_pmd) || pmd_dirty( 1681 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { 2018 pmdp_invalidate(vma, addr, pm 1682 pmdp_invalidate(vma, addr, pmd); 2019 orig_pmd = pmd_mkold(orig_pmd 1683 orig_pmd = pmd_mkold(orig_pmd); 2020 orig_pmd = pmd_mkclean(orig_p 1684 orig_pmd = pmd_mkclean(orig_pmd); 2021 1685 2022 set_pmd_at(mm, addr, pmd, ori 1686 set_pmd_at(mm, addr, pmd, orig_pmd); 2023 tlb_remove_pmd_tlb_entry(tlb, 1687 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 2024 } 1688 } 2025 1689 2026 folio_mark_lazyfree(folio); !! 1690 mark_page_lazyfree(page); 2027 ret = true; 1691 ret = true; 2028 out: 1692 out: 2029 spin_unlock(ptl); 1693 spin_unlock(ptl); 2030 out_unlocked: 1694 out_unlocked: 2031 return ret; 1695 return ret; 2032 } 1696 } 2033 1697 2034 static inline void zap_deposited_table(struct 1698 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) 2035 { 1699 { 2036 pgtable_t pgtable; 1700 pgtable_t pgtable; 2037 1701 2038 pgtable = pgtable_trans_huge_withdraw 1702 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2039 pte_free(mm, pgtable); 1703 pte_free(mm, pgtable); 2040 mm_dec_nr_ptes(mm); 1704 mm_dec_nr_ptes(mm); 2041 } 1705 } 2042 1706 2043 int zap_huge_pmd(struct mmu_gather *tlb, stru 1707 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 2044 pmd_t *pmd, unsigned long ad 1708 pmd_t *pmd, unsigned long addr) 2045 { 1709 { 2046 pmd_t orig_pmd; 1710 pmd_t orig_pmd; 2047 spinlock_t *ptl; 1711 spinlock_t *ptl; 2048 1712 2049 tlb_change_page_size(tlb, HPAGE_PMD_S !! 1713 tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); 2050 1714 2051 ptl = __pmd_trans_huge_lock(pmd, vma) 1715 ptl = __pmd_trans_huge_lock(pmd, vma); 2052 if (!ptl) 1716 if (!ptl) 2053 return 0; 1717 return 0; 2054 /* 1718 /* 2055 * For architectures like ppc64 we lo 1719 * For architectures like ppc64 we look at deposited pgtable 2056 * when calling pmdp_huge_get_and_cle 1720 * when calling pmdp_huge_get_and_clear. So do the 2057 * pgtable_trans_huge_withdraw after 1721 * pgtable_trans_huge_withdraw after finishing pmdp related 2058 * operations. 1722 * operations. 2059 */ 1723 */ 2060 orig_pmd = pmdp_huge_get_and_clear_fu !! 1724 orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, 2061 !! 1725 tlb->fullmm); 2062 arch_check_zapped_pmd(vma, orig_pmd); << 2063 tlb_remove_pmd_tlb_entry(tlb, pmd, ad 1726 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 2064 if (vma_is_special_huge(vma)) { !! 1727 if (vma_is_dax(vma)) { 2065 if (arch_needs_pgtable_deposi 1728 if (arch_needs_pgtable_deposit()) 2066 zap_deposited_table(t 1729 zap_deposited_table(tlb->mm, pmd); 2067 spin_unlock(ptl); 1730 spin_unlock(ptl); >> 1731 if (is_huge_zero_pmd(orig_pmd)) >> 1732 tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); 2068 } else if (is_huge_zero_pmd(orig_pmd) 1733 } else if (is_huge_zero_pmd(orig_pmd)) { 2069 zap_deposited_table(tlb->mm, 1734 zap_deposited_table(tlb->mm, pmd); 2070 spin_unlock(ptl); 1735 spin_unlock(ptl); >> 1736 tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); 2071 } else { 1737 } else { 2072 struct folio *folio = NULL; !! 1738 struct page *page = NULL; 2073 int flush_needed = 1; 1739 int flush_needed = 1; 2074 1740 2075 if (pmd_present(orig_pmd)) { 1741 if (pmd_present(orig_pmd)) { 2076 struct page *page = p !! 1742 page = pmd_page(orig_pmd); 2077 !! 1743 page_remove_rmap(page, true); 2078 folio = page_folio(pa !! 1744 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); 2079 folio_remove_rmap_pmd << 2080 WARN_ON_ONCE(folio_ma << 2081 VM_BUG_ON_PAGE(!PageH 1745 VM_BUG_ON_PAGE(!PageHead(page), page); 2082 } else if (thp_migration_supp 1746 } else if (thp_migration_supported()) { 2083 swp_entry_t entry; 1747 swp_entry_t entry; 2084 1748 2085 VM_BUG_ON(!is_pmd_mig 1749 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); 2086 entry = pmd_to_swp_en 1750 entry = pmd_to_swp_entry(orig_pmd); 2087 folio = pfn_swap_entr !! 1751 page = pfn_to_page(swp_offset(entry)); 2088 flush_needed = 0; 1752 flush_needed = 0; 2089 } else 1753 } else 2090 WARN_ONCE(1, "Non pre 1754 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); 2091 1755 2092 if (folio_test_anon(folio)) { !! 1756 if (PageAnon(page)) { 2093 zap_deposited_table(t 1757 zap_deposited_table(tlb->mm, pmd); 2094 add_mm_counter(tlb->m 1758 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 2095 } else { 1759 } else { 2096 if (arch_needs_pgtabl 1760 if (arch_needs_pgtable_deposit()) 2097 zap_deposited 1761 zap_deposited_table(tlb->mm, pmd); 2098 add_mm_counter(tlb->m !! 1762 add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR); 2099 -HPAGE << 2100 } 1763 } 2101 1764 2102 spin_unlock(ptl); 1765 spin_unlock(ptl); 2103 if (flush_needed) 1766 if (flush_needed) 2104 tlb_remove_page_size( !! 1767 tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); 2105 } 1768 } 2106 return 1; 1769 return 1; 2107 } 1770 } 2108 1771 2109 #ifndef pmd_move_must_withdraw 1772 #ifndef pmd_move_must_withdraw 2110 static inline int pmd_move_must_withdraw(spin 1773 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, 2111 spin 1774 spinlock_t *old_pmd_ptl, 2112 stru 1775 struct vm_area_struct *vma) 2113 { 1776 { 2114 /* 1777 /* 2115 * With split pmd lock we also need t 1778 * With split pmd lock we also need to move preallocated 2116 * PTE page table if new_pmd is on di 1779 * PTE page table if new_pmd is on different PMD page table. 2117 * 1780 * 2118 * We also don't deposit and withdraw 1781 * We also don't deposit and withdraw tables for file pages. 2119 */ 1782 */ 2120 return (new_pmd_ptl != old_pmd_ptl) & 1783 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); 2121 } 1784 } 2122 #endif 1785 #endif 2123 1786 2124 static pmd_t move_soft_dirty_pmd(pmd_t pmd) 1787 static pmd_t move_soft_dirty_pmd(pmd_t pmd) 2125 { 1788 { 2126 #ifdef CONFIG_MEM_SOFT_DIRTY 1789 #ifdef CONFIG_MEM_SOFT_DIRTY 2127 if (unlikely(is_pmd_migration_entry(p 1790 if (unlikely(is_pmd_migration_entry(pmd))) 2128 pmd = pmd_swp_mksoft_dirty(pm 1791 pmd = pmd_swp_mksoft_dirty(pmd); 2129 else if (pmd_present(pmd)) 1792 else if (pmd_present(pmd)) 2130 pmd = pmd_mksoft_dirty(pmd); 1793 pmd = pmd_mksoft_dirty(pmd); 2131 #endif 1794 #endif 2132 return pmd; 1795 return pmd; 2133 } 1796 } 2134 1797 2135 bool move_huge_pmd(struct vm_area_struct *vma 1798 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 2136 unsigned long new_addr, pmd !! 1799 unsigned long new_addr, unsigned long old_end, >> 1800 pmd_t *old_pmd, pmd_t *new_pmd) 2137 { 1801 { 2138 spinlock_t *old_ptl, *new_ptl; 1802 spinlock_t *old_ptl, *new_ptl; 2139 pmd_t pmd; 1803 pmd_t pmd; 2140 struct mm_struct *mm = vma->vm_mm; 1804 struct mm_struct *mm = vma->vm_mm; 2141 bool force_flush = false; 1805 bool force_flush = false; 2142 1806 >> 1807 if ((old_addr & ~HPAGE_PMD_MASK) || >> 1808 (new_addr & ~HPAGE_PMD_MASK) || >> 1809 old_end - old_addr < HPAGE_PMD_SIZE) >> 1810 return false; >> 1811 2143 /* 1812 /* 2144 * The destination pmd shouldn't be e 1813 * The destination pmd shouldn't be established, free_pgtables() 2145 * should have released it; but move_ !! 1814 * should have release it. 2146 * inserted a page table, if racing a << 2147 */ 1815 */ 2148 if (!pmd_none(*new_pmd)) { !! 1816 if (WARN_ON(!pmd_none(*new_pmd))) { 2149 VM_BUG_ON(pmd_trans_huge(*new 1817 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 2150 return false; 1818 return false; 2151 } 1819 } 2152 1820 2153 /* 1821 /* 2154 * We don't have to worry about the o 1822 * We don't have to worry about the ordering of src and dst 2155 * ptlocks because exclusive mmap_loc !! 1823 * ptlocks because exclusive mmap_sem prevents deadlock. 2156 */ 1824 */ 2157 old_ptl = __pmd_trans_huge_lock(old_p 1825 old_ptl = __pmd_trans_huge_lock(old_pmd, vma); 2158 if (old_ptl) { 1826 if (old_ptl) { 2159 new_ptl = pmd_lockptr(mm, new 1827 new_ptl = pmd_lockptr(mm, new_pmd); 2160 if (new_ptl != old_ptl) 1828 if (new_ptl != old_ptl) 2161 spin_lock_nested(new_ 1829 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 2162 pmd = pmdp_huge_get_and_clear 1830 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); 2163 if (pmd_present(pmd)) 1831 if (pmd_present(pmd)) 2164 force_flush = true; 1832 force_flush = true; 2165 VM_BUG_ON(!pmd_none(*new_pmd) 1833 VM_BUG_ON(!pmd_none(*new_pmd)); 2166 1834 2167 if (pmd_move_must_withdraw(ne 1835 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { 2168 pgtable_t pgtable; 1836 pgtable_t pgtable; 2169 pgtable = pgtable_tra 1837 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 2170 pgtable_trans_huge_de 1838 pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 2171 } 1839 } 2172 pmd = move_soft_dirty_pmd(pmd 1840 pmd = move_soft_dirty_pmd(pmd); 2173 set_pmd_at(mm, new_addr, new_ 1841 set_pmd_at(mm, new_addr, new_pmd, pmd); 2174 if (force_flush) 1842 if (force_flush) 2175 flush_pmd_tlb_range(v !! 1843 flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); 2176 if (new_ptl != old_ptl) 1844 if (new_ptl != old_ptl) 2177 spin_unlock(new_ptl); 1845 spin_unlock(new_ptl); 2178 spin_unlock(old_ptl); 1846 spin_unlock(old_ptl); 2179 return true; 1847 return true; 2180 } 1848 } 2181 return false; 1849 return false; 2182 } 1850 } 2183 1851 2184 /* 1852 /* 2185 * Returns 1853 * Returns 2186 * - 0 if PMD could not be locked 1854 * - 0 if PMD could not be locked 2187 * - 1 if PMD was locked but protections unc !! 1855 * - 1 if PMD was locked but protections unchange and TLB flush unnecessary 2188 * or if prot_numa but THP migration is !! 1856 * - HPAGE_PMD_NR is protections changed and TLB flush necessary 2189 * - HPAGE_PMD_NR if protections changed and << 2190 */ 1857 */ 2191 int change_huge_pmd(struct mmu_gather *tlb, s !! 1858 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 2192 pmd_t *pmd, unsigned long !! 1859 unsigned long addr, pgprot_t newprot, int prot_numa) 2193 unsigned long cp_flags) << 2194 { 1860 { 2195 struct mm_struct *mm = vma->vm_mm; 1861 struct mm_struct *mm = vma->vm_mm; 2196 spinlock_t *ptl; 1862 spinlock_t *ptl; 2197 pmd_t oldpmd, entry; !! 1863 pmd_t entry; 2198 bool prot_numa = cp_flags & MM_CP_PRO !! 1864 bool preserve_write; 2199 bool uffd_wp = cp_flags & MM_CP_UFFD_ !! 1865 int ret; 2200 bool uffd_wp_resolve = cp_flags & MM_ << 2201 int ret = 1; << 2202 << 2203 tlb_change_page_size(tlb, HPAGE_PMD_S << 2204 << 2205 if (prot_numa && !thp_migration_suppo << 2206 return 1; << 2207 1866 2208 ptl = __pmd_trans_huge_lock(pmd, vma) 1867 ptl = __pmd_trans_huge_lock(pmd, vma); 2209 if (!ptl) 1868 if (!ptl) 2210 return 0; 1869 return 0; 2211 1870 >> 1871 preserve_write = prot_numa && pmd_write(*pmd); >> 1872 ret = 1; >> 1873 2212 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1874 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 2213 if (is_swap_pmd(*pmd)) { 1875 if (is_swap_pmd(*pmd)) { 2214 swp_entry_t entry = pmd_to_sw 1876 swp_entry_t entry = pmd_to_swp_entry(*pmd); 2215 struct folio *folio = pfn_swa << 2216 pmd_t newpmd; << 2217 1877 2218 VM_BUG_ON(!is_pmd_migration_e 1878 VM_BUG_ON(!is_pmd_migration_entry(*pmd)); 2219 if (is_writable_migration_ent !! 1879 if (is_write_migration_entry(entry)) { >> 1880 pmd_t newpmd; 2220 /* 1881 /* 2221 * A protection check 1882 * A protection check is difficult so 2222 * just be safe and d 1883 * just be safe and disable write 2223 */ 1884 */ 2224 if (folio_test_anon(f !! 1885 make_migration_entry_read(&entry); 2225 entry = make_ << 2226 else << 2227 entry = make_ << 2228 newpmd = swp_entry_to 1886 newpmd = swp_entry_to_pmd(entry); 2229 if (pmd_swp_soft_dirt 1887 if (pmd_swp_soft_dirty(*pmd)) 2230 newpmd = pmd_ 1888 newpmd = pmd_swp_mksoft_dirty(newpmd); 2231 } else { << 2232 newpmd = *pmd; << 2233 } << 2234 << 2235 if (uffd_wp) << 2236 newpmd = pmd_swp_mkuf << 2237 else if (uffd_wp_resolve) << 2238 newpmd = pmd_swp_clea << 2239 if (!pmd_same(*pmd, newpmd)) << 2240 set_pmd_at(mm, addr, 1889 set_pmd_at(mm, addr, pmd, newpmd); >> 1890 } 2241 goto unlock; 1891 goto unlock; 2242 } 1892 } 2243 #endif 1893 #endif 2244 1894 2245 if (prot_numa) { !! 1895 /* 2246 struct folio *folio; !! 1896 * Avoid trapping faults against the zero page. The read-only 2247 bool toptier; !! 1897 * data is likely to be read-cached on the local CPU and 2248 /* !! 1898 * local/remote hits to the zero page are not interesting. 2249 * Avoid trapping faults agai !! 1899 */ 2250 * data is likely to be read- !! 1900 if (prot_numa && is_huge_zero_pmd(*pmd)) 2251 * local/remote hits to the z !! 1901 goto unlock; 2252 */ << 2253 if (is_huge_zero_pmd(*pmd)) << 2254 goto unlock; << 2255 << 2256 if (pmd_protnone(*pmd)) << 2257 goto unlock; << 2258 1902 2259 folio = pmd_folio(*pmd); !! 1903 if (prot_numa && pmd_protnone(*pmd)) 2260 toptier = node_is_toptier(fol !! 1904 goto unlock; 2261 /* << 2262 * Skip scanning top tier nod << 2263 * balancing is disabled << 2264 */ << 2265 if (!(sysctl_numa_balancing_m << 2266 toptier) << 2267 goto unlock; << 2268 1905 2269 if (folio_use_access_time(fol << 2270 folio_xchg_access_tim << 2271 << 2272 } << 2273 /* 1906 /* 2274 * In case prot_numa, we are under mm !! 1907 * In case prot_numa, we are under down_read(mmap_sem). It's critical 2275 * to not clear pmd intermittently to 1908 * to not clear pmd intermittently to avoid race with MADV_DONTNEED 2276 * which is also under mmap_read_lock !! 1909 * which is also under down_read(mmap_sem): 2277 * 1910 * 2278 * CPU0: 1911 * CPU0: CPU1: 2279 * chang 1912 * change_huge_pmd(prot_numa=1) 2280 * pmdp 1913 * pmdp_huge_get_and_clear_notify() 2281 * madvise_dontneed() 1914 * madvise_dontneed() 2282 * zap_pmd_range() 1915 * zap_pmd_range() 2283 * pmd_trans_huge(*pmd) == 0 (witho 1916 * pmd_trans_huge(*pmd) == 0 (without ptl) 2284 * // skip the pmd 1917 * // skip the pmd 2285 * set_ 1918 * set_pmd_at(); 2286 * // p 1919 * // pmd is re-established 2287 * 1920 * 2288 * The race makes MADV_DONTNEED miss 1921 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it 2289 * which may break userspace. 1922 * which may break userspace. 2290 * 1923 * 2291 * pmdp_invalidate_ad() is required t !! 1924 * pmdp_invalidate() is required to make sure we don't miss 2292 * dirty/young flags set by hardware. 1925 * dirty/young flags set by hardware. 2293 */ 1926 */ 2294 oldpmd = pmdp_invalidate_ad(vma, addr !! 1927 entry = pmdp_invalidate(vma, addr, pmd); 2295 << 2296 entry = pmd_modify(oldpmd, newprot); << 2297 if (uffd_wp) << 2298 entry = pmd_mkuffd_wp(entry); << 2299 else if (uffd_wp_resolve) << 2300 /* << 2301 * Leave the write bit to be << 2302 * handler, then things like << 2303 * handled. << 2304 */ << 2305 entry = pmd_clear_uffd_wp(ent << 2306 << 2307 /* See change_pte_range(). */ << 2308 if ((cp_flags & MM_CP_TRY_CHANGE_WRIT << 2309 can_change_pmd_writable(vma, addr << 2310 entry = pmd_mkwrite(entry, vm << 2311 1928 >> 1929 entry = pmd_modify(entry, newprot); >> 1930 if (preserve_write) >> 1931 entry = pmd_mk_savedwrite(entry); 2312 ret = HPAGE_PMD_NR; 1932 ret = HPAGE_PMD_NR; 2313 set_pmd_at(mm, addr, pmd, entry); 1933 set_pmd_at(mm, addr, pmd, entry); 2314 !! 1934 BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); 2315 if (huge_pmd_needs_flush(oldpmd, entr << 2316 tlb_flush_pmd_range(tlb, addr << 2317 unlock: 1935 unlock: 2318 spin_unlock(ptl); 1936 spin_unlock(ptl); 2319 return ret; 1937 return ret; 2320 } 1938 } 2321 1939 2322 /* 1940 /* 2323 * Returns: << 2324 * << 2325 * - 0: if pud leaf changed from under us << 2326 * - 1: if pud can be skipped << 2327 * - HPAGE_PUD_NR: if pud was successfully pr << 2328 */ << 2329 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_ << 2330 int change_huge_pud(struct mmu_gather *tlb, s << 2331 pud_t *pudp, unsigned lon << 2332 unsigned long cp_flags) << 2333 { << 2334 struct mm_struct *mm = vma->vm_mm; << 2335 pud_t oldpud, entry; << 2336 spinlock_t *ptl; << 2337 << 2338 tlb_change_page_size(tlb, HPAGE_PUD_S << 2339 << 2340 /* NUMA balancing doesn't apply to da << 2341 if (cp_flags & MM_CP_PROT_NUMA) << 2342 return 1; << 2343 << 2344 /* << 2345 * Huge entries on userfault-wp only << 2346 * don't have anonymous PUDs yet. << 2347 */ << 2348 if (WARN_ON_ONCE(cp_flags & MM_CP_UFF << 2349 return 1; << 2350 << 2351 ptl = __pud_trans_huge_lock(pudp, vma << 2352 if (!ptl) << 2353 return 0; << 2354 << 2355 /* << 2356 * Can't clear PUD or it can race wit << 2357 * change_huge_pmd(). << 2358 */ << 2359 oldpud = pudp_invalidate(vma, addr, p << 2360 entry = pud_modify(oldpud, newprot); << 2361 set_pud_at(mm, addr, pudp, entry); << 2362 tlb_flush_pud_range(tlb, addr, HPAGE_ << 2363 << 2364 spin_unlock(ptl); << 2365 return HPAGE_PUD_NR; << 2366 } << 2367 #endif << 2368 << 2369 #ifdef CONFIG_USERFAULTFD << 2370 /* << 2371 * The PT lock for src_pmd and dst_vma/src_vm << 2372 * the caller, but it must return after relea << 2373 * Just move the page from src_pmd to dst_pmd << 2374 * Return zero if succeeded in moving the pag << 2375 * repeated by the caller, or other errors in << 2376 */ << 2377 int move_pages_huge_pmd(struct mm_struct *mm, << 2378 struct vm_area_struct << 2379 unsigned long dst_add << 2380 { << 2381 pmd_t _dst_pmd, src_pmdval; << 2382 struct page *src_page; << 2383 struct folio *src_folio; << 2384 struct anon_vma *src_anon_vma; << 2385 spinlock_t *src_ptl, *dst_ptl; << 2386 pgtable_t src_pgtable; << 2387 struct mmu_notifier_range range; << 2388 int err = 0; << 2389 << 2390 src_pmdval = *src_pmd; << 2391 src_ptl = pmd_lockptr(mm, src_pmd); << 2392 << 2393 lockdep_assert_held(src_ptl); << 2394 vma_assert_locked(src_vma); << 2395 vma_assert_locked(dst_vma); << 2396 << 2397 /* Sanity checks before the operation << 2398 if (WARN_ON_ONCE(!pmd_none(dst_pmdval << 2399 WARN_ON_ONCE(dst_addr & ~HPAGE_PM << 2400 spin_unlock(src_ptl); << 2401 return -EINVAL; << 2402 } << 2403 << 2404 if (!pmd_trans_huge(src_pmdval)) { << 2405 spin_unlock(src_ptl); << 2406 if (is_pmd_migration_entry(sr << 2407 pmd_migration_entry_w << 2408 return -EAGAIN; << 2409 } << 2410 return -ENOENT; << 2411 } << 2412 << 2413 src_page = pmd_page(src_pmdval); << 2414 << 2415 if (!is_huge_zero_pmd(src_pmdval)) { << 2416 if (unlikely(!PageAnonExclusi << 2417 spin_unlock(src_ptl); << 2418 return -EBUSY; << 2419 } << 2420 << 2421 src_folio = page_folio(src_pa << 2422 folio_get(src_folio); << 2423 } else << 2424 src_folio = NULL; << 2425 << 2426 spin_unlock(src_ptl); << 2427 << 2428 flush_cache_range(src_vma, src_addr, << 2429 mmu_notifier_range_init(&range, MMU_N << 2430 src_addr + HP << 2431 mmu_notifier_invalidate_range_start(& << 2432 << 2433 if (src_folio) { << 2434 folio_lock(src_folio); << 2435 << 2436 /* << 2437 * split_huge_page walks the << 2438 * lock. Serialize against it << 2439 * lock is not enough. << 2440 */ << 2441 src_anon_vma = folio_get_anon << 2442 if (!src_anon_vma) { << 2443 err = -EAGAIN; << 2444 goto unlock_folio; << 2445 } << 2446 anon_vma_lock_write(src_anon_ << 2447 } else << 2448 src_anon_vma = NULL; << 2449 << 2450 dst_ptl = pmd_lockptr(mm, dst_pmd); << 2451 double_pt_lock(src_ptl, dst_ptl); << 2452 if (unlikely(!pmd_same(*src_pmd, src_ << 2453 !pmd_same(*dst_pmd, dst_ << 2454 err = -EAGAIN; << 2455 goto unlock_ptls; << 2456 } << 2457 if (src_folio) { << 2458 if (folio_maybe_dma_pinned(sr << 2459 !PageAnonExclusive(&src_f << 2460 err = -EBUSY; << 2461 goto unlock_ptls; << 2462 } << 2463 << 2464 if (WARN_ON_ONCE(!folio_test_ << 2465 WARN_ON_ONCE(!folio_test_ << 2466 err = -EBUSY; << 2467 goto unlock_ptls; << 2468 } << 2469 << 2470 src_pmdval = pmdp_huge_clear_ << 2471 /* Folio got pinned from unde << 2472 if (folio_maybe_dma_pinned(sr << 2473 set_pmd_at(mm, src_ad << 2474 err = -EBUSY; << 2475 goto unlock_ptls; << 2476 } << 2477 << 2478 folio_move_anon_rmap(src_foli << 2479 src_folio->index = linear_pag << 2480 << 2481 _dst_pmd = mk_huge_pmd(&src_f << 2482 /* Follow mremap() behavior a << 2483 _dst_pmd = pmd_mkwrite(pmd_mk << 2484 } else { << 2485 src_pmdval = pmdp_huge_clear_ << 2486 _dst_pmd = mk_huge_pmd(src_pa << 2487 } << 2488 set_pmd_at(mm, dst_addr, dst_pmd, _ds << 2489 << 2490 src_pgtable = pgtable_trans_huge_with << 2491 pgtable_trans_huge_deposit(mm, dst_pm << 2492 unlock_ptls: << 2493 double_pt_unlock(src_ptl, dst_ptl); << 2494 if (src_anon_vma) { << 2495 anon_vma_unlock_write(src_ano << 2496 put_anon_vma(src_anon_vma); << 2497 } << 2498 unlock_folio: << 2499 /* unblock rmap walks */ << 2500 if (src_folio) << 2501 folio_unlock(src_folio); << 2502 mmu_notifier_invalidate_range_end(&ra << 2503 if (src_folio) << 2504 folio_put(src_folio); << 2505 return err; << 2506 } << 2507 #endif /* CONFIG_USERFAULTFD */ << 2508 << 2509 /* << 2510 * Returns page table lock pointer if a given 1941 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. 2511 * 1942 * 2512 * Note that if it returns page table lock po 1943 * Note that if it returns page table lock pointer, this routine returns without 2513 * unlocking page table lock. So callers must 1944 * unlocking page table lock. So callers must unlock it. 2514 */ 1945 */ 2515 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, 1946 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 2516 { 1947 { 2517 spinlock_t *ptl; 1948 spinlock_t *ptl; 2518 ptl = pmd_lock(vma->vm_mm, pmd); 1949 ptl = pmd_lock(vma->vm_mm, pmd); 2519 if (likely(is_swap_pmd(*pmd) || pmd_t 1950 if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || 2520 pmd_devmap(*pmd))) 1951 pmd_devmap(*pmd))) 2521 return ptl; 1952 return ptl; 2522 spin_unlock(ptl); 1953 spin_unlock(ptl); 2523 return NULL; 1954 return NULL; 2524 } 1955 } 2525 1956 2526 /* 1957 /* 2527 * Returns page table lock pointer if a given !! 1958 * Returns true if a given pud maps a thp, false otherwise. 2528 * 1959 * 2529 * Note that if it returns page table lock po !! 1960 * Note that if it returns true, this routine returns without unlocking page 2530 * unlocking page table lock. So callers must !! 1961 * table lock. So callers must unlock it. 2531 */ 1962 */ 2532 spinlock_t *__pud_trans_huge_lock(pud_t *pud, 1963 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) 2533 { 1964 { 2534 spinlock_t *ptl; 1965 spinlock_t *ptl; 2535 1966 2536 ptl = pud_lock(vma->vm_mm, pud); 1967 ptl = pud_lock(vma->vm_mm, pud); 2537 if (likely(pud_trans_huge(*pud) || pu 1968 if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) 2538 return ptl; 1969 return ptl; 2539 spin_unlock(ptl); 1970 spin_unlock(ptl); 2540 return NULL; 1971 return NULL; 2541 } 1972 } 2542 1973 2543 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_ 1974 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 2544 int zap_huge_pud(struct mmu_gather *tlb, stru 1975 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 2545 pud_t *pud, unsigned long ad 1976 pud_t *pud, unsigned long addr) 2546 { 1977 { 2547 spinlock_t *ptl; << 2548 pud_t orig_pud; 1978 pud_t orig_pud; >> 1979 spinlock_t *ptl; 2549 1980 2550 ptl = __pud_trans_huge_lock(pud, vma) 1981 ptl = __pud_trans_huge_lock(pud, vma); 2551 if (!ptl) 1982 if (!ptl) 2552 return 0; 1983 return 0; 2553 !! 1984 /* 2554 orig_pud = pudp_huge_get_and_clear_fu !! 1985 * For architectures like ppc64 we look at deposited pgtable 2555 arch_check_zapped_pud(vma, orig_pud); !! 1986 * when calling pudp_huge_get_and_clear. So do the >> 1987 * pgtable_trans_huge_withdraw after finishing pudp related >> 1988 * operations. >> 1989 */ >> 1990 orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud, >> 1991 tlb->fullmm); 2556 tlb_remove_pud_tlb_entry(tlb, pud, ad 1992 tlb_remove_pud_tlb_entry(tlb, pud, addr); 2557 if (vma_is_special_huge(vma)) { !! 1993 if (vma_is_dax(vma)) { 2558 spin_unlock(ptl); 1994 spin_unlock(ptl); 2559 /* No zero page support yet * 1995 /* No zero page support yet */ 2560 } else { 1996 } else { 2561 /* No support for anonymous P 1997 /* No support for anonymous PUD pages yet */ 2562 BUG(); 1998 BUG(); 2563 } 1999 } 2564 return 1; 2000 return 1; 2565 } 2001 } 2566 2002 2567 static void __split_huge_pud_locked(struct vm 2003 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, 2568 unsigned long haddr) 2004 unsigned long haddr) 2569 { 2005 { 2570 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); 2006 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); 2571 VM_BUG_ON_VMA(vma->vm_start > haddr, 2007 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2572 VM_BUG_ON_VMA(vma->vm_end < haddr + H 2008 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); 2573 VM_BUG_ON(!pud_trans_huge(*pud) && !p 2009 VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); 2574 2010 2575 count_vm_event(THP_SPLIT_PUD); 2011 count_vm_event(THP_SPLIT_PUD); 2576 2012 2577 pudp_huge_clear_flush(vma, haddr, pud !! 2013 pudp_huge_clear_flush_notify(vma, haddr, pud); 2578 } 2014 } 2579 2015 2580 void __split_huge_pud(struct vm_area_struct * 2016 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, 2581 unsigned long address) 2017 unsigned long address) 2582 { 2018 { 2583 spinlock_t *ptl; 2019 spinlock_t *ptl; 2584 struct mmu_notifier_range range; !! 2020 struct mm_struct *mm = vma->vm_mm; >> 2021 unsigned long haddr = address & HPAGE_PUD_MASK; 2585 2022 2586 mmu_notifier_range_init(&range, MMU_N !! 2023 mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE); 2587 address & HPA !! 2024 ptl = pud_lock(mm, pud); 2588 (address & HP << 2589 mmu_notifier_invalidate_range_start(& << 2590 ptl = pud_lock(vma->vm_mm, pud); << 2591 if (unlikely(!pud_trans_huge(*pud) && 2025 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) 2592 goto out; 2026 goto out; 2593 __split_huge_pud_locked(vma, pud, ran !! 2027 __split_huge_pud_locked(vma, pud, haddr); 2594 2028 2595 out: 2029 out: 2596 spin_unlock(ptl); 2030 spin_unlock(ptl); 2597 mmu_notifier_invalidate_range_end(&ra !! 2031 /* 2598 } !! 2032 * No need to double call mmu_notifier->invalidate_range() callback as 2599 #else !! 2033 * the above pudp_huge_clear_flush_notify() did already call it. 2600 void __split_huge_pud(struct vm_area_struct * !! 2034 */ 2601 unsigned long address) !! 2035 mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + 2602 { !! 2036 HPAGE_PUD_SIZE); 2603 } 2037 } 2604 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPA 2038 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 2605 2039 2606 static void __split_huge_zero_page_pmd(struct 2040 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 2607 unsigned long haddr, pmd_t *p 2041 unsigned long haddr, pmd_t *pmd) 2608 { 2042 { 2609 struct mm_struct *mm = vma->vm_mm; 2043 struct mm_struct *mm = vma->vm_mm; 2610 pgtable_t pgtable; 2044 pgtable_t pgtable; 2611 pmd_t _pmd, old_pmd; !! 2045 pmd_t _pmd; 2612 unsigned long addr; << 2613 pte_t *pte; << 2614 int i; 2046 int i; 2615 2047 2616 /* 2048 /* 2617 * Leave pmd empty until pte is fille 2049 * Leave pmd empty until pte is filled note that it is fine to delay 2618 * notification until mmu_notifier_in 2050 * notification until mmu_notifier_invalidate_range_end() as we are 2619 * replacing a zero pmd write protect 2051 * replacing a zero pmd write protected page with a zero pte write 2620 * protected page. 2052 * protected page. 2621 * 2053 * 2622 * See Documentation/mm/mmu_notifier. !! 2054 * See Documentation/vm/mmu_notifier.rst 2623 */ 2055 */ 2624 old_pmd = pmdp_huge_clear_flush(vma, !! 2056 pmdp_huge_clear_flush(vma, haddr, pmd); 2625 2057 2626 pgtable = pgtable_trans_huge_withdraw 2058 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2627 pmd_populate(mm, &_pmd, pgtable); 2059 pmd_populate(mm, &_pmd, pgtable); 2628 2060 2629 pte = pte_offset_map(&_pmd, haddr); !! 2061 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 2630 VM_BUG_ON(!pte); !! 2062 pte_t *pte, entry; 2631 for (i = 0, addr = haddr; i < HPAGE_P !! 2063 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); 2632 pte_t entry; << 2633 << 2634 entry = pfn_pte(my_zero_pfn(a << 2635 entry = pte_mkspecial(entry); 2064 entry = pte_mkspecial(entry); 2636 if (pmd_uffd_wp(old_pmd)) !! 2065 pte = pte_offset_map(&_pmd, haddr); 2637 entry = pte_mkuffd_wp !! 2066 VM_BUG_ON(!pte_none(*pte)); 2638 VM_BUG_ON(!pte_none(ptep_get( !! 2067 set_pte_at(mm, haddr, pte, entry); 2639 set_pte_at(mm, addr, pte, ent !! 2068 pte_unmap(pte); 2640 pte++; << 2641 } 2069 } 2642 pte_unmap(pte - 1); << 2643 smp_wmb(); /* make pte visible before 2070 smp_wmb(); /* make pte visible before pmd */ 2644 pmd_populate(mm, pmd, pgtable); 2071 pmd_populate(mm, pmd, pgtable); 2645 } 2072 } 2646 2073 2647 static void __split_huge_pmd_locked(struct vm 2074 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 2648 unsigned long haddr, bool fre 2075 unsigned long haddr, bool freeze) 2649 { 2076 { 2650 struct mm_struct *mm = vma->vm_mm; 2077 struct mm_struct *mm = vma->vm_mm; 2651 struct folio *folio; << 2652 struct page *page; 2078 struct page *page; 2653 pgtable_t pgtable; 2079 pgtable_t pgtable; 2654 pmd_t old_pmd, _pmd; 2080 pmd_t old_pmd, _pmd; 2655 bool young, write, soft_dirty, pmd_mi !! 2081 bool young, write, soft_dirty, pmd_migration = false; 2656 bool anon_exclusive = false, dirty = << 2657 unsigned long addr; 2082 unsigned long addr; 2658 pte_t *pte; << 2659 int i; 2083 int i; 2660 2084 2661 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); 2085 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); 2662 VM_BUG_ON_VMA(vma->vm_start > haddr, 2086 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2663 VM_BUG_ON_VMA(vma->vm_end < haddr + H 2087 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); 2664 VM_BUG_ON(!is_pmd_migration_entry(*pm 2088 VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) 2665 && !pmd_devma 2089 && !pmd_devmap(*pmd)); 2666 2090 2667 count_vm_event(THP_SPLIT_PMD); 2091 count_vm_event(THP_SPLIT_PMD); 2668 2092 2669 if (!vma_is_anonymous(vma)) { 2093 if (!vma_is_anonymous(vma)) { 2670 old_pmd = pmdp_huge_clear_flu !! 2094 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); 2671 /* 2095 /* 2672 * We are going to unmap this 2096 * We are going to unmap this huge page. So 2673 * just go ahead and zap it 2097 * just go ahead and zap it 2674 */ 2098 */ 2675 if (arch_needs_pgtable_deposi 2099 if (arch_needs_pgtable_deposit()) 2676 zap_deposited_table(m 2100 zap_deposited_table(mm, pmd); 2677 if (vma_is_special_huge(vma)) !! 2101 if (vma_is_dax(vma)) 2678 return; 2102 return; 2679 if (unlikely(is_pmd_migration !! 2103 page = pmd_page(_pmd); 2680 swp_entry_t entry; !! 2104 if (!PageDirty(page) && pmd_dirty(_pmd)) 2681 !! 2105 set_page_dirty(page); 2682 entry = pmd_to_swp_en !! 2106 if (!PageReferenced(page) && pmd_young(_pmd)) 2683 folio = pfn_swap_entr !! 2107 SetPageReferenced(page); 2684 } else { !! 2108 page_remove_rmap(page, true); 2685 page = pmd_page(old_p !! 2109 put_page(page); 2686 folio = page_folio(pa !! 2110 add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); 2687 if (!folio_test_dirty << 2688 folio_mark_di << 2689 if (!folio_test_refer << 2690 folio_set_ref << 2691 folio_remove_rmap_pmd << 2692 folio_put(folio); << 2693 } << 2694 add_mm_counter(mm, mm_counter << 2695 return; 2111 return; 2696 } !! 2112 } else if (is_huge_zero_pmd(*pmd)) { 2697 << 2698 if (is_huge_zero_pmd(*pmd)) { << 2699 /* 2113 /* 2700 * FIXME: Do we want to inval 2114 * FIXME: Do we want to invalidate secondary mmu by calling 2701 * mmu_notifier_arch_invalida !! 2115 * mmu_notifier_invalidate_range() see comments below inside 2702 * inside __split_huge_pmd() !! 2116 * __split_huge_pmd() ? 2703 * 2117 * 2704 * We are going from a zero h 2118 * We are going from a zero huge page write protected to zero 2705 * small page also write prot 2119 * small page also write protected so it does not seems useful 2706 * to invalidate secondary mm 2120 * to invalidate secondary mmu at this time. 2707 */ 2121 */ 2708 return __split_huge_zero_page 2122 return __split_huge_zero_page_pmd(vma, haddr, pmd); 2709 } 2123 } 2710 2124 2711 pmd_migration = is_pmd_migration_entr !! 2125 /* >> 2126 * Up to this point the pmd is present and huge and userland has the >> 2127 * whole access to the hugepage during the split (which happens in >> 2128 * place). If we overwrite the pmd with the not-huge version pointing >> 2129 * to the pte here (which of course we could if all CPUs were bug >> 2130 * free), userland could trigger a small page size TLB miss on the >> 2131 * small sized TLB while the hugepage TLB entry is still established in >> 2132 * the huge TLB. Some CPU doesn't like that. >> 2133 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum >> 2134 * 383 on page 93. Intel should be safe but is also warns that it's >> 2135 * only safe if the permission and cache attributes of the two entries >> 2136 * loaded in the two TLB is identical (which should be the case here). >> 2137 * But it is generally safer to never allow small and huge TLB entries >> 2138 * for the same virtual address to be loaded simultaneously. So instead >> 2139 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the >> 2140 * current pmd notpresent (atomically because here the pmd_trans_huge >> 2141 * must remain set at all times on the pmd until the split is complete >> 2142 * for this pmd), then we flush the SMP TLB and finally we write the >> 2143 * non-huge version of the pmd entry with pmd_populate. >> 2144 */ >> 2145 old_pmd = pmdp_invalidate(vma, haddr, pmd); >> 2146 >> 2147 pmd_migration = is_pmd_migration_entry(old_pmd); 2712 if (unlikely(pmd_migration)) { 2148 if (unlikely(pmd_migration)) { 2713 swp_entry_t entry; 2149 swp_entry_t entry; 2714 2150 2715 old_pmd = *pmd; << 2716 entry = pmd_to_swp_entry(old_ 2151 entry = pmd_to_swp_entry(old_pmd); 2717 page = pfn_swap_entry_to_page !! 2152 page = pfn_to_page(swp_offset(entry)); 2718 write = is_writable_migration !! 2153 write = is_write_migration_entry(entry); 2719 if (PageAnon(page)) !! 2154 young = false; 2720 anon_exclusive = is_r << 2721 young = is_migration_entry_yo << 2722 dirty = is_migration_entry_di << 2723 soft_dirty = pmd_swp_soft_dir 2155 soft_dirty = pmd_swp_soft_dirty(old_pmd); 2724 uffd_wp = pmd_swp_uffd_wp(old << 2725 } else { 2156 } else { 2726 /* << 2727 * Up to this point the pmd i << 2728 * the whole access to the hu << 2729 * happens in place). If we o << 2730 * version pointing to the pt << 2731 * all CPUs were bug free), u << 2732 * size TLB miss on the small << 2733 * entry is still established << 2734 * like that. See << 2735 * http://support.amd.com/Tec << 2736 * 383 on page 105. Intel sho << 2737 * it's only safe if the perm << 2738 * two entries loaded in the << 2739 * be the case here). But it << 2740 * small and huge TLB entries << 2741 * loaded simultaneously. So << 2742 * flush_pmd_tlb_range();" we << 2743 * notpresent (atomically bec << 2744 * remain set at all times on << 2745 * complete for this pmd), th << 2746 * we write the non-huge vers << 2747 * pmd_populate. << 2748 */ << 2749 old_pmd = pmdp_invalidate(vma << 2750 page = pmd_page(old_pmd); 2157 page = pmd_page(old_pmd); 2751 folio = page_folio(page); !! 2158 if (pmd_dirty(old_pmd)) 2752 if (pmd_dirty(old_pmd)) { !! 2159 SetPageDirty(page); 2753 dirty = true; << 2754 folio_set_dirty(folio << 2755 } << 2756 write = pmd_write(old_pmd); 2160 write = pmd_write(old_pmd); 2757 young = pmd_young(old_pmd); 2161 young = pmd_young(old_pmd); 2758 soft_dirty = pmd_soft_dirty(o 2162 soft_dirty = pmd_soft_dirty(old_pmd); 2759 uffd_wp = pmd_uffd_wp(old_pmd << 2760 << 2761 VM_WARN_ON_FOLIO(!folio_ref_c << 2762 VM_WARN_ON_FOLIO(!folio_test_ << 2763 << 2764 /* << 2765 * Without "freeze", we'll si << 2766 * PageAnonExclusive() flag f << 2767 * each subpage -- no need to << 2768 * << 2769 * With "freeze" we want to r << 2770 * migration entries right aw << 2771 * managed to clear PageAnonE << 2772 * set_pmd_migration_entry(). << 2773 * << 2774 * In case we cannot clear Pa << 2775 * only and let try_to_migrat << 2776 * << 2777 * See folio_try_share_anon_r << 2778 */ << 2779 anon_exclusive = PageAnonExcl << 2780 if (freeze && anon_exclusive << 2781 folio_try_share_anon_rmap << 2782 freeze = false; << 2783 if (!freeze) { << 2784 rmap_t rmap_flags = R << 2785 << 2786 folio_ref_add(folio, << 2787 if (anon_exclusive) << 2788 rmap_flags |= << 2789 folio_add_anon_rmap_p << 2790 << 2791 } << 2792 } 2163 } >> 2164 VM_BUG_ON_PAGE(!page_count(page), page); >> 2165 page_ref_add(page, HPAGE_PMD_NR - 1); 2793 2166 2794 /* 2167 /* 2795 * Withdraw the table only after we m 2168 * Withdraw the table only after we mark the pmd entry invalid. 2796 * This's critical for some architect 2169 * This's critical for some architectures (Power). 2797 */ 2170 */ 2798 pgtable = pgtable_trans_huge_withdraw 2171 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2799 pmd_populate(mm, &_pmd, pgtable); 2172 pmd_populate(mm, &_pmd, pgtable); 2800 2173 2801 pte = pte_offset_map(&_pmd, haddr); !! 2174 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 2802 VM_BUG_ON(!pte); !! 2175 pte_t entry, *pte; 2803 !! 2176 /* 2804 /* !! 2177 * Note that NUMA hinting access restrictions are not 2805 * Note that NUMA hinting access rest !! 2178 * transferred to avoid any possibility of altering 2806 * avoid any possibility of altering !! 2179 * permissions across VMAs. 2807 */ !! 2180 */ 2808 if (freeze || pmd_migration) { !! 2181 if (freeze || pmd_migration) { 2809 for (i = 0, addr = haddr; i < << 2810 pte_t entry; << 2811 swp_entry_t swp_entry 2182 swp_entry_t swp_entry; 2812 !! 2183 swp_entry = make_migration_entry(page + i, write); 2813 if (write) << 2814 swp_entry = m << 2815 << 2816 else if (anon_exclusi << 2817 swp_entry = m << 2818 << 2819 else << 2820 swp_entry = m << 2821 << 2822 if (young) << 2823 swp_entry = m << 2824 if (dirty) << 2825 swp_entry = m << 2826 entry = swp_entry_to_ 2184 entry = swp_entry_to_pte(swp_entry); 2827 if (soft_dirty) 2185 if (soft_dirty) 2828 entry = pte_s 2186 entry = pte_swp_mksoft_dirty(entry); 2829 if (uffd_wp) !! 2187 } else { 2830 entry = pte_s !! 2188 entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); 2831 !! 2189 entry = maybe_mkwrite(entry, vma); 2832 VM_WARN_ON(!pte_none( !! 2190 if (!write) 2833 set_pte_at(mm, addr, !! 2191 entry = pte_wrprotect(entry); >> 2192 if (!young) >> 2193 entry = pte_mkold(entry); >> 2194 if (soft_dirty) >> 2195 entry = pte_mksoft_dirty(entry); 2834 } 2196 } 2835 } else { !! 2197 pte = pte_offset_map(&_pmd, addr); 2836 pte_t entry; !! 2198 BUG_ON(!pte_none(*pte)); 2837 !! 2199 set_pte_at(mm, addr, pte, entry); 2838 entry = mk_pte(page, READ_ONC !! 2200 atomic_inc(&page[i]._mapcount); 2839 if (write) !! 2201 pte_unmap(pte); 2840 entry = pte_mkwrite(e !! 2202 } 2841 if (!young) << 2842 entry = pte_mkold(ent << 2843 /* NOTE: this may set soft-di << 2844 if (dirty) << 2845 entry = pte_mkdirty(e << 2846 if (soft_dirty) << 2847 entry = pte_mksoft_di << 2848 if (uffd_wp) << 2849 entry = pte_mkuffd_wp << 2850 2203 >> 2204 /* >> 2205 * Set PG_double_map before dropping compound_mapcount to avoid >> 2206 * false-negative page_mapped(). >> 2207 */ >> 2208 if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) { 2851 for (i = 0; i < HPAGE_PMD_NR; 2209 for (i = 0; i < HPAGE_PMD_NR; i++) 2852 VM_WARN_ON(!pte_none( !! 2210 atomic_inc(&page[i]._mapcount); 2853 << 2854 set_ptes(mm, haddr, pte, entr << 2855 } 2211 } 2856 pte_unmap(pte); << 2857 2212 2858 if (!pmd_migration) !! 2213 if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { 2859 folio_remove_rmap_pmd(folio, !! 2214 /* Last compound_mapcount is gone. */ 2860 if (freeze) !! 2215 __dec_node_page_state(page, NR_ANON_THPS); 2861 put_page(page); !! 2216 if (TestClearPageDoubleMap(page)) { >> 2217 /* No need in mapcount reference anymore */ >> 2218 for (i = 0; i < HPAGE_PMD_NR; i++) >> 2219 atomic_dec(&page[i]._mapcount); >> 2220 } >> 2221 } 2862 2222 2863 smp_wmb(); /* make pte visible before 2223 smp_wmb(); /* make pte visible before pmd */ 2864 pmd_populate(mm, pmd, pgtable); 2224 pmd_populate(mm, pmd, pgtable); 2865 } << 2866 2225 2867 void split_huge_pmd_locked(struct vm_area_str !! 2226 if (freeze) { 2868 pmd_t *pmd, bool f !! 2227 for (i = 0; i < HPAGE_PMD_NR; i++) { 2869 { !! 2228 page_remove_rmap(page + i, false); 2870 VM_WARN_ON_ONCE(folio && !folio_test_ !! 2229 put_page(page + i); 2871 VM_WARN_ON_ONCE(!IS_ALIGNED(address, !! 2230 } 2872 VM_WARN_ON_ONCE(folio && !folio_test_ << 2873 VM_BUG_ON(freeze && !folio); << 2874 << 2875 /* << 2876 * When the caller requests to set up << 2877 * require a folio to check the PMD a << 2878 * is a risk of replacing the wrong f << 2879 */ << 2880 if (pmd_trans_huge(*pmd) || pmd_devma << 2881 is_pmd_migration_entry(*pmd)) { << 2882 if (folio && folio != pmd_fol << 2883 return; << 2884 __split_huge_pmd_locked(vma, << 2885 } 2231 } 2886 } 2232 } 2887 2233 2888 void __split_huge_pmd(struct vm_area_struct * 2234 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 2889 unsigned long address, bool f !! 2235 unsigned long address, bool freeze, struct page *page) 2890 { 2236 { 2891 spinlock_t *ptl; 2237 spinlock_t *ptl; 2892 struct mmu_notifier_range range; !! 2238 struct mm_struct *mm = vma->vm_mm; >> 2239 unsigned long haddr = address & HPAGE_PMD_MASK; 2893 2240 2894 mmu_notifier_range_init(&range, MMU_N !! 2241 mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); 2895 address & HPA !! 2242 ptl = pmd_lock(mm, pmd); 2896 (address & HP !! 2243 2897 mmu_notifier_invalidate_range_start(& !! 2244 /* 2898 ptl = pmd_lock(vma->vm_mm, pmd); !! 2245 * If caller asks to setup a migration entries, we need a page to check 2899 split_huge_pmd_locked(vma, range.star !! 2246 * pmd against. Otherwise we can end up replacing wrong page. >> 2247 */ >> 2248 VM_BUG_ON(freeze && !page); >> 2249 if (page && page != pmd_page(*pmd)) >> 2250 goto out; >> 2251 >> 2252 if (pmd_trans_huge(*pmd)) { >> 2253 page = pmd_page(*pmd); >> 2254 if (PageMlocked(page)) >> 2255 clear_page_mlock(page); >> 2256 } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) >> 2257 goto out; >> 2258 __split_huge_pmd_locked(vma, pmd, haddr, freeze); >> 2259 out: 2900 spin_unlock(ptl); 2260 spin_unlock(ptl); 2901 mmu_notifier_invalidate_range_end(&ra !! 2261 /* >> 2262 * No need to double call mmu_notifier->invalidate_range() callback. >> 2263 * They are 3 cases to consider inside __split_huge_pmd_locked(): >> 2264 * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious >> 2265 * 2) __split_huge_zero_page_pmd() read only zero page and any write >> 2266 * fault will trigger a flush_notify before pointing to a new page >> 2267 * (it is fine if the secondary mmu keeps pointing to the old zero >> 2268 * page in the meantime) >> 2269 * 3) Split a huge pmd into pte pointing to the same page. No need >> 2270 * to invalidate secondary tlb entry they are all still valid. >> 2271 * any further changes to individual pte will notify. So no need >> 2272 * to call mmu_notifier->invalidate_range() >> 2273 */ >> 2274 mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + >> 2275 HPAGE_PMD_SIZE); 2902 } 2276 } 2903 2277 2904 void split_huge_pmd_address(struct vm_area_st 2278 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 2905 bool freeze, struct folio *fo !! 2279 bool freeze, struct page *page) 2906 { 2280 { 2907 pmd_t *pmd = mm_find_pmd(vma->vm_mm, !! 2281 pgd_t *pgd; >> 2282 p4d_t *p4d; >> 2283 pud_t *pud; >> 2284 pmd_t *pmd; 2908 2285 2909 if (!pmd) !! 2286 pgd = pgd_offset(vma->vm_mm, address); >> 2287 if (!pgd_present(*pgd)) 2910 return; 2288 return; 2911 2289 2912 __split_huge_pmd(vma, pmd, address, f !! 2290 p4d = p4d_offset(pgd, address); 2913 } !! 2291 if (!p4d_present(*p4d)) >> 2292 return; 2914 2293 2915 static inline void split_huge_pmd_if_needed(s !! 2294 pud = pud_offset(p4d, address); 2916 { !! 2295 if (!pud_present(*pud)) 2917 /* !! 2296 return; 2918 * If the new address isn't hpage ali !! 2297 2919 * contain an hugepage: check if we n !! 2298 pmd = pmd_offset(pud, address); 2920 */ !! 2299 2921 if (!IS_ALIGNED(address, HPAGE_PMD_SI !! 2300 __split_huge_pmd(vma, pmd, address, freeze, page); 2922 range_in_vma(vma, ALIGN_DOWN(addr << 2923 ALIGN(address, HPAGE << 2924 split_huge_pmd_address(vma, a << 2925 } 2301 } 2926 2302 2927 void vma_adjust_trans_huge(struct vm_area_str 2303 void vma_adjust_trans_huge(struct vm_area_struct *vma, 2928 unsigned long st 2304 unsigned long start, 2929 unsigned long en 2305 unsigned long end, 2930 long adjust_next 2306 long adjust_next) 2931 { 2307 { 2932 /* Check if we need to split start fi << 2933 split_huge_pmd_if_needed(vma, start); << 2934 << 2935 /* Check if we need to split end next << 2936 split_huge_pmd_if_needed(vma, end); << 2937 << 2938 /* 2308 /* 2939 * If we're also updating the next vm !! 2309 * If the new start address isn't hpage aligned and it could 2940 * check if we need to split it. !! 2310 * previously contain an hugepage: check if we need to split >> 2311 * an huge pmd. 2941 */ 2312 */ 2942 if (adjust_next > 0) { !! 2313 if (start & ~HPAGE_PMD_MASK && 2943 struct vm_area_struct *next = !! 2314 (start & HPAGE_PMD_MASK) >= vma->vm_start && 2944 unsigned long nstart = next-> !! 2315 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 2945 nstart += adjust_next; !! 2316 split_huge_pmd_address(vma, start, false, NULL); 2946 split_huge_pmd_if_needed(next << 2947 } << 2948 } << 2949 << 2950 static void unmap_folio(struct folio *folio) << 2951 { << 2952 enum ttu_flags ttu_flags = TTU_RMAP_L << 2953 TTU_BATCH_FLUSH; << 2954 << 2955 VM_BUG_ON_FOLIO(!folio_test_large(fol << 2956 << 2957 if (folio_test_pmd_mappable(folio)) << 2958 ttu_flags |= TTU_SPLIT_HUGE_P << 2959 2317 2960 /* 2318 /* 2961 * Anon pages need migration entries !! 2319 * If the new end address isn't hpage aligned and it could 2962 * pages can simply be left unmapped, !! 2320 * previously contain an hugepage: check if we need to split 2963 * If that is ever changed (perhaps f !! 2321 * an huge pmd. 2964 */ 2322 */ 2965 if (folio_test_anon(folio)) !! 2323 if (end & ~HPAGE_PMD_MASK && 2966 try_to_migrate(folio, ttu_fla !! 2324 (end & HPAGE_PMD_MASK) >= vma->vm_start && 2967 else !! 2325 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 2968 try_to_unmap(folio, ttu_flags !! 2326 split_huge_pmd_address(vma, end, false, NULL); 2969 << 2970 try_to_unmap_flush(); << 2971 } << 2972 << 2973 static bool __discard_anon_folio_pmd_locked(s << 2974 u << 2975 s << 2976 { << 2977 struct mm_struct *mm = vma->vm_mm; << 2978 int ref_count, map_count; << 2979 pmd_t orig_pmd = *pmdp; << 2980 << 2981 if (folio_test_dirty(folio) || pmd_di << 2982 return false; << 2983 << 2984 orig_pmd = pmdp_huge_clear_flush(vma, << 2985 << 2986 /* << 2987 * Syncing against concurrent GUP-fas << 2988 * - clear PMD; barrier; read refcoun << 2989 * - inc refcount; barrier; read PMD << 2990 */ << 2991 smp_mb(); << 2992 << 2993 ref_count = folio_ref_count(folio); << 2994 map_count = folio_mapcount(folio); << 2995 << 2996 /* << 2997 * Order reads for folio refcount and << 2998 * (see comments in __remove_mapping( << 2999 */ << 3000 smp_rmb(); << 3001 2327 3002 /* 2328 /* 3003 * If the folio or its PMD is redirti !! 2329 * If we're also updating the vma->vm_next->vm_start, if the new 3004 * are unexpected references, we will !! 2330 * vm_next->vm_start isn't page aligned and it could previously 3005 * and remap it. !! 2331 * contain an hugepage: check if we need to split an huge pmd. 3006 * << 3007 * The only folio refs must be one fr << 3008 */ 2332 */ 3009 if (folio_test_dirty(folio) || pmd_di !! 2333 if (adjust_next > 0) { 3010 ref_count != map_count + 1) { !! 2334 struct vm_area_struct *next = vma->vm_next; 3011 set_pmd_at(mm, addr, pmdp, or !! 2335 unsigned long nstart = next->vm_start; 3012 return false; !! 2336 nstart += adjust_next << PAGE_SHIFT; >> 2337 if (nstart & ~HPAGE_PMD_MASK && >> 2338 (nstart & HPAGE_PMD_MASK) >= next->vm_start && >> 2339 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) >> 2340 split_huge_pmd_address(next, nstart, false, NULL); 3013 } 2341 } 3014 << 3015 folio_remove_rmap_pmd(folio, pmd_page << 3016 zap_deposited_table(mm, pmdp); << 3017 add_mm_counter(mm, MM_ANONPAGES, -HPA << 3018 if (vma->vm_flags & VM_LOCKED) << 3019 mlock_drain_local(); << 3020 folio_put(folio); << 3021 << 3022 return true; << 3023 } 2342 } 3024 2343 3025 bool unmap_huge_pmd_locked(struct vm_area_str !! 2344 static void unmap_page(struct page *page) 3026 pmd_t *pmdp, struc << 3027 { 2345 { 3028 VM_WARN_ON_FOLIO(!folio_test_pmd_mapp !! 2346 enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | 3029 VM_WARN_ON_FOLIO(!folio_test_locked(f !! 2347 TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; 3030 VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPA !! 2348 bool unmap_success; 3031 << 3032 if (folio_test_anon(folio) && !folio_ << 3033 return __discard_anon_folio_p << 3034 2349 3035 return false; !! 2350 VM_BUG_ON_PAGE(!PageHead(page), page); 3036 } << 3037 2351 3038 static void remap_page(struct folio *folio, u !! 2352 if (PageAnon(page)) 3039 { !! 2353 ttu_flags |= TTU_SPLIT_FREEZE; 3040 int i = 0; << 3041 2354 3042 /* If unmap_folio() uses try_to_migra !! 2355 unmap_success = try_to_unmap(page, ttu_flags); 3043 if (!folio_test_anon(folio)) !! 2356 VM_BUG_ON_PAGE(!unmap_success, page); 3044 return; << 3045 for (;;) { << 3046 remove_migration_ptes(folio, << 3047 i += folio_nr_pages(folio); << 3048 if (i >= nr) << 3049 break; << 3050 folio = folio_next(folio); << 3051 } << 3052 } 2357 } 3053 2358 3054 static void lru_add_page_tail(struct folio *f !! 2359 static void remap_page(struct page *page) 3055 struct lruvec *lruvec, struct << 3056 { 2360 { 3057 VM_BUG_ON_FOLIO(!folio_test_large(fol !! 2361 int i; 3058 VM_BUG_ON_FOLIO(PageLRU(tail), folio) !! 2362 if (PageTransHuge(page)) { 3059 lockdep_assert_held(&lruvec->lru_lock !! 2363 remove_migration_ptes(page, page, true); 3060 << 3061 if (list) { << 3062 /* page reclaim is reclaiming << 3063 VM_WARN_ON(folio_test_lru(fol << 3064 get_page(tail); << 3065 list_add_tail(&tail->lru, lis << 3066 } else { 2364 } else { 3067 /* head is still on lru (and !! 2365 for (i = 0; i < HPAGE_PMD_NR; i++) 3068 VM_WARN_ON(!folio_test_lru(fo !! 2366 remove_migration_ptes(page + i, page + i, true); 3069 if (folio_test_unevictable(fo << 3070 tail->mlock_count = 0 << 3071 else << 3072 list_add_tail(&tail-> << 3073 SetPageLRU(tail); << 3074 } 2367 } 3075 } 2368 } 3076 2369 3077 static void __split_huge_page_tail(struct fol !! 2370 static void __split_huge_page_tail(struct page *head, int tail, 3078 struct lruvec *lruvec, struct !! 2371 struct lruvec *lruvec, struct list_head *list) 3079 unsigned int new_order) << 3080 { 2372 { 3081 struct page *head = &folio->page; << 3082 struct page *page_tail = head + tail; 2373 struct page *page_tail = head + tail; 3083 /* << 3084 * Careful: new_folio is not a "real" << 3085 * Don't pass it around before clear_ << 3086 */ << 3087 struct folio *new_folio = (struct fol << 3088 2374 3089 VM_BUG_ON_PAGE(atomic_read(&page_tail 2375 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); 3090 2376 3091 /* 2377 /* 3092 * Clone page flags before unfreezing 2378 * Clone page flags before unfreezing refcount. 3093 * 2379 * 3094 * After successful get_page_unless_z 2380 * After successful get_page_unless_zero() might follow flags change, 3095 * for example lock_page() which set !! 2381 * for exmaple lock_page() which set PG_waiters. 3096 * << 3097 * Note that for mapped sub-pages of << 3098 * PG_anon_exclusive has been cleared << 3099 * the migration entry instead from w << 3100 * We can still have PG_anon_exclusiv << 3101 * unreferenced sub-pages of an anony << 3102 * PG_anon_exclusive (-> PG_mappedtod << 3103 */ 2382 */ 3104 page_tail->flags &= ~PAGE_FLAGS_CHECK 2383 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 3105 page_tail->flags |= (head->flags & 2384 page_tail->flags |= (head->flags & 3106 ((1L << PG_referenced 2385 ((1L << PG_referenced) | 3107 (1L << PG_swapbacked 2386 (1L << PG_swapbacked) | 3108 (1L << PG_swapcache) 2387 (1L << PG_swapcache) | 3109 (1L << PG_mlocked) | 2388 (1L << PG_mlocked) | 3110 (1L << PG_uptodate) 2389 (1L << PG_uptodate) | 3111 (1L << PG_active) | 2390 (1L << PG_active) | 3112 (1L << PG_workingset 2391 (1L << PG_workingset) | 3113 (1L << PG_locked) | 2392 (1L << PG_locked) | 3114 (1L << PG_unevictabl 2393 (1L << PG_unevictable) | 3115 #ifdef CONFIG_ARCH_USES_PG_ARCH_2 !! 2394 (1L << PG_dirty))); 3116 (1L << PG_arch_2) | << 3117 #endif << 3118 #ifdef CONFIG_ARCH_USES_PG_ARCH_3 << 3119 (1L << PG_arch_3) | << 3120 #endif << 3121 (1L << PG_dirty) | << 3122 LRU_GEN_MASK | LRU_R << 3123 2395 3124 /* ->mapping in first and second tail !! 2396 /* ->mapping in first tail page is compound_mapcount */ 3125 VM_BUG_ON_PAGE(tail > 2 && page_tail- 2397 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, 3126 page_tail); 2398 page_tail); 3127 page_tail->mapping = head->mapping; 2399 page_tail->mapping = head->mapping; 3128 page_tail->index = head->index + tail 2400 page_tail->index = head->index + tail; 3129 2401 3130 /* << 3131 * page->private should not be set in << 3132 * if private is unexpectedly set. << 3133 */ << 3134 if (unlikely(page_tail->private)) { << 3135 VM_WARN_ON_ONCE_PAGE(true, pa << 3136 page_tail->private = 0; << 3137 } << 3138 if (folio_test_swapcache(folio)) << 3139 new_folio->swap.val = folio-> << 3140 << 3141 /* Page flags must be visible before 2402 /* Page flags must be visible before we make the page non-compound. */ 3142 smp_wmb(); 2403 smp_wmb(); 3143 2404 3144 /* 2405 /* 3145 * Clear PageTail before unfreezing p 2406 * Clear PageTail before unfreezing page refcount. 3146 * 2407 * 3147 * After successful get_page_unless_z 2408 * After successful get_page_unless_zero() might follow put_page() 3148 * which needs correct compound_head( 2409 * which needs correct compound_head(). 3149 */ 2410 */ 3150 clear_compound_head(page_tail); 2411 clear_compound_head(page_tail); 3151 if (new_order) { << 3152 prep_compound_page(page_tail, << 3153 folio_set_large_rmappable(new << 3154 } << 3155 2412 3156 /* Finally unfreeze refcount. Additio 2413 /* Finally unfreeze refcount. Additional reference from page cache. */ 3157 page_ref_unfreeze(page_tail, !! 2414 page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || 3158 1 + ((!folio_test_anon(folio) !! 2415 PageSwapCache(head))); 3159 folio_nr_pages(n << 3160 << 3161 if (folio_test_young(folio)) << 3162 folio_set_young(new_folio); << 3163 if (folio_test_idle(folio)) << 3164 folio_set_idle(new_folio); << 3165 2416 3166 folio_xchg_last_cpupid(new_folio, fol !! 2417 if (page_is_young(head)) >> 2418 set_page_young(page_tail); >> 2419 if (page_is_idle(head)) >> 2420 set_page_idle(page_tail); >> 2421 >> 2422 page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); 3167 2423 3168 /* 2424 /* 3169 * always add to the tail because som 2425 * always add to the tail because some iterators expect new 3170 * pages to show after the currently 2426 * pages to show after the currently processed elements - e.g. 3171 * migrate_pages 2427 * migrate_pages 3172 */ 2428 */ 3173 lru_add_page_tail(folio, page_tail, l !! 2429 lru_add_page_tail(head, page_tail, lruvec, list); 3174 } 2430 } 3175 2431 3176 static void __split_huge_page(struct page *pa 2432 static void __split_huge_page(struct page *page, struct list_head *list, 3177 pgoff_t end, unsigned int new !! 2433 pgoff_t end, unsigned long flags) 3178 { 2434 { 3179 struct folio *folio = page_folio(page !! 2435 struct page *head = compound_head(page); 3180 struct page *head = &folio->page; !! 2436 struct zone *zone = page_zone(head); 3181 struct lruvec *lruvec; 2437 struct lruvec *lruvec; 3182 struct address_space *swap_cache = NU !! 2438 int i; 3183 unsigned long offset = 0; << 3184 int i, nr_dropped = 0; << 3185 unsigned int new_nr = 1 << new_order; << 3186 int order = folio_order(folio); << 3187 unsigned int nr = 1 << order; << 3188 << 3189 /* complete memcg works before add pa << 3190 split_page_memcg(head, order, new_ord << 3191 << 3192 if (folio_test_anon(folio) && folio_t << 3193 offset = swap_cache_index(fol << 3194 swap_cache = swap_address_spa << 3195 xa_lock(&swap_cache->i_pages) << 3196 } << 3197 2439 3198 /* lock lru list/PageCompound, ref fr !! 2440 lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); 3199 lruvec = folio_lruvec_lock(folio); << 3200 2441 3201 ClearPageHasHWPoisoned(head); !! 2442 /* complete memcg works before add pages to LRU */ >> 2443 mem_cgroup_split_huge_fixup(head); 3202 2444 3203 for (i = nr - new_nr; i >= new_nr; i !! 2445 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { 3204 __split_huge_page_tail(folio, !! 2446 __split_huge_page_tail(head, i, lruvec, list); 3205 /* Some pages can be beyond E !! 2447 /* Some pages can be beyond i_size: drop them from page cache */ 3206 if (head[i].index >= end) { 2448 if (head[i].index >= end) { 3207 struct folio *tail = !! 2449 ClearPageDirty(head + i); 3208 !! 2450 __delete_from_page_cache(head + i, NULL); 3209 if (shmem_mapping(fol !! 2451 if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) 3210 nr_dropped++; !! 2452 shmem_uncharge(head->mapping->host, 1); 3211 else if (folio_test_c !! 2453 put_page(head + i); 3212 folio_account << 3213 inode << 3214 __filemap_remove_foli << 3215 folio_put(tail); << 3216 } else if (!PageAnon(page)) { << 3217 __xa_store(&folio->ma << 3218 head << 3219 } else if (swap_cache) { << 3220 __xa_store(&swap_cach << 3221 head << 3222 } 2454 } 3223 } 2455 } 3224 2456 3225 if (!new_order) !! 2457 ClearPageCompound(head); 3226 ClearPageCompound(head); << 3227 else { << 3228 struct folio *new_folio = (st << 3229 << 3230 folio_set_order(new_folio, ne << 3231 } << 3232 unlock_page_lruvec(lruvec); << 3233 /* Caller disabled irqs, so they are << 3234 << 3235 split_page_owner(head, order, new_ord << 3236 pgalloc_tag_split(folio, order, new_o << 3237 << 3238 /* See comment in __split_huge_page_t 2458 /* See comment in __split_huge_page_tail() */ 3239 if (folio_test_anon(folio)) { !! 2459 if (PageAnon(head)) { 3240 /* Additional pin to swap cac 2460 /* Additional pin to swap cache */ 3241 if (folio_test_swapcache(foli !! 2461 if (PageSwapCache(head)) 3242 folio_ref_add(folio, !! 2462 page_ref_add(head, 2); 3243 xa_unlock(&swap_cache !! 2463 else 3244 } else { !! 2464 page_ref_inc(head); 3245 folio_ref_inc(folio); << 3246 } << 3247 } else { 2465 } else { 3248 /* Additional pin to page cac 2466 /* Additional pin to page cache */ 3249 folio_ref_add(folio, 1 + new_ !! 2467 page_ref_add(head, 2); 3250 xa_unlock(&folio->mapping->i_ !! 2468 xa_unlock(&head->mapping->i_pages); 3251 } 2469 } 3252 local_irq_enable(); << 3253 2470 3254 if (nr_dropped) !! 2471 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); 3255 shmem_uncharge(folio->mapping << 3256 remap_page(folio, nr, PageAnon(head) << 3257 2472 3258 /* !! 2473 remap_page(head); 3259 * set page to its compound_head when << 3260 * we can skip unlocking it below, si << 3261 * the compound_head of the page and << 3262 */ << 3263 if (new_order) << 3264 page = compound_head(page); << 3265 2474 3266 for (i = 0; i < nr; i += new_nr) { !! 2475 for (i = 0; i < HPAGE_PMD_NR; i++) { 3267 struct page *subpage = head + 2476 struct page *subpage = head + i; 3268 struct folio *new_folio = pag << 3269 if (subpage == page) 2477 if (subpage == page) 3270 continue; 2478 continue; 3271 folio_unlock(new_folio); !! 2479 unlock_page(subpage); 3272 2480 3273 /* 2481 /* 3274 * Subpages may be freed if t 2482 * Subpages may be freed if there wasn't any mapping 3275 * like if add_to_swap() is r 2483 * like if add_to_swap() is running on a lru page that 3276 * had its mapping zapped. An 2484 * had its mapping zapped. And freeing these pages 3277 * requires taking the lru_lo 2485 * requires taking the lru_lock so we do the put_page 3278 * of the tail pages after th 2486 * of the tail pages after the split is complete. 3279 */ 2487 */ 3280 free_page_and_swap_cache(subp !! 2488 put_page(subpage); 3281 } 2489 } 3282 } 2490 } 3283 2491 >> 2492 int total_mapcount(struct page *page) >> 2493 { >> 2494 int i, compound, ret; >> 2495 >> 2496 VM_BUG_ON_PAGE(PageTail(page), page); >> 2497 >> 2498 if (likely(!PageCompound(page))) >> 2499 return atomic_read(&page->_mapcount) + 1; >> 2500 >> 2501 compound = compound_mapcount(page); >> 2502 if (PageHuge(page)) >> 2503 return compound; >> 2504 ret = compound; >> 2505 for (i = 0; i < HPAGE_PMD_NR; i++) >> 2506 ret += atomic_read(&page[i]._mapcount) + 1; >> 2507 /* File pages has compound_mapcount included in _mapcount */ >> 2508 if (!PageAnon(page)) >> 2509 return ret - compound * HPAGE_PMD_NR; >> 2510 if (PageDoubleMap(page)) >> 2511 ret -= HPAGE_PMD_NR; >> 2512 return ret; >> 2513 } >> 2514 >> 2515 /* >> 2516 * This calculates accurately how many mappings a transparent hugepage >> 2517 * has (unlike page_mapcount() which isn't fully accurate). This full >> 2518 * accuracy is primarily needed to know if copy-on-write faults can >> 2519 * reuse the page and change the mapping to read-write instead of >> 2520 * copying them. At the same time this returns the total_mapcount too. >> 2521 * >> 2522 * The function returns the highest mapcount any one of the subpages >> 2523 * has. If the return value is one, even if different processes are >> 2524 * mapping different subpages of the transparent hugepage, they can >> 2525 * all reuse it, because each process is reusing a different subpage. >> 2526 * >> 2527 * The total_mapcount is instead counting all virtual mappings of the >> 2528 * subpages. If the total_mapcount is equal to "one", it tells the >> 2529 * caller all mappings belong to the same "mm" and in turn the >> 2530 * anon_vma of the transparent hugepage can become the vma->anon_vma >> 2531 * local one as no other process may be mapping any of the subpages. >> 2532 * >> 2533 * It would be more accurate to replace page_mapcount() with >> 2534 * page_trans_huge_mapcount(), however we only use >> 2535 * page_trans_huge_mapcount() in the copy-on-write faults where we >> 2536 * need full accuracy to avoid breaking page pinning, because >> 2537 * page_trans_huge_mapcount() is slower than page_mapcount(). >> 2538 */ >> 2539 int page_trans_huge_mapcount(struct page *page, int *total_mapcount) >> 2540 { >> 2541 int i, ret, _total_mapcount, mapcount; >> 2542 >> 2543 /* hugetlbfs shouldn't call it */ >> 2544 VM_BUG_ON_PAGE(PageHuge(page), page); >> 2545 >> 2546 if (likely(!PageTransCompound(page))) { >> 2547 mapcount = atomic_read(&page->_mapcount) + 1; >> 2548 if (total_mapcount) >> 2549 *total_mapcount = mapcount; >> 2550 return mapcount; >> 2551 } >> 2552 >> 2553 page = compound_head(page); >> 2554 >> 2555 _total_mapcount = ret = 0; >> 2556 for (i = 0; i < HPAGE_PMD_NR; i++) { >> 2557 mapcount = atomic_read(&page[i]._mapcount) + 1; >> 2558 ret = max(ret, mapcount); >> 2559 _total_mapcount += mapcount; >> 2560 } >> 2561 if (PageDoubleMap(page)) { >> 2562 ret -= 1; >> 2563 _total_mapcount -= HPAGE_PMD_NR; >> 2564 } >> 2565 mapcount = compound_mapcount(page); >> 2566 ret += mapcount; >> 2567 _total_mapcount += mapcount; >> 2568 if (total_mapcount) >> 2569 *total_mapcount = _total_mapcount; >> 2570 return ret; >> 2571 } >> 2572 3284 /* Racy check whether the huge page can be sp 2573 /* Racy check whether the huge page can be split */ 3285 bool can_split_folio(struct folio *folio, int !! 2574 bool can_split_huge_page(struct page *page, int *pextra_pins) 3286 { 2575 { 3287 int extra_pins; 2576 int extra_pins; 3288 2577 3289 /* Additional pins from page cache */ 2578 /* Additional pins from page cache */ 3290 if (folio_test_anon(folio)) !! 2579 if (PageAnon(page)) 3291 extra_pins = folio_test_swapc !! 2580 extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; 3292 folio_nr_page << 3293 else 2581 else 3294 extra_pins = folio_nr_pages(f !! 2582 extra_pins = HPAGE_PMD_NR; 3295 if (pextra_pins) 2583 if (pextra_pins) 3296 *pextra_pins = extra_pins; 2584 *pextra_pins = extra_pins; 3297 return folio_mapcount(folio) == folio !! 2585 return total_mapcount(page) == page_count(page) - extra_pins - 1; 3298 calle << 3299 } 2586 } 3300 2587 3301 /* 2588 /* 3302 * This function splits a large folio into sm !! 2589 * This function splits huge page into normal pages. @page can point to any 3303 * @page can point to any page of the large f !! 2590 * subpage of huge page to split. Split doesn't change the position of @page. 3304 * does not change the position of @page. << 3305 * << 3306 * Prerequisites: << 3307 * << 3308 * 1) The caller must hold a reference on the << 3309 * as the large folio. << 3310 * << 3311 * 2) The large folio must be locked. << 3312 * 2591 * 3313 * 3) The folio must not be pinned. Any unexp !! 2592 * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. 3314 * GUP pins, will result in the folio not !! 2593 * The huge page must be locked. 3315 * will receive an -EAGAIN. << 3316 * << 3317 * 4) @new_order > 1, usually. Splitting to o << 3318 * supported for non-file-backed folios, b << 3319 * is used by partially mapped folios, is << 3320 * folio only has subpages 0 and 1. File-b << 3321 * since they do not use _deferred_list. << 3322 * << 3323 * After splitting, the caller's folio refere << 3324 * resulting in a raised refcount of @page af << 3325 * be freed if they are not mapped. << 3326 * 2594 * 3327 * If @list is null, tail pages will be added 2595 * If @list is null, tail pages will be added to LRU list, otherwise, to @list. 3328 * 2596 * 3329 * Pages in @new_order will inherit the mappi !! 2597 * Both head page and tail pages will inherit mapping, flags, and so on from 3330 * huge page. !! 2598 * the hugepage. 3331 * << 3332 * Returns 0 if the huge page was split succe << 3333 * << 3334 * Returns -EAGAIN if the folio has unexpecte << 3335 * the folio was concurrently removed from th << 3336 * << 3337 * Returns -EBUSY when trying to split the hu << 3338 * under writeback, if fs-specific folio meta << 3339 * released, or if some unexpected race happe << 3340 * truncation). << 3341 * 2599 * 3342 * Callers should ensure that the order respe !! 2600 * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if 3343 * min-order if one is set for non-anonymous !! 2601 * they are not mapped. 3344 * 2602 * 3345 * Returns -EINVAL when trying to split to an !! 2603 * Returns 0 if the hugepage is split successfully. 3346 * with the folio. Splitting to order 0 is co !! 2604 * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under >> 2605 * us. 3347 */ 2606 */ 3348 int split_huge_page_to_list_to_order(struct p !! 2607 int split_huge_page_to_list(struct page *page, struct list_head *list) 3349 unsigned << 3350 { 2608 { 3351 struct folio *folio = page_folio(page !! 2609 struct page *head = compound_head(page); 3352 struct deferred_split *ds_queue = get !! 2610 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); 3353 /* reset xarray order to new order af << 3354 XA_STATE_ORDER(xas, &folio->mapping-> << 3355 bool is_anon = folio_test_anon(folio) << 3356 struct address_space *mapping = NULL; << 3357 struct anon_vma *anon_vma = NULL; 2611 struct anon_vma *anon_vma = NULL; 3358 int order = folio_order(folio); !! 2612 struct address_space *mapping = NULL; 3359 int extra_pins, ret; !! 2613 int count, mapcount, extra_pins, ret; >> 2614 bool mlocked; >> 2615 unsigned long flags; 3360 pgoff_t end; 2616 pgoff_t end; 3361 bool is_hzp; << 3362 << 3363 VM_BUG_ON_FOLIO(!folio_test_locked(fo << 3364 VM_BUG_ON_FOLIO(!folio_test_large(fol << 3365 2617 3366 if (new_order >= folio_order(folio)) !! 2618 VM_BUG_ON_PAGE(is_huge_zero_page(page), page); 3367 return -EINVAL; !! 2619 VM_BUG_ON_PAGE(!PageLocked(page), page); 3368 !! 2620 VM_BUG_ON_PAGE(!PageCompound(page), page); 3369 if (is_anon) { << 3370 /* order-1 is not supported f << 3371 if (new_order == 1) { << 3372 VM_WARN_ONCE(1, "Cann << 3373 return -EINVAL; << 3374 } << 3375 } else if (new_order) { << 3376 /* Split shmem folio to non-z << 3377 if (shmem_mapping(folio->mapp << 3378 VM_WARN_ONCE(1, << 3379 "Cannot split << 3380 return -EINVAL; << 3381 } << 3382 /* << 3383 * No split if the file syste << 3384 * Note that we might still h << 3385 * CONFIG_READ_ONLY_THP_FOR_F << 3386 * does not actually support << 3387 */ << 3388 if (IS_ENABLED(CONFIG_READ_ON << 3389 !mapping_large_folio_supp << 3390 VM_WARN_ONCE(1, << 3391 "Cannot split << 3392 return -EINVAL; << 3393 } << 3394 } << 3395 2621 3396 /* Only swapping a whole PMD-mapped f !! 2622 if (PageWriteback(page)) 3397 if (folio_test_swapcache(folio) && ne << 3398 return -EINVAL; << 3399 << 3400 is_hzp = is_huge_zero_folio(folio); << 3401 if (is_hzp) { << 3402 pr_warn_ratelimited("Called s << 3403 return -EBUSY; << 3404 } << 3405 << 3406 if (folio_test_writeback(folio)) << 3407 return -EBUSY; 2623 return -EBUSY; 3408 2624 3409 if (is_anon) { !! 2625 if (PageAnon(head)) { 3410 /* 2626 /* 3411 * The caller does not necess !! 2627 * The caller does not necessarily hold an mmap_sem that would 3412 * prevent the anon_vma disap 2628 * prevent the anon_vma disappearing so we first we take a 3413 * reference to it and then l 2629 * reference to it and then lock the anon_vma for write. This 3414 * is similar to folio_lock_a !! 2630 * is similar to page_lock_anon_vma_read except the write lock 3415 * is taken to serialise agai 2631 * is taken to serialise against parallel split or collapse 3416 * operations. 2632 * operations. 3417 */ 2633 */ 3418 anon_vma = folio_get_anon_vma !! 2634 anon_vma = page_get_anon_vma(head); 3419 if (!anon_vma) { 2635 if (!anon_vma) { 3420 ret = -EBUSY; 2636 ret = -EBUSY; 3421 goto out; 2637 goto out; 3422 } 2638 } 3423 end = -1; 2639 end = -1; 3424 mapping = NULL; 2640 mapping = NULL; 3425 anon_vma_lock_write(anon_vma) 2641 anon_vma_lock_write(anon_vma); 3426 } else { 2642 } else { 3427 unsigned int min_order; !! 2643 mapping = head->mapping; 3428 gfp_t gfp; << 3429 << 3430 mapping = folio->mapping; << 3431 2644 3432 /* Truncated ? */ 2645 /* Truncated ? */ 3433 if (!mapping) { 2646 if (!mapping) { 3434 ret = -EBUSY; 2647 ret = -EBUSY; 3435 goto out; 2648 goto out; 3436 } 2649 } 3437 2650 3438 min_order = mapping_min_folio << 3439 if (new_order < min_order) { << 3440 VM_WARN_ONCE(1, "Cann << 3441 min_orde << 3442 ret = -EINVAL; << 3443 goto out; << 3444 } << 3445 << 3446 gfp = current_gfp_context(map << 3447 << 3448 << 3449 if (!filemap_release_folio(fo << 3450 ret = -EBUSY; << 3451 goto out; << 3452 } << 3453 << 3454 xas_split_alloc(&xas, folio, << 3455 if (xas_error(&xas)) { << 3456 ret = xas_error(&xas) << 3457 goto out; << 3458 } << 3459 << 3460 anon_vma = NULL; 2651 anon_vma = NULL; 3461 i_mmap_lock_read(mapping); 2652 i_mmap_lock_read(mapping); 3462 2653 3463 /* 2654 /* 3464 *__split_huge_page() may nee 2655 *__split_huge_page() may need to trim off pages beyond EOF: 3465 * but on 32-bit, i_size_read 2656 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, 3466 * which cannot be nested ins 2657 * which cannot be nested inside the page tree lock. So note 3467 * end now: i_size itself may 2658 * end now: i_size itself may be changed at any moment, but 3468 * folio lock is good enough !! 2659 * head page lock is good enough to serialize the trimming. 3469 */ 2660 */ 3470 end = DIV_ROUND_UP(i_size_rea 2661 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); 3471 if (shmem_mapping(mapping)) << 3472 end = shmem_fallocend << 3473 } 2662 } 3474 2663 3475 /* 2664 /* 3476 * Racy check if we can split the pag !! 2665 * Racy check if we can split the page, before unmap_page() will 3477 * split PMDs 2666 * split PMDs 3478 */ 2667 */ 3479 if (!can_split_folio(folio, 1, &extra !! 2668 if (!can_split_huge_page(head, &extra_pins)) { 3480 ret = -EAGAIN; !! 2669 ret = -EBUSY; 3481 goto out_unlock; 2670 goto out_unlock; 3482 } 2671 } 3483 2672 3484 unmap_folio(folio); !! 2673 mlocked = PageMlocked(page); >> 2674 unmap_page(head); >> 2675 VM_BUG_ON_PAGE(compound_mapcount(head), head); >> 2676 >> 2677 /* Make sure the page is not on per-CPU pagevec as it takes pin */ >> 2678 if (mlocked) >> 2679 lru_add_drain(); >> 2680 >> 2681 /* prevent PageLRU to go away from under us, and freeze lru stats */ >> 2682 spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); 3485 2683 3486 /* block interrupt reentry in xa_lock << 3487 local_irq_disable(); << 3488 if (mapping) { 2684 if (mapping) { >> 2685 XA_STATE(xas, &mapping->i_pages, page_index(head)); >> 2686 3489 /* 2687 /* 3490 * Check if the folio is pres !! 2688 * Check if the head page is present in page cache. 3491 * We assume all tail are pre !! 2689 * We assume all tail are present too, if head is there. 3492 */ 2690 */ 3493 xas_lock(&xas); !! 2691 xa_lock(&mapping->i_pages); 3494 xas_reset(&xas); !! 2692 if (xas_load(&xas) != head) 3495 if (xas_load(&xas) != folio) << 3496 goto fail; 2693 goto fail; 3497 } 2694 } 3498 2695 3499 /* Prevent deferred_split_scan() touc 2696 /* Prevent deferred_split_scan() touching ->_refcount */ 3500 spin_lock(&ds_queue->split_queue_lock !! 2697 spin_lock(&pgdata->split_queue_lock); 3501 if (folio_ref_freeze(folio, 1 + extra !! 2698 count = page_count(head); 3502 if (folio_order(folio) > 1 && !! 2699 mapcount = total_mapcount(head); 3503 !list_empty(&folio->_defe !! 2700 if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { 3504 ds_queue->split_queue !! 2701 if (!list_empty(page_deferred_list(head))) { 3505 if (folio_test_partia !! 2702 pgdata->split_queue_len--; 3506 __folio_clear !! 2703 list_del(page_deferred_list(head)); 3507 mod_mthp_stat << 3508 << 3509 } << 3510 /* << 3511 * Reinitialize page_ << 3512 * page from the spli << 3513 * split will see lis << 3514 * page_deferred_list << 3515 */ << 3516 list_del_init(&folio- << 3517 } << 3518 spin_unlock(&ds_queue->split_ << 3519 if (mapping) { << 3520 int nr = folio_nr_pag << 3521 << 3522 xas_split(&xas, folio << 3523 if (folio_test_pmd_ma << 3524 new_order < HPAGE << 3525 if (folio_tes << 3526 __lru << 3527 << 3528 } else { << 3529 __lru << 3530 << 3531 filem << 3532 } << 3533 } << 3534 } 2704 } >> 2705 if (mapping) >> 2706 __dec_node_page_state(page, NR_SHMEM_THPS); >> 2707 spin_unlock(&pgdata->split_queue_lock); >> 2708 __split_huge_page(page, list, end, flags); >> 2709 if (PageSwapCache(head)) { >> 2710 swp_entry_t entry = { .val = page_private(head) }; 3535 2711 3536 if (is_anon) { !! 2712 ret = split_swap_cluster(entry); 3537 mod_mthp_stat(order, !! 2713 } else 3538 mod_mthp_stat(new_ord !! 2714 ret = 0; 3539 } << 3540 __split_huge_page(page, list, << 3541 ret = 0; << 3542 } else { 2715 } else { 3543 spin_unlock(&ds_queue->split_ !! 2716 if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { 3544 fail: !! 2717 pr_alert("total_mapcount: %u, page_count(): %u\n", 3545 if (mapping) !! 2718 mapcount, count); 3546 xas_unlock(&xas); !! 2719 if (PageTail(page)) 3547 local_irq_enable(); !! 2720 dump_page(head, NULL); 3548 remap_page(folio, folio_nr_pa !! 2721 dump_page(page, "total_mapcount(head) > 0"); 3549 ret = -EAGAIN; !! 2722 BUG(); >> 2723 } >> 2724 spin_unlock(&pgdata->split_queue_lock); >> 2725 fail: if (mapping) >> 2726 xa_unlock(&mapping->i_pages); >> 2727 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); >> 2728 remap_page(head); >> 2729 ret = -EBUSY; 3550 } 2730 } 3551 2731 3552 out_unlock: 2732 out_unlock: 3553 if (anon_vma) { 2733 if (anon_vma) { 3554 anon_vma_unlock_write(anon_vm 2734 anon_vma_unlock_write(anon_vma); 3555 put_anon_vma(anon_vma); 2735 put_anon_vma(anon_vma); 3556 } 2736 } 3557 if (mapping) 2737 if (mapping) 3558 i_mmap_unlock_read(mapping); 2738 i_mmap_unlock_read(mapping); 3559 out: 2739 out: 3560 xas_destroy(&xas); !! 2740 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 3561 if (order == HPAGE_PMD_ORDER) << 3562 count_vm_event(!ret ? THP_SPL << 3563 count_mthp_stat(order, !ret ? MTHP_ST << 3564 return ret; 2741 return ret; 3565 } 2742 } 3566 2743 3567 int min_order_for_split(struct folio *folio) !! 2744 void free_transhuge_page(struct page *page) 3568 { 2745 { 3569 if (folio_test_anon(folio)) !! 2746 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 3570 return 0; << 3571 << 3572 if (!folio->mapping) { << 3573 if (folio_test_pmd_mappable(f << 3574 count_vm_event(THP_SP << 3575 return -EBUSY; << 3576 } << 3577 << 3578 return mapping_min_folio_order(folio- << 3579 } << 3580 << 3581 int split_folio_to_list(struct folio *folio, << 3582 { << 3583 int ret = min_order_for_split(folio); << 3584 << 3585 if (ret < 0) << 3586 return ret; << 3587 << 3588 return split_huge_page_to_list_to_ord << 3589 } << 3590 << 3591 /* << 3592 * __folio_unqueue_deferred_split() is not to << 3593 * the folio_unqueue_deferred_split() inline << 3594 * limits its calls to those folios which may << 3595 * queueing THP splits, and that list is (rac << 3596 * << 3597 * It is unsafe to call folio_unqueue_deferre << 3598 * zero: because even when split_queue_lock i << 3599 * might be in use on deferred_split_scan()'s << 3600 * << 3601 * If memory cgroups are enabled, split_queue << 3602 * therefore important to unqueue deferred sp << 3603 */ << 3604 bool __folio_unqueue_deferred_split(struct fo << 3605 { << 3606 struct deferred_split *ds_queue; << 3607 unsigned long flags; 2747 unsigned long flags; 3608 bool unqueued = false; << 3609 << 3610 WARN_ON_ONCE(folio_ref_count(folio)); << 3611 WARN_ON_ONCE(!mem_cgroup_disabled() & << 3612 2748 3613 ds_queue = get_deferred_split_queue(f !! 2749 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3614 spin_lock_irqsave(&ds_queue->split_qu !! 2750 if (!list_empty(page_deferred_list(page))) { 3615 if (!list_empty(&folio->_deferred_lis !! 2751 pgdata->split_queue_len--; 3616 ds_queue->split_queue_len--; !! 2752 list_del(page_deferred_list(page)); 3617 if (folio_test_partially_mapp << 3618 __folio_clear_partial << 3619 mod_mthp_stat(folio_o << 3620 MTHP_ST << 3621 } << 3622 list_del_init(&folio->_deferr << 3623 unqueued = true; << 3624 } 2753 } 3625 spin_unlock_irqrestore(&ds_queue->spl !! 2754 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3626 !! 2755 free_compound_page(page); 3627 return unqueued; /* useful for << 3628 } 2756 } 3629 2757 3630 /* partially_mapped=false won't clear PG_part !! 2758 void deferred_split_huge_page(struct page *page) 3631 void deferred_split_folio(struct folio *folio << 3632 { 2759 { 3633 struct deferred_split *ds_queue = get !! 2760 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 3634 #ifdef CONFIG_MEMCG << 3635 struct mem_cgroup *memcg = folio_memc << 3636 #endif << 3637 unsigned long flags; 2761 unsigned long flags; 3638 2762 3639 /* !! 2763 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 3640 * Order 1 folios have no space for a << 3641 * won't waste much memory by not add << 3642 */ << 3643 if (folio_order(folio) <= 1) << 3644 return; << 3645 << 3646 if (!partially_mapped && !split_under << 3647 return; << 3648 2764 3649 /* !! 2765 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3650 * Exclude swapcache: originally to a !! 2766 if (list_empty(page_deferred_list(page))) { 3651 * queue. Nowadays that is fully prev !! 2767 count_vm_event(THP_DEFERRED_SPLIT_PAGE); 3652 * but if page reclaim is already han !! 2768 list_add_tail(page_deferred_list(page), &pgdata->split_queue); 3653 * unnecessary to handle it again in !! 2769 pgdata->split_queue_len++; 3654 * swapcache here may still be a usef << 3655 */ << 3656 if (folio_test_swapcache(folio)) << 3657 return; << 3658 << 3659 spin_lock_irqsave(&ds_queue->split_qu << 3660 if (partially_mapped) { << 3661 if (!folio_test_partially_map << 3662 __folio_set_partially << 3663 if (folio_test_pmd_ma << 3664 count_vm_even << 3665 count_mthp_stat(folio << 3666 mod_mthp_stat(folio_o << 3667 << 3668 } << 3669 } else { << 3670 /* partially mapped folios ca << 3671 VM_WARN_ON_FOLIO(folio_test_p << 3672 } << 3673 if (list_empty(&folio->_deferred_list << 3674 list_add_tail(&folio->_deferr << 3675 ds_queue->split_queue_len++; << 3676 #ifdef CONFIG_MEMCG << 3677 if (memcg) << 3678 set_shrinker_bit(memc << 3679 defe << 3680 #endif << 3681 } 2770 } 3682 spin_unlock_irqrestore(&ds_queue->spl !! 2771 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3683 } 2772 } 3684 2773 3685 static unsigned long deferred_split_count(str 2774 static unsigned long deferred_split_count(struct shrinker *shrink, 3686 struct shrink_control *sc) 2775 struct shrink_control *sc) 3687 { 2776 { 3688 struct pglist_data *pgdata = NODE_DAT 2777 struct pglist_data *pgdata = NODE_DATA(sc->nid); 3689 struct deferred_split *ds_queue = &pg !! 2778 return READ_ONCE(pgdata->split_queue_len); 3690 << 3691 #ifdef CONFIG_MEMCG << 3692 if (sc->memcg) << 3693 ds_queue = &sc->memcg->deferr << 3694 #endif << 3695 return READ_ONCE(ds_queue->split_queu << 3696 } << 3697 << 3698 static bool thp_underused(struct folio *folio << 3699 { << 3700 int num_zero_pages = 0, num_filled_pa << 3701 void *kaddr; << 3702 int i; << 3703 << 3704 if (khugepaged_max_ptes_none == HPAGE << 3705 return false; << 3706 << 3707 for (i = 0; i < folio_nr_pages(folio) << 3708 kaddr = kmap_local_folio(foli << 3709 if (!memchr_inv(kaddr, 0, PAG << 3710 num_zero_pages++; << 3711 if (num_zero_pages > << 3712 kunmap_local( << 3713 return true; << 3714 } << 3715 } else { << 3716 /* << 3717 * Another path for e << 3718 * of non-zero filled << 3719 */ << 3720 num_filled_pages++; << 3721 if (num_filled_pages << 3722 kunmap_local( << 3723 return false; << 3724 } << 3725 } << 3726 kunmap_local(kaddr); << 3727 } << 3728 return false; << 3729 } 2779 } 3730 2780 3731 static unsigned long deferred_split_scan(stru 2781 static unsigned long deferred_split_scan(struct shrinker *shrink, 3732 struct shrink_control *sc) 2782 struct shrink_control *sc) 3733 { 2783 { 3734 struct pglist_data *pgdata = NODE_DAT 2784 struct pglist_data *pgdata = NODE_DATA(sc->nid); 3735 struct deferred_split *ds_queue = &pg << 3736 unsigned long flags; 2785 unsigned long flags; 3737 LIST_HEAD(list); !! 2786 LIST_HEAD(list), *pos, *next; 3738 struct folio *folio, *next, *prev = N !! 2787 struct page *page; 3739 int split = 0, removed = 0; !! 2788 int split = 0; 3740 << 3741 #ifdef CONFIG_MEMCG << 3742 if (sc->memcg) << 3743 ds_queue = &sc->memcg->deferr << 3744 #endif << 3745 2789 3746 spin_lock_irqsave(&ds_queue->split_qu !! 2790 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3747 /* Take pin on all head pages to avoi 2791 /* Take pin on all head pages to avoid freeing them under us */ 3748 list_for_each_entry_safe(folio, next, !! 2792 list_for_each_safe(pos, next, &pgdata->split_queue) { 3749 !! 2793 page = list_entry((void *)pos, struct page, mapping); 3750 if (folio_try_get(folio)) { !! 2794 page = compound_head(page); 3751 list_move(&folio->_de !! 2795 if (get_page_unless_zero(page)) { >> 2796 list_move(page_deferred_list(page), &list); 3752 } else { 2797 } else { 3753 /* We lost race with !! 2798 /* We lost race with put_compound_page() */ 3754 if (folio_test_partia !! 2799 list_del_init(page_deferred_list(page)); 3755 __folio_clear !! 2800 pgdata->split_queue_len--; 3756 mod_mthp_stat << 3757 << 3758 } << 3759 list_del_init(&folio- << 3760 ds_queue->split_queue << 3761 } 2801 } 3762 if (!--sc->nr_to_scan) 2802 if (!--sc->nr_to_scan) 3763 break; 2803 break; 3764 } 2804 } 3765 spin_unlock_irqrestore(&ds_queue->spl !! 2805 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3766 2806 3767 list_for_each_entry_safe(folio, next, !! 2807 list_for_each_safe(pos, next, &list) { 3768 bool did_split = false; !! 2808 page = list_entry((void *)pos, struct page, mapping); 3769 bool underused = false; !! 2809 if (!trylock_page(page)) 3770 << 3771 if (!folio_test_partially_map << 3772 underused = thp_under << 3773 if (!underused) << 3774 goto next; << 3775 } << 3776 if (!folio_trylock(folio)) << 3777 goto next; 2810 goto next; 3778 if (!split_folio(folio)) { !! 2811 /* split_huge_page() removes page from list on success */ 3779 did_split = true; !! 2812 if (!split_huge_page(page)) 3780 if (underused) << 3781 count_vm_even << 3782 split++; 2813 split++; 3783 } !! 2814 unlock_page(page); 3784 folio_unlock(folio); << 3785 next: 2815 next: 3786 /* !! 2816 put_page(page); 3787 * split_folio() removes foli << 3788 * Only add back to the queue << 3789 * If thp_underused returns f << 3790 * in the case it was underus << 3791 * don't add it back to split << 3792 */ << 3793 if (!did_split && !folio_test << 3794 list_del_init(&folio- << 3795 removed++; << 3796 } else { << 3797 /* << 3798 * That unlocked list << 3799 * unless its folio i << 3800 * left on the list ( << 3801 * by one safe folio << 3802 */ << 3803 swap(folio, prev); << 3804 } << 3805 if (folio) << 3806 folio_put(folio); << 3807 } 2817 } 3808 2818 3809 spin_lock_irqsave(&ds_queue->split_qu !! 2819 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3810 list_splice_tail(&list, &ds_queue->sp !! 2820 list_splice_tail(&list, &pgdata->split_queue); 3811 ds_queue->split_queue_len -= removed; !! 2821 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3812 spin_unlock_irqrestore(&ds_queue->spl << 3813 << 3814 if (prev) << 3815 folio_put(prev); << 3816 2822 3817 /* 2823 /* 3818 * Stop shrinker if we didn't split a 2824 * Stop shrinker if we didn't split any page, but the queue is empty. 3819 * This can happen if pages were free 2825 * This can happen if pages were freed under us. 3820 */ 2826 */ 3821 if (!split && list_empty(&ds_queue->s !! 2827 if (!split && list_empty(&pgdata->split_queue)) 3822 return SHRINK_STOP; 2828 return SHRINK_STOP; 3823 return split; 2829 return split; 3824 } 2830 } 3825 2831 >> 2832 static struct shrinker deferred_split_shrinker = { >> 2833 .count_objects = deferred_split_count, >> 2834 .scan_objects = deferred_split_scan, >> 2835 .seeks = DEFAULT_SEEKS, >> 2836 .flags = SHRINKER_NUMA_AWARE, >> 2837 }; >> 2838 3826 #ifdef CONFIG_DEBUG_FS 2839 #ifdef CONFIG_DEBUG_FS 3827 static void split_huge_pages_all(void) !! 2840 static int split_huge_pages_set(void *data, u64 val) 3828 { 2841 { 3829 struct zone *zone; 2842 struct zone *zone; 3830 struct page *page; 2843 struct page *page; 3831 struct folio *folio; << 3832 unsigned long pfn, max_zone_pfn; 2844 unsigned long pfn, max_zone_pfn; 3833 unsigned long total = 0, split = 0; 2845 unsigned long total = 0, split = 0; 3834 2846 3835 pr_debug("Split all THPs\n"); !! 2847 if (val != 1) 3836 for_each_zone(zone) { !! 2848 return -EINVAL; 3837 if (!managed_zone(zone)) !! 2849 3838 continue; !! 2850 for_each_populated_zone(zone) { 3839 max_zone_pfn = zone_end_pfn(z 2851 max_zone_pfn = zone_end_pfn(zone); 3840 for (pfn = zone->zone_start_p 2852 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { 3841 int nr_pages; !! 2853 if (!pfn_valid(pfn)) 3842 << 3843 page = pfn_to_online_ << 3844 if (!page || PageTail << 3845 continue; << 3846 folio = page_folio(pa << 3847 if (!folio_try_get(fo << 3848 continue; 2854 continue; 3849 2855 3850 if (unlikely(page_fol !! 2856 page = pfn_to_page(pfn); 3851 goto next; !! 2857 if (!get_page_unless_zero(page)) >> 2858 continue; 3852 2859 3853 if (zone != folio_zon !! 2860 if (zone != page_zone(page)) 3854 goto next; 2861 goto next; 3855 2862 3856 if (!folio_test_large !! 2863 if (!PageHead(page) || PageHuge(page) || !PageLRU(page)) 3857 || folio_test << 3858 || !folio_tes << 3859 goto next; 2864 goto next; 3860 2865 3861 total++; 2866 total++; 3862 folio_lock(folio); !! 2867 lock_page(page); 3863 nr_pages = folio_nr_p !! 2868 if (!split_huge_page(page)) 3864 if (!split_folio(foli << 3865 split++; 2869 split++; 3866 pfn += nr_pages - 1; !! 2870 unlock_page(page); 3867 folio_unlock(folio); << 3868 next: 2871 next: 3869 folio_put(folio); !! 2872 put_page(page); 3870 cond_resched(); << 3871 } << 3872 } << 3873 << 3874 pr_debug("%lu of %lu THP split\n", sp << 3875 } << 3876 << 3877 static inline bool vma_not_suitable_for_thp_s << 3878 { << 3879 return vma_is_special_huge(vma) || (v << 3880 is_vm_hugetlb_page(vma); << 3881 } << 3882 << 3883 static int split_huge_pages_pid(int pid, unsi << 3884 unsigned long << 3885 { << 3886 int ret = 0; << 3887 struct task_struct *task; << 3888 struct mm_struct *mm; << 3889 unsigned long total = 0, split = 0; << 3890 unsigned long addr; << 3891 << 3892 vaddr_start &= PAGE_MASK; << 3893 vaddr_end &= PAGE_MASK; << 3894 << 3895 task = find_get_task_by_vpid(pid); << 3896 if (!task) { << 3897 ret = -ESRCH; << 3898 goto out; << 3899 } << 3900 << 3901 /* Find the mm_struct */ << 3902 mm = get_task_mm(task); << 3903 put_task_struct(task); << 3904 << 3905 if (!mm) { << 3906 ret = -EINVAL; << 3907 goto out; << 3908 } << 3909 << 3910 pr_debug("Split huge pages in pid: %d << 3911 pid, vaddr_start, vaddr_end) << 3912 << 3913 mmap_read_lock(mm); << 3914 /* << 3915 * always increase addr by PAGE_SIZE, << 3916 * table filled with PTE-mapped THPs, << 3917 */ << 3918 for (addr = vaddr_start; addr < vaddr << 3919 struct vm_area_struct *vma = << 3920 struct folio_walk fw; << 3921 struct folio *folio; << 3922 struct address_space *mapping << 3923 unsigned int target_order = n << 3924 << 3925 if (!vma) << 3926 break; << 3927 << 3928 /* skip special VMA and huget << 3929 if (vma_not_suitable_for_thp_ << 3930 addr = vma->vm_end; << 3931 continue; << 3932 } << 3933 << 3934 folio = folio_walk_start(&fw, << 3935 if (!folio) << 3936 continue; << 3937 << 3938 if (!is_transparent_hugepage( << 3939 goto next; << 3940 << 3941 if (!folio_test_anon(folio)) << 3942 mapping = folio->mapp << 3943 target_order = max(ne << 3944 ma << 3945 } 2873 } 3946 << 3947 if (target_order >= folio_ord << 3948 goto next; << 3949 << 3950 total++; << 3951 /* << 3952 * For folios with private, s << 3953 * will try to drop it before << 3954 * can be split or not. So sk << 3955 */ << 3956 if (!folio_test_private(folio << 3957 !can_split_folio(folio, 0 << 3958 goto next; << 3959 << 3960 if (!folio_trylock(folio)) << 3961 goto next; << 3962 folio_get(folio); << 3963 folio_walk_end(&fw, vma); << 3964 << 3965 if (!folio_test_anon(folio) & << 3966 goto unlock; << 3967 << 3968 if (!split_folio_to_order(fol << 3969 split++; << 3970 << 3971 unlock: << 3972 << 3973 folio_unlock(folio); << 3974 folio_put(folio); << 3975 << 3976 cond_resched(); << 3977 continue; << 3978 next: << 3979 folio_walk_end(&fw, vma); << 3980 cond_resched(); << 3981 } << 3982 mmap_read_unlock(mm); << 3983 mmput(mm); << 3984 << 3985 pr_debug("%lu of %lu THP split\n", sp << 3986 << 3987 out: << 3988 return ret; << 3989 } << 3990 << 3991 static int split_huge_pages_in_file(const cha << 3992 pgoff_t off_e << 3993 { << 3994 struct filename *file; << 3995 struct file *candidate; << 3996 struct address_space *mapping; << 3997 int ret = -EINVAL; << 3998 pgoff_t index; << 3999 int nr_pages = 1; << 4000 unsigned long total = 0, split = 0; << 4001 unsigned int min_order; << 4002 unsigned int target_order; << 4003 << 4004 file = getname_kernel(file_path); << 4005 if (IS_ERR(file)) << 4006 return ret; << 4007 << 4008 candidate = file_open_name(file, O_RD << 4009 if (IS_ERR(candidate)) << 4010 goto out; << 4011 << 4012 pr_debug("split file-backed THPs in f << 4013 file_path, off_start, off_en << 4014 << 4015 mapping = candidate->f_mapping; << 4016 min_order = mapping_min_folio_order(m << 4017 target_order = max(new_order, min_ord << 4018 << 4019 for (index = off_start; index < off_e << 4020 struct folio *folio = filemap << 4021 << 4022 nr_pages = 1; << 4023 if (IS_ERR(folio)) << 4024 continue; << 4025 << 4026 if (!folio_test_large(folio)) << 4027 goto next; << 4028 << 4029 total++; << 4030 nr_pages = folio_nr_pages(fol << 4031 << 4032 if (target_order >= folio_ord << 4033 goto next; << 4034 << 4035 if (!folio_trylock(folio)) << 4036 goto next; << 4037 << 4038 if (folio->mapping != mapping << 4039 goto unlock; << 4040 << 4041 if (!split_folio_to_order(fol << 4042 split++; << 4043 << 4044 unlock: << 4045 folio_unlock(folio); << 4046 next: << 4047 folio_put(folio); << 4048 cond_resched(); << 4049 } 2874 } 4050 2875 4051 filp_close(candidate, NULL); !! 2876 pr_info("%lu of %lu THP split\n", split, total); 4052 ret = 0; << 4053 2877 4054 pr_debug("%lu of %lu file-backed THP !! 2878 return 0; 4055 out: << 4056 putname(file); << 4057 return ret; << 4058 } 2879 } >> 2880 DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, >> 2881 "%llu\n"); 4059 2882 4060 #define MAX_INPUT_BUF_SZ 255 !! 2883 static int __init split_huge_pages_debugfs(void) 4061 << 4062 static ssize_t split_huge_pages_write(struct << 4063 size_t count, << 4064 { 2884 { 4065 static DEFINE_MUTEX(split_debug_mutex !! 2885 void *ret; 4066 ssize_t ret; << 4067 /* << 4068 * hold pid, start_vaddr, end_vaddr, << 4069 * file_path, off_start, off_end, new << 4070 */ << 4071 char input_buf[MAX_INPUT_BUF_SZ]; << 4072 int pid; << 4073 unsigned long vaddr_start, vaddr_end; << 4074 unsigned int new_order = 0; << 4075 << 4076 ret = mutex_lock_interruptible(&split << 4077 if (ret) << 4078 return ret; << 4079 << 4080 ret = -EFAULT; << 4081 << 4082 memset(input_buf, 0, MAX_INPUT_BUF_SZ << 4083 if (copy_from_user(input_buf, buf, mi << 4084 goto out; << 4085 << 4086 input_buf[MAX_INPUT_BUF_SZ - 1] = '\0 << 4087 << 4088 if (input_buf[0] == '/') { << 4089 char *tok; << 4090 char *buf = input_buf; << 4091 char file_path[MAX_INPUT_BUF_ << 4092 pgoff_t off_start = 0, off_en << 4093 size_t input_len = strlen(inp << 4094 << 4095 tok = strsep(&buf, ","); << 4096 if (tok) { << 4097 strcpy(file_path, tok << 4098 } else { << 4099 ret = -EINVAL; << 4100 goto out; << 4101 } << 4102 << 4103 ret = sscanf(buf, "0x%lx,0x%l << 4104 if (ret != 2 && ret != 3) { << 4105 ret = -EINVAL; << 4106 goto out; << 4107 } << 4108 ret = split_huge_pages_in_fil << 4109 if (!ret) << 4110 ret = input_len; << 4111 << 4112 goto out; << 4113 } << 4114 2886 4115 ret = sscanf(input_buf, "%d,0x%lx,0x% !! 2887 ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 4116 if (ret == 1 && pid == 1) { !! 2888 &split_huge_pages_fops); 4117 split_huge_pages_all(); << 4118 ret = strlen(input_buf); << 4119 goto out; << 4120 } else if (ret != 3 && ret != 4) { << 4121 ret = -EINVAL; << 4122 goto out; << 4123 } << 4124 << 4125 ret = split_huge_pages_pid(pid, vaddr << 4126 if (!ret) 2889 if (!ret) 4127 ret = strlen(input_buf); !! 2890 pr_warn("Failed to create split_huge_pages in debugfs"); 4128 out: << 4129 mutex_unlock(&split_debug_mutex); << 4130 return ret; << 4131 << 4132 } << 4133 << 4134 static const struct file_operations split_hug << 4135 .owner = THIS_MODULE, << 4136 .write = split_huge_pages_write, << 4137 }; << 4138 << 4139 static int __init split_huge_pages_debugfs(vo << 4140 { << 4141 debugfs_create_file("split_huge_pages << 4142 &split_huge_pages << 4143 return 0; 2891 return 0; 4144 } 2892 } 4145 late_initcall(split_huge_pages_debugfs); 2893 late_initcall(split_huge_pages_debugfs); 4146 #endif 2894 #endif 4147 2895 4148 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 2896 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 4149 int set_pmd_migration_entry(struct page_vma_m !! 2897 void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, 4150 struct page *page) 2898 struct page *page) 4151 { 2899 { 4152 struct folio *folio = page_folio(page << 4153 struct vm_area_struct *vma = pvmw->vm 2900 struct vm_area_struct *vma = pvmw->vma; 4154 struct mm_struct *mm = vma->vm_mm; 2901 struct mm_struct *mm = vma->vm_mm; 4155 unsigned long address = pvmw->address 2902 unsigned long address = pvmw->address; 4156 bool anon_exclusive; << 4157 pmd_t pmdval; 2903 pmd_t pmdval; 4158 swp_entry_t entry; 2904 swp_entry_t entry; 4159 pmd_t pmdswp; 2905 pmd_t pmdswp; 4160 2906 4161 if (!(pvmw->pmd && !pvmw->pte)) 2907 if (!(pvmw->pmd && !pvmw->pte)) 4162 return 0; !! 2908 return; 4163 2909 4164 flush_cache_range(vma, address, addre 2910 flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); 4165 pmdval = pmdp_invalidate(vma, address !! 2911 pmdval = *pvmw->pmd; 4166 !! 2912 pmdp_invalidate(vma, address, pvmw->pmd); 4167 /* See folio_try_share_anon_rmap_pmd( << 4168 anon_exclusive = folio_test_anon(foli << 4169 if (anon_exclusive && folio_try_share << 4170 set_pmd_at(mm, address, pvmw- << 4171 return -EBUSY; << 4172 } << 4173 << 4174 if (pmd_dirty(pmdval)) << 4175 folio_mark_dirty(folio); << 4176 if (pmd_write(pmdval)) << 4177 entry = make_writable_migrati << 4178 else if (anon_exclusive) << 4179 entry = make_readable_exclusi << 4180 else << 4181 entry = make_readable_migrati << 4182 if (pmd_young(pmdval)) << 4183 entry = make_migration_entry_ << 4184 if (pmd_dirty(pmdval)) 2913 if (pmd_dirty(pmdval)) 4185 entry = make_migration_entry_ !! 2914 set_page_dirty(page); >> 2915 entry = make_migration_entry(page, pmd_write(pmdval)); 4186 pmdswp = swp_entry_to_pmd(entry); 2916 pmdswp = swp_entry_to_pmd(entry); 4187 if (pmd_soft_dirty(pmdval)) 2917 if (pmd_soft_dirty(pmdval)) 4188 pmdswp = pmd_swp_mksoft_dirty 2918 pmdswp = pmd_swp_mksoft_dirty(pmdswp); 4189 if (pmd_uffd_wp(pmdval)) << 4190 pmdswp = pmd_swp_mkuffd_wp(pm << 4191 set_pmd_at(mm, address, pvmw->pmd, pm 2919 set_pmd_at(mm, address, pvmw->pmd, pmdswp); 4192 folio_remove_rmap_pmd(folio, page, vm !! 2920 page_remove_rmap(page, true); 4193 folio_put(folio); !! 2921 put_page(page); 4194 trace_set_migration_pmd(address, pmd_ << 4195 << 4196 return 0; << 4197 } 2922 } 4198 2923 4199 void remove_migration_pmd(struct page_vma_map 2924 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) 4200 { 2925 { 4201 struct folio *folio = page_folio(new) << 4202 struct vm_area_struct *vma = pvmw->vm 2926 struct vm_area_struct *vma = pvmw->vma; 4203 struct mm_struct *mm = vma->vm_mm; 2927 struct mm_struct *mm = vma->vm_mm; 4204 unsigned long address = pvmw->address 2928 unsigned long address = pvmw->address; 4205 unsigned long haddr = address & HPAGE !! 2929 unsigned long mmun_start = address & HPAGE_PMD_MASK; 4206 pmd_t pmde; 2930 pmd_t pmde; 4207 swp_entry_t entry; 2931 swp_entry_t entry; 4208 2932 4209 if (!(pvmw->pmd && !pvmw->pte)) 2933 if (!(pvmw->pmd && !pvmw->pte)) 4210 return; 2934 return; 4211 2935 4212 entry = pmd_to_swp_entry(*pvmw->pmd); 2936 entry = pmd_to_swp_entry(*pvmw->pmd); 4213 folio_get(folio); !! 2937 get_page(new); 4214 pmde = mk_huge_pmd(new, READ_ONCE(vma !! 2938 pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot)); 4215 if (pmd_swp_soft_dirty(*pvmw->pmd)) 2939 if (pmd_swp_soft_dirty(*pvmw->pmd)) 4216 pmde = pmd_mksoft_dirty(pmde) 2940 pmde = pmd_mksoft_dirty(pmde); 4217 if (is_writable_migration_entry(entry !! 2941 if (is_write_migration_entry(entry)) 4218 pmde = pmd_mkwrite(pmde, vma) !! 2942 pmde = maybe_pmd_mkwrite(pmde, vma); 4219 if (pmd_swp_uffd_wp(*pvmw->pmd)) << 4220 pmde = pmd_mkuffd_wp(pmde); << 4221 if (!is_migration_entry_young(entry)) << 4222 pmde = pmd_mkold(pmde); << 4223 /* NOTE: this may contain setting sof << 4224 if (folio_test_dirty(folio) && is_mig << 4225 pmde = pmd_mkdirty(pmde); << 4226 << 4227 if (folio_test_anon(folio)) { << 4228 rmap_t rmap_flags = RMAP_NONE << 4229 << 4230 if (!is_readable_migration_en << 4231 rmap_flags |= RMAP_EX << 4232 2943 4233 folio_add_anon_rmap_pmd(folio !! 2944 flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); 4234 } else { !! 2945 if (PageAnon(new)) 4235 folio_add_file_rmap_pmd(folio !! 2946 page_add_anon_rmap(new, vma, mmun_start, true); 4236 } !! 2947 else 4237 VM_BUG_ON(pmd_write(pmde) && folio_te !! 2948 page_add_file_rmap(new, true); 4238 set_pmd_at(mm, haddr, pvmw->pmd, pmde !! 2949 set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); 4239 !! 2950 if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) 4240 /* No need to invalidate - it was non !! 2951 mlock_vma_page(new); 4241 update_mmu_cache_pmd(vma, address, pv 2952 update_mmu_cache_pmd(vma, address, pvmw->pmd); 4242 trace_remove_migration_pmd(address, p << 4243 } 2953 } 4244 #endif 2954 #endif 4245 2955
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.