1 /* SPDX-License-Identifier: GPL-2.0 */ 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef _LINUX_RMAP_H 2 #ifndef _LINUX_RMAP_H 3 #define _LINUX_RMAP_H 3 #define _LINUX_RMAP_H 4 /* 4 /* 5 * Declarations for Reverse Mapping functions 5 * Declarations for Reverse Mapping functions in mm/rmap.c 6 */ 6 */ 7 7 8 #include <linux/list.h> 8 #include <linux/list.h> 9 #include <linux/slab.h> 9 #include <linux/slab.h> 10 #include <linux/mm.h> 10 #include <linux/mm.h> 11 #include <linux/rwsem.h> 11 #include <linux/rwsem.h> 12 #include <linux/memcontrol.h> 12 #include <linux/memcontrol.h> 13 #include <linux/highmem.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> << 15 #include <linux/memremap.h> << 16 14 17 /* 15 /* 18 * The anon_vma heads a list of private "relat 16 * The anon_vma heads a list of private "related" vmas, to scan if 19 * an anonymous page pointing to this anon_vma 17 * an anonymous page pointing to this anon_vma needs to be unmapped: 20 * the vmas on the list will be related by for 18 * the vmas on the list will be related by forking, or by splitting. 21 * 19 * 22 * Since vmas come and go as they are split an 20 * Since vmas come and go as they are split and merged (particularly 23 * in mprotect), the mapping field of an anony 21 * in mprotect), the mapping field of an anonymous page cannot point 24 * directly to a vma: instead it points to an 22 * directly to a vma: instead it points to an anon_vma, on whose list 25 * the related vmas can be easily linked or un 23 * the related vmas can be easily linked or unlinked. 26 * 24 * 27 * After unlinking the last vma on the list, w 25 * After unlinking the last vma on the list, we must garbage collect 28 * the anon_vma object itself: we're guarantee 26 * the anon_vma object itself: we're guaranteed no page can be 29 * pointing to this anon_vma once its vma list 27 * pointing to this anon_vma once its vma list is empty. 30 */ 28 */ 31 struct anon_vma { 29 struct anon_vma { 32 struct anon_vma *root; /* Roo 30 struct anon_vma *root; /* Root of this anon_vma tree */ 33 struct rw_semaphore rwsem; /* W: 31 struct rw_semaphore rwsem; /* W: modification, R: walking the list */ 34 /* 32 /* 35 * The refcount is taken on an anon_vm 33 * The refcount is taken on an anon_vma when there is no 36 * guarantee that the vma of page tabl 34 * guarantee that the vma of page tables will exist for 37 * the duration of the operation. A ca 35 * the duration of the operation. A caller that takes 38 * the reference is responsible for cl 36 * the reference is responsible for clearing up the 39 * anon_vma if they are the last user 37 * anon_vma if they are the last user on release 40 */ 38 */ 41 atomic_t refcount; 39 atomic_t refcount; 42 40 43 /* 41 /* 44 * Count of child anon_vmas. Equals to !! 42 * Count of child anon_vmas and VMAs which points to this anon_vma. 45 * have ->parent pointing to this one, << 46 * 43 * 47 * This counter is used for making dec 44 * This counter is used for making decision about reusing anon_vma 48 * instead of forking new one. See com 45 * instead of forking new one. See comments in function anon_vma_clone. 49 */ 46 */ 50 unsigned long num_children; !! 47 unsigned degree; 51 /* Count of VMAs whose ->anon_vma poin << 52 unsigned long num_active_vmas; << 53 48 54 struct anon_vma *parent; /* Par 49 struct anon_vma *parent; /* Parent of this anon_vma */ 55 50 56 /* 51 /* 57 * NOTE: the LSB of the rb_root.rb_nod 52 * NOTE: the LSB of the rb_root.rb_node is set by 58 * mm_take_all_locks() _after_ taking 53 * mm_take_all_locks() _after_ taking the above lock. So the 59 * rb_root must only be read/written a 54 * rb_root must only be read/written after taking the above lock 60 * to be sure to see a valid next poin 55 * to be sure to see a valid next pointer. The LSB bit itself 61 * is serialized by a system wide lock 56 * is serialized by a system wide lock only visible to 62 * mm_take_all_locks() (mm_all_locks_m 57 * mm_take_all_locks() (mm_all_locks_mutex). 63 */ 58 */ 64 59 65 /* Interval tree of private "related" 60 /* Interval tree of private "related" vmas */ 66 struct rb_root_cached rb_root; 61 struct rb_root_cached rb_root; 67 }; 62 }; 68 63 69 /* 64 /* 70 * The copy-on-write semantics of fork mean th 65 * The copy-on-write semantics of fork mean that an anon_vma 71 * can become associated with multiple process 66 * can become associated with multiple processes. Furthermore, 72 * each child process will have its own anon_v 67 * each child process will have its own anon_vma, where new 73 * pages for that process are instantiated. 68 * pages for that process are instantiated. 74 * 69 * 75 * This structure allows us to find the anon_v 70 * This structure allows us to find the anon_vmas associated 76 * with a VMA, or the VMAs associated with an 71 * with a VMA, or the VMAs associated with an anon_vma. 77 * The "same_vma" list contains the anon_vma_c 72 * The "same_vma" list contains the anon_vma_chains linking 78 * all the anon_vmas associated with this VMA. 73 * all the anon_vmas associated with this VMA. 79 * The "rb" field indexes on an interval tree 74 * The "rb" field indexes on an interval tree the anon_vma_chains 80 * which link all the VMAs associated with thi 75 * which link all the VMAs associated with this anon_vma. 81 */ 76 */ 82 struct anon_vma_chain { 77 struct anon_vma_chain { 83 struct vm_area_struct *vma; 78 struct vm_area_struct *vma; 84 struct anon_vma *anon_vma; 79 struct anon_vma *anon_vma; 85 struct list_head same_vma; /* locked !! 80 struct list_head same_vma; /* locked by mmap_sem & page_table_lock */ 86 struct rb_node rb; 81 struct rb_node rb; /* locked by anon_vma->rwsem */ 87 unsigned long rb_subtree_last; 82 unsigned long rb_subtree_last; 88 #ifdef CONFIG_DEBUG_VM_RB 83 #ifdef CONFIG_DEBUG_VM_RB 89 unsigned long cached_vma_start, cached 84 unsigned long cached_vma_start, cached_vma_last; 90 #endif 85 #endif 91 }; 86 }; 92 87 93 enum ttu_flags { 88 enum ttu_flags { >> 89 TTU_MIGRATION = 0x1, /* migration mode */ >> 90 TTU_MUNLOCK = 0x2, /* munlock mode */ >> 91 94 TTU_SPLIT_HUGE_PMD = 0x4, /* spl 92 TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ 95 TTU_IGNORE_MLOCK = 0x8, /* ign 93 TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ 96 TTU_SYNC = 0x10, /* avo !! 94 TTU_IGNORE_ACCESS = 0x10, /* don't age */ 97 TTU_HWPOISON = 0x20, /* do !! 95 TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */ 98 TTU_BATCH_FLUSH = 0x40, /* Bat 96 TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible 99 * and 97 * and caller guarantees they will 100 * do 98 * do a final flush if necessary */ 101 TTU_RMAP_LOCKED = 0x80, /* do 99 TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: 102 * cal 100 * caller holds it */ >> 101 TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */ 103 }; 102 }; 104 103 105 #ifdef CONFIG_MMU 104 #ifdef CONFIG_MMU 106 static inline void get_anon_vma(struct anon_vm 105 static inline void get_anon_vma(struct anon_vma *anon_vma) 107 { 106 { 108 atomic_inc(&anon_vma->refcount); 107 atomic_inc(&anon_vma->refcount); 109 } 108 } 110 109 111 void __put_anon_vma(struct anon_vma *anon_vma) 110 void __put_anon_vma(struct anon_vma *anon_vma); 112 111 113 static inline void put_anon_vma(struct anon_vm 112 static inline void put_anon_vma(struct anon_vma *anon_vma) 114 { 113 { 115 if (atomic_dec_and_test(&anon_vma->ref 114 if (atomic_dec_and_test(&anon_vma->refcount)) 116 __put_anon_vma(anon_vma); 115 __put_anon_vma(anon_vma); 117 } 116 } 118 117 119 static inline void anon_vma_lock_write(struct 118 static inline void anon_vma_lock_write(struct anon_vma *anon_vma) 120 { 119 { 121 down_write(&anon_vma->root->rwsem); 120 down_write(&anon_vma->root->rwsem); 122 } 121 } 123 122 124 static inline int anon_vma_trylock_write(struc << 125 { << 126 return down_write_trylock(&anon_vma->r << 127 } << 128 << 129 static inline void anon_vma_unlock_write(struc 123 static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) 130 { 124 { 131 up_write(&anon_vma->root->rwsem); 125 up_write(&anon_vma->root->rwsem); 132 } 126 } 133 127 134 static inline void anon_vma_lock_read(struct a 128 static inline void anon_vma_lock_read(struct anon_vma *anon_vma) 135 { 129 { 136 down_read(&anon_vma->root->rwsem); 130 down_read(&anon_vma->root->rwsem); 137 } 131 } 138 132 139 static inline int anon_vma_trylock_read(struct << 140 { << 141 return down_read_trylock(&anon_vma->ro << 142 } << 143 << 144 static inline void anon_vma_unlock_read(struct 133 static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) 145 { 134 { 146 up_read(&anon_vma->root->rwsem); 135 up_read(&anon_vma->root->rwsem); 147 } 136 } 148 137 149 138 150 /* 139 /* 151 * anon_vma helper functions. 140 * anon_vma helper functions. 152 */ 141 */ 153 void anon_vma_init(void); /* create anon 142 void anon_vma_init(void); /* create anon_vma_cachep */ 154 int __anon_vma_prepare(struct vm_area_struct 143 int __anon_vma_prepare(struct vm_area_struct *); 155 void unlink_anon_vmas(struct vm_area_struct *) 144 void unlink_anon_vmas(struct vm_area_struct *); 156 int anon_vma_clone(struct vm_area_struct *, st 145 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); 157 int anon_vma_fork(struct vm_area_struct *, str 146 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); 158 147 159 static inline int anon_vma_prepare(struct vm_a 148 static inline int anon_vma_prepare(struct vm_area_struct *vma) 160 { 149 { 161 if (likely(vma->anon_vma)) 150 if (likely(vma->anon_vma)) 162 return 0; 151 return 0; 163 152 164 return __anon_vma_prepare(vma); 153 return __anon_vma_prepare(vma); 165 } 154 } 166 155 167 static inline void anon_vma_merge(struct vm_ar 156 static inline void anon_vma_merge(struct vm_area_struct *vma, 168 struct vm_ar 157 struct vm_area_struct *next) 169 { 158 { 170 VM_BUG_ON_VMA(vma->anon_vma != next->a 159 VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); 171 unlink_anon_vmas(next); 160 unlink_anon_vmas(next); 172 } 161 } 173 162 174 struct anon_vma *folio_get_anon_vma(struct fol !! 163 struct anon_vma *page_get_anon_vma(struct page *page); 175 164 176 /* RMAP flags, currently only relevant for som !! 165 /* bitflags for do_page_add_anon_rmap() */ 177 typedef int __bitwise rmap_t; !! 166 #define RMAP_EXCLUSIVE 0x01 178 !! 167 #define RMAP_COMPOUND 0x02 179 /* << 180 * No special request: A mapped anonymous (sub << 181 * processes. << 182 */ << 183 #define RMAP_NONE ((__force rmap << 184 << 185 /* The anonymous (sub)page is exclusive to a s << 186 #define RMAP_EXCLUSIVE ((__force rmap << 187 << 188 /* << 189 * Internally, we're using an enum to specify << 190 * compiler emit specialized code for each gra << 191 */ << 192 enum rmap_level { << 193 RMAP_LEVEL_PTE = 0, << 194 RMAP_LEVEL_PMD, << 195 }; << 196 << 197 static inline void __folio_rmap_sanity_checks( << 198 struct page *page, int nr_page << 199 { << 200 /* hugetlb folios are handled separate << 201 VM_WARN_ON_FOLIO(folio_test_hugetlb(fo << 202 << 203 /* When (un)mapping zeropages, we shou << 204 VM_WARN_ON_FOLIO(is_zero_folio(folio), << 205 << 206 /* << 207 * TODO: we get driver-allocated folio << 208 * the rmap using vm_insert_page(); th << 209 * folio_test_large_rmappable() holds << 210 * handle any desired mapcount+stats a << 211 * VM_MIXEDMAP VMAs separately, and th << 212 * we really only get rmappable folios << 213 */ << 214 << 215 VM_WARN_ON_ONCE(nr_pages <= 0); << 216 VM_WARN_ON_FOLIO(page_folio(page) != f << 217 VM_WARN_ON_FOLIO(page_folio(page + nr_ << 218 << 219 switch (level) { << 220 case RMAP_LEVEL_PTE: << 221 break; << 222 case RMAP_LEVEL_PMD: << 223 /* << 224 * We don't support folios lar << 225 * when RMAP_LEVEL_PMD is set, << 226 * a single "entire" mapping o << 227 */ << 228 VM_WARN_ON_FOLIO(folio_nr_page << 229 VM_WARN_ON_FOLIO(nr_pages != H << 230 break; << 231 default: << 232 VM_WARN_ON_ONCE(true); << 233 } << 234 } << 235 168 236 /* 169 /* 237 * rmap interfaces called when adding or remov 170 * rmap interfaces called when adding or removing pte of page 238 */ 171 */ 239 void folio_move_anon_rmap(struct folio *, stru !! 172 void page_move_anon_rmap(struct page *, struct vm_area_struct *); 240 void folio_add_anon_rmap_ptes(struct folio *, !! 173 void page_add_anon_rmap(struct page *, struct vm_area_struct *, 241 struct vm_area_struct *, unsig !! 174 unsigned long, bool); 242 #define folio_add_anon_rmap_pte(folio, page, v !! 175 void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, 243 folio_add_anon_rmap_ptes(folio, page, !! 176 unsigned long, int); 244 void folio_add_anon_rmap_pmd(struct folio *, s !! 177 void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, 245 struct vm_area_struct *, unsig !! 178 unsigned long, bool); 246 void folio_add_new_anon_rmap(struct folio *, s !! 179 void page_add_file_rmap(struct page *, bool); 247 unsigned long address, rmap_t !! 180 void page_remove_rmap(struct page *, bool); 248 void folio_add_file_rmap_ptes(struct folio *, !! 181 249 struct vm_area_struct *); !! 182 void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, 250 #define folio_add_file_rmap_pte(folio, page, v !! 183 unsigned long); 251 folio_add_file_rmap_ptes(folio, page, !! 184 void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, 252 void folio_add_file_rmap_pmd(struct folio *, s !! 185 unsigned long); 253 struct vm_area_struct *); << 254 void folio_remove_rmap_ptes(struct folio *, st << 255 struct vm_area_struct *); << 256 #define folio_remove_rmap_pte(folio, page, vma << 257 folio_remove_rmap_ptes(folio, page, 1, << 258 void folio_remove_rmap_pmd(struct folio *, str << 259 struct vm_area_struct *); << 260 << 261 void hugetlb_add_anon_rmap(struct folio *, str << 262 unsigned long address, rmap_t << 263 void hugetlb_add_new_anon_rmap(struct folio *, << 264 unsigned long address); << 265 << 266 /* See folio_try_dup_anon_rmap_*() */ << 267 static inline int hugetlb_try_dup_anon_rmap(st << 268 struct vm_area_struct *vma) << 269 { << 270 VM_WARN_ON_FOLIO(!folio_test_hugetlb(f << 271 VM_WARN_ON_FOLIO(!folio_test_anon(foli << 272 << 273 if (PageAnonExclusive(&folio->page)) { << 274 if (unlikely(folio_needs_cow_f << 275 return -EBUSY; << 276 ClearPageAnonExclusive(&folio- << 277 } << 278 atomic_inc(&folio->_entire_mapcount); << 279 atomic_inc(&folio->_large_mapcount); << 280 return 0; << 281 } << 282 << 283 /* See folio_try_share_anon_rmap_*() */ << 284 static inline int hugetlb_try_share_anon_rmap( << 285 { << 286 VM_WARN_ON_FOLIO(!folio_test_hugetlb(f << 287 VM_WARN_ON_FOLIO(!folio_test_anon(foli << 288 VM_WARN_ON_FOLIO(!PageAnonExclusive(&f << 289 << 290 /* Paired with the memory barrier in t << 291 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) << 292 smp_mb(); << 293 << 294 if (unlikely(folio_maybe_dma_pinned(fo << 295 return -EBUSY; << 296 ClearPageAnonExclusive(&folio->page); << 297 << 298 /* << 299 * This is conceptually a smp_wmb() pa << 300 * gup_must_unshare(). << 301 */ << 302 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) << 303 smp_mb__after_atomic(); << 304 return 0; << 305 } << 306 << 307 static inline void hugetlb_add_file_rmap(struc << 308 { << 309 VM_WARN_ON_FOLIO(!folio_test_hugetlb(f << 310 VM_WARN_ON_FOLIO(folio_test_anon(folio << 311 << 312 atomic_inc(&folio->_entire_mapcount); << 313 atomic_inc(&folio->_large_mapcount); << 314 } << 315 << 316 static inline void hugetlb_remove_rmap(struct << 317 { << 318 VM_WARN_ON_FOLIO(!folio_test_hugetlb(f << 319 << 320 atomic_dec(&folio->_entire_mapcount); << 321 atomic_dec(&folio->_large_mapcount); << 322 } << 323 << 324 static __always_inline void __folio_dup_file_r << 325 struct page *page, int nr_page << 326 { << 327 const int orig_nr_pages = nr_pages; << 328 << 329 __folio_rmap_sanity_checks(folio, page << 330 << 331 switch (level) { << 332 case RMAP_LEVEL_PTE: << 333 if (!folio_test_large(folio)) << 334 atomic_inc(&folio->_ma << 335 break; << 336 } << 337 << 338 do { << 339 atomic_inc(&page->_map << 340 } while (page++, --nr_pages > << 341 atomic_add(orig_nr_pages, &fol << 342 break; << 343 case RMAP_LEVEL_PMD: << 344 atomic_inc(&folio->_entire_map << 345 atomic_inc(&folio->_large_mapc << 346 break; << 347 } << 348 } << 349 << 350 /** << 351 * folio_dup_file_rmap_ptes - duplicate PTE ma << 352 * @folio: The folio to duplicate the map << 353 * @page: The first page to duplicate th << 354 * @nr_pages: The number of pages of which t << 355 * << 356 * The page range of the folio is defined by [ << 357 * << 358 * The caller needs to hold the page table loc << 359 */ << 360 static inline void folio_dup_file_rmap_ptes(st << 361 struct page *page, int nr_page << 362 { << 363 __folio_dup_file_rmap(folio, page, nr_ << 364 } << 365 186 366 static __always_inline void folio_dup_file_rma !! 187 static inline void page_dup_rmap(struct page *page, bool compound) 367 struct page *page) << 368 { 188 { 369 __folio_dup_file_rmap(folio, page, 1, !! 189 atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); 370 } << 371 << 372 /** << 373 * folio_dup_file_rmap_pmd - duplicate a PMD m << 374 * @folio: The folio to duplicate the map << 375 * @page: The first page to duplicate th << 376 * << 377 * The page range of the folio is defined by [ << 378 * << 379 * The caller needs to hold the page table loc << 380 */ << 381 static inline void folio_dup_file_rmap_pmd(str << 382 struct page *page) << 383 { << 384 #ifdef CONFIG_TRANSPARENT_HUGEPAGE << 385 __folio_dup_file_rmap(folio, page, HPA << 386 #else << 387 WARN_ON_ONCE(true); << 388 #endif << 389 } << 390 << 391 static __always_inline int __folio_try_dup_ano << 392 struct page *page, int nr_page << 393 enum rmap_level level) << 394 { << 395 const int orig_nr_pages = nr_pages; << 396 bool maybe_pinned; << 397 int i; << 398 << 399 VM_WARN_ON_FOLIO(!folio_test_anon(foli << 400 __folio_rmap_sanity_checks(folio, page << 401 << 402 /* << 403 * If this folio may have been pinned << 404 * don't allow to duplicate the mappin << 405 * copy the subpage immediately for th << 406 * guarantee the pinned folio won't be << 407 * future on write faults. << 408 */ << 409 maybe_pinned = likely(!folio_is_device << 410 unlikely(folio_needs_co << 411 << 412 /* << 413 * No need to check+clear for already << 414 * folio. But if any page is PageAnonE << 415 * copying if the folio maybe pinned. << 416 */ << 417 switch (level) { << 418 case RMAP_LEVEL_PTE: << 419 if (unlikely(maybe_pinned)) { << 420 for (i = 0; i < nr_pag << 421 if (PageAnonEx << 422 return << 423 } << 424 << 425 if (!folio_test_large(folio)) << 426 if (PageAnonExclusive( << 427 ClearPageAnonE << 428 atomic_inc(&folio->_ma << 429 break; << 430 } << 431 << 432 do { << 433 if (PageAnonExclusive( << 434 ClearPageAnonE << 435 atomic_inc(&page->_map << 436 } while (page++, --nr_pages > << 437 atomic_add(orig_nr_pages, &fol << 438 break; << 439 case RMAP_LEVEL_PMD: << 440 if (PageAnonExclusive(page)) { << 441 if (unlikely(maybe_pin << 442 return -EBUSY; << 443 ClearPageAnonExclusive << 444 } << 445 atomic_inc(&folio->_entire_map << 446 atomic_inc(&folio->_large_mapc << 447 break; << 448 } << 449 return 0; << 450 } << 451 << 452 /** << 453 * folio_try_dup_anon_rmap_ptes - try duplicat << 454 * of a folio << 455 * @folio: The folio to duplicate the map << 456 * @page: The first page to duplicate th << 457 * @nr_pages: The number of pages of which t << 458 * @src_vma: The vm area from which the map << 459 * << 460 * The page range of the folio is defined by [ << 461 * << 462 * The caller needs to hold the page table loc << 463 * vma->vma_mm->write_protect_seq. << 464 * << 465 * Duplicating the mappings can only fail if t << 466 * private folios cannot get pinned and conseq << 467 * for them. << 468 * << 469 * If duplicating the mappings succeeded, the << 470 * the parent and the child. They must *not* b << 471 * succeeded. << 472 * << 473 * Returns 0 if duplicating the mappings succe << 474 */ << 475 static inline int folio_try_dup_anon_rmap_ptes << 476 struct page *page, int nr_page << 477 { << 478 return __folio_try_dup_anon_rmap(folio << 479 RMAP_ << 480 } << 481 << 482 static __always_inline int folio_try_dup_anon_ << 483 struct page *page, struct vm_a << 484 { << 485 return __folio_try_dup_anon_rmap(folio << 486 RMAP_ << 487 } << 488 << 489 /** << 490 * folio_try_dup_anon_rmap_pmd - try duplicati << 491 * of a folio << 492 * @folio: The folio to duplicate the map << 493 * @page: The first page to duplicate th << 494 * @src_vma: The vm area from which the map << 495 * << 496 * The page range of the folio is defined by [ << 497 * << 498 * The caller needs to hold the page table loc << 499 * vma->vma_mm->write_protect_seq. << 500 * << 501 * Duplicating the mapping can only fail if th << 502 * private folios cannot get pinned and conseq << 503 * for them. << 504 * << 505 * If duplicating the mapping succeeds, the du << 506 * the parent and the child. They must *not* b << 507 * succeeded. << 508 * << 509 * Returns 0 if duplicating the mapping succee << 510 */ << 511 static inline int folio_try_dup_anon_rmap_pmd( << 512 struct page *page, struct vm_a << 513 { << 514 #ifdef CONFIG_TRANSPARENT_HUGEPAGE << 515 return __folio_try_dup_anon_rmap(folio << 516 RMAP_ << 517 #else << 518 WARN_ON_ONCE(true); << 519 return -EBUSY; << 520 #endif << 521 } << 522 << 523 static __always_inline int __folio_try_share_a << 524 struct page *page, int nr_page << 525 { << 526 VM_WARN_ON_FOLIO(!folio_test_anon(foli << 527 VM_WARN_ON_FOLIO(!PageAnonExclusive(pa << 528 __folio_rmap_sanity_checks(folio, page << 529 << 530 /* device private folios cannot get pi << 531 if (unlikely(folio_is_device_private(f << 532 ClearPageAnonExclusive(page); << 533 return 0; << 534 } << 535 << 536 /* << 537 * We have to make sure that when we c << 538 * the page is not pinned and that con << 539 * concurrently pinning the page. << 540 * << 541 * Conceptually, PageAnonExclusive cle << 542 * (A1) Clear PTE << 543 * (A2) Check if the page is pinned; b << 544 * (A3) Clear PageAnonExclusive << 545 * (A4) Restore PTE (optional, but cer << 546 * << 547 * When clearing PageAnonExclusive, we << 548 * writable again, because anon pages << 549 * be writable. So in any case, if the << 550 * be writable anymore afterwards and << 551 * if the PTE wasn't writable, there m << 552 * << 553 * Conceptually, GUP-fast pinning of a << 554 * (B1) Read the PTE << 555 * (B2) FOLL_WRITE: check if the PTE i << 556 * (B3) Pin the mapped page << 557 * (B4) Check if the PTE changed by re << 558 * (B5) If the original PTE is not wri << 559 * PageAnonExclusive is not set; << 560 * << 561 * If the PTE was writable, we only ha << 562 * observes a PTE change and properly << 563 * << 564 * If the PTE was not writable, we hav << 565 * detects a (temporary) PTE change or << 566 * and properly backs off. << 567 * << 568 * Consequently, when clearing PageAno << 569 * sure that (A1), (A2)/(A3) and (A4) << 570 * order. In GUP-fast pinning code, we << 571 * and (B5) happen in the right memory << 572 * << 573 * We assume that there might not be a << 574 * clearing/invalidating the PTE (A1) << 575 * so we use explicit ones here. << 576 */ << 577 << 578 /* Paired with the memory barrier in t << 579 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) << 580 smp_mb(); << 581 << 582 if (unlikely(folio_maybe_dma_pinned(fo << 583 return -EBUSY; << 584 ClearPageAnonExclusive(page); << 585 << 586 /* << 587 * This is conceptually a smp_wmb() pa << 588 * gup_must_unshare(). << 589 */ << 590 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) << 591 smp_mb__after_atomic(); << 592 return 0; << 593 } << 594 << 595 /** << 596 * folio_try_share_anon_rmap_pte - try marking << 597 * mapped by a << 598 * for KSM or << 599 * @folio: The folio to share a mapping o << 600 * @page: The mapped exclusive page << 601 * << 602 * The caller needs to hold the page table loc << 603 * entries cleared/invalidated. << 604 * << 605 * This is similar to folio_try_dup_anon_rmap_ << 606 * fork() to duplicate mappings, but instead t << 607 * unmapping parts of a folio (swap, migration << 608 * << 609 * Marking the mapped page shared can only fai << 610 * device private folios cannot get pinned and << 611 * fail. << 612 * << 613 * Returns 0 if marking the mapped page possib << 614 * -EBUSY otherwise. << 615 */ << 616 static inline int folio_try_share_anon_rmap_pt << 617 struct page *page) << 618 { << 619 return __folio_try_share_anon_rmap(fol << 620 } << 621 << 622 /** << 623 * folio_try_share_anon_rmap_pmd - try marking << 624 * range mappe << 625 * prepare for << 626 * @folio: The folio to share the mapping << 627 * @page: The first page to share the ma << 628 * << 629 * The page range of the folio is defined by [ << 630 * << 631 * The caller needs to hold the page table loc << 632 * entries cleared/invalidated. << 633 * << 634 * This is similar to folio_try_dup_anon_rmap_ << 635 * fork() to duplicate a mapping, but instead << 636 * unmapping parts of a folio (swap, migration << 637 * << 638 * Marking the mapped pages shared can only fa << 639 * device private folios cannot get pinned and << 640 * fail. << 641 * << 642 * Returns 0 if marking the mapped pages possi << 643 * -EBUSY otherwise. << 644 */ << 645 static inline int folio_try_share_anon_rmap_pm << 646 struct page *page) << 647 { << 648 #ifdef CONFIG_TRANSPARENT_HUGEPAGE << 649 return __folio_try_share_anon_rmap(fol << 650 RMA << 651 #else << 652 WARN_ON_ONCE(true); << 653 return -EBUSY; << 654 #endif << 655 } 190 } 656 191 657 /* 192 /* 658 * Called from mm/vmscan.c to handle paging ou 193 * Called from mm/vmscan.c to handle paging out 659 */ 194 */ 660 int folio_referenced(struct folio *, int is_lo !! 195 int page_referenced(struct page *, int is_locked, 661 struct mem_cgroup *mem 196 struct mem_cgroup *memcg, unsigned long *vm_flags); 662 197 663 void try_to_migrate(struct folio *folio, enum !! 198 bool try_to_unmap(struct page *, enum ttu_flags flags); 664 void try_to_unmap(struct folio *, enum ttu_fla << 665 << 666 int make_device_exclusive_range(struct mm_stru << 667 unsigned long << 668 void *arg); << 669 199 670 /* Avoid racy checks */ 200 /* Avoid racy checks */ 671 #define PVMW_SYNC (1 << 0) 201 #define PVMW_SYNC (1 << 0) 672 /* Look for migration entries rather than pres !! 202 /* Look for migarion entries rather than present PTEs */ 673 #define PVMW_MIGRATION (1 << 1) 203 #define PVMW_MIGRATION (1 << 1) 674 204 675 struct page_vma_mapped_walk { 205 struct page_vma_mapped_walk { 676 unsigned long pfn; !! 206 struct page *page; 677 unsigned long nr_pages; << 678 pgoff_t pgoff; << 679 struct vm_area_struct *vma; 207 struct vm_area_struct *vma; 680 unsigned long address; 208 unsigned long address; 681 pmd_t *pmd; 209 pmd_t *pmd; 682 pte_t *pte; 210 pte_t *pte; 683 spinlock_t *ptl; 211 spinlock_t *ptl; 684 unsigned int flags; 212 unsigned int flags; 685 }; 213 }; 686 214 687 #define DEFINE_FOLIO_VMA_WALK(name, _folio, _v << 688 struct page_vma_mapped_walk name = { << 689 .pfn = folio_pfn(_folio), << 690 .nr_pages = folio_nr_pages(_fo << 691 .pgoff = folio_pgoff(_folio), << 692 .vma = _vma, << 693 .address = _address, << 694 .flags = _flags, << 695 } << 696 << 697 static inline void page_vma_mapped_walk_done(s 215 static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) 698 { 216 { 699 /* HugeTLB pte is set to the relevant !! 217 if (pvmw->pte) 700 if (pvmw->pte && !is_vm_hugetlb_page(p << 701 pte_unmap(pvmw->pte); 218 pte_unmap(pvmw->pte); 702 if (pvmw->ptl) 219 if (pvmw->ptl) 703 spin_unlock(pvmw->ptl); 220 spin_unlock(pvmw->ptl); 704 } 221 } 705 222 706 /** << 707 * page_vma_mapped_walk_restart - Restart the << 708 * @pvmw: Pointer to struct page_vma_mapped_wa << 709 * << 710 * It restarts the page table walk when change << 711 * table, such as splitting a PMD. Ensures tha << 712 * the previous walk is released and resets th << 713 * a new walk starting at the current address << 714 */ << 715 static inline void << 716 page_vma_mapped_walk_restart(struct page_vma_m << 717 { << 718 WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte) << 719 << 720 if (likely(pvmw->ptl)) << 721 spin_unlock(pvmw->ptl); << 722 else << 723 WARN_ON_ONCE(1); << 724 << 725 pvmw->ptl = NULL; << 726 pvmw->pmd = NULL; << 727 pvmw->pte = NULL; << 728 } << 729 << 730 bool page_vma_mapped_walk(struct page_vma_mapp 223 bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); 731 224 732 /* 225 /* 733 * Used by swapoff to help locate where page i 226 * Used by swapoff to help locate where page is expected in vma. 734 */ 227 */ 735 unsigned long page_address_in_vma(struct page 228 unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); 736 229 737 /* 230 /* 738 * Cleans the PTEs of shared mappings. 231 * Cleans the PTEs of shared mappings. 739 * (and since clean PTEs should also be readon 232 * (and since clean PTEs should also be readonly, write protects them too) 740 * 233 * 741 * returns the number of cleaned PTEs. 234 * returns the number of cleaned PTEs. 742 */ 235 */ 743 int folio_mkclean(struct folio *); !! 236 int page_mkclean(struct page *); 744 237 745 int pfn_mkclean_range(unsigned long pfn, unsig !! 238 /* 746 struct vm_area_struct *v !! 239 * called in munlock()/munmap() path to check for other vmas holding >> 240 * the page mlocked. >> 241 */ >> 242 void try_to_munlock(struct page *); 747 243 748 enum rmp_flags { !! 244 void remove_migration_ptes(struct page *old, struct page *new, bool locked); 749 RMP_LOCKED = 1 << 0, << 750 RMP_USE_SHARED_ZEROPAGE = 1 << 1, << 751 }; << 752 245 753 void remove_migration_ptes(struct folio *src, !! 246 /* >> 247 * Called by memory-failure.c to kill processes. >> 248 */ >> 249 struct anon_vma *page_lock_anon_vma_read(struct page *page); >> 250 void page_unlock_anon_vma_read(struct anon_vma *anon_vma); >> 251 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); 754 252 755 /* 253 /* 756 * rmap_walk_control: To control rmap traversi 254 * rmap_walk_control: To control rmap traversing for specific needs 757 * 255 * 758 * arg: passed to rmap_one() and invalid_vma() 256 * arg: passed to rmap_one() and invalid_vma() 759 * try_lock: bail out if the rmap lock is cont << 760 * contended: indicate the rmap traversal bail << 761 * rmap_one: executed on each vma where page i 257 * rmap_one: executed on each vma where page is mapped 762 * done: for checking traversing termination c 258 * done: for checking traversing termination condition 763 * anon_lock: for getting anon_lock by optimiz 259 * anon_lock: for getting anon_lock by optimized way rather than default 764 * invalid_vma: for skipping uninterested vma 260 * invalid_vma: for skipping uninterested vma 765 */ 261 */ 766 struct rmap_walk_control { 262 struct rmap_walk_control { 767 void *arg; 263 void *arg; 768 bool try_lock; << 769 bool contended; << 770 /* 264 /* 771 * Return false if page table scanning 265 * Return false if page table scanning in rmap_walk should be stopped. 772 * Otherwise, return true. 266 * Otherwise, return true. 773 */ 267 */ 774 bool (*rmap_one)(struct folio *folio, !! 268 bool (*rmap_one)(struct page *page, struct vm_area_struct *vma, 775 unsign 269 unsigned long addr, void *arg); 776 int (*done)(struct folio *folio); !! 270 int (*done)(struct page *page); 777 struct anon_vma *(*anon_lock)(struct f !! 271 struct anon_vma *(*anon_lock)(struct page *page); 778 struct r << 779 bool (*invalid_vma)(struct vm_area_str 272 bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); 780 }; 273 }; 781 274 782 void rmap_walk(struct folio *folio, struct rma !! 275 void rmap_walk(struct page *page, struct rmap_walk_control *rwc); 783 void rmap_walk_locked(struct folio *folio, str !! 276 void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); 784 struct anon_vma *folio_lock_anon_vma_read(stru << 785 stru << 786 277 787 #else /* !CONFIG_MMU */ 278 #else /* !CONFIG_MMU */ 788 279 789 #define anon_vma_init() do {} while (0 280 #define anon_vma_init() do {} while (0) 790 #define anon_vma_prepare(vma) (0) 281 #define anon_vma_prepare(vma) (0) >> 282 #define anon_vma_link(vma) do {} while (0) 791 283 792 static inline int folio_referenced(struct foli !! 284 static inline int page_referenced(struct page *page, int is_locked, 793 struct mem_c 285 struct mem_cgroup *memcg, 794 unsigned lon 286 unsigned long *vm_flags) 795 { 287 { 796 *vm_flags = 0; 288 *vm_flags = 0; 797 return 0; 289 return 0; 798 } 290 } 799 291 800 static inline void try_to_unmap(struct folio * !! 292 #define try_to_unmap(page, refs) false 801 { << 802 } << 803 293 804 static inline int folio_mkclean(struct folio * !! 294 static inline int page_mkclean(struct page *page) 805 { 295 { 806 return 0; 296 return 0; 807 } 297 } >> 298 >> 299 808 #endif /* CONFIG_MMU */ 300 #endif /* CONFIG_MMU */ 809 301 810 #endif /* _LINUX_RMAP_H */ 302 #endif /* _LINUX_RMAP_H */ 811 303
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.