1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/pagewalk.h> 3 #include <linux/highmem.h> 4 #include <linux/sched.h> 5 #include <linux/hugetlb.h> 6 7 /* 8 * We want to know the real level where a entry is located ignoring any 9 * folding of levels which may be happening. For example if p4d is folded then 10 * a missing entry found at level 1 (p4d) is actually at level 0 (pgd). 11 */ 12 static int real_depth(int depth) 13 { 14 if (depth == 3 && PTRS_PER_PMD == 1) 15 depth = 2; 16 if (depth == 2 && PTRS_PER_PUD == 1) 17 depth = 1; 18 if (depth == 1 && PTRS_PER_P4D == 1) 19 depth = 0; 20 return depth; 21 } 22 23 static int walk_pte_range_inner(pte_t *pte, unsigned long addr, 24 unsigned long end, struct mm_walk *walk) 25 { 26 const struct mm_walk_ops *ops = walk->ops; 27 int err = 0; 28 29 for (;;) { 30 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 31 if (err) 32 break; 33 if (addr >= end - PAGE_SIZE) 34 break; 35 addr += PAGE_SIZE; 36 pte++; 37 } 38 return err; 39 } 40 41 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 42 struct mm_walk *walk) 43 { 44 pte_t *pte; 45 int err = 0; 46 spinlock_t *ptl; 47 48 if (walk->no_vma) { 49 /* 50 * pte_offset_map() might apply user-specific validation. 51 * Indeed, on x86_64 the pmd entries set up by init_espfix_ap() 52 * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear), 53 * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them. 54 */ 55 if (walk->mm == &init_mm || addr >= TASK_SIZE) 56 pte = pte_offset_kernel(pmd, addr); 57 else 58 pte = pte_offset_map(pmd, addr); 59 if (pte) { 60 err = walk_pte_range_inner(pte, addr, end, walk); 61 if (walk->mm != &init_mm && addr < TASK_SIZE) 62 pte_unmap(pte); 63 } 64 } else { 65 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 66 if (pte) { 67 err = walk_pte_range_inner(pte, addr, end, walk); 68 pte_unmap_unlock(pte, ptl); 69 } 70 } 71 if (!pte) 72 walk->action = ACTION_AGAIN; 73 return err; 74 } 75 76 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 77 struct mm_walk *walk) 78 { 79 pmd_t *pmd; 80 unsigned long next; 81 const struct mm_walk_ops *ops = walk->ops; 82 int err = 0; 83 int depth = real_depth(3); 84 85 pmd = pmd_offset(pud, addr); 86 do { 87 again: 88 next = pmd_addr_end(addr, end); 89 if (pmd_none(*pmd)) { 90 if (ops->pte_hole) 91 err = ops->pte_hole(addr, next, depth, walk); 92 if (err) 93 break; 94 continue; 95 } 96 97 walk->action = ACTION_SUBTREE; 98 99 /* 100 * This implies that each ->pmd_entry() handler 101 * needs to know about pmd_trans_huge() pmds 102 */ 103 if (ops->pmd_entry) 104 err = ops->pmd_entry(pmd, addr, next, walk); 105 if (err) 106 break; 107 108 if (walk->action == ACTION_AGAIN) 109 goto again; 110 111 /* 112 * Check this here so we only break down trans_huge 113 * pages when we _need_ to 114 */ 115 if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) || 116 walk->action == ACTION_CONTINUE || 117 !(ops->pte_entry)) 118 continue; 119 120 if (walk->vma) 121 split_huge_pmd(walk->vma, pmd, addr); 122 123 err = walk_pte_range(pmd, addr, next, walk); 124 if (err) 125 break; 126 127 if (walk->action == ACTION_AGAIN) 128 goto again; 129 130 } while (pmd++, addr = next, addr != end); 131 132 return err; 133 } 134 135 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 136 struct mm_walk *walk) 137 { 138 pud_t *pud; 139 unsigned long next; 140 const struct mm_walk_ops *ops = walk->ops; 141 int err = 0; 142 int depth = real_depth(2); 143 144 pud = pud_offset(p4d, addr); 145 do { 146 again: 147 next = pud_addr_end(addr, end); 148 if (pud_none(*pud)) { 149 if (ops->pte_hole) 150 err = ops->pte_hole(addr, next, depth, walk); 151 if (err) 152 break; 153 continue; 154 } 155 156 walk->action = ACTION_SUBTREE; 157 158 if (ops->pud_entry) 159 err = ops->pud_entry(pud, addr, next, walk); 160 if (err) 161 break; 162 163 if (walk->action == ACTION_AGAIN) 164 goto again; 165 166 if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) || 167 walk->action == ACTION_CONTINUE || 168 !(ops->pmd_entry || ops->pte_entry)) 169 continue; 170 171 if (walk->vma) 172 split_huge_pud(walk->vma, pud, addr); 173 if (pud_none(*pud)) 174 goto again; 175 176 err = walk_pmd_range(pud, addr, next, walk); 177 if (err) 178 break; 179 } while (pud++, addr = next, addr != end); 180 181 return err; 182 } 183 184 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 185 struct mm_walk *walk) 186 { 187 p4d_t *p4d; 188 unsigned long next; 189 const struct mm_walk_ops *ops = walk->ops; 190 int err = 0; 191 int depth = real_depth(1); 192 193 p4d = p4d_offset(pgd, addr); 194 do { 195 next = p4d_addr_end(addr, end); 196 if (p4d_none_or_clear_bad(p4d)) { 197 if (ops->pte_hole) 198 err = ops->pte_hole(addr, next, depth, walk); 199 if (err) 200 break; 201 continue; 202 } 203 if (ops->p4d_entry) { 204 err = ops->p4d_entry(p4d, addr, next, walk); 205 if (err) 206 break; 207 } 208 if (ops->pud_entry || ops->pmd_entry || ops->pte_entry) 209 err = walk_pud_range(p4d, addr, next, walk); 210 if (err) 211 break; 212 } while (p4d++, addr = next, addr != end); 213 214 return err; 215 } 216 217 static int walk_pgd_range(unsigned long addr, unsigned long end, 218 struct mm_walk *walk) 219 { 220 pgd_t *pgd; 221 unsigned long next; 222 const struct mm_walk_ops *ops = walk->ops; 223 int err = 0; 224 225 if (walk->pgd) 226 pgd = walk->pgd + pgd_index(addr); 227 else 228 pgd = pgd_offset(walk->mm, addr); 229 do { 230 next = pgd_addr_end(addr, end); 231 if (pgd_none_or_clear_bad(pgd)) { 232 if (ops->pte_hole) 233 err = ops->pte_hole(addr, next, 0, walk); 234 if (err) 235 break; 236 continue; 237 } 238 if (ops->pgd_entry) { 239 err = ops->pgd_entry(pgd, addr, next, walk); 240 if (err) 241 break; 242 } 243 if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry) 244 err = walk_p4d_range(pgd, addr, next, walk); 245 if (err) 246 break; 247 } while (pgd++, addr = next, addr != end); 248 249 return err; 250 } 251 252 #ifdef CONFIG_HUGETLB_PAGE 253 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 254 unsigned long end) 255 { 256 unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); 257 return boundary < end ? boundary : end; 258 } 259 260 static int walk_hugetlb_range(unsigned long addr, unsigned long end, 261 struct mm_walk *walk) 262 { 263 struct vm_area_struct *vma = walk->vma; 264 struct hstate *h = hstate_vma(vma); 265 unsigned long next; 266 unsigned long hmask = huge_page_mask(h); 267 unsigned long sz = huge_page_size(h); 268 pte_t *pte; 269 const struct mm_walk_ops *ops = walk->ops; 270 int err = 0; 271 272 hugetlb_vma_lock_read(vma); 273 do { 274 next = hugetlb_entry_end(h, addr, end); 275 pte = hugetlb_walk(vma, addr & hmask, sz); 276 if (pte) 277 err = ops->hugetlb_entry(pte, hmask, addr, next, walk); 278 else if (ops->pte_hole) 279 err = ops->pte_hole(addr, next, -1, walk); 280 if (err) 281 break; 282 } while (addr = next, addr != end); 283 hugetlb_vma_unlock_read(vma); 284 285 return err; 286 } 287 288 #else /* CONFIG_HUGETLB_PAGE */ 289 static int walk_hugetlb_range(unsigned long addr, unsigned long end, 290 struct mm_walk *walk) 291 { 292 return 0; 293 } 294 295 #endif /* CONFIG_HUGETLB_PAGE */ 296 297 /* 298 * Decide whether we really walk over the current vma on [@start, @end) 299 * or skip it via the returned value. Return 0 if we do walk over the 300 * current vma, and return 1 if we skip the vma. Negative values means 301 * error, where we abort the current walk. 302 */ 303 static int walk_page_test(unsigned long start, unsigned long end, 304 struct mm_walk *walk) 305 { 306 struct vm_area_struct *vma = walk->vma; 307 const struct mm_walk_ops *ops = walk->ops; 308 309 if (ops->test_walk) 310 return ops->test_walk(start, end, walk); 311 312 /* 313 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 314 * range, so we don't walk over it as we do for normal vmas. However, 315 * Some callers are interested in handling hole range and they don't 316 * want to just ignore any single address range. Such users certainly 317 * define their ->pte_hole() callbacks, so let's delegate them to handle 318 * vma(VM_PFNMAP). 319 */ 320 if (vma->vm_flags & VM_PFNMAP) { 321 int err = 1; 322 if (ops->pte_hole) 323 err = ops->pte_hole(start, end, -1, walk); 324 return err ? err : 1; 325 } 326 return 0; 327 } 328 329 static int __walk_page_range(unsigned long start, unsigned long end, 330 struct mm_walk *walk) 331 { 332 int err = 0; 333 struct vm_area_struct *vma = walk->vma; 334 const struct mm_walk_ops *ops = walk->ops; 335 336 if (ops->pre_vma) { 337 err = ops->pre_vma(start, end, walk); 338 if (err) 339 return err; 340 } 341 342 if (is_vm_hugetlb_page(vma)) { 343 if (ops->hugetlb_entry) 344 err = walk_hugetlb_range(start, end, walk); 345 } else 346 err = walk_pgd_range(start, end, walk); 347 348 if (ops->post_vma) 349 ops->post_vma(walk); 350 351 return err; 352 } 353 354 static inline void process_mm_walk_lock(struct mm_struct *mm, 355 enum page_walk_lock walk_lock) 356 { 357 if (walk_lock == PGWALK_RDLOCK) 358 mmap_assert_locked(mm); 359 else 360 mmap_assert_write_locked(mm); 361 } 362 363 static inline void process_vma_walk_lock(struct vm_area_struct *vma, 364 enum page_walk_lock walk_lock) 365 { 366 #ifdef CONFIG_PER_VMA_LOCK 367 switch (walk_lock) { 368 case PGWALK_WRLOCK: 369 vma_start_write(vma); 370 break; 371 case PGWALK_WRLOCK_VERIFY: 372 vma_assert_write_locked(vma); 373 break; 374 case PGWALK_RDLOCK: 375 /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ 376 break; 377 } 378 #endif 379 } 380 381 /** 382 * walk_page_range - walk page table with caller specific callbacks 383 * @mm: mm_struct representing the target process of page table walk 384 * @start: start address of the virtual address range 385 * @end: end address of the virtual address range 386 * @ops: operation to call during the walk 387 * @private: private data for callbacks' usage 388 * 389 * Recursively walk the page table tree of the process represented by @mm 390 * within the virtual address range [@start, @end). During walking, we can do 391 * some caller-specific works for each entry, by setting up pmd_entry(), 392 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 393 * callbacks, the associated entries/pages are just ignored. 394 * The return values of these callbacks are commonly defined like below: 395 * 396 * - 0 : succeeded to handle the current entry, and if you don't reach the 397 * end address yet, continue to walk. 398 * - >0 : succeeded to handle the current entry, and return to the caller 399 * with caller specific value. 400 * - <0 : failed to handle the current entry, and return to the caller 401 * with error code. 402 * 403 * Before starting to walk page table, some callers want to check whether 404 * they really want to walk over the current vma, typically by checking 405 * its vm_flags. walk_page_test() and @ops->test_walk() are used for this 406 * purpose. 407 * 408 * If operations need to be staged before and committed after a vma is walked, 409 * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), 410 * since it is intended to handle commit-type operations, can't return any 411 * errors. 412 * 413 * struct mm_walk keeps current values of some common data like vma and pmd, 414 * which are useful for the access from callbacks. If you want to pass some 415 * caller-specific data to callbacks, @private should be helpful. 416 * 417 * Locking: 418 * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, 419 * because these function traverse vma list and/or access to vma's data. 420 */ 421 int walk_page_range(struct mm_struct *mm, unsigned long start, 422 unsigned long end, const struct mm_walk_ops *ops, 423 void *private) 424 { 425 int err = 0; 426 unsigned long next; 427 struct vm_area_struct *vma; 428 struct mm_walk walk = { 429 .ops = ops, 430 .mm = mm, 431 .private = private, 432 }; 433 434 if (start >= end) 435 return -EINVAL; 436 437 if (!walk.mm) 438 return -EINVAL; 439 440 process_mm_walk_lock(walk.mm, ops->walk_lock); 441 442 vma = find_vma(walk.mm, start); 443 do { 444 if (!vma) { /* after the last vma */ 445 walk.vma = NULL; 446 next = end; 447 if (ops->pte_hole) 448 err = ops->pte_hole(start, next, -1, &walk); 449 } else if (start < vma->vm_start) { /* outside vma */ 450 walk.vma = NULL; 451 next = min(end, vma->vm_start); 452 if (ops->pte_hole) 453 err = ops->pte_hole(start, next, -1, &walk); 454 } else { /* inside vma */ 455 process_vma_walk_lock(vma, ops->walk_lock); 456 walk.vma = vma; 457 next = min(end, vma->vm_end); 458 vma = find_vma(mm, vma->vm_end); 459 460 err = walk_page_test(start, next, &walk); 461 if (err > 0) { 462 /* 463 * positive return values are purely for 464 * controlling the pagewalk, so should never 465 * be passed to the callers. 466 */ 467 err = 0; 468 continue; 469 } 470 if (err < 0) 471 break; 472 err = __walk_page_range(start, next, &walk); 473 } 474 if (err) 475 break; 476 } while (start = next, start < end); 477 return err; 478 } 479 480 /** 481 * walk_page_range_novma - walk a range of pagetables not backed by a vma 482 * @mm: mm_struct representing the target process of page table walk 483 * @start: start address of the virtual address range 484 * @end: end address of the virtual address range 485 * @ops: operation to call during the walk 486 * @pgd: pgd to walk if different from mm->pgd 487 * @private: private data for callbacks' usage 488 * 489 * Similar to walk_page_range() but can walk any page tables even if they are 490 * not backed by VMAs. Because 'unusual' entries may be walked this function 491 * will also not lock the PTEs for the pte_entry() callback. This is useful for 492 * walking the kernel pages tables or page tables for firmware. 493 * 494 * Note: Be careful to walk the kernel pages tables, the caller may be need to 495 * take other effective approache (mmap lock may be insufficient) to prevent 496 * the intermediate kernel page tables belonging to the specified address range 497 * from being freed (e.g. memory hot-remove). 498 */ 499 int walk_page_range_novma(struct mm_struct *mm, unsigned long start, 500 unsigned long end, const struct mm_walk_ops *ops, 501 pgd_t *pgd, 502 void *private) 503 { 504 struct mm_walk walk = { 505 .ops = ops, 506 .mm = mm, 507 .pgd = pgd, 508 .private = private, 509 .no_vma = true 510 }; 511 512 if (start >= end || !walk.mm) 513 return -EINVAL; 514 515 /* 516 * 1) For walking the user virtual address space: 517 * 518 * The mmap lock protects the page walker from changes to the page 519 * tables during the walk. However a read lock is insufficient to 520 * protect those areas which don't have a VMA as munmap() detaches 521 * the VMAs before downgrading to a read lock and actually tearing 522 * down PTEs/page tables. In which case, the mmap write lock should 523 * be hold. 524 * 525 * 2) For walking the kernel virtual address space: 526 * 527 * The kernel intermediate page tables usually do not be freed, so 528 * the mmap map read lock is sufficient. But there are some exceptions. 529 * E.g. memory hot-remove. In which case, the mmap lock is insufficient 530 * to prevent the intermediate kernel pages tables belonging to the 531 * specified address range from being freed. The caller should take 532 * other actions to prevent this race. 533 */ 534 if (mm == &init_mm) 535 mmap_assert_locked(walk.mm); 536 else 537 mmap_assert_write_locked(walk.mm); 538 539 return walk_pgd_range(start, end, &walk); 540 } 541 542 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, 543 unsigned long end, const struct mm_walk_ops *ops, 544 void *private) 545 { 546 struct mm_walk walk = { 547 .ops = ops, 548 .mm = vma->vm_mm, 549 .vma = vma, 550 .private = private, 551 }; 552 553 if (start >= end || !walk.mm) 554 return -EINVAL; 555 if (start < vma->vm_start || end > vma->vm_end) 556 return -EINVAL; 557 558 process_mm_walk_lock(walk.mm, ops->walk_lock); 559 process_vma_walk_lock(vma, ops->walk_lock); 560 return __walk_page_range(start, end, &walk); 561 } 562 563 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, 564 void *private) 565 { 566 struct mm_walk walk = { 567 .ops = ops, 568 .mm = vma->vm_mm, 569 .vma = vma, 570 .private = private, 571 }; 572 573 if (!walk.mm) 574 return -EINVAL; 575 576 process_mm_walk_lock(walk.mm, ops->walk_lock); 577 process_vma_walk_lock(vma, ops->walk_lock); 578 return __walk_page_range(vma->vm_start, vma->vm_end, &walk); 579 } 580 581 /** 582 * walk_page_mapping - walk all memory areas mapped into a struct address_space. 583 * @mapping: Pointer to the struct address_space 584 * @first_index: First page offset in the address_space 585 * @nr: Number of incremental page offsets to cover 586 * @ops: operation to call during the walk 587 * @private: private data for callbacks' usage 588 * 589 * This function walks all memory areas mapped into a struct address_space. 590 * The walk is limited to only the given page-size index range, but if 591 * the index boundaries cross a huge page-table entry, that entry will be 592 * included. 593 * 594 * Also see walk_page_range() for additional information. 595 * 596 * Locking: 597 * This function can't require that the struct mm_struct::mmap_lock is held, 598 * since @mapping may be mapped by multiple processes. Instead 599 * @mapping->i_mmap_rwsem must be held. This might have implications in the 600 * callbacks, and it's up tho the caller to ensure that the 601 * struct mm_struct::mmap_lock is not needed. 602 * 603 * Also this means that a caller can't rely on the struct 604 * vm_area_struct::vm_flags to be constant across a call, 605 * except for immutable flags. Callers requiring this shouldn't use 606 * this function. 607 * 608 * Return: 0 on success, negative error code on failure, positive number on 609 * caller defined premature termination. 610 */ 611 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, 612 pgoff_t nr, const struct mm_walk_ops *ops, 613 void *private) 614 { 615 struct mm_walk walk = { 616 .ops = ops, 617 .private = private, 618 }; 619 struct vm_area_struct *vma; 620 pgoff_t vba, vea, cba, cea; 621 unsigned long start_addr, end_addr; 622 int err = 0; 623 624 lockdep_assert_held(&mapping->i_mmap_rwsem); 625 vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, 626 first_index + nr - 1) { 627 /* Clip to the vma */ 628 vba = vma->vm_pgoff; 629 vea = vba + vma_pages(vma); 630 cba = first_index; 631 cba = max(cba, vba); 632 cea = first_index + nr; 633 cea = min(cea, vea); 634 635 start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; 636 end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; 637 if (start_addr >= end_addr) 638 continue; 639 640 walk.vma = vma; 641 walk.mm = vma->vm_mm; 642 643 err = walk_page_test(vma->vm_start, vma->vm_end, &walk); 644 if (err > 0) { 645 err = 0; 646 break; 647 } else if (err < 0) 648 break; 649 650 err = __walk_page_range(start_addr, end_addr, &walk); 651 if (err) 652 break; 653 } 654 655 return err; 656 } 657
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.