1 // SPDX-License-Identifier: GPL-2.0 1 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include "mmu.h" 5 #include "mmu_internal.h" 6 #include "mmutrace.h" 7 #include "tdp_iter.h" 8 #include "tdp_mmu.h" 9 #include "spte.h" 10 11 #include <asm/cmpxchg.h> 12 #include <trace/events/kvm.h> 13 14 /* Initializes the TDP MMU for the VM, if enab 15 void kvm_mmu_init_tdp_mmu(struct kvm *kvm) 16 { 17 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_root 18 spin_lock_init(&kvm->arch.tdp_mmu_page 19 } 20 21 /* Arbitrarily returns true so that this may b 22 static __always_inline bool kvm_lockdep_assert 23 24 { 25 if (shared) 26 lockdep_assert_held_read(&kvm- 27 else 28 lockdep_assert_held_write(&kvm 29 30 return true; 31 } 32 33 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 34 { 35 /* 36 * Invalidate all roots, which besides 37 * for zapping and thus puts the TDP M 38 * ultimately frees all roots. 39 */ 40 kvm_tdp_mmu_invalidate_all_roots(kvm); 41 kvm_tdp_mmu_zap_invalidated_roots(kvm) 42 43 WARN_ON(atomic64_read(&kvm->arch.tdp_m 44 WARN_ON(!list_empty(&kvm->arch.tdp_mmu 45 46 /* 47 * Ensure that all the outstanding RCU 48 * can run before the VM is torn down. 49 * zapped roots will create new callba 50 */ 51 rcu_barrier(); 52 } 53 54 static void tdp_mmu_free_sp(struct kvm_mmu_pag 55 { 56 free_page((unsigned long)sp->spt); 57 kmem_cache_free(mmu_page_header_cache, 58 } 59 60 /* 61 * This is called through call_rcu in order to 62 * safely with respect to other kernel threads 63 * the memory. 64 * By only accessing TDP MMU page table memory 65 * section, and freeing it after a grace perio 66 * memory won't use it after it is freed. 67 */ 68 static void tdp_mmu_free_sp_rcu_callback(struc 69 { 70 struct kvm_mmu_page *sp = container_of 71 72 73 tdp_mmu_free_sp(sp); 74 } 75 76 void kvm_tdp_mmu_put_root(struct kvm *kvm, str 77 { 78 if (!refcount_dec_and_test(&root->tdp_ 79 return; 80 81 /* 82 * The TDP MMU itself holds a referenc 83 * explicitly invalidated, i.e. the fi 84 * put for a valid root. 85 */ 86 KVM_BUG_ON(!is_tdp_mmu_page(root) || ! 87 88 spin_lock(&kvm->arch.tdp_mmu_pages_loc 89 list_del_rcu(&root->link); 90 spin_unlock(&kvm->arch.tdp_mmu_pages_l 91 call_rcu(&root->rcu_head, tdp_mmu_free 92 } 93 94 /* 95 * Returns the next root after @prev_root (or 96 * NULL). A reference to the returned root is 97 * @prev_root is released (the caller obviousl 98 * @prev_root if it's non-NULL). 99 * 100 * If @only_valid is true, invalid roots are s 101 * 102 * Returns NULL if the end of tdp_mmu_roots wa 103 */ 104 static struct kvm_mmu_page *tdp_mmu_next_root( 105 106 107 { 108 struct kvm_mmu_page *next_root; 109 110 /* 111 * While the roots themselves are RCU- 112 * role.invalid are protected by mmu_l 113 */ 114 lockdep_assert_held(&kvm->mmu_lock); 115 116 rcu_read_lock(); 117 118 if (prev_root) 119 next_root = list_next_or_null_ 120 121 122 else 123 next_root = list_first_or_null 124 125 126 while (next_root) { 127 if ((!only_valid || !next_root 128 kvm_tdp_mmu_get_root(next_ 129 break; 130 131 next_root = list_next_or_null_ 132 &next_root->li 133 } 134 135 rcu_read_unlock(); 136 137 if (prev_root) 138 kvm_tdp_mmu_put_root(kvm, prev 139 140 return next_root; 141 } 142 143 /* 144 * Note: this iterator gets and puts reference 145 * This makes it safe to release the MMU lock 146 * if exiting the loop early, the caller must 147 * recent root. (Unless keeping a live referen 148 * 149 * If shared is set, this function is operatin 150 * mode. 151 */ 152 #define __for_each_tdp_mmu_root_yield_safe(_kv 153 for (_root = tdp_mmu_next_root(_kvm, N 154 ({ lockdep_assert_held(&(_kvm)->m 155 _root = tdp_mmu_next_root(_kvm, _ 156 if (_as_id >= 0 && kvm_mmu_pag 157 } else 158 159 #define for_each_valid_tdp_mmu_root_yield_safe 160 __for_each_tdp_mmu_root_yield_safe(_kv 161 162 #define for_each_tdp_mmu_root_yield_safe(_kvm, 163 for (_root = tdp_mmu_next_root(_kvm, N 164 ({ lockdep_assert_held(&(_kvm)->m 165 _root = tdp_mmu_next_root(_kvm, _ 166 167 /* 168 * Iterate over all TDP MMU roots. Requires t 169 * the implication being that any flow that ho 170 * inherently yield-friendly and should use th 171 * Holding mmu_lock for write obviates the nee 172 * is guaranteed to be stable. 173 */ 174 #define __for_each_tdp_mmu_root(_kvm, _root, _ 175 list_for_each_entry(_root, &_kvm->arch 176 if (kvm_lockdep_assert_mmu_loc 177 ((_as_id >= 0 && kvm_mmu_p 178 ((_only_valid) && (_root) 179 } else 180 181 #define for_each_tdp_mmu_root(_kvm, _root, _as 182 __for_each_tdp_mmu_root(_kvm, _root, _ 183 184 #define for_each_valid_tdp_mmu_root(_kvm, _roo 185 __for_each_tdp_mmu_root(_kvm, _root, _ 186 187 static struct kvm_mmu_page *tdp_mmu_alloc_sp(s 188 { 189 struct kvm_mmu_page *sp; 190 191 sp = kvm_mmu_memory_cache_alloc(&vcpu- 192 sp->spt = kvm_mmu_memory_cache_alloc(& 193 194 return sp; 195 } 196 197 static void tdp_mmu_init_sp(struct kvm_mmu_pag 198 gfn_t gfn, union k 199 { 200 INIT_LIST_HEAD(&sp->possible_nx_huge_p 201 202 set_page_private(virt_to_page(sp->spt) 203 204 sp->role = role; 205 sp->gfn = gfn; 206 sp->ptep = sptep; 207 sp->tdp_mmu_page = true; 208 209 trace_kvm_mmu_get_page(sp, true); 210 } 211 212 static void tdp_mmu_init_child_sp(struct kvm_m 213 struct tdp_i 214 { 215 struct kvm_mmu_page *parent_sp; 216 union kvm_mmu_page_role role; 217 218 parent_sp = sptep_to_sp(rcu_dereferenc 219 220 role = parent_sp->role; 221 role.level--; 222 223 tdp_mmu_init_sp(child_sp, iter->sptep, 224 } 225 226 int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vc 227 { 228 struct kvm_mmu *mmu = vcpu->arch.mmu; 229 union kvm_mmu_page_role role = mmu->ro 230 int as_id = kvm_mmu_role_as_id(role); 231 struct kvm *kvm = vcpu->kvm; 232 struct kvm_mmu_page *root; 233 234 /* 235 * Check for an existing root before a 236 * unnecessary serialization if multip 237 * E.g. when bringing up secondary vCP 238 * a valid root on behalf of the prima 239 */ 240 read_lock(&kvm->mmu_lock); 241 242 for_each_valid_tdp_mmu_root_yield_safe 243 if (root->role.word == role.wo 244 goto out_read_unlock; 245 } 246 247 spin_lock(&kvm->arch.tdp_mmu_pages_loc 248 249 /* 250 * Recheck for an existing root after 251 * vCPU may have raced ahead and creat 252 * walk the list of roots as the stand 253 * lock is *not* held. WARN if grabbi 254 * fails, as the last reference to a r 255 * root has been invalidated, which re 256 */ 257 list_for_each_entry(root, &kvm->arch.t 258 if (root->role.word == role.wo 259 !WARN_ON_ONCE(!kvm_tdp_mmu 260 goto out_spin_unlock; 261 } 262 263 root = tdp_mmu_alloc_sp(vcpu); 264 tdp_mmu_init_sp(root, NULL, 0, role); 265 266 /* 267 * TDP MMU roots are kept until they a 268 * by a memslot update or by the destr 269 * refcount to two; one reference for 270 * the TDP MMU itself, which is held u 271 * is ultimately put by kvm_tdp_mmu_za 272 */ 273 refcount_set(&root->tdp_mmu_root_count 274 list_add_rcu(&root->link, &kvm->arch.t 275 276 out_spin_unlock: 277 spin_unlock(&kvm->arch.tdp_mmu_pages_l 278 out_read_unlock: 279 read_unlock(&kvm->mmu_lock); 280 /* 281 * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROO 282 * and actually consuming the root if 283 * mmu_lock, and the root can't be fre 284 */ 285 mmu->root.hpa = __pa(root->spt); 286 mmu->root.pgd = 0; 287 return 0; 288 } 289 290 static void handle_changed_spte(struct kvm *kv 291 u64 old_spte, 292 bool shared); 293 294 static void tdp_account_mmu_page(struct kvm *k 295 { 296 kvm_account_pgtable_pages((void *)sp-> 297 atomic64_inc(&kvm->arch.tdp_mmu_pages) 298 } 299 300 static void tdp_unaccount_mmu_page(struct kvm 301 { 302 kvm_account_pgtable_pages((void *)sp-> 303 atomic64_dec(&kvm->arch.tdp_mmu_pages) 304 } 305 306 /** 307 * tdp_mmu_unlink_sp() - Remove a shadow page 308 * 309 * @kvm: kvm instance 310 * @sp: the page to be removed 311 */ 312 static void tdp_mmu_unlink_sp(struct kvm *kvm, 313 { 314 tdp_unaccount_mmu_page(kvm, sp); 315 316 if (!sp->nx_huge_page_disallowed) 317 return; 318 319 spin_lock(&kvm->arch.tdp_mmu_pages_loc 320 sp->nx_huge_page_disallowed = false; 321 untrack_possible_nx_huge_page(kvm, sp) 322 spin_unlock(&kvm->arch.tdp_mmu_pages_l 323 } 324 325 /** 326 * handle_removed_pt() - handle a page table r 327 * 328 * @kvm: kvm instance 329 * @pt: the page removed from the paging struc 330 * @shared: This operation may not be running 331 * of the MMU lock and the operation 332 * threads that might be modifying SP 333 * 334 * Given a page table that has been removed fr 335 * iterates through the page table to clear SP 336 * 337 * Note that pt is passed in as a tdp_ptep_t, 338 * protection. Since this thread removed it fr 339 * this thread will be responsible for ensurin 340 * early rcu_dereferences in the function. 341 */ 342 static void handle_removed_pt(struct kvm *kvm, 343 { 344 struct kvm_mmu_page *sp = sptep_to_sp( 345 int level = sp->role.level; 346 gfn_t base_gfn = sp->gfn; 347 int i; 348 349 trace_kvm_mmu_prepare_zap_page(sp); 350 351 tdp_mmu_unlink_sp(kvm, sp); 352 353 for (i = 0; i < SPTE_ENT_PER_PAGE; i++ 354 tdp_ptep_t sptep = pt + i; 355 gfn_t gfn = base_gfn + i * KVM 356 u64 old_spte; 357 358 if (shared) { 359 /* 360 * Set the SPTE to a n 361 * threads will not ov 362 * already marked as f 363 * handling a page fau 364 * set the SPTE until 365 * value to the frozen 366 */ 367 for (;;) { 368 old_spte = kvm 369 if (!is_frozen 370 break; 371 cpu_relax(); 372 } 373 } else { 374 /* 375 * If the SPTE is not 376 * page associated wit 377 * that need to be rec 378 * mmu_lock ensures th 379 * Note, zapping MMIO 380 * are guarded by the 381 * unreachable. 382 */ 383 old_spte = kvm_tdp_mmu 384 if (!is_shadow_present 385 continue; 386 387 /* 388 * Use the common help 389 * the SPTE needs to b 390 * modified by a diffe 391 * Even though the par 392 * hasn't yet been flu 393 * document that A/D a 394 * entries that are ca 395 * still access the pa 396 * 397 * No retry is needed 398 * sole concern is dro 399 * task can zap/remove 400 * write. Marking the 401 * strictly necessary 402 * the frozen SPTE val 403 * paths consistent an 404 * call below to hardc 405 * 406 * Note, even though d 407 * scenario where a no 408 * functional bug, sim 409 * sufficient as a fas 410 * level SPTE before i 411 * target SPTE writabl 412 * Dirty bit between r 413 * it here. 414 */ 415 old_spte = kvm_tdp_mmu 416 417 } 418 handle_changed_spte(kvm, kvm_m 419 old_spte, 420 } 421 422 call_rcu(&sp->rcu_head, tdp_mmu_free_s 423 } 424 425 /** 426 * handle_changed_spte - handle bookkeeping as 427 * @kvm: kvm instance 428 * @as_id: the address space of the paging str 429 * @gfn: the base GFN that was mapped by the S 430 * @old_spte: The value of the SPTE before the 431 * @new_spte: The value of the SPTE after the 432 * @level: the level of the PT the SPTE is par 433 * @shared: This operation may not be running 434 * the MMU lock and the operation mus 435 * threads that might be modifying SP 436 * 437 * Handle bookkeeping that might result from t 438 * dirty logging updates are handled in common 439 * and fast_pf_fix_direct_spte()). 440 */ 441 static void handle_changed_spte(struct kvm *kv 442 u64 old_spte, 443 bool shared) 444 { 445 bool was_present = is_shadow_present_p 446 bool is_present = is_shadow_present_pt 447 bool was_leaf = was_present && is_last 448 bool is_leaf = is_present && is_last_s 449 bool pfn_changed = spte_to_pfn(old_spt 450 451 WARN_ON_ONCE(level > PT64_ROOT_MAX_LEV 452 WARN_ON_ONCE(level < PG_LEVEL_4K); 453 WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAG 454 455 /* 456 * If this warning were to trigger it 457 * missing MMU notifier or a race with 458 * A present, leaf SPTE should never b 459 * present leaf SPTE pointing to a dif 460 * should be zapping the SPTE before t 461 * changed, or the SPTE should be zero 462 * thread before replacement. 463 */ 464 if (was_leaf && is_leaf && pfn_changed 465 pr_err("Invalid SPTE change: c 466 "SPTE with another pres 467 "different PFN!\n" 468 "as_id: %d gfn: %llx ol 469 as_id, gfn, old_spte, n 470 471 /* 472 * Crash the host to prevent e 473 * corruption. 474 */ 475 BUG(); 476 } 477 478 if (old_spte == new_spte) 479 return; 480 481 trace_kvm_tdp_mmu_spte_changed(as_id, 482 483 if (is_leaf) 484 check_spte_writable_invariants 485 486 /* 487 * The only times a SPTE should be cha 488 * non-present state is when an MMIO e 489 * removed. In that case, there is not 490 */ 491 if (!was_present && !is_present) { 492 /* 493 * If this change does not inv 494 * it is unexpected. Log the c 495 * impact the guest since both 496 * are nonpresent. 497 */ 498 if (WARN_ON_ONCE(!is_mmio_spte 499 !is_mmio_spte 500 !is_frozen_sp 501 pr_err("Unexpected SPT 502 "should not be 503 "different nonp 504 "are MMIO SPTEs 505 "a temporary fr 506 "as_id: %d gfn: 507 as_id, gfn, old 508 return; 509 } 510 511 if (is_leaf != was_leaf) 512 kvm_update_page_stats(kvm, lev 513 514 if (was_leaf && is_dirty_spte(old_spte 515 (!is_present || !is_dirty_spte(new 516 kvm_set_pfn_dirty(spte_to_pfn( 517 518 /* 519 * Recursively handle child PTs if the 520 * the paging structure. Note the WAR 521 * SPTE being converted to a hugepage 522 * pages are kernel allocations and sh 523 */ 524 if (was_present && !was_leaf && 525 (is_leaf || !is_present || WARN_ON 526 handle_removed_pt(kvm, spte_to 527 528 if (was_leaf && is_accessed_spte(old_s 529 (!is_present || !is_accessed_spte( 530 kvm_set_pfn_accessed(spte_to_p 531 } 532 533 static inline int __must_check __tdp_mmu_set_s 534 535 { 536 u64 *sptep = rcu_dereference(iter->spt 537 538 /* 539 * The caller is responsible for ensur 540 * SPTE. KVM should never attempt to 541 * and pre-checking before inserting a 542 * avoids unnecessary work. 543 */ 544 WARN_ON_ONCE(iter->yielded || is_froze 545 546 /* 547 * Note, fast_pf_fix_direct_spte() can 548 * does not hold the mmu_lock. On fai 549 * CPU modified the SPTE, try_cmpxchg6 550 * the current value, so the caller op 551 * retries tdp_mmu_set_spte_atomic() 552 */ 553 if (!try_cmpxchg64(sptep, &iter->old_s 554 return -EBUSY; 555 556 return 0; 557 } 558 559 /* 560 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPT 561 * and handle the associated bookkeeping. Do 562 * in KVM's dirty bitmaps. 563 * 564 * If setting the SPTE fails because it has ch 565 * refreshed to the current value of the spte. 566 * 567 * @kvm: kvm instance 568 * @iter: a tdp_iter instance currently on the 569 * @new_spte: The value the SPTE should be set 570 * Return: 571 * * 0 - If the SPTE was set. 572 * * -EBUSY - If the SPTE cannot be set. In th 573 * no side-effects other than setti 574 * known value of the spte. 575 */ 576 static inline int __must_check tdp_mmu_set_spt 577 578 579 { 580 int ret; 581 582 lockdep_assert_held_read(&kvm->mmu_loc 583 584 ret = __tdp_mmu_set_spte_atomic(iter, 585 if (ret) 586 return ret; 587 588 handle_changed_spte(kvm, iter->as_id, 589 new_spte, iter->le 590 591 return 0; 592 } 593 594 static inline int __must_check tdp_mmu_zap_spt 595 596 { 597 int ret; 598 599 lockdep_assert_held_read(&kvm->mmu_loc 600 601 /* 602 * Freeze the SPTE by setting it to a 603 * will stop other threads from immedi 604 * in its place before the TLBs are fl 605 * 606 * Delay processing of the zapped SPTE 607 * the FROZEN_SPTE is replaced (see be 608 */ 609 ret = __tdp_mmu_set_spte_atomic(iter, 610 if (ret) 611 return ret; 612 613 kvm_flush_remote_tlbs_gfn(kvm, iter->g 614 615 /* 616 * No other thread can overwrite the f 617 * wait on the MMU lock or use tdp_mmu 618 * overwrite the special frozen SPTE v 619 * avoid an unnecessary check on volat 620 */ 621 __kvm_tdp_mmu_write_spte(iter->sptep, 622 623 /* 624 * Process the zapped SPTE after flush 625 * FROZEN_SPTE with 0. This minimizes 626 * blocked by the FROZEN_SPTE and redu 627 * SPTEs. 628 */ 629 handle_changed_spte(kvm, iter->as_id, 630 SHADOW_NONPRESENT_ 631 632 return 0; 633 } 634 635 636 /* 637 * tdp_mmu_set_spte - Set a TDP MMU SPTE and h 638 * @kvm: KVM instance 639 * @as_id: Address space ID, i.e. r 640 * @sptep: Pointer to the SPTE 641 * @old_spte: The current value of the 642 * @new_spte: The new value that will 643 * @gfn: The base GFN that was (o 644 * @level: The level _containing_ t 645 * 646 * Returns the old SPTE value, which _may_ be 647 * SPTE had voldatile bits. 648 */ 649 static u64 tdp_mmu_set_spte(struct kvm *kvm, i 650 u64 old_spte, u64 651 { 652 lockdep_assert_held_write(&kvm->mmu_lo 653 654 /* 655 * No thread should be using this func 656 * temporary frozen SPTE value. 657 * If operating under the MMU lock in 658 * should be used. If operating under 659 * use of the frozen SPTE should not b 660 */ 661 WARN_ON_ONCE(is_frozen_spte(old_spte) 662 663 old_spte = kvm_tdp_mmu_write_spte(spte 664 665 handle_changed_spte(kvm, as_id, gfn, o 666 return old_spte; 667 } 668 669 static inline void tdp_mmu_iter_set_spte(struc 670 u64 n 671 { 672 WARN_ON_ONCE(iter->yielded); 673 iter->old_spte = tdp_mmu_set_spte(kvm, 674 iter 675 iter 676 } 677 678 #define tdp_root_for_each_pte(_iter, _root, _s 679 for_each_tdp_pte(_iter, _root, _start, 680 681 #define tdp_root_for_each_leaf_pte(_iter, _roo 682 tdp_root_for_each_pte(_iter, _root, _s 683 if (!is_shadow_present_pte(_it 684 !is_last_spte(_iter.old_sp 685 continue; 686 else 687 688 #define tdp_mmu_for_each_pte(_iter, _mmu, _sta 689 for_each_tdp_pte(_iter, root_to_sp(_mm 690 691 /* 692 * Yield if the MMU lock is contended or this 693 * to the scheduler. 694 * 695 * If this function should yield and flush is 696 * TLB flush before yielding. 697 * 698 * If this function yields, iter->yielded is s 699 * the next iteration, where tdp_iter_next() w 700 * over the paging structures to allow the ite 701 * from the paging structure root. 702 * 703 * Returns true if this function yielded. 704 */ 705 static inline bool __must_check tdp_mmu_iter_c 706 707 708 { 709 WARN_ON_ONCE(iter->yielded); 710 711 /* Ensure forward progress has been ma 712 if (iter->next_last_level_gfn == iter- 713 return false; 714 715 if (need_resched() || rwlock_needbreak 716 if (flush) 717 kvm_flush_remote_tlbs( 718 719 rcu_read_unlock(); 720 721 if (shared) 722 cond_resched_rwlock_re 723 else 724 cond_resched_rwlock_wr 725 726 rcu_read_lock(); 727 728 WARN_ON_ONCE(iter->gfn > iter- 729 730 iter->yielded = true; 731 } 732 733 return iter->yielded; 734 } 735 736 static inline gfn_t tdp_mmu_max_gfn_exclusive( 737 { 738 /* 739 * Bound TDP MMU walks at host.MAXPHYA 740 * a gpa range that would exceed the m 741 * MMIO SPTEs for "impossible" gfns, i 742 * the slow emulation path every time. 743 */ 744 return kvm_mmu_max_gfn() + 1; 745 } 746 747 static void __tdp_mmu_zap_root(struct kvm *kvm 748 bool shared, in 749 { 750 struct tdp_iter iter; 751 752 gfn_t end = tdp_mmu_max_gfn_exclusive( 753 gfn_t start = 0; 754 755 for_each_tdp_pte_min_level(iter, root, 756 retry: 757 if (tdp_mmu_iter_cond_resched( 758 continue; 759 760 if (!is_shadow_present_pte(ite 761 continue; 762 763 if (iter.level > zap_level) 764 continue; 765 766 if (!shared) 767 tdp_mmu_iter_set_spte( 768 else if (tdp_mmu_set_spte_atom 769 goto retry; 770 } 771 } 772 773 static void tdp_mmu_zap_root(struct kvm *kvm, 774 bool shared) 775 { 776 777 /* 778 * The root must have an elevated refc 779 * mmu_notifier callbacks, which allow 780 * mmu_lock. When handling an unmap/r 781 * must drop all references to relevan 782 * callback. Dropping mmu_lock with a 783 * in zapping SPTEs after a relevant m 784 * and lead to use-after-free as zappi 785 * dirty accessed bits to the SPTE's a 786 */ 787 WARN_ON_ONCE(!refcount_read(&root->tdp 788 789 kvm_lockdep_assert_mmu_lock_held(kvm, 790 791 rcu_read_lock(); 792 793 /* 794 * Zap roots in multiple passes of dec 795 * 4KiB=>2MiB=>1GiB=>root, in order to 796 * preempt models) or mmu_lock content 797 * Zapping at finer granularity margin 798 * the zap, but in most cases the zap 799 * 800 * If KVM is configured to prove the M 801 * in order to mimic the page fault pa 802 * table with an equivalent 1GiB hugep 803 * zapping a 1GiB region that's fully 804 * allows verifying that KVM can safel 805 * inducing RCU stalls, without relyin 806 * (zapping roots is orders of magnitu 807 * zapping a SP recurses on its childr 808 * in the iterator itself is unnecessa 809 */ 810 if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) 811 __tdp_mmu_zap_root(kvm, root, 812 __tdp_mmu_zap_root(kvm, root, 813 } 814 __tdp_mmu_zap_root(kvm, root, shared, 815 __tdp_mmu_zap_root(kvm, root, shared, 816 817 rcu_read_unlock(); 818 } 819 820 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struc 821 { 822 u64 old_spte; 823 824 /* 825 * This helper intentionally doesn't a 826 * which doesn't have a parent page ta 827 */ 828 if (WARN_ON_ONCE(!sp->ptep)) 829 return false; 830 831 old_spte = kvm_tdp_mmu_read_spte(sp->p 832 if (WARN_ON_ONCE(!is_shadow_present_pt 833 return false; 834 835 tdp_mmu_set_spte(kvm, kvm_mmu_page_as_ 836 SHADOW_NONPRESENT_VAL 837 838 return true; 839 } 840 841 /* 842 * If can_yield is true, will release the MMU 843 * scheduler needs the CPU or there is content 844 * function cannot yield, it will not release 845 * the caller must ensure it does not supply t 846 * operation can cause a soft lockup. 847 */ 848 static bool tdp_mmu_zap_leafs(struct kvm *kvm, 849 gfn_t start, gfn 850 { 851 struct tdp_iter iter; 852 853 end = min(end, tdp_mmu_max_gfn_exclusi 854 855 lockdep_assert_held_write(&kvm->mmu_lo 856 857 rcu_read_lock(); 858 859 for_each_tdp_pte_min_level(iter, root, 860 if (can_yield && 861 tdp_mmu_iter_cond_resched( 862 flush = false; 863 continue; 864 } 865 866 if (!is_shadow_present_pte(ite 867 !is_last_spte(iter.old_spt 868 continue; 869 870 tdp_mmu_iter_set_spte(kvm, &it 871 872 /* 873 * Zappings SPTEs in invalid r 874 * see kvm_tdp_mmu_zap_invalid 875 */ 876 if (!root->role.invalid) 877 flush = true; 878 } 879 880 rcu_read_unlock(); 881 882 /* 883 * Because this flow zaps _only_ leaf 884 * to provide RCU protection as no 'st 885 */ 886 return flush; 887 } 888 889 /* 890 * Zap leaf SPTEs for the range of gfns, [star 891 * Returns true if a TLB flush is needed befor 892 * one or more SPTEs were zapped since the MMU 893 */ 894 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gf 895 { 896 struct kvm_mmu_page *root; 897 898 lockdep_assert_held_write(&kvm->mmu_lo 899 for_each_valid_tdp_mmu_root_yield_safe 900 flush = tdp_mmu_zap_leafs(kvm, 901 902 return flush; 903 } 904 905 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 906 { 907 struct kvm_mmu_page *root; 908 909 /* 910 * Zap all roots, including invalid ro 911 * before returning to the caller. Za 912 * also being zapped by a worker. Wal 913 * all that expensive and mmu_lock is 914 * worker has yielded, i.e. flushing t 915 * isn't guaranteed to be any faster. 916 * 917 * A TLB flush is unnecessary, KVM zap 918 * is being destroyed or the userspace 919 * KVM_RUN is unreachable, i.e. no vCP 920 */ 921 lockdep_assert_held_write(&kvm->mmu_lo 922 for_each_tdp_mmu_root_yield_safe(kvm, 923 tdp_mmu_zap_root(kvm, root, fa 924 } 925 926 /* 927 * Zap all invalidated roots to ensure all SPT 928 * zap" completes. 929 */ 930 void kvm_tdp_mmu_zap_invalidated_roots(struct 931 { 932 struct kvm_mmu_page *root; 933 934 read_lock(&kvm->mmu_lock); 935 936 for_each_tdp_mmu_root_yield_safe(kvm, 937 if (!root->tdp_mmu_scheduled_r 938 continue; 939 940 root->tdp_mmu_scheduled_root_t 941 KVM_BUG_ON(!root->role.invalid 942 943 /* 944 * A TLB flush is not necessar 945 * flush when allocating a new 946 * when migrating a vCPU to a 947 * TLB flush on reuse also inv 948 * entries, i.e. TLB entries f 949 * that may be zapped, as such 950 * ASID on both VMX and SVM. 951 */ 952 tdp_mmu_zap_root(kvm, root, tr 953 954 /* 955 * The referenced needs to be 956 * the root must be reachable 957 * zapped 958 */ 959 kvm_tdp_mmu_put_root(kvm, root 960 } 961 962 read_unlock(&kvm->mmu_lock); 963 } 964 965 /* 966 * Mark each TDP MMU root as invalid to preven 967 * is about to be zapped, e.g. in response to 968 * zapping is done separately so that it happe 969 * whereas invalidating roots must be done wit 970 * the VM is being destroyed). 971 * 972 * Note, kvm_tdp_mmu_zap_invalidated_roots() i 973 * See kvm_tdp_mmu_alloc_root(). 974 */ 975 void kvm_tdp_mmu_invalidate_all_roots(struct k 976 { 977 struct kvm_mmu_page *root; 978 979 /* 980 * mmu_lock must be held for write to 981 * invalid while there are active read 982 * there are active readers may or may 983 * but it's uncharted territory and no 984 * 985 * Waive the assertion if there are no 986 * being destroyed after all reference 987 * have been created (which means ther 988 * being destroyed in an error path of 989 */ 990 if (IS_ENABLED(CONFIG_PROVE_LOCKING) & 991 refcount_read(&kvm->users_count) & 992 lockdep_assert_held_write(&kvm 993 994 /* 995 * As above, mmu_lock isn't held when 996 * be other references to @kvm, i.e. n 997 * or get/put references to roots. 998 */ 999 list_for_each_entry(root, &kvm->arch.t 1000 /* 1001 * Note, invalid roots can ou 1002 * roots must be *zapped* bef 1003 * but a different task can a 1004 * root alive after its been 1005 */ 1006 if (!root->role.invalid) { 1007 root->tdp_mmu_schedul 1008 root->role.invalid = 1009 } 1010 } 1011 } 1012 1013 /* 1014 * Installs a last-level SPTE to handle a TDP 1015 * (NPT/EPT violation/misconfiguration) 1016 */ 1017 static int tdp_mmu_map_handle_target_level(st 1018 str 1019 str 1020 { 1021 struct kvm_mmu_page *sp = sptep_to_sp 1022 u64 new_spte; 1023 int ret = RET_PF_FIXED; 1024 bool wrprot = false; 1025 1026 if (WARN_ON_ONCE(sp->role.level != fa 1027 return RET_PF_RETRY; 1028 1029 if (unlikely(!fault->slot)) 1030 new_spte = make_mmio_spte(vcp 1031 else 1032 wrprot = make_spte(vcpu, sp, 1033 faul 1034 faul 1035 1036 if (new_spte == iter->old_spte) 1037 ret = RET_PF_SPURIOUS; 1038 else if (tdp_mmu_set_spte_atomic(vcpu 1039 return RET_PF_RETRY; 1040 else if (is_shadow_present_pte(iter-> 1041 !is_last_spte(iter->old_spte 1042 kvm_flush_remote_tlbs_gfn(vcp 1043 1044 /* 1045 * If the page fault was caused by a 1046 * protected, emulation is needed. If 1047 * the vCPU would have the same fault 1048 */ 1049 if (wrprot && fault->write) 1050 ret = RET_PF_WRITE_PROTECTED; 1051 1052 /* If a MMIO SPTE is installed, the M 1053 if (unlikely(is_mmio_spte(vcpu->kvm, 1054 vcpu->stat.pf_mmio_spte_creat 1055 trace_mark_mmio_spte(rcu_dere 1056 new_spte 1057 ret = RET_PF_EMULATE; 1058 } else { 1059 trace_kvm_mmu_set_spte(iter-> 1060 rcu_de 1061 } 1062 1063 return ret; 1064 } 1065 1066 /* 1067 * tdp_mmu_link_sp - Replace the given spte w 1068 * provided page table. 1069 * 1070 * @kvm: kvm instance 1071 * @iter: a tdp_iter instance currently on th 1072 * @sp: The new TDP page table to install. 1073 * @shared: This operation is running under t 1074 * 1075 * Returns: 0 if the new page table was insta 1076 * could not be installed (e.g. the 1077 */ 1078 static int tdp_mmu_link_sp(struct kvm *kvm, s 1079 struct kvm_mmu_pag 1080 { 1081 u64 spte = make_nonleaf_spte(sp->spt, 1082 int ret = 0; 1083 1084 if (shared) { 1085 ret = tdp_mmu_set_spte_atomic 1086 if (ret) 1087 return ret; 1088 } else { 1089 tdp_mmu_iter_set_spte(kvm, it 1090 } 1091 1092 tdp_account_mmu_page(kvm, sp); 1093 1094 return 0; 1095 } 1096 1097 static int tdp_mmu_split_huge_page(struct kvm 1098 struct kvm 1099 1100 /* 1101 * Handle a TDP page fault (NPT/EPT violation 1102 * page tables and SPTEs to translate the fau 1103 */ 1104 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, st 1105 { 1106 struct kvm_mmu *mmu = vcpu->arch.mmu; 1107 struct kvm *kvm = vcpu->kvm; 1108 struct tdp_iter iter; 1109 struct kvm_mmu_page *sp; 1110 int ret = RET_PF_RETRY; 1111 1112 kvm_mmu_hugepage_adjust(vcpu, fault); 1113 1114 trace_kvm_mmu_spte_requested(fault); 1115 1116 rcu_read_lock(); 1117 1118 tdp_mmu_for_each_pte(iter, mmu, fault 1119 int r; 1120 1121 if (fault->nx_huge_page_worka 1122 disallowed_hugepage_a 1123 1124 /* 1125 * If SPTE has been frozen by 1126 * retry, avoiding unnecessar 1127 */ 1128 if (is_frozen_spte(iter.old_s 1129 goto retry; 1130 1131 if (iter.level == fault->goal 1132 goto map_target_level 1133 1134 /* Step down into the lower l 1135 if (is_shadow_present_pte(ite 1136 !is_large_pte(iter.old_sp 1137 continue; 1138 1139 /* 1140 * The SPTE is either non-pre 1141 * needs to be split. 1142 */ 1143 sp = tdp_mmu_alloc_sp(vcpu); 1144 tdp_mmu_init_child_sp(sp, &it 1145 1146 sp->nx_huge_page_disallowed = 1147 1148 if (is_shadow_present_pte(ite 1149 r = tdp_mmu_split_hug 1150 else 1151 r = tdp_mmu_link_sp(k 1152 1153 /* 1154 * Force the guest to retry i 1155 * failed, e.g. because a dif 1156 */ 1157 if (r) { 1158 tdp_mmu_free_sp(sp); 1159 goto retry; 1160 } 1161 1162 if (fault->huge_page_disallow 1163 fault->req_level >= iter. 1164 spin_lock(&kvm->arch. 1165 if (sp->nx_huge_page_ 1166 track_possibl 1167 spin_unlock(&kvm->arc 1168 } 1169 } 1170 1171 /* 1172 * The walk aborted before reaching t 1173 * iterator detected an upper level S 1174 */ 1175 WARN_ON_ONCE(iter.level == fault->goa 1176 goto retry; 1177 1178 map_target_level: 1179 ret = tdp_mmu_map_handle_target_level 1180 1181 retry: 1182 rcu_read_unlock(); 1183 return ret; 1184 } 1185 1186 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm * 1187 bool flush) 1188 { 1189 struct kvm_mmu_page *root; 1190 1191 __for_each_tdp_mmu_root_yield_safe(kv 1192 flush = tdp_mmu_zap_leafs(kvm 1193 ran 1194 1195 return flush; 1196 } 1197 1198 typedef bool (*tdp_handler_t)(struct kvm *kvm 1199 struct kvm_gfn_ 1200 1201 static __always_inline bool kvm_tdp_mmu_handl 1202 1203 1204 { 1205 struct kvm_mmu_page *root; 1206 struct tdp_iter iter; 1207 bool ret = false; 1208 1209 /* 1210 * Don't support rescheduling, none o 1211 * into this helper allow blocking; i 1212 */ 1213 for_each_tdp_mmu_root(kvm, root, rang 1214 rcu_read_lock(); 1215 1216 tdp_root_for_each_leaf_pte(it 1217 ret |= handler(kvm, & 1218 1219 rcu_read_unlock(); 1220 } 1221 1222 return ret; 1223 } 1224 1225 /* 1226 * Mark the SPTEs range of GFNs [start, end) 1227 * if any of the GFNs in the range have been 1228 * 1229 * No need to mark the corresponding PFN as a 1230 * from the clear_young() or clear_flush_youn 1231 * return value to determine if the page has 1232 */ 1233 static bool age_gfn_range(struct kvm *kvm, st 1234 struct kvm_gfn_rang 1235 { 1236 u64 new_spte; 1237 1238 /* If we have a non-accessed entry we 1239 if (!is_accessed_spte(iter->old_spte) 1240 return false; 1241 1242 if (spte_ad_enabled(iter->old_spte)) 1243 iter->old_spte = tdp_mmu_clea 1244 1245 1246 1247 new_spte = iter->old_spte & ~ 1248 } else { 1249 /* 1250 * Capture the dirty status o 1251 * lost when the SPTE is mark 1252 */ 1253 if (is_writable_pte(iter->old 1254 kvm_set_pfn_dirty(spt 1255 1256 new_spte = mark_spte_for_acce 1257 iter->old_spte = kvm_tdp_mmu_ 1258 1259 1260 } 1261 1262 trace_kvm_tdp_mmu_spte_changed(iter-> 1263 iter-> 1264 return true; 1265 } 1266 1267 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kv 1268 { 1269 return kvm_tdp_mmu_handle_gfn(kvm, ra 1270 } 1271 1272 static bool test_age_gfn(struct kvm *kvm, str 1273 struct kvm_gfn_range 1274 { 1275 return is_accessed_spte(iter->old_spt 1276 } 1277 1278 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm 1279 { 1280 return kvm_tdp_mmu_handle_gfn(kvm, ra 1281 } 1282 1283 /* 1284 * Remove write access from all SPTEs at or a 1285 * [start, end). Returns true if an SPTE has 1286 * be flushed. 1287 */ 1288 static bool wrprot_gfn_range(struct kvm *kvm, 1289 gfn_t start, gfn 1290 { 1291 struct tdp_iter iter; 1292 u64 new_spte; 1293 bool spte_set = false; 1294 1295 rcu_read_lock(); 1296 1297 BUG_ON(min_level > KVM_MAX_HUGEPAGE_L 1298 1299 for_each_tdp_pte_min_level(iter, root 1300 retry: 1301 if (tdp_mmu_iter_cond_resched 1302 continue; 1303 1304 if (!is_shadow_present_pte(it 1305 !is_last_spte(iter.old_sp 1306 !(iter.old_spte & PT_WRIT 1307 continue; 1308 1309 new_spte = iter.old_spte & ~P 1310 1311 if (tdp_mmu_set_spte_atomic(k 1312 goto retry; 1313 1314 spte_set = true; 1315 } 1316 1317 rcu_read_unlock(); 1318 return spte_set; 1319 } 1320 1321 /* 1322 * Remove write access from all the SPTEs map 1323 * only affect leaf SPTEs down to min_level. 1324 * Returns true if an SPTE has been changed a 1325 */ 1326 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1327 const struct kvm 1328 { 1329 struct kvm_mmu_page *root; 1330 bool spte_set = false; 1331 1332 lockdep_assert_held_read(&kvm->mmu_lo 1333 1334 for_each_valid_tdp_mmu_root_yield_saf 1335 spte_set |= wrprot_gfn_range( 1336 slot->base_gfn + 1337 1338 return spte_set; 1339 } 1340 1341 static struct kvm_mmu_page *tdp_mmu_alloc_sp_ 1342 { 1343 struct kvm_mmu_page *sp; 1344 1345 sp = kmem_cache_zalloc(mmu_page_heade 1346 if (!sp) 1347 return NULL; 1348 1349 sp->spt = (void *)get_zeroed_page(GFP 1350 if (!sp->spt) { 1351 kmem_cache_free(mmu_page_head 1352 return NULL; 1353 } 1354 1355 return sp; 1356 } 1357 1358 /* Note, the caller is responsible for initia 1359 static int tdp_mmu_split_huge_page(struct kvm 1360 struct kvm 1361 { 1362 const u64 huge_spte = iter->old_spte; 1363 const int level = iter->level; 1364 int ret, i; 1365 1366 /* 1367 * No need for atomics when writing t 1368 * not been linked in yet and thus is 1369 */ 1370 for (i = 0; i < SPTE_ENT_PER_PAGE; i+ 1371 sp->spt[i] = make_huge_page_s 1372 1373 /* 1374 * Replace the huge spte with a point 1375 * page table. Since we are making th 1376 * will see a mix of the split mappin 1377 * depending on what's currently in t 1378 * correctness standpoint since the t 1379 * way. 1380 */ 1381 ret = tdp_mmu_link_sp(kvm, iter, sp, 1382 if (ret) 1383 goto out; 1384 1385 /* 1386 * tdp_mmu_link_sp_atomic() will hand 1387 * are overwriting from the page stat 1388 * the page stats with the new presen 1389 */ 1390 kvm_update_page_stats(kvm, level - 1, 1391 1392 out: 1393 trace_kvm_mmu_split_huge_page(iter->g 1394 return ret; 1395 } 1396 1397 static int tdp_mmu_split_huge_pages_root(stru 1398 stru 1399 gfn_ 1400 int 1401 { 1402 struct kvm_mmu_page *sp = NULL; 1403 struct tdp_iter iter; 1404 1405 rcu_read_lock(); 1406 1407 /* 1408 * Traverse the page table splitting 1409 * level into one lower level. For ex 1410 * we split it into 512 2MB pages. 1411 * 1412 * Since the TDP iterator uses a pre- 1413 * to visit an SPTE before ever visit 1414 * will correctly recursively split h 1415 * level above the target level (e.g. 1416 * and then splitting each of those t 1417 */ 1418 for_each_tdp_pte_min_level(iter, root 1419 retry: 1420 if (tdp_mmu_iter_cond_resched 1421 continue; 1422 1423 if (!is_shadow_present_pte(it 1424 continue; 1425 1426 if (!sp) { 1427 rcu_read_unlock(); 1428 1429 if (shared) 1430 read_unlock(& 1431 else 1432 write_unlock( 1433 1434 sp = tdp_mmu_alloc_sp 1435 1436 if (shared) 1437 read_lock(&kv 1438 else 1439 write_lock(&k 1440 1441 if (!sp) { 1442 trace_kvm_mmu 1443 1444 1445 return -ENOME 1446 } 1447 1448 rcu_read_lock(); 1449 1450 iter.yielded = true; 1451 continue; 1452 } 1453 1454 tdp_mmu_init_child_sp(sp, &it 1455 1456 if (tdp_mmu_split_huge_page(k 1457 goto retry; 1458 1459 sp = NULL; 1460 } 1461 1462 rcu_read_unlock(); 1463 1464 /* 1465 * It's possible to exit the loop hav 1466 * example, a vCPU doing HugePage NX 1467 * installs its own sp in place of th 1468 */ 1469 if (sp) 1470 tdp_mmu_free_sp(sp); 1471 1472 return 0; 1473 } 1474 1475 1476 /* 1477 * Try to split all huge pages mapped by the 1478 */ 1479 void kvm_tdp_mmu_try_split_huge_pages(struct 1480 const s 1481 gfn_t s 1482 int tar 1483 { 1484 struct kvm_mmu_page *root; 1485 int r = 0; 1486 1487 kvm_lockdep_assert_mmu_lock_held(kvm, 1488 for_each_valid_tdp_mmu_root_yield_saf 1489 r = tdp_mmu_split_huge_pages_ 1490 if (r) { 1491 kvm_tdp_mmu_put_root( 1492 break; 1493 } 1494 } 1495 } 1496 1497 static bool tdp_mmu_need_write_protect(struct 1498 { 1499 /* 1500 * All TDP MMU shadow pages share the 1501 * from level, so it is valid to key 1502 * write protection is needed for an 1503 */ 1504 return kvm_mmu_page_ad_need_write_pro 1505 } 1506 1507 static bool clear_dirty_gfn_range(struct kvm 1508 gfn_t start, gfn_t 1509 { 1510 const u64 dbit = tdp_mmu_need_write_p 1511 1512 struct tdp_iter iter; 1513 bool spte_set = false; 1514 1515 rcu_read_lock(); 1516 1517 tdp_root_for_each_pte(iter, root, sta 1518 retry: 1519 if (!is_shadow_present_pte(it 1520 !is_last_spte(iter.old_sp 1521 continue; 1522 1523 if (tdp_mmu_iter_cond_resched 1524 continue; 1525 1526 KVM_MMU_WARN_ON(dbit == shado 1527 spte_ad_need_ 1528 1529 if (!(iter.old_spte & dbit)) 1530 continue; 1531 1532 if (tdp_mmu_set_spte_atomic(k 1533 goto retry; 1534 1535 spte_set = true; 1536 } 1537 1538 rcu_read_unlock(); 1539 return spte_set; 1540 } 1541 1542 /* 1543 * Clear the dirty status (D-bit or W-bit) of 1544 * memslot. Returns true if an SPTE has been 1545 * flushed. 1546 */ 1547 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm 1548 const struc 1549 { 1550 struct kvm_mmu_page *root; 1551 bool spte_set = false; 1552 1553 lockdep_assert_held_read(&kvm->mmu_lo 1554 for_each_valid_tdp_mmu_root_yield_saf 1555 spte_set |= clear_dirty_gfn_r 1556 slot->base_gf 1557 1558 return spte_set; 1559 } 1560 1561 static void clear_dirty_pt_masked(struct kvm 1562 gfn_t gfn, 1563 { 1564 const u64 dbit = (wrprot || tdp_mmu_n 1565 1566 struct tdp_iter iter; 1567 1568 lockdep_assert_held_write(&kvm->mmu_l 1569 1570 rcu_read_lock(); 1571 1572 tdp_root_for_each_leaf_pte(iter, root 1573 gfn + BIT 1574 if (!mask) 1575 break; 1576 1577 KVM_MMU_WARN_ON(dbit == shado 1578 spte_ad_need_ 1579 1580 if (iter.level > PG_LEVEL_4K 1581 !(mask & (1UL << (iter.gf 1582 continue; 1583 1584 mask &= ~(1UL << (iter.gfn - 1585 1586 if (!(iter.old_spte & dbit)) 1587 continue; 1588 1589 iter.old_spte = tdp_mmu_clear 1590 1591 1592 1593 trace_kvm_tdp_mmu_spte_change 1594 1595 1596 kvm_set_pfn_dirty(spte_to_pfn 1597 } 1598 1599 rcu_read_unlock(); 1600 } 1601 1602 /* 1603 * Clear the dirty status (D-bit or W-bit) of 1604 * which a bit is set in mask, starting at gf 1605 * contain all the GFNs represented by set bi 1606 */ 1607 void kvm_tdp_mmu_clear_dirty_pt_masked(struct 1608 struct 1609 gfn_t 1610 bool w 1611 { 1612 struct kvm_mmu_page *root; 1613 1614 for_each_valid_tdp_mmu_root(kvm, root 1615 clear_dirty_pt_masked(kvm, ro 1616 } 1617 1618 static void zap_collapsible_spte_range(struct 1619 struct 1620 const 1621 { 1622 gfn_t start = slot->base_gfn; 1623 gfn_t end = start + slot->npages; 1624 struct tdp_iter iter; 1625 int max_mapping_level; 1626 1627 rcu_read_lock(); 1628 1629 for_each_tdp_pte_min_level(iter, root 1630 retry: 1631 if (tdp_mmu_iter_cond_resched 1632 continue; 1633 1634 if (iter.level > KVM_MAX_HUGE 1635 !is_shadow_present_pte(it 1636 continue; 1637 1638 /* 1639 * Don't zap leaf SPTEs, if a 1640 * a large page size, then it 1641 * instead of stepping down. 1642 */ 1643 if (is_last_spte(iter.old_spt 1644 continue; 1645 1646 /* 1647 * If iter.gfn resides outsid 1648 * the current level overlaps 1649 * then the SPTE can't be mad 1650 * to query that info from sl 1651 * out-of-bounds access. 1652 */ 1653 if (iter.gfn < start || iter. 1654 continue; 1655 1656 max_mapping_level = kvm_mmu_m 1657 1658 if (max_mapping_level < iter. 1659 continue; 1660 1661 /* Note, a successful atomic 1662 if (tdp_mmu_zap_spte_atomic(k 1663 goto retry; 1664 } 1665 1666 rcu_read_unlock(); 1667 } 1668 1669 /* 1670 * Zap non-leaf SPTEs (and free their associa 1671 * be replaced by huge pages, for GFNs within 1672 */ 1673 void kvm_tdp_mmu_zap_collapsible_sptes(struct 1674 const 1675 { 1676 struct kvm_mmu_page *root; 1677 1678 lockdep_assert_held_read(&kvm->mmu_lo 1679 for_each_valid_tdp_mmu_root_yield_saf 1680 zap_collapsible_spte_range(kv 1681 } 1682 1683 /* 1684 * Removes write access on the last level SPT 1685 * MMU-writable bit to ensure future writes c 1686 * Returns true if an SPTE was set and a TLB 1687 */ 1688 static bool write_protect_gfn(struct kvm *kvm 1689 gfn_t gfn, int 1690 { 1691 struct tdp_iter iter; 1692 u64 new_spte; 1693 bool spte_set = false; 1694 1695 BUG_ON(min_level > KVM_MAX_HUGEPAGE_L 1696 1697 rcu_read_lock(); 1698 1699 for_each_tdp_pte_min_level(iter, root 1700 if (!is_shadow_present_pte(it 1701 !is_last_spte(iter.old_sp 1702 continue; 1703 1704 new_spte = iter.old_spte & 1705 ~(PT_WRITABLE_MASK | 1706 1707 if (new_spte == iter.old_spte 1708 break; 1709 1710 tdp_mmu_iter_set_spte(kvm, &i 1711 spte_set = true; 1712 } 1713 1714 rcu_read_unlock(); 1715 1716 return spte_set; 1717 } 1718 1719 /* 1720 * Removes write access on the last level SPT 1721 * MMU-writable bit to ensure future writes c 1722 * Returns true if an SPTE was set and a TLB 1723 */ 1724 bool kvm_tdp_mmu_write_protect_gfn(struct kvm 1725 struct kvm 1726 int min_le 1727 { 1728 struct kvm_mmu_page *root; 1729 bool spte_set = false; 1730 1731 lockdep_assert_held_write(&kvm->mmu_l 1732 for_each_valid_tdp_mmu_root(kvm, root 1733 spte_set |= write_protect_gfn 1734 1735 return spte_set; 1736 } 1737 1738 /* 1739 * Return the level of the lowest level SPTE 1740 * That SPTE may be non-present. 1741 * 1742 * Must be called between kvm_tdp_mmu_walk_lo 1743 */ 1744 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcp 1745 int *root_level) 1746 { 1747 struct tdp_iter iter; 1748 struct kvm_mmu *mmu = vcpu->arch.mmu; 1749 gfn_t gfn = addr >> PAGE_SHIFT; 1750 int leaf = -1; 1751 1752 *root_level = vcpu->arch.mmu->root_ro 1753 1754 tdp_mmu_for_each_pte(iter, mmu, gfn, 1755 leaf = iter.level; 1756 sptes[leaf] = iter.old_spte; 1757 } 1758 1759 return leaf; 1760 } 1761 1762 /* 1763 * Returns the last level spte pointer of the 1764 * gpa, and sets *spte to the spte value. Thi 1765 * walk could be performed, returns NULL and 1766 * 1767 * Contract: 1768 * - Must be called between kvm_tdp_mmu_walk 1769 * - The returned sptep must not be used aft 1770 * 1771 * WARNING: This function is only intended to 1772 */ 1773 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struc 1774 u64 * 1775 { 1776 struct tdp_iter iter; 1777 struct kvm_mmu *mmu = vcpu->arch.mmu; 1778 tdp_ptep_t sptep = NULL; 1779 1780 tdp_mmu_for_each_pte(iter, mmu, gfn, 1781 *spte = iter.old_spte; 1782 sptep = iter.sptep; 1783 } 1784 1785 /* 1786 * Perform the rcu_dereference to get 1787 * we are passing it up to fast_page_ 1788 * legacy MMU and thus does not retai 1789 * annotation. 1790 * 1791 * This is safe since fast_page_fault 1792 * function as well as all TDP MMU co 1793 * outside of mmu_lock. 1794 */ 1795 return rcu_dereference(sptep); 1796 } 1797
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.