~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/kvm/mmu/tdp_mmu.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /arch/x86/kvm/mmu/tdp_mmu.c (Version linux-6.12-rc7) and /arch/i386/kvm/mmu/tdp_mmu.c (Version linux-5.10.228)


** Warning: Cannot open xref database.

  1 // SPDX-License-Identifier: GPL-2.0                 1 
  2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt       
  3                                                   
  4 #include "mmu.h"                                  
  5 #include "mmu_internal.h"                         
  6 #include "mmutrace.h"                             
  7 #include "tdp_iter.h"                             
  8 #include "tdp_mmu.h"                              
  9 #include "spte.h"                                 
 10                                                   
 11 #include <asm/cmpxchg.h>                          
 12 #include <trace/events/kvm.h>                     
 13                                                   
 14 /* Initializes the TDP MMU for the VM, if enab    
 15 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)        
 16 {                                                 
 17         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_root    
 18         spin_lock_init(&kvm->arch.tdp_mmu_page    
 19 }                                                 
 20                                                   
 21 /* Arbitrarily returns true so that this may b    
 22 static __always_inline bool kvm_lockdep_assert    
 23                                                   
 24 {                                                 
 25         if (shared)                               
 26                 lockdep_assert_held_read(&kvm-    
 27         else                                      
 28                 lockdep_assert_held_write(&kvm    
 29                                                   
 30         return true;                              
 31 }                                                 
 32                                                   
 33 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)      
 34 {                                                 
 35         /*                                        
 36          * Invalidate all roots, which besides    
 37          * for zapping and thus puts the TDP M    
 38          * ultimately frees all roots.            
 39          */                                       
 40         kvm_tdp_mmu_invalidate_all_roots(kvm);    
 41         kvm_tdp_mmu_zap_invalidated_roots(kvm)    
 42                                                   
 43         WARN_ON(atomic64_read(&kvm->arch.tdp_m    
 44         WARN_ON(!list_empty(&kvm->arch.tdp_mmu    
 45                                                   
 46         /*                                        
 47          * Ensure that all the outstanding RCU    
 48          * can run before the VM is torn down.    
 49          * zapped roots will create new callba    
 50          */                                       
 51         rcu_barrier();                            
 52 }                                                 
 53                                                   
 54 static void tdp_mmu_free_sp(struct kvm_mmu_pag    
 55 {                                                 
 56         free_page((unsigned long)sp->spt);        
 57         kmem_cache_free(mmu_page_header_cache,    
 58 }                                                 
 59                                                   
 60 /*                                                
 61  * This is called through call_rcu in order to    
 62  * safely with respect to other kernel threads    
 63  * the memory.                                    
 64  * By only accessing TDP MMU page table memory    
 65  * section, and freeing it after a grace perio    
 66  * memory won't use it after it is freed.         
 67  */                                               
 68 static void tdp_mmu_free_sp_rcu_callback(struc    
 69 {                                                 
 70         struct kvm_mmu_page *sp = container_of    
 71                                                   
 72                                                   
 73         tdp_mmu_free_sp(sp);                      
 74 }                                                 
 75                                                   
 76 void kvm_tdp_mmu_put_root(struct kvm *kvm, str    
 77 {                                                 
 78         if (!refcount_dec_and_test(&root->tdp_    
 79                 return;                           
 80                                                   
 81         /*                                        
 82          * The TDP MMU itself holds a referenc    
 83          * explicitly invalidated, i.e. the fi    
 84          * put for a valid root.                  
 85          */                                       
 86         KVM_BUG_ON(!is_tdp_mmu_page(root) || !    
 87                                                   
 88         spin_lock(&kvm->arch.tdp_mmu_pages_loc    
 89         list_del_rcu(&root->link);                
 90         spin_unlock(&kvm->arch.tdp_mmu_pages_l    
 91         call_rcu(&root->rcu_head, tdp_mmu_free    
 92 }                                                 
 93                                                   
 94 /*                                                
 95  * Returns the next root after @prev_root (or     
 96  * NULL).  A reference to the returned root is    
 97  * @prev_root is released (the caller obviousl    
 98  * @prev_root if it's non-NULL).                  
 99  *                                                
100  * If @only_valid is true, invalid roots are s    
101  *                                                
102  * Returns NULL if the end of tdp_mmu_roots wa    
103  */                                               
104 static struct kvm_mmu_page *tdp_mmu_next_root(    
105                                                   
106                                                   
107 {                                                 
108         struct kvm_mmu_page *next_root;           
109                                                   
110         /*                                        
111          * While the roots themselves are RCU-    
112          * role.invalid are protected by mmu_l    
113          */                                       
114         lockdep_assert_held(&kvm->mmu_lock);      
115                                                   
116         rcu_read_lock();                          
117                                                   
118         if (prev_root)                            
119                 next_root = list_next_or_null_    
120                                                   
121                                                   
122         else                                      
123                 next_root = list_first_or_null    
124                                                   
125                                                   
126         while (next_root) {                       
127                 if ((!only_valid || !next_root    
128                     kvm_tdp_mmu_get_root(next_    
129                         break;                    
130                                                   
131                 next_root = list_next_or_null_    
132                                 &next_root->li    
133         }                                         
134                                                   
135         rcu_read_unlock();                        
136                                                   
137         if (prev_root)                            
138                 kvm_tdp_mmu_put_root(kvm, prev    
139                                                   
140         return next_root;                         
141 }                                                 
142                                                   
143 /*                                                
144  * Note: this iterator gets and puts reference    
145  * This makes it safe to release the MMU lock     
146  * if exiting the loop early, the caller must     
147  * recent root. (Unless keeping a live referen    
148  *                                                
149  * If shared is set, this function is operatin    
150  * mode.                                          
151  */                                               
152 #define __for_each_tdp_mmu_root_yield_safe(_kv    
153         for (_root = tdp_mmu_next_root(_kvm, N    
154              ({ lockdep_assert_held(&(_kvm)->m    
155              _root = tdp_mmu_next_root(_kvm, _    
156                 if (_as_id >= 0 && kvm_mmu_pag    
157                 } else                            
158                                                   
159 #define for_each_valid_tdp_mmu_root_yield_safe    
160         __for_each_tdp_mmu_root_yield_safe(_kv    
161                                                   
162 #define for_each_tdp_mmu_root_yield_safe(_kvm,    
163         for (_root = tdp_mmu_next_root(_kvm, N    
164              ({ lockdep_assert_held(&(_kvm)->m    
165              _root = tdp_mmu_next_root(_kvm, _    
166                                                   
167 /*                                                
168  * Iterate over all TDP MMU roots.  Requires t    
169  * the implication being that any flow that ho    
170  * inherently yield-friendly and should use th    
171  * Holding mmu_lock for write obviates the nee    
172  * is guaranteed to be stable.                    
173  */                                               
174 #define __for_each_tdp_mmu_root(_kvm, _root, _    
175         list_for_each_entry(_root, &_kvm->arch    
176                 if (kvm_lockdep_assert_mmu_loc    
177                     ((_as_id >= 0 && kvm_mmu_p    
178                      ((_only_valid) && (_root)    
179                 } else                            
180                                                   
181 #define for_each_tdp_mmu_root(_kvm, _root, _as    
182         __for_each_tdp_mmu_root(_kvm, _root, _    
183                                                   
184 #define for_each_valid_tdp_mmu_root(_kvm, _roo    
185         __for_each_tdp_mmu_root(_kvm, _root, _    
186                                                   
187 static struct kvm_mmu_page *tdp_mmu_alloc_sp(s    
188 {                                                 
189         struct kvm_mmu_page *sp;                  
190                                                   
191         sp = kvm_mmu_memory_cache_alloc(&vcpu-    
192         sp->spt = kvm_mmu_memory_cache_alloc(&    
193                                                   
194         return sp;                                
195 }                                                 
196                                                   
197 static void tdp_mmu_init_sp(struct kvm_mmu_pag    
198                             gfn_t gfn, union k    
199 {                                                 
200         INIT_LIST_HEAD(&sp->possible_nx_huge_p    
201                                                   
202         set_page_private(virt_to_page(sp->spt)    
203                                                   
204         sp->role = role;                          
205         sp->gfn = gfn;                            
206         sp->ptep = sptep;                         
207         sp->tdp_mmu_page = true;                  
208                                                   
209         trace_kvm_mmu_get_page(sp, true);         
210 }                                                 
211                                                   
212 static void tdp_mmu_init_child_sp(struct kvm_m    
213                                   struct tdp_i    
214 {                                                 
215         struct kvm_mmu_page *parent_sp;           
216         union kvm_mmu_page_role role;             
217                                                   
218         parent_sp = sptep_to_sp(rcu_dereferenc    
219                                                   
220         role = parent_sp->role;                   
221         role.level--;                             
222                                                   
223         tdp_mmu_init_sp(child_sp, iter->sptep,    
224 }                                                 
225                                                   
226 int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vc    
227 {                                                 
228         struct kvm_mmu *mmu = vcpu->arch.mmu;     
229         union kvm_mmu_page_role role = mmu->ro    
230         int as_id = kvm_mmu_role_as_id(role);     
231         struct kvm *kvm = vcpu->kvm;              
232         struct kvm_mmu_page *root;                
233                                                   
234         /*                                        
235          * Check for an existing root before a    
236          * unnecessary serialization if multip    
237          * E.g. when bringing up secondary vCP    
238          * a valid root on behalf of the prima    
239          */                                       
240         read_lock(&kvm->mmu_lock);                
241                                                   
242         for_each_valid_tdp_mmu_root_yield_safe    
243                 if (root->role.word == role.wo    
244                         goto out_read_unlock;     
245         }                                         
246                                                   
247         spin_lock(&kvm->arch.tdp_mmu_pages_loc    
248                                                   
249         /*                                        
250          * Recheck for an existing root after     
251          * vCPU may have raced ahead and creat    
252          * walk the list of roots as the stand    
253          * lock is *not* held.  WARN if grabbi    
254          * fails, as the last reference to a r    
255          * root has been invalidated, which re    
256          */                                       
257         list_for_each_entry(root, &kvm->arch.t    
258                 if (root->role.word == role.wo    
259                     !WARN_ON_ONCE(!kvm_tdp_mmu    
260                         goto out_spin_unlock;     
261         }                                         
262                                                   
263         root = tdp_mmu_alloc_sp(vcpu);            
264         tdp_mmu_init_sp(root, NULL, 0, role);     
265                                                   
266         /*                                        
267          * TDP MMU roots are kept until they a    
268          * by a memslot update or by the destr    
269          * refcount to two; one reference for     
270          * the TDP MMU itself, which is held u    
271          * is ultimately put by kvm_tdp_mmu_za    
272          */                                       
273         refcount_set(&root->tdp_mmu_root_count    
274         list_add_rcu(&root->link, &kvm->arch.t    
275                                                   
276 out_spin_unlock:                                  
277         spin_unlock(&kvm->arch.tdp_mmu_pages_l    
278 out_read_unlock:                                  
279         read_unlock(&kvm->mmu_lock);              
280         /*                                        
281          * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROO    
282          * and actually consuming the root if     
283          * mmu_lock, and the root can't be fre    
284          */                                       
285         mmu->root.hpa = __pa(root->spt);          
286         mmu->root.pgd = 0;                        
287         return 0;                                 
288 }                                                 
289                                                   
290 static void handle_changed_spte(struct kvm *kv    
291                                 u64 old_spte,     
292                                 bool shared);     
293                                                   
294 static void tdp_account_mmu_page(struct kvm *k    
295 {                                                 
296         kvm_account_pgtable_pages((void *)sp->    
297         atomic64_inc(&kvm->arch.tdp_mmu_pages)    
298 }                                                 
299                                                   
300 static void tdp_unaccount_mmu_page(struct kvm     
301 {                                                 
302         kvm_account_pgtable_pages((void *)sp->    
303         atomic64_dec(&kvm->arch.tdp_mmu_pages)    
304 }                                                 
305                                                   
306 /**                                               
307  * tdp_mmu_unlink_sp() - Remove a shadow page     
308  *                                                
309  * @kvm: kvm instance                             
310  * @sp: the page to be removed                    
311  */                                               
312 static void tdp_mmu_unlink_sp(struct kvm *kvm,    
313 {                                                 
314         tdp_unaccount_mmu_page(kvm, sp);          
315                                                   
316         if (!sp->nx_huge_page_disallowed)         
317                 return;                           
318                                                   
319         spin_lock(&kvm->arch.tdp_mmu_pages_loc    
320         sp->nx_huge_page_disallowed = false;      
321         untrack_possible_nx_huge_page(kvm, sp)    
322         spin_unlock(&kvm->arch.tdp_mmu_pages_l    
323 }                                                 
324                                                   
325 /**                                               
326  * handle_removed_pt() - handle a page table r    
327  *                                                
328  * @kvm: kvm instance                             
329  * @pt: the page removed from the paging struc    
330  * @shared: This operation may not be running     
331  *          of the MMU lock and the operation     
332  *          threads that might be modifying SP    
333  *                                                
334  * Given a page table that has been removed fr    
335  * iterates through the page table to clear SP    
336  *                                                
337  * Note that pt is passed in as a tdp_ptep_t,     
338  * protection. Since this thread removed it fr    
339  * this thread will be responsible for ensurin    
340  * early rcu_dereferences in the function.        
341  */                                               
342 static void handle_removed_pt(struct kvm *kvm,    
343 {                                                 
344         struct kvm_mmu_page *sp = sptep_to_sp(    
345         int level = sp->role.level;               
346         gfn_t base_gfn = sp->gfn;                 
347         int i;                                    
348                                                   
349         trace_kvm_mmu_prepare_zap_page(sp);       
350                                                   
351         tdp_mmu_unlink_sp(kvm, sp);               
352                                                   
353         for (i = 0; i < SPTE_ENT_PER_PAGE; i++    
354                 tdp_ptep_t sptep = pt + i;        
355                 gfn_t gfn = base_gfn + i * KVM    
356                 u64 old_spte;                     
357                                                   
358                 if (shared) {                     
359                         /*                        
360                          * Set the SPTE to a n    
361                          * threads will not ov    
362                          * already marked as f    
363                          * handling a page fau    
364                          * set the SPTE until     
365                          * value to the frozen    
366                          */                       
367                         for (;;) {                
368                                 old_spte = kvm    
369                                 if (!is_frozen    
370                                         break;    
371                                 cpu_relax();      
372                         }                         
373                 } else {                          
374                         /*                        
375                          * If the SPTE is not     
376                          * page associated wit    
377                          * that need to be rec    
378                          * mmu_lock ensures th    
379                          * Note, zapping MMIO     
380                          * are guarded by the     
381                          * unreachable.           
382                          */                       
383                         old_spte = kvm_tdp_mmu    
384                         if (!is_shadow_present    
385                                 continue;         
386                                                   
387                         /*                        
388                          * Use the common help    
389                          * the SPTE needs to b    
390                          * modified by a diffe    
391                          * Even though the par    
392                          * hasn't yet been flu    
393                          * document that A/D a    
394                          * entries that are ca    
395                          * still access the pa    
396                          *                        
397                          * No retry is needed     
398                          * sole concern is dro    
399                          * task can zap/remove    
400                          * write.  Marking the    
401                          * strictly necessary     
402                          * the frozen SPTE val    
403                          * paths consistent an    
404                          * call below to hardc    
405                          *                        
406                          * Note, even though d    
407                          * scenario where a no    
408                          * functional bug, sim    
409                          * sufficient as a fas    
410                          * level SPTE before i    
411                          * target SPTE writabl    
412                          * Dirty bit between r    
413                          * it here.               
414                          */                       
415                         old_spte = kvm_tdp_mmu    
416                                                   
417                 }                                 
418                 handle_changed_spte(kvm, kvm_m    
419                                     old_spte,     
420         }                                         
421                                                   
422         call_rcu(&sp->rcu_head, tdp_mmu_free_s    
423 }                                                 
424                                                   
425 /**                                               
426  * handle_changed_spte - handle bookkeeping as    
427  * @kvm: kvm instance                             
428  * @as_id: the address space of the paging str    
429  * @gfn: the base GFN that was mapped by the S    
430  * @old_spte: The value of the SPTE before the    
431  * @new_spte: The value of the SPTE after the     
432  * @level: the level of the PT the SPTE is par    
433  * @shared: This operation may not be running     
434  *          the MMU lock and the operation mus    
435  *          threads that might be modifying SP    
436  *                                                
437  * Handle bookkeeping that might result from t    
438  * dirty logging updates are handled in common    
439  * and fast_pf_fix_direct_spte()).                
440  */                                               
441 static void handle_changed_spte(struct kvm *kv    
442                                 u64 old_spte,     
443                                 bool shared)      
444 {                                                 
445         bool was_present = is_shadow_present_p    
446         bool is_present = is_shadow_present_pt    
447         bool was_leaf = was_present && is_last    
448         bool is_leaf = is_present && is_last_s    
449         bool pfn_changed = spte_to_pfn(old_spt    
450                                                   
451         WARN_ON_ONCE(level > PT64_ROOT_MAX_LEV    
452         WARN_ON_ONCE(level < PG_LEVEL_4K);        
453         WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAG    
454                                                   
455         /*                                        
456          * If this warning were to trigger it     
457          * missing MMU notifier or a race with    
458          * A present, leaf SPTE should never b    
459          * present leaf SPTE pointing to a dif    
460          * should be zapping the SPTE before t    
461          * changed, or the SPTE should be zero    
462          * thread before replacement.             
463          */                                       
464         if (was_leaf && is_leaf && pfn_changed    
465                 pr_err("Invalid SPTE change: c    
466                        "SPTE with another pres    
467                        "different PFN!\n"         
468                        "as_id: %d gfn: %llx ol    
469                        as_id, gfn, old_spte, n    
470                                                   
471                 /*                                
472                  * Crash the host to prevent e    
473                  * corruption.                    
474                  */                               
475                 BUG();                            
476         }                                         
477                                                   
478         if (old_spte == new_spte)                 
479                 return;                           
480                                                   
481         trace_kvm_tdp_mmu_spte_changed(as_id,     
482                                                   
483         if (is_leaf)                              
484                 check_spte_writable_invariants    
485                                                   
486         /*                                        
487          * The only times a SPTE should be cha    
488          * non-present state is when an MMIO e    
489          * removed. In that case, there is not    
490          */                                       
491         if (!was_present && !is_present) {        
492                 /*                                
493                  * If this change does not inv    
494                  * it is unexpected. Log the c    
495                  * impact the guest since both    
496                  * are nonpresent.                
497                  */                               
498                 if (WARN_ON_ONCE(!is_mmio_spte    
499                                  !is_mmio_spte    
500                                  !is_frozen_sp    
501                         pr_err("Unexpected SPT    
502                                "should not be     
503                                "different nonp    
504                                "are MMIO SPTEs    
505                                "a temporary fr    
506                                "as_id: %d gfn:    
507                                as_id, gfn, old    
508                 return;                           
509         }                                         
510                                                   
511         if (is_leaf != was_leaf)                  
512                 kvm_update_page_stats(kvm, lev    
513                                                   
514         if (was_leaf && is_dirty_spte(old_spte    
515             (!is_present || !is_dirty_spte(new    
516                 kvm_set_pfn_dirty(spte_to_pfn(    
517                                                   
518         /*                                        
519          * Recursively handle child PTs if the    
520          * the paging structure.  Note the WAR    
521          * SPTE being converted to a hugepage     
522          * pages are kernel allocations and sh    
523          */                                       
524         if (was_present && !was_leaf &&           
525             (is_leaf || !is_present || WARN_ON    
526                 handle_removed_pt(kvm, spte_to    
527                                                   
528         if (was_leaf && is_accessed_spte(old_s    
529             (!is_present || !is_accessed_spte(    
530                 kvm_set_pfn_accessed(spte_to_p    
531 }                                                 
532                                                   
533 static inline int __must_check __tdp_mmu_set_s    
534                                                   
535 {                                                 
536         u64 *sptep = rcu_dereference(iter->spt    
537                                                   
538         /*                                        
539          * The caller is responsible for ensur    
540          * SPTE.  KVM should never attempt to     
541          * and pre-checking before inserting a    
542          * avoids unnecessary work.               
543          */                                       
544         WARN_ON_ONCE(iter->yielded || is_froze    
545                                                   
546         /*                                        
547          * Note, fast_pf_fix_direct_spte() can    
548          * does not hold the mmu_lock.  On fai    
549          * CPU modified the SPTE, try_cmpxchg6    
550          * the current value, so the caller op    
551          * retries tdp_mmu_set_spte_atomic()      
552          */                                       
553         if (!try_cmpxchg64(sptep, &iter->old_s    
554                 return -EBUSY;                    
555                                                   
556         return 0;                                 
557 }                                                 
558                                                   
559 /*                                                
560  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPT    
561  * and handle the associated bookkeeping.  Do     
562  * in KVM's dirty bitmaps.                        
563  *                                                
564  * If setting the SPTE fails because it has ch    
565  * refreshed to the current value of the spte.    
566  *                                                
567  * @kvm: kvm instance                             
568  * @iter: a tdp_iter instance currently on the    
569  * @new_spte: The value the SPTE should be set    
570  * Return:                                        
571  * * 0      - If the SPTE was set.                
572  * * -EBUSY - If the SPTE cannot be set. In th    
573  *            no side-effects other than setti    
574  *            known value of the spte.            
575  */                                               
576 static inline int __must_check tdp_mmu_set_spt    
577                                                   
578                                                   
579 {                                                 
580         int ret;                                  
581                                                   
582         lockdep_assert_held_read(&kvm->mmu_loc    
583                                                   
584         ret = __tdp_mmu_set_spte_atomic(iter,     
585         if (ret)                                  
586                 return ret;                       
587                                                   
588         handle_changed_spte(kvm, iter->as_id,     
589                             new_spte, iter->le    
590                                                   
591         return 0;                                 
592 }                                                 
593                                                   
594 static inline int __must_check tdp_mmu_zap_spt    
595                                                   
596 {                                                 
597         int ret;                                  
598                                                   
599         lockdep_assert_held_read(&kvm->mmu_loc    
600                                                   
601         /*                                        
602          * Freeze the SPTE by setting it to a     
603          * will stop other threads from immedi    
604          * in its place before the TLBs are fl    
605          *                                        
606          * Delay processing of the zapped SPTE    
607          * the FROZEN_SPTE is replaced (see be    
608          */                                       
609         ret = __tdp_mmu_set_spte_atomic(iter,     
610         if (ret)                                  
611                 return ret;                       
612                                                   
613         kvm_flush_remote_tlbs_gfn(kvm, iter->g    
614                                                   
615         /*                                        
616          * No other thread can overwrite the f    
617          * wait on the MMU lock or use tdp_mmu    
618          * overwrite the special frozen SPTE v    
619          * avoid an unnecessary check on volat    
620          */                                       
621         __kvm_tdp_mmu_write_spte(iter->sptep,     
622                                                   
623         /*                                        
624          * Process the zapped SPTE after flush    
625          * FROZEN_SPTE with 0. This minimizes     
626          * blocked by the FROZEN_SPTE and redu    
627          * SPTEs.                                 
628          */                                       
629         handle_changed_spte(kvm, iter->as_id,     
630                             SHADOW_NONPRESENT_    
631                                                   
632         return 0;                                 
633 }                                                 
634                                                   
635                                                   
636 /*                                                
637  * tdp_mmu_set_spte - Set a TDP MMU SPTE and h    
638  * @kvm:              KVM instance                
639  * @as_id:            Address space ID, i.e. r    
640  * @sptep:            Pointer to the SPTE         
641  * @old_spte:         The current value of the    
642  * @new_spte:         The new value that will     
643  * @gfn:              The base GFN that was (o    
644  * @level:            The level _containing_ t    
645  *                                                
646  * Returns the old SPTE value, which _may_ be     
647  * SPTE had voldatile bits.                       
648  */                                               
649 static u64 tdp_mmu_set_spte(struct kvm *kvm, i    
650                             u64 old_spte, u64     
651 {                                                 
652         lockdep_assert_held_write(&kvm->mmu_lo    
653                                                   
654         /*                                        
655          * No thread should be using this func    
656          * temporary frozen SPTE value.           
657          * If operating under the MMU lock in     
658          * should be used. If operating under     
659          * use of the frozen SPTE should not b    
660          */                                       
661         WARN_ON_ONCE(is_frozen_spte(old_spte)     
662                                                   
663         old_spte = kvm_tdp_mmu_write_spte(spte    
664                                                   
665         handle_changed_spte(kvm, as_id, gfn, o    
666         return old_spte;                          
667 }                                                 
668                                                   
669 static inline void tdp_mmu_iter_set_spte(struc    
670                                          u64 n    
671 {                                                 
672         WARN_ON_ONCE(iter->yielded);              
673         iter->old_spte = tdp_mmu_set_spte(kvm,    
674                                           iter    
675                                           iter    
676 }                                                 
677                                                   
678 #define tdp_root_for_each_pte(_iter, _root, _s    
679         for_each_tdp_pte(_iter, _root, _start,    
680                                                   
681 #define tdp_root_for_each_leaf_pte(_iter, _roo    
682         tdp_root_for_each_pte(_iter, _root, _s    
683                 if (!is_shadow_present_pte(_it    
684                     !is_last_spte(_iter.old_sp    
685                         continue;                 
686                 else                              
687                                                   
688 #define tdp_mmu_for_each_pte(_iter, _mmu, _sta    
689         for_each_tdp_pte(_iter, root_to_sp(_mm    
690                                                   
691 /*                                                
692  * Yield if the MMU lock is contended or this     
693  * to the scheduler.                              
694  *                                                
695  * If this function should yield and flush is     
696  * TLB flush before yielding.                     
697  *                                                
698  * If this function yields, iter->yielded is s    
699  * the next iteration, where tdp_iter_next() w    
700  * over the paging structures to allow the ite    
701  * from the paging structure root.                
702  *                                                
703  * Returns true if this function yielded.         
704  */                                               
705 static inline bool __must_check tdp_mmu_iter_c    
706                                                   
707                                                   
708 {                                                 
709         WARN_ON_ONCE(iter->yielded);              
710                                                   
711         /* Ensure forward progress has been ma    
712         if (iter->next_last_level_gfn == iter-    
713                 return false;                     
714                                                   
715         if (need_resched() || rwlock_needbreak    
716                 if (flush)                        
717                         kvm_flush_remote_tlbs(    
718                                                   
719                 rcu_read_unlock();                
720                                                   
721                 if (shared)                       
722                         cond_resched_rwlock_re    
723                 else                              
724                         cond_resched_rwlock_wr    
725                                                   
726                 rcu_read_lock();                  
727                                                   
728                 WARN_ON_ONCE(iter->gfn > iter-    
729                                                   
730                 iter->yielded = true;             
731         }                                         
732                                                   
733         return iter->yielded;                     
734 }                                                 
735                                                   
736 static inline gfn_t tdp_mmu_max_gfn_exclusive(    
737 {                                                 
738         /*                                        
739          * Bound TDP MMU walks at host.MAXPHYA    
740          * a gpa range that would exceed the m    
741          * MMIO SPTEs for "impossible" gfns, i    
742          * the slow emulation path every time.    
743          */                                       
744         return kvm_mmu_max_gfn() + 1;             
745 }                                                 
746                                                   
747 static void __tdp_mmu_zap_root(struct kvm *kvm    
748                                bool shared, in    
749 {                                                 
750         struct tdp_iter iter;                     
751                                                   
752         gfn_t end = tdp_mmu_max_gfn_exclusive(    
753         gfn_t start = 0;                          
754                                                   
755         for_each_tdp_pte_min_level(iter, root,    
756 retry:                                            
757                 if (tdp_mmu_iter_cond_resched(    
758                         continue;                 
759                                                   
760                 if (!is_shadow_present_pte(ite    
761                         continue;                 
762                                                   
763                 if (iter.level > zap_level)       
764                         continue;                 
765                                                   
766                 if (!shared)                      
767                         tdp_mmu_iter_set_spte(    
768                 else if (tdp_mmu_set_spte_atom    
769                         goto retry;               
770         }                                         
771 }                                                 
772                                                   
773 static void tdp_mmu_zap_root(struct kvm *kvm,     
774                              bool shared)         
775 {                                                 
776                                                   
777         /*                                        
778          * The root must have an elevated refc    
779          * mmu_notifier callbacks, which allow    
780          * mmu_lock.  When handling an unmap/r    
781          * must drop all references to relevan    
782          * callback.  Dropping mmu_lock with a    
783          * in zapping SPTEs after a relevant m    
784          * and lead to use-after-free as zappi    
785          * dirty accessed bits to the SPTE's a    
786          */                                       
787         WARN_ON_ONCE(!refcount_read(&root->tdp    
788                                                   
789         kvm_lockdep_assert_mmu_lock_held(kvm,     
790                                                   
791         rcu_read_lock();                          
792                                                   
793         /*                                        
794          * Zap roots in multiple passes of dec    
795          * 4KiB=>2MiB=>1GiB=>root, in order to    
796          * preempt models) or mmu_lock content    
797          * Zapping at finer granularity margin    
798          * the zap, but in most cases the zap     
799          *                                        
800          * If KVM is configured to prove the M    
801          * in order to mimic the page fault pa    
802          * table with an equivalent 1GiB hugep    
803          * zapping a 1GiB region that's fully     
804          * allows verifying that KVM can safel    
805          * inducing RCU stalls, without relyin    
806          * (zapping roots is orders of magnitu    
807          * zapping a SP recurses on its childr    
808          * in the iterator itself is unnecessa    
809          */                                       
810         if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU))    
811                 __tdp_mmu_zap_root(kvm, root,     
812                 __tdp_mmu_zap_root(kvm, root,     
813         }                                         
814         __tdp_mmu_zap_root(kvm, root, shared,     
815         __tdp_mmu_zap_root(kvm, root, shared,     
816                                                   
817         rcu_read_unlock();                        
818 }                                                 
819                                                   
820 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struc    
821 {                                                 
822         u64 old_spte;                             
823                                                   
824         /*                                        
825          * This helper intentionally doesn't a    
826          * which doesn't have a parent page ta    
827          */                                       
828         if (WARN_ON_ONCE(!sp->ptep))              
829                 return false;                     
830                                                   
831         old_spte = kvm_tdp_mmu_read_spte(sp->p    
832         if (WARN_ON_ONCE(!is_shadow_present_pt    
833                 return false;                     
834                                                   
835         tdp_mmu_set_spte(kvm, kvm_mmu_page_as_    
836                          SHADOW_NONPRESENT_VAL    
837                                                   
838         return true;                              
839 }                                                 
840                                                   
841 /*                                                
842  * If can_yield is true, will release the MMU     
843  * scheduler needs the CPU or there is content    
844  * function cannot yield, it will not release     
845  * the caller must ensure it does not supply t    
846  * operation can cause a soft lockup.             
847  */                                               
848 static bool tdp_mmu_zap_leafs(struct kvm *kvm,    
849                               gfn_t start, gfn    
850 {                                                 
851         struct tdp_iter iter;                     
852                                                   
853         end = min(end, tdp_mmu_max_gfn_exclusi    
854                                                   
855         lockdep_assert_held_write(&kvm->mmu_lo    
856                                                   
857         rcu_read_lock();                          
858                                                   
859         for_each_tdp_pte_min_level(iter, root,    
860                 if (can_yield &&                  
861                     tdp_mmu_iter_cond_resched(    
862                         flush = false;            
863                         continue;                 
864                 }                                 
865                                                   
866                 if (!is_shadow_present_pte(ite    
867                     !is_last_spte(iter.old_spt    
868                         continue;                 
869                                                   
870                 tdp_mmu_iter_set_spte(kvm, &it    
871                                                   
872                 /*                                
873                  * Zappings SPTEs in invalid r    
874                  * see kvm_tdp_mmu_zap_invalid    
875                  */                               
876                 if (!root->role.invalid)          
877                         flush = true;             
878         }                                         
879                                                   
880         rcu_read_unlock();                        
881                                                   
882         /*                                        
883          * Because this flow zaps _only_ leaf     
884          * to provide RCU protection as no 'st    
885          */                                       
886         return flush;                             
887 }                                                 
888                                                   
889 /*                                                
890  * Zap leaf SPTEs for the range of gfns, [star    
891  * Returns true if a TLB flush is needed befor    
892  * one or more SPTEs were zapped since the MMU    
893  */                                               
894 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gf    
895 {                                                 
896         struct kvm_mmu_page *root;                
897                                                   
898         lockdep_assert_held_write(&kvm->mmu_lo    
899         for_each_valid_tdp_mmu_root_yield_safe    
900                 flush = tdp_mmu_zap_leafs(kvm,    
901                                                   
902         return flush;                             
903 }                                                 
904                                                   
905 void kvm_tdp_mmu_zap_all(struct kvm *kvm)         
906 {                                                 
907         struct kvm_mmu_page *root;                
908                                                   
909         /*                                        
910          * Zap all roots, including invalid ro    
911          * before returning to the caller.  Za    
912          * also being zapped by a worker.  Wal    
913          * all that expensive and mmu_lock is     
914          * worker has yielded, i.e. flushing t    
915          * isn't guaranteed to be any faster.     
916          *                                        
917          * A TLB flush is unnecessary, KVM zap    
918          * is being destroyed or the userspace    
919          * KVM_RUN is unreachable, i.e. no vCP    
920          */                                       
921         lockdep_assert_held_write(&kvm->mmu_lo    
922         for_each_tdp_mmu_root_yield_safe(kvm,     
923                 tdp_mmu_zap_root(kvm, root, fa    
924 }                                                 
925                                                   
926 /*                                                
927  * Zap all invalidated roots to ensure all SPT    
928  * zap" completes.                                
929  */                                               
930 void kvm_tdp_mmu_zap_invalidated_roots(struct     
931 {                                                 
932         struct kvm_mmu_page *root;                
933                                                   
934         read_lock(&kvm->mmu_lock);                
935                                                   
936         for_each_tdp_mmu_root_yield_safe(kvm,     
937                 if (!root->tdp_mmu_scheduled_r    
938                         continue;                 
939                                                   
940                 root->tdp_mmu_scheduled_root_t    
941                 KVM_BUG_ON(!root->role.invalid    
942                                                   
943                 /*                                
944                  * A TLB flush is not necessar    
945                  * flush when allocating a new    
946                  * when migrating a vCPU to a     
947                  * TLB flush on reuse also inv    
948                  * entries, i.e. TLB entries f    
949                  * that may be zapped, as such    
950                  * ASID on both VMX and SVM.      
951                  */                               
952                 tdp_mmu_zap_root(kvm, root, tr    
953                                                   
954                 /*                                
955                  * The referenced needs to be     
956                  * the root must be reachable     
957                  * zapped                         
958                  */                               
959                 kvm_tdp_mmu_put_root(kvm, root    
960         }                                         
961                                                   
962         read_unlock(&kvm->mmu_lock);              
963 }                                                 
964                                                   
965 /*                                                
966  * Mark each TDP MMU root as invalid to preven    
967  * is about to be zapped, e.g. in response to     
968  * zapping is done separately so that it happe    
969  * whereas invalidating roots must be done wit    
970  * the VM is being destroyed).                    
971  *                                                
972  * Note, kvm_tdp_mmu_zap_invalidated_roots() i    
973  * See kvm_tdp_mmu_alloc_root().                  
974  */                                               
975 void kvm_tdp_mmu_invalidate_all_roots(struct k    
976 {                                                 
977         struct kvm_mmu_page *root;                
978                                                   
979         /*                                        
980          * mmu_lock must be held for write to     
981          * invalid while there are active read    
982          * there are active readers may or may    
983          * but it's uncharted territory and no    
984          *                                        
985          * Waive the assertion if there are no    
986          * being destroyed after all reference    
987          * have been created (which means ther    
988          * being destroyed in an error path of    
989          */                                       
990         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &    
991             refcount_read(&kvm->users_count) &    
992                 lockdep_assert_held_write(&kvm    
993                                                   
994         /*                                        
995          * As above, mmu_lock isn't held when     
996          * be other references to @kvm, i.e. n    
997          * or get/put references to roots.        
998          */                                       
999         list_for_each_entry(root, &kvm->arch.t    
1000                 /*                               
1001                  * Note, invalid roots can ou    
1002                  * roots must be *zapped* bef    
1003                  * but a different task can a    
1004                  * root alive after its been     
1005                  */                              
1006                 if (!root->role.invalid) {       
1007                         root->tdp_mmu_schedul    
1008                         root->role.invalid =     
1009                 }                                
1010         }                                        
1011 }                                                
1012                                                  
1013 /*                                               
1014  * Installs a last-level SPTE to handle a TDP    
1015  * (NPT/EPT violation/misconfiguration)          
1016  */                                              
1017 static int tdp_mmu_map_handle_target_level(st    
1018                                           str    
1019                                           str    
1020 {                                                
1021         struct kvm_mmu_page *sp = sptep_to_sp    
1022         u64 new_spte;                            
1023         int ret = RET_PF_FIXED;                  
1024         bool wrprot = false;                     
1025                                                  
1026         if (WARN_ON_ONCE(sp->role.level != fa    
1027                 return RET_PF_RETRY;             
1028                                                  
1029         if (unlikely(!fault->slot))              
1030                 new_spte = make_mmio_spte(vcp    
1031         else                                     
1032                 wrprot = make_spte(vcpu, sp,     
1033                                          faul    
1034                                          faul    
1035                                                  
1036         if (new_spte == iter->old_spte)          
1037                 ret = RET_PF_SPURIOUS;           
1038         else if (tdp_mmu_set_spte_atomic(vcpu    
1039                 return RET_PF_RETRY;             
1040         else if (is_shadow_present_pte(iter->    
1041                  !is_last_spte(iter->old_spte    
1042                 kvm_flush_remote_tlbs_gfn(vcp    
1043                                                  
1044         /*                                       
1045          * If the page fault was caused by a     
1046          * protected, emulation is needed. If    
1047          * the vCPU would have the same fault    
1048          */                                      
1049         if (wrprot && fault->write)              
1050                 ret = RET_PF_WRITE_PROTECTED;    
1051                                                  
1052         /* If a MMIO SPTE is installed, the M    
1053         if (unlikely(is_mmio_spte(vcpu->kvm,     
1054                 vcpu->stat.pf_mmio_spte_creat    
1055                 trace_mark_mmio_spte(rcu_dere    
1056                                      new_spte    
1057                 ret = RET_PF_EMULATE;            
1058         } else {                                 
1059                 trace_kvm_mmu_set_spte(iter->    
1060                                        rcu_de    
1061         }                                        
1062                                                  
1063         return ret;                              
1064 }                                                
1065                                                  
1066 /*                                               
1067  * tdp_mmu_link_sp - Replace the given spte w    
1068  * provided page table.                          
1069  *                                               
1070  * @kvm: kvm instance                            
1071  * @iter: a tdp_iter instance currently on th    
1072  * @sp: The new TDP page table to install.       
1073  * @shared: This operation is running under t    
1074  *                                               
1075  * Returns: 0 if the new page table was insta    
1076  *          could not be installed (e.g. the     
1077  */                                              
1078 static int tdp_mmu_link_sp(struct kvm *kvm, s    
1079                            struct kvm_mmu_pag    
1080 {                                                
1081         u64 spte = make_nonleaf_spte(sp->spt,    
1082         int ret = 0;                             
1083                                                  
1084         if (shared) {                            
1085                 ret = tdp_mmu_set_spte_atomic    
1086                 if (ret)                         
1087                         return ret;              
1088         } else {                                 
1089                 tdp_mmu_iter_set_spte(kvm, it    
1090         }                                        
1091                                                  
1092         tdp_account_mmu_page(kvm, sp);           
1093                                                  
1094         return 0;                                
1095 }                                                
1096                                                  
1097 static int tdp_mmu_split_huge_page(struct kvm    
1098                                    struct kvm    
1099                                                  
1100 /*                                               
1101  * Handle a TDP page fault (NPT/EPT violation    
1102  * page tables and SPTEs to translate the fau    
1103  */                                              
1104 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, st    
1105 {                                                
1106         struct kvm_mmu *mmu = vcpu->arch.mmu;    
1107         struct kvm *kvm = vcpu->kvm;             
1108         struct tdp_iter iter;                    
1109         struct kvm_mmu_page *sp;                 
1110         int ret = RET_PF_RETRY;                  
1111                                                  
1112         kvm_mmu_hugepage_adjust(vcpu, fault);    
1113                                                  
1114         trace_kvm_mmu_spte_requested(fault);     
1115                                                  
1116         rcu_read_lock();                         
1117                                                  
1118         tdp_mmu_for_each_pte(iter, mmu, fault    
1119                 int r;                           
1120                                                  
1121                 if (fault->nx_huge_page_worka    
1122                         disallowed_hugepage_a    
1123                                                  
1124                 /*                               
1125                  * If SPTE has been frozen by    
1126                  * retry, avoiding unnecessar    
1127                  */                              
1128                 if (is_frozen_spte(iter.old_s    
1129                         goto retry;              
1130                                                  
1131                 if (iter.level == fault->goal    
1132                         goto map_target_level    
1133                                                  
1134                 /* Step down into the lower l    
1135                 if (is_shadow_present_pte(ite    
1136                     !is_large_pte(iter.old_sp    
1137                         continue;                
1138                                                  
1139                 /*                               
1140                  * The SPTE is either non-pre    
1141                  * needs to be split.            
1142                  */                              
1143                 sp = tdp_mmu_alloc_sp(vcpu);     
1144                 tdp_mmu_init_child_sp(sp, &it    
1145                                                  
1146                 sp->nx_huge_page_disallowed =    
1147                                                  
1148                 if (is_shadow_present_pte(ite    
1149                         r = tdp_mmu_split_hug    
1150                 else                             
1151                         r = tdp_mmu_link_sp(k    
1152                                                  
1153                 /*                               
1154                  * Force the guest to retry i    
1155                  * failed, e.g. because a dif    
1156                  */                              
1157                 if (r) {                         
1158                         tdp_mmu_free_sp(sp);     
1159                         goto retry;              
1160                 }                                
1161                                                  
1162                 if (fault->huge_page_disallow    
1163                     fault->req_level >= iter.    
1164                         spin_lock(&kvm->arch.    
1165                         if (sp->nx_huge_page_    
1166                                 track_possibl    
1167                         spin_unlock(&kvm->arc    
1168                 }                                
1169         }                                        
1170                                                  
1171         /*                                       
1172          * The walk aborted before reaching t    
1173          * iterator detected an upper level S    
1174          */                                      
1175         WARN_ON_ONCE(iter.level == fault->goa    
1176         goto retry;                              
1177                                                  
1178 map_target_level:                                
1179         ret = tdp_mmu_map_handle_target_level    
1180                                                  
1181 retry:                                           
1182         rcu_read_unlock();                       
1183         return ret;                              
1184 }                                                
1185                                                  
1186 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *    
1187                                  bool flush)     
1188 {                                                
1189         struct kvm_mmu_page *root;               
1190                                                  
1191         __for_each_tdp_mmu_root_yield_safe(kv    
1192                 flush = tdp_mmu_zap_leafs(kvm    
1193                                           ran    
1194                                                  
1195         return flush;                            
1196 }                                                
1197                                                  
1198 typedef bool (*tdp_handler_t)(struct kvm *kvm    
1199                               struct kvm_gfn_    
1200                                                  
1201 static __always_inline bool kvm_tdp_mmu_handl    
1202                                                  
1203                                                  
1204 {                                                
1205         struct kvm_mmu_page *root;               
1206         struct tdp_iter iter;                    
1207         bool ret = false;                        
1208                                                  
1209         /*                                       
1210          * Don't support rescheduling, none o    
1211          * into this helper allow blocking; i    
1212          */                                      
1213         for_each_tdp_mmu_root(kvm, root, rang    
1214                 rcu_read_lock();                 
1215                                                  
1216                 tdp_root_for_each_leaf_pte(it    
1217                         ret |= handler(kvm, &    
1218                                                  
1219                 rcu_read_unlock();               
1220         }                                        
1221                                                  
1222         return ret;                              
1223 }                                                
1224                                                  
1225 /*                                               
1226  * Mark the SPTEs range of GFNs [start, end)     
1227  * if any of the GFNs in the range have been     
1228  *                                               
1229  * No need to mark the corresponding PFN as a    
1230  * from the clear_young() or clear_flush_youn    
1231  * return value to determine if the page has     
1232  */                                              
1233 static bool age_gfn_range(struct kvm *kvm, st    
1234                           struct kvm_gfn_rang    
1235 {                                                
1236         u64 new_spte;                            
1237                                                  
1238         /* If we have a non-accessed entry we    
1239         if (!is_accessed_spte(iter->old_spte)    
1240                 return false;                    
1241                                                  
1242         if (spte_ad_enabled(iter->old_spte))     
1243                 iter->old_spte = tdp_mmu_clea    
1244                                                  
1245                                                  
1246                                                  
1247                 new_spte = iter->old_spte & ~    
1248         } else {                                 
1249                 /*                               
1250                  * Capture the dirty status o    
1251                  * lost when the SPTE is mark    
1252                  */                              
1253                 if (is_writable_pte(iter->old    
1254                         kvm_set_pfn_dirty(spt    
1255                                                  
1256                 new_spte = mark_spte_for_acce    
1257                 iter->old_spte = kvm_tdp_mmu_    
1258                                                  
1259                                                  
1260         }                                        
1261                                                  
1262         trace_kvm_tdp_mmu_spte_changed(iter->    
1263                                        iter->    
1264         return true;                             
1265 }                                                
1266                                                  
1267 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kv    
1268 {                                                
1269         return kvm_tdp_mmu_handle_gfn(kvm, ra    
1270 }                                                
1271                                                  
1272 static bool test_age_gfn(struct kvm *kvm, str    
1273                          struct kvm_gfn_range    
1274 {                                                
1275         return is_accessed_spte(iter->old_spt    
1276 }                                                
1277                                                  
1278 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm    
1279 {                                                
1280         return kvm_tdp_mmu_handle_gfn(kvm, ra    
1281 }                                                
1282                                                  
1283 /*                                               
1284  * Remove write access from all SPTEs at or a    
1285  * [start, end). Returns true if an SPTE has     
1286  * be flushed.                                   
1287  */                                              
1288 static bool wrprot_gfn_range(struct kvm *kvm,    
1289                              gfn_t start, gfn    
1290 {                                                
1291         struct tdp_iter iter;                    
1292         u64 new_spte;                            
1293         bool spte_set = false;                   
1294                                                  
1295         rcu_read_lock();                         
1296                                                  
1297         BUG_ON(min_level > KVM_MAX_HUGEPAGE_L    
1298                                                  
1299         for_each_tdp_pte_min_level(iter, root    
1300 retry:                                           
1301                 if (tdp_mmu_iter_cond_resched    
1302                         continue;                
1303                                                  
1304                 if (!is_shadow_present_pte(it    
1305                     !is_last_spte(iter.old_sp    
1306                     !(iter.old_spte & PT_WRIT    
1307                         continue;                
1308                                                  
1309                 new_spte = iter.old_spte & ~P    
1310                                                  
1311                 if (tdp_mmu_set_spte_atomic(k    
1312                         goto retry;              
1313                                                  
1314                 spte_set = true;                 
1315         }                                        
1316                                                  
1317         rcu_read_unlock();                       
1318         return spte_set;                         
1319 }                                                
1320                                                  
1321 /*                                               
1322  * Remove write access from all the SPTEs map    
1323  * only affect leaf SPTEs down to min_level.     
1324  * Returns true if an SPTE has been changed a    
1325  */                                              
1326 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,    
1327                              const struct kvm    
1328 {                                                
1329         struct kvm_mmu_page *root;               
1330         bool spte_set = false;                   
1331                                                  
1332         lockdep_assert_held_read(&kvm->mmu_lo    
1333                                                  
1334         for_each_valid_tdp_mmu_root_yield_saf    
1335                 spte_set |= wrprot_gfn_range(    
1336                              slot->base_gfn +    
1337                                                  
1338         return spte_set;                         
1339 }                                                
1340                                                  
1341 static struct kvm_mmu_page *tdp_mmu_alloc_sp_    
1342 {                                                
1343         struct kvm_mmu_page *sp;                 
1344                                                  
1345         sp = kmem_cache_zalloc(mmu_page_heade    
1346         if (!sp)                                 
1347                 return NULL;                     
1348                                                  
1349         sp->spt = (void *)get_zeroed_page(GFP    
1350         if (!sp->spt) {                          
1351                 kmem_cache_free(mmu_page_head    
1352                 return NULL;                     
1353         }                                        
1354                                                  
1355         return sp;                               
1356 }                                                
1357                                                  
1358 /* Note, the caller is responsible for initia    
1359 static int tdp_mmu_split_huge_page(struct kvm    
1360                                    struct kvm    
1361 {                                                
1362         const u64 huge_spte = iter->old_spte;    
1363         const int level = iter->level;           
1364         int ret, i;                              
1365                                                  
1366         /*                                       
1367          * No need for atomics when writing t    
1368          * not been linked in yet and thus is    
1369          */                                      
1370         for (i = 0; i < SPTE_ENT_PER_PAGE; i+    
1371                 sp->spt[i] = make_huge_page_s    
1372                                                  
1373         /*                                       
1374          * Replace the huge spte with a point    
1375          * page table. Since we are making th    
1376          * will see a mix of the split mappin    
1377          * depending on what's currently in t    
1378          * correctness standpoint since the t    
1379          * way.                                  
1380          */                                      
1381         ret = tdp_mmu_link_sp(kvm, iter, sp,     
1382         if (ret)                                 
1383                 goto out;                        
1384                                                  
1385         /*                                       
1386          * tdp_mmu_link_sp_atomic() will hand    
1387          * are overwriting from the page stat    
1388          * the page stats with the new presen    
1389          */                                      
1390         kvm_update_page_stats(kvm, level - 1,    
1391                                                  
1392 out:                                             
1393         trace_kvm_mmu_split_huge_page(iter->g    
1394         return ret;                              
1395 }                                                
1396                                                  
1397 static int tdp_mmu_split_huge_pages_root(stru    
1398                                          stru    
1399                                          gfn_    
1400                                          int     
1401 {                                                
1402         struct kvm_mmu_page *sp = NULL;          
1403         struct tdp_iter iter;                    
1404                                                  
1405         rcu_read_lock();                         
1406                                                  
1407         /*                                       
1408          * Traverse the page table splitting     
1409          * level into one lower level. For ex    
1410          * we split it into 512 2MB pages.       
1411          *                                       
1412          * Since the TDP iterator uses a pre-    
1413          * to visit an SPTE before ever visit    
1414          * will correctly recursively split h    
1415          * level above the target level (e.g.    
1416          * and then splitting each of those t    
1417          */                                      
1418         for_each_tdp_pte_min_level(iter, root    
1419 retry:                                           
1420                 if (tdp_mmu_iter_cond_resched    
1421                         continue;                
1422                                                  
1423                 if (!is_shadow_present_pte(it    
1424                         continue;                
1425                                                  
1426                 if (!sp) {                       
1427                         rcu_read_unlock();       
1428                                                  
1429                         if (shared)              
1430                                 read_unlock(&    
1431                         else                     
1432                                 write_unlock(    
1433                                                  
1434                         sp = tdp_mmu_alloc_sp    
1435                                                  
1436                         if (shared)              
1437                                 read_lock(&kv    
1438                         else                     
1439                                 write_lock(&k    
1440                                                  
1441                         if (!sp) {               
1442                                 trace_kvm_mmu    
1443                                                  
1444                                                  
1445                                 return -ENOME    
1446                         }                        
1447                                                  
1448                         rcu_read_lock();         
1449                                                  
1450                         iter.yielded = true;     
1451                         continue;                
1452                 }                                
1453                                                  
1454                 tdp_mmu_init_child_sp(sp, &it    
1455                                                  
1456                 if (tdp_mmu_split_huge_page(k    
1457                         goto retry;              
1458                                                  
1459                 sp = NULL;                       
1460         }                                        
1461                                                  
1462         rcu_read_unlock();                       
1463                                                  
1464         /*                                       
1465          * It's possible to exit the loop hav    
1466          * example, a vCPU doing HugePage NX     
1467          * installs its own sp in place of th    
1468          */                                      
1469         if (sp)                                  
1470                 tdp_mmu_free_sp(sp);             
1471                                                  
1472         return 0;                                
1473 }                                                
1474                                                  
1475                                                  
1476 /*                                               
1477  * Try to split all huge pages mapped by the     
1478  */                                              
1479 void kvm_tdp_mmu_try_split_huge_pages(struct     
1480                                       const s    
1481                                       gfn_t s    
1482                                       int tar    
1483 {                                                
1484         struct kvm_mmu_page *root;               
1485         int r = 0;                               
1486                                                  
1487         kvm_lockdep_assert_mmu_lock_held(kvm,    
1488         for_each_valid_tdp_mmu_root_yield_saf    
1489                 r = tdp_mmu_split_huge_pages_    
1490                 if (r) {                         
1491                         kvm_tdp_mmu_put_root(    
1492                         break;                   
1493                 }                                
1494         }                                        
1495 }                                                
1496                                                  
1497 static bool tdp_mmu_need_write_protect(struct    
1498 {                                                
1499         /*                                       
1500          * All TDP MMU shadow pages share the    
1501          * from level, so it is valid to key     
1502          * write protection is needed for an     
1503          */                                      
1504         return kvm_mmu_page_ad_need_write_pro    
1505 }                                                
1506                                                  
1507 static bool clear_dirty_gfn_range(struct kvm     
1508                            gfn_t start, gfn_t    
1509 {                                                
1510         const u64 dbit = tdp_mmu_need_write_p    
1511                                                  
1512         struct tdp_iter iter;                    
1513         bool spte_set = false;                   
1514                                                  
1515         rcu_read_lock();                         
1516                                                  
1517         tdp_root_for_each_pte(iter, root, sta    
1518 retry:                                           
1519                 if (!is_shadow_present_pte(it    
1520                     !is_last_spte(iter.old_sp    
1521                         continue;                
1522                                                  
1523                 if (tdp_mmu_iter_cond_resched    
1524                         continue;                
1525                                                  
1526                 KVM_MMU_WARN_ON(dbit == shado    
1527                                 spte_ad_need_    
1528                                                  
1529                 if (!(iter.old_spte & dbit))     
1530                         continue;                
1531                                                  
1532                 if (tdp_mmu_set_spte_atomic(k    
1533                         goto retry;              
1534                                                  
1535                 spte_set = true;                 
1536         }                                        
1537                                                  
1538         rcu_read_unlock();                       
1539         return spte_set;                         
1540 }                                                
1541                                                  
1542 /*                                               
1543  * Clear the dirty status (D-bit or W-bit) of    
1544  * memslot. Returns true if an SPTE has been     
1545  * flushed.                                      
1546  */                                              
1547 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm     
1548                                   const struc    
1549 {                                                
1550         struct kvm_mmu_page *root;               
1551         bool spte_set = false;                   
1552                                                  
1553         lockdep_assert_held_read(&kvm->mmu_lo    
1554         for_each_valid_tdp_mmu_root_yield_saf    
1555                 spte_set |= clear_dirty_gfn_r    
1556                                 slot->base_gf    
1557                                                  
1558         return spte_set;                         
1559 }                                                
1560                                                  
1561 static void clear_dirty_pt_masked(struct kvm     
1562                                   gfn_t gfn,     
1563 {                                                
1564         const u64 dbit = (wrprot || tdp_mmu_n    
1565                                                  
1566         struct tdp_iter iter;                    
1567                                                  
1568         lockdep_assert_held_write(&kvm->mmu_l    
1569                                                  
1570         rcu_read_lock();                         
1571                                                  
1572         tdp_root_for_each_leaf_pte(iter, root    
1573                                     gfn + BIT    
1574                 if (!mask)                       
1575                         break;                   
1576                                                  
1577                 KVM_MMU_WARN_ON(dbit == shado    
1578                                 spte_ad_need_    
1579                                                  
1580                 if (iter.level > PG_LEVEL_4K     
1581                     !(mask & (1UL << (iter.gf    
1582                         continue;                
1583                                                  
1584                 mask &= ~(1UL << (iter.gfn -     
1585                                                  
1586                 if (!(iter.old_spte & dbit))     
1587                         continue;                
1588                                                  
1589                 iter.old_spte = tdp_mmu_clear    
1590                                                  
1591                                                  
1592                                                  
1593                 trace_kvm_tdp_mmu_spte_change    
1594                                                  
1595                                                  
1596                 kvm_set_pfn_dirty(spte_to_pfn    
1597         }                                        
1598                                                  
1599         rcu_read_unlock();                       
1600 }                                                
1601                                                  
1602 /*                                               
1603  * Clear the dirty status (D-bit or W-bit) of    
1604  * which a bit is set in mask, starting at gf    
1605  * contain all the GFNs represented by set bi    
1606  */                                              
1607 void kvm_tdp_mmu_clear_dirty_pt_masked(struct    
1608                                        struct    
1609                                        gfn_t     
1610                                        bool w    
1611 {                                                
1612         struct kvm_mmu_page *root;               
1613                                                  
1614         for_each_valid_tdp_mmu_root(kvm, root    
1615                 clear_dirty_pt_masked(kvm, ro    
1616 }                                                
1617                                                  
1618 static void zap_collapsible_spte_range(struct    
1619                                        struct    
1620                                        const     
1621 {                                                
1622         gfn_t start = slot->base_gfn;            
1623         gfn_t end = start + slot->npages;        
1624         struct tdp_iter iter;                    
1625         int max_mapping_level;                   
1626                                                  
1627         rcu_read_lock();                         
1628                                                  
1629         for_each_tdp_pte_min_level(iter, root    
1630 retry:                                           
1631                 if (tdp_mmu_iter_cond_resched    
1632                         continue;                
1633                                                  
1634                 if (iter.level > KVM_MAX_HUGE    
1635                     !is_shadow_present_pte(it    
1636                         continue;                
1637                                                  
1638                 /*                               
1639                  * Don't zap leaf SPTEs, if a    
1640                  * a large page size, then it    
1641                  * instead of stepping down.     
1642                  */                              
1643                 if (is_last_spte(iter.old_spt    
1644                         continue;                
1645                                                  
1646                 /*                               
1647                  * If iter.gfn resides outsid    
1648                  * the current level overlaps    
1649                  * then the SPTE can't be mad    
1650                  * to query that info from sl    
1651                  * out-of-bounds access.         
1652                  */                              
1653                 if (iter.gfn < start || iter.    
1654                         continue;                
1655                                                  
1656                 max_mapping_level = kvm_mmu_m    
1657                                                  
1658                 if (max_mapping_level < iter.    
1659                         continue;                
1660                                                  
1661                 /* Note, a successful atomic     
1662                 if (tdp_mmu_zap_spte_atomic(k    
1663                         goto retry;              
1664         }                                        
1665                                                  
1666         rcu_read_unlock();                       
1667 }                                                
1668                                                  
1669 /*                                               
1670  * Zap non-leaf SPTEs (and free their associa    
1671  * be replaced by huge pages, for GFNs within    
1672  */                                              
1673 void kvm_tdp_mmu_zap_collapsible_sptes(struct    
1674                                        const     
1675 {                                                
1676         struct kvm_mmu_page *root;               
1677                                                  
1678         lockdep_assert_held_read(&kvm->mmu_lo    
1679         for_each_valid_tdp_mmu_root_yield_saf    
1680                 zap_collapsible_spte_range(kv    
1681 }                                                
1682                                                  
1683 /*                                               
1684  * Removes write access on the last level SPT    
1685  * MMU-writable bit to ensure future writes c    
1686  * Returns true if an SPTE was set and a TLB     
1687  */                                              
1688 static bool write_protect_gfn(struct kvm *kvm    
1689                               gfn_t gfn, int     
1690 {                                                
1691         struct tdp_iter iter;                    
1692         u64 new_spte;                            
1693         bool spte_set = false;                   
1694                                                  
1695         BUG_ON(min_level > KVM_MAX_HUGEPAGE_L    
1696                                                  
1697         rcu_read_lock();                         
1698                                                  
1699         for_each_tdp_pte_min_level(iter, root    
1700                 if (!is_shadow_present_pte(it    
1701                     !is_last_spte(iter.old_sp    
1702                         continue;                
1703                                                  
1704                 new_spte = iter.old_spte &       
1705                         ~(PT_WRITABLE_MASK |     
1706                                                  
1707                 if (new_spte == iter.old_spte    
1708                         break;                   
1709                                                  
1710                 tdp_mmu_iter_set_spte(kvm, &i    
1711                 spte_set = true;                 
1712         }                                        
1713                                                  
1714         rcu_read_unlock();                       
1715                                                  
1716         return spte_set;                         
1717 }                                                
1718                                                  
1719 /*                                               
1720  * Removes write access on the last level SPT    
1721  * MMU-writable bit to ensure future writes c    
1722  * Returns true if an SPTE was set and a TLB     
1723  */                                              
1724 bool kvm_tdp_mmu_write_protect_gfn(struct kvm    
1725                                    struct kvm    
1726                                    int min_le    
1727 {                                                
1728         struct kvm_mmu_page *root;               
1729         bool spte_set = false;                   
1730                                                  
1731         lockdep_assert_held_write(&kvm->mmu_l    
1732         for_each_valid_tdp_mmu_root(kvm, root    
1733                 spte_set |= write_protect_gfn    
1734                                                  
1735         return spte_set;                         
1736 }                                                
1737                                                  
1738 /*                                               
1739  * Return the level of the lowest level SPTE     
1740  * That SPTE may be non-present.                 
1741  *                                               
1742  * Must be called between kvm_tdp_mmu_walk_lo    
1743  */                                              
1744 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcp    
1745                          int *root_level)        
1746 {                                                
1747         struct tdp_iter iter;                    
1748         struct kvm_mmu *mmu = vcpu->arch.mmu;    
1749         gfn_t gfn = addr >> PAGE_SHIFT;          
1750         int leaf = -1;                           
1751                                                  
1752         *root_level = vcpu->arch.mmu->root_ro    
1753                                                  
1754         tdp_mmu_for_each_pte(iter, mmu, gfn,     
1755                 leaf = iter.level;               
1756                 sptes[leaf] = iter.old_spte;     
1757         }                                        
1758                                                  
1759         return leaf;                             
1760 }                                                
1761                                                  
1762 /*                                               
1763  * Returns the last level spte pointer of the    
1764  * gpa, and sets *spte to the spte value. Thi    
1765  * walk could be performed, returns NULL and     
1766  *                                               
1767  * Contract:                                     
1768  *  - Must be called between kvm_tdp_mmu_walk    
1769  *  - The returned sptep must not be used aft    
1770  *                                               
1771  * WARNING: This function is only intended to    
1772  */                                              
1773 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struc    
1774                                         u64 *    
1775 {                                                
1776         struct tdp_iter iter;                    
1777         struct kvm_mmu *mmu = vcpu->arch.mmu;    
1778         tdp_ptep_t sptep = NULL;                 
1779                                                  
1780         tdp_mmu_for_each_pte(iter, mmu, gfn,     
1781                 *spte = iter.old_spte;           
1782                 sptep = iter.sptep;              
1783         }                                        
1784                                                  
1785         /*                                       
1786          * Perform the rcu_dereference to get    
1787          * we are passing it up to fast_page_    
1788          * legacy MMU and thus does not retai    
1789          * annotation.                           
1790          *                                       
1791          * This is safe since fast_page_fault    
1792          * function as well as all TDP MMU co    
1793          * outside of mmu_lock.                  
1794          */                                      
1795         return rcu_dereference(sptep);           
1796 }                                                
1797                                                  

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php