~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/tools/testing/selftests/kvm/lib/x86_64/processor.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-only
  2 /*
  3  * tools/testing/selftests/kvm/lib/x86_64/processor.c
  4  *
  5  * Copyright (C) 2018, Google LLC.
  6  */
  7 
  8 #include "linux/bitmap.h"
  9 #include "test_util.h"
 10 #include "kvm_util.h"
 11 #include "processor.h"
 12 #include "sev.h"
 13 
 14 #ifndef NUM_INTERRUPTS
 15 #define NUM_INTERRUPTS 256
 16 #endif
 17 
 18 #define KERNEL_CS       0x8
 19 #define KERNEL_DS       0x10
 20 #define KERNEL_TSS      0x18
 21 
 22 #define MAX_NR_CPUID_ENTRIES 100
 23 
 24 vm_vaddr_t exception_handlers;
 25 bool host_cpu_is_amd;
 26 bool host_cpu_is_intel;
 27 bool is_forced_emulation_enabled;
 28 uint64_t guest_tsc_khz;
 29 
 30 static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent)
 31 {
 32         fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx "
 33                 "rcx: 0x%.16llx rdx: 0x%.16llx\n",
 34                 indent, "",
 35                 regs->rax, regs->rbx, regs->rcx, regs->rdx);
 36         fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx "
 37                 "rsp: 0x%.16llx rbp: 0x%.16llx\n",
 38                 indent, "",
 39                 regs->rsi, regs->rdi, regs->rsp, regs->rbp);
 40         fprintf(stream, "%*sr8:  0x%.16llx r9:  0x%.16llx "
 41                 "r10: 0x%.16llx r11: 0x%.16llx\n",
 42                 indent, "",
 43                 regs->r8, regs->r9, regs->r10, regs->r11);
 44         fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx "
 45                 "r14: 0x%.16llx r15: 0x%.16llx\n",
 46                 indent, "",
 47                 regs->r12, regs->r13, regs->r14, regs->r15);
 48         fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n",
 49                 indent, "",
 50                 regs->rip, regs->rflags);
 51 }
 52 
 53 static void segment_dump(FILE *stream, struct kvm_segment *segment,
 54                          uint8_t indent)
 55 {
 56         fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x "
 57                 "selector: 0x%.4x type: 0x%.2x\n",
 58                 indent, "", segment->base, segment->limit,
 59                 segment->selector, segment->type);
 60         fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x "
 61                 "db: 0x%.2x s: 0x%.2x l: 0x%.2x\n",
 62                 indent, "", segment->present, segment->dpl,
 63                 segment->db, segment->s, segment->l);
 64         fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x "
 65                 "unusable: 0x%.2x padding: 0x%.2x\n",
 66                 indent, "", segment->g, segment->avl,
 67                 segment->unusable, segment->padding);
 68 }
 69 
 70 static void dtable_dump(FILE *stream, struct kvm_dtable *dtable,
 71                         uint8_t indent)
 72 {
 73         fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x "
 74                 "padding: 0x%.4x 0x%.4x 0x%.4x\n",
 75                 indent, "", dtable->base, dtable->limit,
 76                 dtable->padding[0], dtable->padding[1], dtable->padding[2]);
 77 }
 78 
 79 static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent)
 80 {
 81         unsigned int i;
 82 
 83         fprintf(stream, "%*scs:\n", indent, "");
 84         segment_dump(stream, &sregs->cs, indent + 2);
 85         fprintf(stream, "%*sds:\n", indent, "");
 86         segment_dump(stream, &sregs->ds, indent + 2);
 87         fprintf(stream, "%*ses:\n", indent, "");
 88         segment_dump(stream, &sregs->es, indent + 2);
 89         fprintf(stream, "%*sfs:\n", indent, "");
 90         segment_dump(stream, &sregs->fs, indent + 2);
 91         fprintf(stream, "%*sgs:\n", indent, "");
 92         segment_dump(stream, &sregs->gs, indent + 2);
 93         fprintf(stream, "%*sss:\n", indent, "");
 94         segment_dump(stream, &sregs->ss, indent + 2);
 95         fprintf(stream, "%*str:\n", indent, "");
 96         segment_dump(stream, &sregs->tr, indent + 2);
 97         fprintf(stream, "%*sldt:\n", indent, "");
 98         segment_dump(stream, &sregs->ldt, indent + 2);
 99 
100         fprintf(stream, "%*sgdt:\n", indent, "");
101         dtable_dump(stream, &sregs->gdt, indent + 2);
102         fprintf(stream, "%*sidt:\n", indent, "");
103         dtable_dump(stream, &sregs->idt, indent + 2);
104 
105         fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx "
106                 "cr3: 0x%.16llx cr4: 0x%.16llx\n",
107                 indent, "",
108                 sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4);
109         fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx "
110                 "apic_base: 0x%.16llx\n",
111                 indent, "",
112                 sregs->cr8, sregs->efer, sregs->apic_base);
113 
114         fprintf(stream, "%*sinterrupt_bitmap:\n", indent, "");
115         for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) {
116                 fprintf(stream, "%*s%.16llx\n", indent + 2, "",
117                         sregs->interrupt_bitmap[i]);
118         }
119 }
120 
121 bool kvm_is_tdp_enabled(void)
122 {
123         if (host_cpu_is_intel)
124                 return get_kvm_intel_param_bool("ept");
125         else
126                 return get_kvm_amd_param_bool("npt");
127 }
128 
129 void virt_arch_pgd_alloc(struct kvm_vm *vm)
130 {
131         TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
132                 "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
133 
134         /* If needed, create page map l4 table. */
135         if (!vm->pgd_created) {
136                 vm->pgd = vm_alloc_page_table(vm);
137                 vm->pgd_created = true;
138         }
139 }
140 
141 static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte,
142                           uint64_t vaddr, int level)
143 {
144         uint64_t pt_gpa = PTE_GET_PA(*parent_pte);
145         uint64_t *page_table = addr_gpa2hva(vm, pt_gpa);
146         int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
147 
148         TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->pgd,
149                     "Parent PTE (level %d) not PRESENT for gva: 0x%08lx",
150                     level + 1, vaddr);
151 
152         return &page_table[index];
153 }
154 
155 static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
156                                        uint64_t *parent_pte,
157                                        uint64_t vaddr,
158                                        uint64_t paddr,
159                                        int current_level,
160                                        int target_level)
161 {
162         uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level);
163 
164         paddr = vm_untag_gpa(vm, paddr);
165 
166         if (!(*pte & PTE_PRESENT_MASK)) {
167                 *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
168                 if (current_level == target_level)
169                         *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
170                 else
171                         *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
172         } else {
173                 /*
174                  * Entry already present.  Assert that the caller doesn't want
175                  * a hugepage at this level, and that there isn't a hugepage at
176                  * this level.
177                  */
178                 TEST_ASSERT(current_level != target_level,
179                             "Cannot create hugepage at level: %u, vaddr: 0x%lx",
180                             current_level, vaddr);
181                 TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
182                             "Cannot create page table at level: %u, vaddr: 0x%lx",
183                             current_level, vaddr);
184         }
185         return pte;
186 }
187 
188 void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
189 {
190         const uint64_t pg_size = PG_LEVEL_SIZE(level);
191         uint64_t *pml4e, *pdpe, *pde;
192         uint64_t *pte;
193 
194         TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K,
195                     "Unknown or unsupported guest mode, mode: 0x%x", vm->mode);
196 
197         TEST_ASSERT((vaddr % pg_size) == 0,
198                     "Virtual address not aligned,\n"
199                     "vaddr: 0x%lx page size: 0x%lx", vaddr, pg_size);
200         TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)),
201                     "Invalid virtual address, vaddr: 0x%lx", vaddr);
202         TEST_ASSERT((paddr % pg_size) == 0,
203                     "Physical address not aligned,\n"
204                     "  paddr: 0x%lx page size: 0x%lx", paddr, pg_size);
205         TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
206                     "Physical address beyond maximum supported,\n"
207                     "  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
208                     paddr, vm->max_gfn, vm->page_size);
209         TEST_ASSERT(vm_untag_gpa(vm, paddr) == paddr,
210                     "Unexpected bits in paddr: %lx", paddr);
211 
212         /*
213          * Allocate upper level page tables, if not already present.  Return
214          * early if a hugepage was created.
215          */
216         pml4e = virt_create_upper_pte(vm, &vm->pgd, vaddr, paddr, PG_LEVEL_512G, level);
217         if (*pml4e & PTE_LARGE_MASK)
218                 return;
219 
220         pdpe = virt_create_upper_pte(vm, pml4e, vaddr, paddr, PG_LEVEL_1G, level);
221         if (*pdpe & PTE_LARGE_MASK)
222                 return;
223 
224         pde = virt_create_upper_pte(vm, pdpe, vaddr, paddr, PG_LEVEL_2M, level);
225         if (*pde & PTE_LARGE_MASK)
226                 return;
227 
228         /* Fill in page table entry. */
229         pte = virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K);
230         TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
231                     "PTE already present for 4k page at vaddr: 0x%lx", vaddr);
232         *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
233 
234         /*
235          * Neither SEV nor TDX supports shared page tables, so only the final
236          * leaf PTE needs manually set the C/S-bit.
237          */
238         if (vm_is_gpa_protected(vm, paddr))
239                 *pte |= vm->arch.c_bit;
240         else
241                 *pte |= vm->arch.s_bit;
242 }
243 
244 void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
245 {
246         __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K);
247 }
248 
249 void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
250                     uint64_t nr_bytes, int level)
251 {
252         uint64_t pg_size = PG_LEVEL_SIZE(level);
253         uint64_t nr_pages = nr_bytes / pg_size;
254         int i;
255 
256         TEST_ASSERT(nr_bytes % pg_size == 0,
257                     "Region size not aligned: nr_bytes: 0x%lx, page size: 0x%lx",
258                     nr_bytes, pg_size);
259 
260         for (i = 0; i < nr_pages; i++) {
261                 __virt_pg_map(vm, vaddr, paddr, level);
262 
263                 vaddr += pg_size;
264                 paddr += pg_size;
265         }
266 }
267 
268 static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level)
269 {
270         if (*pte & PTE_LARGE_MASK) {
271                 TEST_ASSERT(*level == PG_LEVEL_NONE ||
272                             *level == current_level,
273                             "Unexpected hugepage at level %d", current_level);
274                 *level = current_level;
275         }
276 
277         return *level == current_level;
278 }
279 
280 uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
281                                     int *level)
282 {
283         uint64_t *pml4e, *pdpe, *pde;
284 
285         TEST_ASSERT(!vm->arch.is_pt_protected,
286                     "Walking page tables of protected guests is impossible");
287 
288         TEST_ASSERT(*level >= PG_LEVEL_NONE && *level < PG_LEVEL_NUM,
289                     "Invalid PG_LEVEL_* '%d'", *level);
290 
291         TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
292                 "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
293         TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
294                 (vaddr >> vm->page_shift)),
295                 "Invalid virtual address, vaddr: 0x%lx",
296                 vaddr);
297         /*
298          * Based on the mode check above there are 48 bits in the vaddr, so
299          * shift 16 to sign extend the last bit (bit-47),
300          */
301         TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16),
302                 "Canonical check failed.  The virtual address is invalid.");
303 
304         pml4e = virt_get_pte(vm, &vm->pgd, vaddr, PG_LEVEL_512G);
305         if (vm_is_target_pte(pml4e, level, PG_LEVEL_512G))
306                 return pml4e;
307 
308         pdpe = virt_get_pte(vm, pml4e, vaddr, PG_LEVEL_1G);
309         if (vm_is_target_pte(pdpe, level, PG_LEVEL_1G))
310                 return pdpe;
311 
312         pde = virt_get_pte(vm, pdpe, vaddr, PG_LEVEL_2M);
313         if (vm_is_target_pte(pde, level, PG_LEVEL_2M))
314                 return pde;
315 
316         return virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K);
317 }
318 
319 uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr)
320 {
321         int level = PG_LEVEL_4K;
322 
323         return __vm_get_page_table_entry(vm, vaddr, &level);
324 }
325 
326 void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
327 {
328         uint64_t *pml4e, *pml4e_start;
329         uint64_t *pdpe, *pdpe_start;
330         uint64_t *pde, *pde_start;
331         uint64_t *pte, *pte_start;
332 
333         if (!vm->pgd_created)
334                 return;
335 
336         fprintf(stream, "%*s                                          "
337                 "                no\n", indent, "");
338         fprintf(stream, "%*s      index hvaddr         gpaddr         "
339                 "addr         w exec dirty\n",
340                 indent, "");
341         pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);
342         for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
343                 pml4e = &pml4e_start[n1];
344                 if (!(*pml4e & PTE_PRESENT_MASK))
345                         continue;
346                 fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
347                         " %u\n",
348                         indent, "",
349                         pml4e - pml4e_start, pml4e,
350                         addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
351                         !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
352 
353                 pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
354                 for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
355                         pdpe = &pdpe_start[n2];
356                         if (!(*pdpe & PTE_PRESENT_MASK))
357                                 continue;
358                         fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10llx "
359                                 "%u  %u\n",
360                                 indent, "",
361                                 pdpe - pdpe_start, pdpe,
362                                 addr_hva2gpa(vm, pdpe),
363                                 PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
364                                 !!(*pdpe & PTE_NX_MASK));
365 
366                         pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
367                         for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
368                                 pde = &pde_start[n3];
369                                 if (!(*pde & PTE_PRESENT_MASK))
370                                         continue;
371                                 fprintf(stream, "%*spde   0x%-3zx %p "
372                                         "0x%-12lx 0x%-10llx %u  %u\n",
373                                         indent, "", pde - pde_start, pde,
374                                         addr_hva2gpa(vm, pde),
375                                         PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
376                                         !!(*pde & PTE_NX_MASK));
377 
378                                 pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
379                                 for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
380                                         pte = &pte_start[n4];
381                                         if (!(*pte & PTE_PRESENT_MASK))
382                                                 continue;
383                                         fprintf(stream, "%*spte   0x%-3zx %p "
384                                                 "0x%-12lx 0x%-10llx %u  %u "
385                                                 "    %u    0x%-10lx\n",
386                                                 indent, "",
387                                                 pte - pte_start, pte,
388                                                 addr_hva2gpa(vm, pte),
389                                                 PTE_GET_PFN(*pte),
390                                                 !!(*pte & PTE_WRITABLE_MASK),
391                                                 !!(*pte & PTE_NX_MASK),
392                                                 !!(*pte & PTE_DIRTY_MASK),
393                                                 ((uint64_t) n1 << 27)
394                                                         | ((uint64_t) n2 << 18)
395                                                         | ((uint64_t) n3 << 9)
396                                                         | ((uint64_t) n4));
397                                 }
398                         }
399                 }
400         }
401 }
402 
403 /*
404  * Set Unusable Segment
405  *
406  * Input Args: None
407  *
408  * Output Args:
409  *   segp - Pointer to segment register
410  *
411  * Return: None
412  *
413  * Sets the segment register pointed to by @segp to an unusable state.
414  */
415 static void kvm_seg_set_unusable(struct kvm_segment *segp)
416 {
417         memset(segp, 0, sizeof(*segp));
418         segp->unusable = true;
419 }
420 
421 static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)
422 {
423         void *gdt = addr_gva2hva(vm, vm->arch.gdt);
424         struct desc64 *desc = gdt + (segp->selector >> 3) * 8;
425 
426         desc->limit0 = segp->limit & 0xFFFF;
427         desc->base0 = segp->base & 0xFFFF;
428         desc->base1 = segp->base >> 16;
429         desc->type = segp->type;
430         desc->s = segp->s;
431         desc->dpl = segp->dpl;
432         desc->p = segp->present;
433         desc->limit1 = segp->limit >> 16;
434         desc->avl = segp->avl;
435         desc->l = segp->l;
436         desc->db = segp->db;
437         desc->g = segp->g;
438         desc->base2 = segp->base >> 24;
439         if (!segp->s)
440                 desc->base3 = segp->base >> 32;
441 }
442 
443 static void kvm_seg_set_kernel_code_64bit(struct kvm_segment *segp)
444 {
445         memset(segp, 0, sizeof(*segp));
446         segp->selector = KERNEL_CS;
447         segp->limit = 0xFFFFFFFFu;
448         segp->s = 0x1; /* kTypeCodeData */
449         segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed
450                                           * | kFlagCodeReadable
451                                           */
452         segp->g = true;
453         segp->l = true;
454         segp->present = 1;
455 }
456 
457 static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp)
458 {
459         memset(segp, 0, sizeof(*segp));
460         segp->selector = KERNEL_DS;
461         segp->limit = 0xFFFFFFFFu;
462         segp->s = 0x1; /* kTypeCodeData */
463         segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed
464                                           * | kFlagDataWritable
465                                           */
466         segp->g = true;
467         segp->present = true;
468 }
469 
470 vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
471 {
472         int level = PG_LEVEL_NONE;
473         uint64_t *pte = __vm_get_page_table_entry(vm, gva, &level);
474 
475         TEST_ASSERT(*pte & PTE_PRESENT_MASK,
476                     "Leaf PTE not PRESENT for gva: 0x%08lx", gva);
477 
478         /*
479          * No need for a hugepage mask on the PTE, x86-64 requires the "unused"
480          * address bits to be zero.
481          */
482         return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level));
483 }
484 
485 static void kvm_seg_set_tss_64bit(vm_vaddr_t base, struct kvm_segment *segp)
486 {
487         memset(segp, 0, sizeof(*segp));
488         segp->base = base;
489         segp->limit = 0x67;
490         segp->selector = KERNEL_TSS;
491         segp->type = 0xb;
492         segp->present = 1;
493 }
494 
495 static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
496 {
497         struct kvm_sregs sregs;
498 
499         TEST_ASSERT_EQ(vm->mode, VM_MODE_PXXV48_4K);
500 
501         /* Set mode specific system register values. */
502         vcpu_sregs_get(vcpu, &sregs);
503 
504         sregs.idt.base = vm->arch.idt;
505         sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1;
506         sregs.gdt.base = vm->arch.gdt;
507         sregs.gdt.limit = getpagesize() - 1;
508 
509         sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
510         sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
511         sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
512 
513         kvm_seg_set_unusable(&sregs.ldt);
514         kvm_seg_set_kernel_code_64bit(&sregs.cs);
515         kvm_seg_set_kernel_data_64bit(&sregs.ds);
516         kvm_seg_set_kernel_data_64bit(&sregs.es);
517         kvm_seg_set_kernel_data_64bit(&sregs.gs);
518         kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr);
519 
520         sregs.cr3 = vm->pgd;
521         vcpu_sregs_set(vcpu, &sregs);
522 }
523 
524 static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr,
525                           int dpl, unsigned short selector)
526 {
527         struct idt_entry *base =
528                 (struct idt_entry *)addr_gva2hva(vm, vm->arch.idt);
529         struct idt_entry *e = &base[vector];
530 
531         memset(e, 0, sizeof(*e));
532         e->offset0 = addr;
533         e->selector = selector;
534         e->ist = 0;
535         e->type = 14;
536         e->dpl = dpl;
537         e->p = 1;
538         e->offset1 = addr >> 16;
539         e->offset2 = addr >> 32;
540 }
541 
542 static bool kvm_fixup_exception(struct ex_regs *regs)
543 {
544         if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10)
545                 return false;
546 
547         if (regs->vector == DE_VECTOR)
548                 return false;
549 
550         regs->rip = regs->r11;
551         regs->r9 = regs->vector;
552         regs->r10 = regs->error_code;
553         return true;
554 }
555 
556 void route_exception(struct ex_regs *regs)
557 {
558         typedef void(*handler)(struct ex_regs *);
559         handler *handlers = (handler *)exception_handlers;
560 
561         if (handlers && handlers[regs->vector]) {
562                 handlers[regs->vector](regs);
563                 return;
564         }
565 
566         if (kvm_fixup_exception(regs))
567                 return;
568 
569         ucall_assert(UCALL_UNHANDLED,
570                      "Unhandled exception in guest", __FILE__, __LINE__,
571                      "Unhandled exception '0x%lx' at guest RIP '0x%lx'",
572                      regs->vector, regs->rip);
573 }
574 
575 static void vm_init_descriptor_tables(struct kvm_vm *vm)
576 {
577         extern void *idt_handlers;
578         struct kvm_segment seg;
579         int i;
580 
581         vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
582         vm->arch.idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
583         vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
584         vm->arch.tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
585 
586         /* Handlers have the same address in both address spaces.*/
587         for (i = 0; i < NUM_INTERRUPTS; i++)
588                 set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, KERNEL_CS);
589 
590         *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
591 
592         kvm_seg_set_kernel_code_64bit(&seg);
593         kvm_seg_fill_gdt_64bit(vm, &seg);
594 
595         kvm_seg_set_kernel_data_64bit(&seg);
596         kvm_seg_fill_gdt_64bit(vm, &seg);
597 
598         kvm_seg_set_tss_64bit(vm->arch.tss, &seg);
599         kvm_seg_fill_gdt_64bit(vm, &seg);
600 }
601 
602 void vm_install_exception_handler(struct kvm_vm *vm, int vector,
603                                void (*handler)(struct ex_regs *))
604 {
605         vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers);
606 
607         handlers[vector] = (vm_vaddr_t)handler;
608 }
609 
610 void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
611 {
612         struct ucall uc;
613 
614         if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED)
615                 REPORT_GUEST_ASSERT(uc);
616 }
617 
618 void kvm_arch_vm_post_create(struct kvm_vm *vm)
619 {
620         int r;
621 
622         TEST_ASSERT(kvm_has_cap(KVM_CAP_GET_TSC_KHZ),
623                     "Require KVM_GET_TSC_KHZ to provide udelay() to guest.");
624 
625         vm_create_irqchip(vm);
626         vm_init_descriptor_tables(vm);
627 
628         sync_global_to_guest(vm, host_cpu_is_intel);
629         sync_global_to_guest(vm, host_cpu_is_amd);
630         sync_global_to_guest(vm, is_forced_emulation_enabled);
631 
632         if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) {
633                 struct kvm_sev_init init = { 0 };
634 
635                 vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
636         }
637 
638         r = __vm_ioctl(vm, KVM_GET_TSC_KHZ, NULL);
639         TEST_ASSERT(r > 0, "KVM_GET_TSC_KHZ did not provide a valid TSC frequency.");
640         guest_tsc_khz = r;
641         sync_global_to_guest(vm, guest_tsc_khz);
642 }
643 
644 void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
645 {
646         struct kvm_regs regs;
647 
648         vcpu_regs_get(vcpu, &regs);
649         regs.rip = (unsigned long) guest_code;
650         vcpu_regs_set(vcpu, &regs);
651 }
652 
653 struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
654 {
655         struct kvm_mp_state mp_state;
656         struct kvm_regs regs;
657         vm_vaddr_t stack_vaddr;
658         struct kvm_vcpu *vcpu;
659 
660         stack_vaddr = __vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
661                                        DEFAULT_GUEST_STACK_VADDR_MIN,
662                                        MEM_REGION_DATA);
663 
664         stack_vaddr += DEFAULT_STACK_PGS * getpagesize();
665 
666         /*
667          * Align stack to match calling sequence requirements in section "The
668          * Stack Frame" of the System V ABI AMD64 Architecture Processor
669          * Supplement, which requires the value (%rsp + 8) to be a multiple of
670          * 16 when control is transferred to the function entry point.
671          *
672          * If this code is ever used to launch a vCPU with 32-bit entry point it
673          * may need to subtract 4 bytes instead of 8 bytes.
674          */
675         TEST_ASSERT(IS_ALIGNED(stack_vaddr, PAGE_SIZE),
676                     "__vm_vaddr_alloc() did not provide a page-aligned address");
677         stack_vaddr -= 8;
678 
679         vcpu = __vm_vcpu_add(vm, vcpu_id);
680         vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
681         vcpu_init_sregs(vm, vcpu);
682 
683         /* Setup guest general purpose registers */
684         vcpu_regs_get(vcpu, &regs);
685         regs.rflags = regs.rflags | 0x2;
686         regs.rsp = stack_vaddr;
687         vcpu_regs_set(vcpu, &regs);
688 
689         /* Setup the MP state */
690         mp_state.mp_state = 0;
691         vcpu_mp_state_set(vcpu, &mp_state);
692 
693         return vcpu;
694 }
695 
696 struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id)
697 {
698         struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id);
699 
700         vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
701 
702         return vcpu;
703 }
704 
705 void vcpu_arch_free(struct kvm_vcpu *vcpu)
706 {
707         if (vcpu->cpuid)
708                 free(vcpu->cpuid);
709 }
710 
711 /* Do not use kvm_supported_cpuid directly except for validity checks. */
712 static void *kvm_supported_cpuid;
713 
714 const struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
715 {
716         int kvm_fd;
717 
718         if (kvm_supported_cpuid)
719                 return kvm_supported_cpuid;
720 
721         kvm_supported_cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
722         kvm_fd = open_kvm_dev_path_or_exit();
723 
724         kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID,
725                   (struct kvm_cpuid2 *)kvm_supported_cpuid);
726 
727         close(kvm_fd);
728         return kvm_supported_cpuid;
729 }
730 
731 static uint32_t __kvm_cpu_has(const struct kvm_cpuid2 *cpuid,
732                               uint32_t function, uint32_t index,
733                               uint8_t reg, uint8_t lo, uint8_t hi)
734 {
735         const struct kvm_cpuid_entry2 *entry;
736         int i;
737 
738         for (i = 0; i < cpuid->nent; i++) {
739                 entry = &cpuid->entries[i];
740 
741                 /*
742                  * The output registers in kvm_cpuid_entry2 are in alphabetical
743                  * order, but kvm_x86_cpu_feature matches that mess, so yay
744                  * pointer shenanigans!
745                  */
746                 if (entry->function == function && entry->index == index)
747                         return ((&entry->eax)[reg] & GENMASK(hi, lo)) >> lo;
748         }
749 
750         return 0;
751 }
752 
753 bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid,
754                    struct kvm_x86_cpu_feature feature)
755 {
756         return __kvm_cpu_has(cpuid, feature.function, feature.index,
757                              feature.reg, feature.bit, feature.bit);
758 }
759 
760 uint32_t kvm_cpuid_property(const struct kvm_cpuid2 *cpuid,
761                             struct kvm_x86_cpu_property property)
762 {
763         return __kvm_cpu_has(cpuid, property.function, property.index,
764                              property.reg, property.lo_bit, property.hi_bit);
765 }
766 
767 uint64_t kvm_get_feature_msr(uint64_t msr_index)
768 {
769         struct {
770                 struct kvm_msrs header;
771                 struct kvm_msr_entry entry;
772         } buffer = {};
773         int r, kvm_fd;
774 
775         buffer.header.nmsrs = 1;
776         buffer.entry.index = msr_index;
777         kvm_fd = open_kvm_dev_path_or_exit();
778 
779         r = __kvm_ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header);
780         TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_GET_MSRS, r));
781 
782         close(kvm_fd);
783         return buffer.entry.data;
784 }
785 
786 void __vm_xsave_require_permission(uint64_t xfeature, const char *name)
787 {
788         int kvm_fd;
789         u64 bitmask;
790         long rc;
791         struct kvm_device_attr attr = {
792                 .group = 0,
793                 .attr = KVM_X86_XCOMP_GUEST_SUPP,
794                 .addr = (unsigned long) &bitmask,
795         };
796 
797         TEST_ASSERT(!kvm_supported_cpuid,
798                     "kvm_get_supported_cpuid() cannot be used before ARCH_REQ_XCOMP_GUEST_PERM");
799 
800         TEST_ASSERT(is_power_of_2(xfeature),
801                     "Dynamic XFeatures must be enabled one at a time");
802 
803         kvm_fd = open_kvm_dev_path_or_exit();
804         rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr);
805         close(kvm_fd);
806 
807         if (rc == -1 && (errno == ENXIO || errno == EINVAL))
808                 __TEST_REQUIRE(0, "KVM_X86_XCOMP_GUEST_SUPP not supported");
809 
810         TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc);
811 
812         __TEST_REQUIRE(bitmask & xfeature,
813                        "Required XSAVE feature '%s' not supported", name);
814 
815         TEST_REQUIRE(!syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, ilog2(xfeature)));
816 
817         rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);
818         TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
819         TEST_ASSERT(bitmask & xfeature,
820                     "'%s' (0x%lx) not permitted after prctl(ARCH_REQ_XCOMP_GUEST_PERM) permitted=0x%lx",
821                     name, xfeature, bitmask);
822 }
823 
824 void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid)
825 {
826         TEST_ASSERT(cpuid != vcpu->cpuid, "@cpuid can't be the vCPU's CPUID");
827 
828         /* Allow overriding the default CPUID. */
829         if (vcpu->cpuid && vcpu->cpuid->nent < cpuid->nent) {
830                 free(vcpu->cpuid);
831                 vcpu->cpuid = NULL;
832         }
833 
834         if (!vcpu->cpuid)
835                 vcpu->cpuid = allocate_kvm_cpuid2(cpuid->nent);
836 
837         memcpy(vcpu->cpuid, cpuid, kvm_cpuid2_size(cpuid->nent));
838         vcpu_set_cpuid(vcpu);
839 }
840 
841 void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu,
842                              struct kvm_x86_cpu_property property,
843                              uint32_t value)
844 {
845         struct kvm_cpuid_entry2 *entry;
846 
847         entry = __vcpu_get_cpuid_entry(vcpu, property.function, property.index);
848 
849         (&entry->eax)[property.reg] &= ~GENMASK(property.hi_bit, property.lo_bit);
850         (&entry->eax)[property.reg] |= value << property.lo_bit;
851 
852         vcpu_set_cpuid(vcpu);
853 
854         /* Sanity check that @value doesn't exceed the bounds in any way. */
855         TEST_ASSERT_EQ(kvm_cpuid_property(vcpu->cpuid, property), value);
856 }
857 
858 void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function)
859 {
860         struct kvm_cpuid_entry2 *entry = vcpu_get_cpuid_entry(vcpu, function);
861 
862         entry->eax = 0;
863         entry->ebx = 0;
864         entry->ecx = 0;
865         entry->edx = 0;
866         vcpu_set_cpuid(vcpu);
867 }
868 
869 void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu,
870                                      struct kvm_x86_cpu_feature feature,
871                                      bool set)
872 {
873         struct kvm_cpuid_entry2 *entry;
874         u32 *reg;
875 
876         entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index);
877         reg = (&entry->eax) + feature.reg;
878 
879         if (set)
880                 *reg |= BIT(feature.bit);
881         else
882                 *reg &= ~BIT(feature.bit);
883 
884         vcpu_set_cpuid(vcpu);
885 }
886 
887 uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index)
888 {
889         struct {
890                 struct kvm_msrs header;
891                 struct kvm_msr_entry entry;
892         } buffer = {};
893 
894         buffer.header.nmsrs = 1;
895         buffer.entry.index = msr_index;
896 
897         vcpu_msrs_get(vcpu, &buffer.header);
898 
899         return buffer.entry.data;
900 }
901 
902 int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value)
903 {
904         struct {
905                 struct kvm_msrs header;
906                 struct kvm_msr_entry entry;
907         } buffer = {};
908 
909         memset(&buffer, 0, sizeof(buffer));
910         buffer.header.nmsrs = 1;
911         buffer.entry.index = msr_index;
912         buffer.entry.data = msr_value;
913 
914         return __vcpu_ioctl(vcpu, KVM_SET_MSRS, &buffer.header);
915 }
916 
917 void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...)
918 {
919         va_list ap;
920         struct kvm_regs regs;
921 
922         TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n"
923                     "  num: %u",
924                     num);
925 
926         va_start(ap, num);
927         vcpu_regs_get(vcpu, &regs);
928 
929         if (num >= 1)
930                 regs.rdi = va_arg(ap, uint64_t);
931 
932         if (num >= 2)
933                 regs.rsi = va_arg(ap, uint64_t);
934 
935         if (num >= 3)
936                 regs.rdx = va_arg(ap, uint64_t);
937 
938         if (num >= 4)
939                 regs.rcx = va_arg(ap, uint64_t);
940 
941         if (num >= 5)
942                 regs.r8 = va_arg(ap, uint64_t);
943 
944         if (num >= 6)
945                 regs.r9 = va_arg(ap, uint64_t);
946 
947         vcpu_regs_set(vcpu, &regs);
948         va_end(ap);
949 }
950 
951 void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)
952 {
953         struct kvm_regs regs;
954         struct kvm_sregs sregs;
955 
956         fprintf(stream, "%*svCPU ID: %u\n", indent, "", vcpu->id);
957 
958         fprintf(stream, "%*sregs:\n", indent + 2, "");
959         vcpu_regs_get(vcpu, &regs);
960         regs_dump(stream, &regs, indent + 4);
961 
962         fprintf(stream, "%*ssregs:\n", indent + 2, "");
963         vcpu_sregs_get(vcpu, &sregs);
964         sregs_dump(stream, &sregs, indent + 4);
965 }
966 
967 static struct kvm_msr_list *__kvm_get_msr_index_list(bool feature_msrs)
968 {
969         struct kvm_msr_list *list;
970         struct kvm_msr_list nmsrs;
971         int kvm_fd, r;
972 
973         kvm_fd = open_kvm_dev_path_or_exit();
974 
975         nmsrs.nmsrs = 0;
976         if (!feature_msrs)
977                 r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs);
978         else
979                 r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, &nmsrs);
980 
981         TEST_ASSERT(r == -1 && errno == E2BIG,
982                     "Expected -E2BIG, got rc: %i errno: %i (%s)",
983                     r, errno, strerror(errno));
984 
985         list = malloc(sizeof(*list) + nmsrs.nmsrs * sizeof(list->indices[0]));
986         TEST_ASSERT(list, "-ENOMEM when allocating MSR index list");
987         list->nmsrs = nmsrs.nmsrs;
988 
989         if (!feature_msrs)
990                 kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
991         else
992                 kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list);
993         close(kvm_fd);
994 
995         TEST_ASSERT(list->nmsrs == nmsrs.nmsrs,
996                     "Number of MSRs in list changed, was %d, now %d",
997                     nmsrs.nmsrs, list->nmsrs);
998         return list;
999 }
1000 
1001 const struct kvm_msr_list *kvm_get_msr_index_list(void)
1002 {
1003         static const struct kvm_msr_list *list;
1004 
1005         if (!list)
1006                 list = __kvm_get_msr_index_list(false);
1007         return list;
1008 }
1009 
1010 
1011 const struct kvm_msr_list *kvm_get_feature_msr_index_list(void)
1012 {
1013         static const struct kvm_msr_list *list;
1014 
1015         if (!list)
1016                 list = __kvm_get_msr_index_list(true);
1017         return list;
1018 }
1019 
1020 bool kvm_msr_is_in_save_restore_list(uint32_t msr_index)
1021 {
1022         const struct kvm_msr_list *list = kvm_get_msr_index_list();
1023         int i;
1024 
1025         for (i = 0; i < list->nmsrs; ++i) {
1026                 if (list->indices[i] == msr_index)
1027                         return true;
1028         }
1029 
1030         return false;
1031 }
1032 
1033 static void vcpu_save_xsave_state(struct kvm_vcpu *vcpu,
1034                                   struct kvm_x86_state *state)
1035 {
1036         int size = vm_check_cap(vcpu->vm, KVM_CAP_XSAVE2);
1037 
1038         if (size) {
1039                 state->xsave = malloc(size);
1040                 vcpu_xsave2_get(vcpu, state->xsave);
1041         } else {
1042                 state->xsave = malloc(sizeof(struct kvm_xsave));
1043                 vcpu_xsave_get(vcpu, state->xsave);
1044         }
1045 }
1046 
1047 struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu)
1048 {
1049         const struct kvm_msr_list *msr_list = kvm_get_msr_index_list();
1050         struct kvm_x86_state *state;
1051         int i;
1052 
1053         static int nested_size = -1;
1054 
1055         if (nested_size == -1) {
1056                 nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE);
1057                 TEST_ASSERT(nested_size <= sizeof(state->nested_),
1058                             "Nested state size too big, %i > %zi",
1059                             nested_size, sizeof(state->nested_));
1060         }
1061 
1062         /*
1063          * When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees
1064          * guest state is consistent only after userspace re-enters the
1065          * kernel with KVM_RUN.  Complete IO prior to migrating state
1066          * to a new VM.
1067          */
1068         vcpu_run_complete_io(vcpu);
1069 
1070         state = malloc(sizeof(*state) + msr_list->nmsrs * sizeof(state->msrs.entries[0]));
1071         TEST_ASSERT(state, "-ENOMEM when allocating kvm state");
1072 
1073         vcpu_events_get(vcpu, &state->events);
1074         vcpu_mp_state_get(vcpu, &state->mp_state);
1075         vcpu_regs_get(vcpu, &state->regs);
1076         vcpu_save_xsave_state(vcpu, state);
1077 
1078         if (kvm_has_cap(KVM_CAP_XCRS))
1079                 vcpu_xcrs_get(vcpu, &state->xcrs);
1080 
1081         vcpu_sregs_get(vcpu, &state->sregs);
1082 
1083         if (nested_size) {
1084                 state->nested.size = sizeof(state->nested_);
1085 
1086                 vcpu_nested_state_get(vcpu, &state->nested);
1087                 TEST_ASSERT(state->nested.size <= nested_size,
1088                             "Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
1089                             state->nested.size, nested_size);
1090         } else {
1091                 state->nested.size = 0;
1092         }
1093 
1094         state->msrs.nmsrs = msr_list->nmsrs;
1095         for (i = 0; i < msr_list->nmsrs; i++)
1096                 state->msrs.entries[i].index = msr_list->indices[i];
1097         vcpu_msrs_get(vcpu, &state->msrs);
1098 
1099         vcpu_debugregs_get(vcpu, &state->debugregs);
1100 
1101         return state;
1102 }
1103 
1104 void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state)
1105 {
1106         vcpu_sregs_set(vcpu, &state->sregs);
1107         vcpu_msrs_set(vcpu, &state->msrs);
1108 
1109         if (kvm_has_cap(KVM_CAP_XCRS))
1110                 vcpu_xcrs_set(vcpu, &state->xcrs);
1111 
1112         vcpu_xsave_set(vcpu,  state->xsave);
1113         vcpu_events_set(vcpu, &state->events);
1114         vcpu_mp_state_set(vcpu, &state->mp_state);
1115         vcpu_debugregs_set(vcpu, &state->debugregs);
1116         vcpu_regs_set(vcpu, &state->regs);
1117 
1118         if (state->nested.size)
1119                 vcpu_nested_state_set(vcpu, &state->nested);
1120 }
1121 
1122 void kvm_x86_state_cleanup(struct kvm_x86_state *state)
1123 {
1124         free(state->xsave);
1125         free(state);
1126 }
1127 
1128 void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
1129 {
1130         if (!kvm_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR)) {
1131                 *pa_bits = kvm_cpu_has(X86_FEATURE_PAE) ? 36 : 32;
1132                 *va_bits = 32;
1133         } else {
1134                 *pa_bits = kvm_cpu_property(X86_PROPERTY_MAX_PHY_ADDR);
1135                 *va_bits = kvm_cpu_property(X86_PROPERTY_MAX_VIRT_ADDR);
1136         }
1137 }
1138 
1139 void kvm_init_vm_address_properties(struct kvm_vm *vm)
1140 {
1141         if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) {
1142                 vm->arch.sev_fd = open_sev_dev_path_or_exit();
1143                 vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT));
1144                 vm->gpa_tag_mask = vm->arch.c_bit;
1145         } else {
1146                 vm->arch.sev_fd = -1;
1147         }
1148 }
1149 
1150 const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid,
1151                                                uint32_t function, uint32_t index)
1152 {
1153         int i;
1154 
1155         for (i = 0; i < cpuid->nent; i++) {
1156                 if (cpuid->entries[i].function == function &&
1157                     cpuid->entries[i].index == index)
1158                         return &cpuid->entries[i];
1159         }
1160 
1161         TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index);
1162 
1163         return NULL;
1164 }
1165 
1166 #define X86_HYPERCALL(inputs...)                                        \
1167 ({                                                                      \
1168         uint64_t r;                                                     \
1169                                                                         \
1170         asm volatile("test %[use_vmmcall], %[use_vmmcall]\n\t"          \
1171                      "jnz 1f\n\t"                                       \
1172                      "vmcall\n\t"                                       \
1173                      "jmp 2f\n\t"                                       \
1174                      "1: vmmcall\n\t"                                   \
1175                      "2:"                                               \
1176                      : "=a"(r)                                          \
1177                      : [use_vmmcall] "r" (host_cpu_is_amd), inputs);    \
1178                                                                         \
1179         r;                                                              \
1180 })
1181 
1182 uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
1183                        uint64_t a3)
1184 {
1185         return X86_HYPERCALL("a"(nr), "b"(a0), "c"(a1), "d"(a2), "S"(a3));
1186 }
1187 
1188 uint64_t __xen_hypercall(uint64_t nr, uint64_t a0, void *a1)
1189 {
1190         return X86_HYPERCALL("a"(nr), "D"(a0), "S"(a1));
1191 }
1192 
1193 void xen_hypercall(uint64_t nr, uint64_t a0, void *a1)
1194 {
1195         GUEST_ASSERT(!__xen_hypercall(nr, a0, a1));
1196 }
1197 
1198 const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void)
1199 {
1200         static struct kvm_cpuid2 *cpuid;
1201         int kvm_fd;
1202 
1203         if (cpuid)
1204                 return cpuid;
1205 
1206         cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
1207         kvm_fd = open_kvm_dev_path_or_exit();
1208 
1209         kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
1210 
1211         close(kvm_fd);
1212         return cpuid;
1213 }
1214 
1215 void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu)
1216 {
1217         static struct kvm_cpuid2 *cpuid_full;
1218         const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv;
1219         int i, nent = 0;
1220 
1221         if (!cpuid_full) {
1222                 cpuid_sys = kvm_get_supported_cpuid();
1223                 cpuid_hv = kvm_get_supported_hv_cpuid();
1224 
1225                 cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent);
1226                 if (!cpuid_full) {
1227                         perror("malloc");
1228                         abort();
1229                 }
1230 
1231                 /* Need to skip KVM CPUID leaves 0x400000xx */
1232                 for (i = 0; i < cpuid_sys->nent; i++) {
1233                         if (cpuid_sys->entries[i].function >= 0x40000000 &&
1234                             cpuid_sys->entries[i].function < 0x40000100)
1235                                 continue;
1236                         cpuid_full->entries[nent] = cpuid_sys->entries[i];
1237                         nent++;
1238                 }
1239 
1240                 memcpy(&cpuid_full->entries[nent], cpuid_hv->entries,
1241                        cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2));
1242                 cpuid_full->nent = nent + cpuid_hv->nent;
1243         }
1244 
1245         vcpu_init_cpuid(vcpu, cpuid_full);
1246 }
1247 
1248 const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu)
1249 {
1250         struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
1251 
1252         vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
1253 
1254         return cpuid;
1255 }
1256 
1257 unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
1258 {
1259         const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
1260         unsigned long ht_gfn, max_gfn, max_pfn;
1261         uint8_t maxphyaddr, guest_maxphyaddr;
1262 
1263         /*
1264          * Use "guest MAXPHYADDR" from KVM if it's available.  Guest MAXPHYADDR
1265          * enumerates the max _mappable_ GPA, which can be less than the raw
1266          * MAXPHYADDR, e.g. if MAXPHYADDR=52, KVM is using TDP, and the CPU
1267          * doesn't support 5-level TDP.
1268          */
1269         guest_maxphyaddr = kvm_cpu_property(X86_PROPERTY_GUEST_MAX_PHY_ADDR);
1270         guest_maxphyaddr = guest_maxphyaddr ?: vm->pa_bits;
1271         TEST_ASSERT(guest_maxphyaddr <= vm->pa_bits,
1272                     "Guest MAXPHYADDR should never be greater than raw MAXPHYADDR");
1273 
1274         max_gfn = (1ULL << (guest_maxphyaddr - vm->page_shift)) - 1;
1275 
1276         /* Avoid reserved HyperTransport region on AMD processors.  */
1277         if (!host_cpu_is_amd)
1278                 return max_gfn;
1279 
1280         /* On parts with <40 physical address bits, the area is fully hidden */
1281         if (vm->pa_bits < 40)
1282                 return max_gfn;
1283 
1284         /* Before family 17h, the HyperTransport area is just below 1T.  */
1285         ht_gfn = (1 << 28) - num_ht_pages;
1286         if (this_cpu_family() < 0x17)
1287                 goto done;
1288 
1289         /*
1290          * Otherwise it's at the top of the physical address space, possibly
1291          * reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX.  Use
1292          * the old conservative value if MAXPHYADDR is not enumerated.
1293          */
1294         if (!this_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR))
1295                 goto done;
1296 
1297         maxphyaddr = this_cpu_property(X86_PROPERTY_MAX_PHY_ADDR);
1298         max_pfn = (1ULL << (maxphyaddr - vm->page_shift)) - 1;
1299 
1300         if (this_cpu_has_p(X86_PROPERTY_PHYS_ADDR_REDUCTION))
1301                 max_pfn >>= this_cpu_property(X86_PROPERTY_PHYS_ADDR_REDUCTION);
1302 
1303         ht_gfn = max_pfn - num_ht_pages;
1304 done:
1305         return min(max_gfn, ht_gfn - 1);
1306 }
1307 
1308 /* Returns true if kvm_intel was loaded with unrestricted_guest=1. */
1309 bool vm_is_unrestricted_guest(struct kvm_vm *vm)
1310 {
1311         /* Ensure that a KVM vendor-specific module is loaded. */
1312         if (vm == NULL)
1313                 close(open_kvm_dev_path_or_exit());
1314 
1315         return get_kvm_intel_param_bool("unrestricted_guest");
1316 }
1317 
1318 void kvm_selftest_arch_init(void)
1319 {
1320         host_cpu_is_intel = this_cpu_is_intel();
1321         host_cpu_is_amd = this_cpu_is_amd();
1322         is_forced_emulation_enabled = kvm_is_forced_emulation_enabled();
1323 }
1324 
1325 bool sys_clocksource_is_based_on_tsc(void)
1326 {
1327         char *clk_name = sys_get_cur_clocksource();
1328         bool ret = !strcmp(clk_name, "tsc\n") ||
1329                    !strcmp(clk_name, "hyperv_clocksource_tsc_page\n");
1330 
1331         free(clk_name);
1332 
1333         return ret;
1334 }
1335 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php