Linux/arch/x86/virt/vmx/tdx/tdx.c

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright(c) 2023 Intel Corporation. 4 * 5 * Intel Trusted Domain Extensions (TDX) support 6 */ 7 8 #define pr_fmt(fmt) "virt/tdx: " fmt 9 10 #include <linux/types.h> 11 #include <linux/cache.h> 12 #include <linux/init.h> 13 #include <linux/errno.h> 14 #include <linux/printk.h> 15 #include <linux/cpu.h> 16 #include <linux/spinlock.h> 17 #include <linux/percpu-defs.h> 18 #include <linux/mutex.h> 19 #include <linux/list.h> 20 #include <linux/memblock.h> 21 #include <linux/memory.h> 22 #include <linux/minmax.h> 23 #include <linux/sizes.h> 24 #include <linux/pfn.h> 25 #include <linux/align.h> 26 #include <linux/sort.h> 27 #include <linux/log2.h> 28 #include <linux/acpi.h> 29 #include <linux/suspend.h> 30 #include <asm/page.h> 31 #include <asm/special_insns.h> 32 #include <asm/msr-index.h> 33 #include <asm/msr.h> 34 #include <asm/cpufeature.h> 35 #include <asm/tdx.h> 36 #include <asm/cpu_device_id.h> 37 #include <asm/processor.h> 38 #include <asm/mce.h> 39 #include "tdx.h" 40 41 static u32 tdx_global_keyid __ro_after_init; 42 static u32 tdx_guest_keyid_start __ro_after_init; 43 static u32 tdx_nr_guest_keyids __ro_after_init; 44 45 static DEFINE_PER_CPU(bool, tdx_lp_initialized); 46 47 static struct tdmr_info_list tdx_tdmr_list; 48 49 static enum tdx_module_status_t tdx_module_status; 50 static DEFINE_MUTEX(tdx_module_lock); 51 52 /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ 53 static LIST_HEAD(tdx_memlist); 54 55 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); 56 57 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) 58 { 59 pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err); 60 } 61 62 static inline void seamcall_err_ret(u64 fn, u64 err, 63 struct tdx_module_args *args) 64 { 65 seamcall_err(fn, err, args); 66 pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n", 67 args->rcx, args->rdx, args->r8); 68 pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n", 69 args->r9, args->r10, args->r11); 70 } 71 72 static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func, 73 u64 fn, struct tdx_module_args *args) 74 { 75 u64 sret = sc_retry(func, fn, args); 76 77 if (sret == TDX_SUCCESS) 78 return 0; 79 80 if (sret == TDX_SEAMCALL_VMFAILINVALID) 81 return -ENODEV; 82 83 if (sret == TDX_SEAMCALL_GP) 84 return -EOPNOTSUPP; 85 86 if (sret == TDX_SEAMCALL_UD) 87 return -EACCES; 88 89 err_func(fn, sret, args); 90 return -EIO; 91 } 92 93 #define seamcall_prerr(__fn, __args) \ 94 sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args)) 95 96 #define seamcall_prerr_ret(__fn, __args) \ 97 sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args)) 98 99 /* 100 * Do the module global initialization once and return its result. 101 * It can be done on any cpu. It's always called with interrupts 102 * disabled. 103 */ 104 static int try_init_module_global(void) 105 { 106 struct tdx_module_args args = {}; 107 static DEFINE_RAW_SPINLOCK(sysinit_lock); 108 static bool sysinit_done; 109 static int sysinit_ret; 110 111 lockdep_assert_irqs_disabled(); 112 113 raw_spin_lock(&sysinit_lock); 114 115 if (sysinit_done) 116 goto out; 117 118 /* RCX is module attributes and all bits are reserved */ 119 args.rcx = 0; 120 sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args); 121 122 /* 123 * The first SEAMCALL also detects the TDX module, thus 124 * it can fail due to the TDX module is not loaded. 125 * Dump message to let the user know. 126 */ 127 if (sysinit_ret == -ENODEV) 128 pr_err("module not loaded\n"); 129 130 sysinit_done = true; 131 out: 132 raw_spin_unlock(&sysinit_lock); 133 return sysinit_ret; 134 } 135 136 /** 137 * tdx_cpu_enable - Enable TDX on local cpu 138 * 139 * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module 140 * global initialization SEAMCALL if not done) on local cpu to make this 141 * cpu be ready to run any other SEAMCALLs. 142 * 143 * Always call this function via IPI function calls. 144 * 145 * Return 0 on success, otherwise errors. 146 */ 147 int tdx_cpu_enable(void) 148 { 149 struct tdx_module_args args = {}; 150 int ret; 151 152 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 153 return -ENODEV; 154 155 lockdep_assert_irqs_disabled(); 156 157 if (__this_cpu_read(tdx_lp_initialized)) 158 return 0; 159 160 /* 161 * The TDX module global initialization is the very first step 162 * to enable TDX. Need to do it first (if hasn't been done) 163 * before the per-cpu initialization. 164 */ 165 ret = try_init_module_global(); 166 if (ret) 167 return ret; 168 169 ret = seamcall_prerr(TDH_SYS_LP_INIT, &args); 170 if (ret) 171 return ret; 172 173 __this_cpu_write(tdx_lp_initialized, true); 174 175 return 0; 176 } 177 EXPORT_SYMBOL_GPL(tdx_cpu_enable); 178 179 /* 180 * Add a memory region as a TDX memory block. The caller must make sure 181 * all memory regions are added in address ascending order and don't 182 * overlap. 183 */ 184 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, 185 unsigned long end_pfn, int nid) 186 { 187 struct tdx_memblock *tmb; 188 189 tmb = kmalloc(sizeof(*tmb), GFP_KERNEL); 190 if (!tmb) 191 return -ENOMEM; 192 193 INIT_LIST_HEAD(&tmb->list); 194 tmb->start_pfn = start_pfn; 195 tmb->end_pfn = end_pfn; 196 tmb->nid = nid; 197 198 /* @tmb_list is protected by mem_hotplug_lock */ 199 list_add_tail(&tmb->list, tmb_list); 200 return 0; 201 } 202 203 static void free_tdx_memlist(struct list_head *tmb_list) 204 { 205 /* @tmb_list is protected by mem_hotplug_lock */ 206 while (!list_empty(tmb_list)) { 207 struct tdx_memblock *tmb = list_first_entry(tmb_list, 208 struct tdx_memblock, list); 209 210 list_del(&tmb->list); 211 kfree(tmb); 212 } 213 } 214 215 /* 216 * Ensure that all memblock memory regions are convertible to TDX 217 * memory. Once this has been established, stash the memblock 218 * ranges off in a secondary structure because memblock is modified 219 * in memory hotplug while TDX memory regions are fixed. 220 */ 221 static int build_tdx_memlist(struct list_head *tmb_list) 222 { 223 unsigned long start_pfn, end_pfn; 224 int i, nid, ret; 225 226 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 227 /* 228 * The first 1MB is not reported as TDX convertible memory. 229 * Although the first 1MB is always reserved and won't end up 230 * to the page allocator, it is still in memblock's memory 231 * regions. Skip them manually to exclude them as TDX memory. 232 */ 233 start_pfn = max(start_pfn, PHYS_PFN(SZ_1M)); 234 if (start_pfn >= end_pfn) 235 continue; 236 237 /* 238 * Add the memory regions as TDX memory. The regions in 239 * memblock has already guaranteed they are in address 240 * ascending order and don't overlap. 241 */ 242 ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid); 243 if (ret) 244 goto err; 245 } 246 247 return 0; 248 err: 249 free_tdx_memlist(tmb_list); 250 return ret; 251 } 252 253 static int read_sys_metadata_field(u64 field_id, u64 *data) 254 { 255 struct tdx_module_args args = {}; 256 int ret; 257 258 /* 259 * TDH.SYS.RD -- reads one global metadata field 260 * - RDX (in): the field to read 261 * - R8 (out): the field data 262 */ 263 args.rdx = field_id; 264 ret = seamcall_prerr_ret(TDH_SYS_RD, &args); 265 if (ret) 266 return ret; 267 268 *data = args.r8; 269 270 return 0; 271 } 272 273 static int read_sys_metadata_field16(u64 field_id, 274 int offset, 275 struct tdx_tdmr_sysinfo *ts) 276 { 277 u16 *ts_member = ((void *)ts) + offset; 278 u64 tmp; 279 int ret; 280 281 if (WARN_ON_ONCE(MD_FIELD_ID_ELE_SIZE_CODE(field_id) != 282 MD_FIELD_ID_ELE_SIZE_16BIT)) 283 return -EINVAL; 284 285 ret = read_sys_metadata_field(field_id, &tmp); 286 if (ret) 287 return ret; 288 289 *ts_member = tmp; 290 291 return 0; 292 } 293 294 struct field_mapping { 295 u64 field_id; 296 int offset; 297 }; 298 299 #define TD_SYSINFO_MAP(_field_id, _offset) \ 300 { .field_id = MD_FIELD_ID_##_field_id, \ 301 .offset = offsetof(struct tdx_tdmr_sysinfo, _offset) } 302 303 /* Map TD_SYSINFO fields into 'struct tdx_tdmr_sysinfo': */ 304 static const struct field_mapping fields[] = { 305 TD_SYSINFO_MAP(MAX_TDMRS, max_tdmrs), 306 TD_SYSINFO_MAP(MAX_RESERVED_PER_TDMR, max_reserved_per_tdmr), 307 TD_SYSINFO_MAP(PAMT_4K_ENTRY_SIZE, pamt_entry_size[TDX_PS_4K]), 308 TD_SYSINFO_MAP(PAMT_2M_ENTRY_SIZE, pamt_entry_size[TDX_PS_2M]), 309 TD_SYSINFO_MAP(PAMT_1G_ENTRY_SIZE, pamt_entry_size[TDX_PS_1G]), 310 }; 311 312 static int get_tdx_tdmr_sysinfo(struct tdx_tdmr_sysinfo *tdmr_sysinfo) 313 { 314 int ret; 315 int i; 316 317 /* Populate 'tdmr_sysinfo' fields using the mapping structure above: */ 318 for (i = 0; i < ARRAY_SIZE(fields); i++) { 319 ret = read_sys_metadata_field16(fields[i].field_id, 320 fields[i].offset, 321 tdmr_sysinfo); 322 if (ret) 323 return ret; 324 } 325 326 return 0; 327 } 328 329 /* Calculate the actual TDMR size */ 330 static int tdmr_size_single(u16 max_reserved_per_tdmr) 331 { 332 int tdmr_sz; 333 334 /* 335 * The actual size of TDMR depends on the maximum 336 * number of reserved areas. 337 */ 338 tdmr_sz = sizeof(struct tdmr_info); 339 tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr; 340 341 return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT); 342 } 343 344 static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, 345 struct tdx_tdmr_sysinfo *tdmr_sysinfo) 346 { 347 size_t tdmr_sz, tdmr_array_sz; 348 void *tdmr_array; 349 350 tdmr_sz = tdmr_size_single(tdmr_sysinfo->max_reserved_per_tdmr); 351 tdmr_array_sz = tdmr_sz * tdmr_sysinfo->max_tdmrs; 352 353 /* 354 * To keep things simple, allocate all TDMRs together. 355 * The buffer needs to be physically contiguous to make 356 * sure each TDMR is physically contiguous. 357 */ 358 tdmr_array = alloc_pages_exact(tdmr_array_sz, 359 GFP_KERNEL | __GFP_ZERO); 360 if (!tdmr_array) 361 return -ENOMEM; 362 363 tdmr_list->tdmrs = tdmr_array; 364 365 /* 366 * Keep the size of TDMR to find the target TDMR 367 * at a given index in the TDMR list. 368 */ 369 tdmr_list->tdmr_sz = tdmr_sz; 370 tdmr_list->max_tdmrs = tdmr_sysinfo->max_tdmrs; 371 tdmr_list->nr_consumed_tdmrs = 0; 372 373 return 0; 374 } 375 376 static void free_tdmr_list(struct tdmr_info_list *tdmr_list) 377 { 378 free_pages_exact(tdmr_list->tdmrs, 379 tdmr_list->max_tdmrs * tdmr_list->tdmr_sz); 380 } 381 382 /* Get the TDMR from the list at the given index. */ 383 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list, 384 int idx) 385 { 386 int tdmr_info_offset = tdmr_list->tdmr_sz * idx; 387 388 return (void *)tdmr_list->tdmrs + tdmr_info_offset; 389 } 390 391 #define TDMR_ALIGNMENT SZ_1G 392 #define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT) 393 #define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT) 394 395 static inline u64 tdmr_end(struct tdmr_info *tdmr) 396 { 397 return tdmr->base + tdmr->size; 398 } 399 400 /* 401 * Take the memory referenced in @tmb_list and populate the 402 * preallocated @tdmr_list, following all the special alignment 403 * and size rules for TDMR. 404 */ 405 static int fill_out_tdmrs(struct list_head *tmb_list, 406 struct tdmr_info_list *tdmr_list) 407 { 408 struct tdx_memblock *tmb; 409 int tdmr_idx = 0; 410 411 /* 412 * Loop over TDX memory regions and fill out TDMRs to cover them. 413 * To keep it simple, always try to use one TDMR to cover one 414 * memory region. 415 * 416 * In practice TDX supports at least 64 TDMRs. A 2-socket system 417 * typically only consumes less than 10 of those. This code is 418 * dumb and simple and may use more TMDRs than is strictly 419 * required. 420 */ 421 list_for_each_entry(tmb, tmb_list, list) { 422 struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx); 423 u64 start, end; 424 425 start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn)); 426 end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn)); 427 428 /* 429 * A valid size indicates the current TDMR has already 430 * been filled out to cover the previous memory region(s). 431 */ 432 if (tdmr->size) { 433 /* 434 * Loop to the next if the current memory region 435 * has already been fully covered. 436 */ 437 if (end <= tdmr_end(tdmr)) 438 continue; 439 440 /* Otherwise, skip the already covered part. */ 441 if (start < tdmr_end(tdmr)) 442 start = tdmr_end(tdmr); 443 444 /* 445 * Create a new TDMR to cover the current memory 446 * region, or the remaining part of it. 447 */ 448 tdmr_idx++; 449 if (tdmr_idx >= tdmr_list->max_tdmrs) { 450 pr_warn("initialization failed: TDMRs exhausted.\n"); 451 return -ENOSPC; 452 } 453 454 tdmr = tdmr_entry(tdmr_list, tdmr_idx); 455 } 456 457 tdmr->base = start; 458 tdmr->size = end - start; 459 } 460 461 /* @tdmr_idx is always the index of the last valid TDMR. */ 462 tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1; 463 464 /* 465 * Warn early that kernel is about to run out of TDMRs. 466 * 467 * This is an indication that TDMR allocation has to be 468 * reworked to be smarter to not run into an issue. 469 */ 470 if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN) 471 pr_warn("consumed TDMRs reaching limit: %d used out of %d\n", 472 tdmr_list->nr_consumed_tdmrs, 473 tdmr_list->max_tdmrs); 474 475 return 0; 476 } 477 478 /* 479 * Calculate PAMT size given a TDMR and a page size. The returned 480 * PAMT size is always aligned up to 4K page boundary. 481 */ 482 static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, 483 u16 pamt_entry_size) 484 { 485 unsigned long pamt_sz, nr_pamt_entries; 486 487 switch (pgsz) { 488 case TDX_PS_4K: 489 nr_pamt_entries = tdmr->size >> PAGE_SHIFT; 490 break; 491 case TDX_PS_2M: 492 nr_pamt_entries = tdmr->size >> PMD_SHIFT; 493 break; 494 case TDX_PS_1G: 495 nr_pamt_entries = tdmr->size >> PUD_SHIFT; 496 break; 497 default: 498 WARN_ON_ONCE(1); 499 return 0; 500 } 501 502 pamt_sz = nr_pamt_entries * pamt_entry_size; 503 /* TDX requires PAMT size must be 4K aligned */ 504 pamt_sz = ALIGN(pamt_sz, PAGE_SIZE); 505 506 return pamt_sz; 507 } 508 509 /* 510 * Locate a NUMA node which should hold the allocation of the @tdmr 511 * PAMT. This node will have some memory covered by the TDMR. The 512 * relative amount of memory covered is not considered. 513 */ 514 static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) 515 { 516 struct tdx_memblock *tmb; 517 518 /* 519 * A TDMR must cover at least part of one TMB. That TMB will end 520 * after the TDMR begins. But, that TMB may have started before 521 * the TDMR. Find the next 'tmb' that _ends_ after this TDMR 522 * begins. Ignore 'tmb' start addresses. They are irrelevant. 523 */ 524 list_for_each_entry(tmb, tmb_list, list) { 525 if (tmb->end_pfn > PHYS_PFN(tdmr->base)) 526 return tmb->nid; 527 } 528 529 /* 530 * Fall back to allocating the TDMR's metadata from node 0 when 531 * no TDX memory block can be found. This should never happen 532 * since TDMRs originate from TDX memory blocks. 533 */ 534 pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n", 535 tdmr->base, tdmr_end(tdmr)); 536 return 0; 537 } 538 539 /* 540 * Allocate PAMTs from the local NUMA node of some memory in @tmb_list 541 * within @tdmr, and set up PAMTs for @tdmr. 542 */ 543 static int tdmr_set_up_pamt(struct tdmr_info *tdmr, 544 struct list_head *tmb_list, 545 u16 pamt_entry_size[]) 546 { 547 unsigned long pamt_base[TDX_PS_NR]; 548 unsigned long pamt_size[TDX_PS_NR]; 549 unsigned long tdmr_pamt_base; 550 unsigned long tdmr_pamt_size; 551 struct page *pamt; 552 int pgsz, nid; 553 554 nid = tdmr_get_nid(tdmr, tmb_list); 555 556 /* 557 * Calculate the PAMT size for each TDX supported page size 558 * and the total PAMT size. 559 */ 560 tdmr_pamt_size = 0; 561 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { 562 pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz, 563 pamt_entry_size[pgsz]); 564 tdmr_pamt_size += pamt_size[pgsz]; 565 } 566 567 /* 568 * Allocate one chunk of physically contiguous memory for all 569 * PAMTs. This helps minimize the PAMT's use of reserved areas 570 * in overlapped TDMRs. 571 */ 572 pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL, 573 nid, &node_online_map); 574 if (!pamt) 575 return -ENOMEM; 576 577 /* 578 * Break the contiguous allocation back up into the 579 * individual PAMTs for each page size. 580 */ 581 tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT; 582 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { 583 pamt_base[pgsz] = tdmr_pamt_base; 584 tdmr_pamt_base += pamt_size[pgsz]; 585 } 586 587 tdmr->pamt_4k_base = pamt_base[TDX_PS_4K]; 588 tdmr->pamt_4k_size = pamt_size[TDX_PS_4K]; 589 tdmr->pamt_2m_base = pamt_base[TDX_PS_2M]; 590 tdmr->pamt_2m_size = pamt_size[TDX_PS_2M]; 591 tdmr->pamt_1g_base = pamt_base[TDX_PS_1G]; 592 tdmr->pamt_1g_size = pamt_size[TDX_PS_1G]; 593 594 return 0; 595 } 596 597 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base, 598 unsigned long *pamt_size) 599 { 600 unsigned long pamt_bs, pamt_sz; 601 602 /* 603 * The PAMT was allocated in one contiguous unit. The 4K PAMT 604 * should always point to the beginning of that allocation. 605 */ 606 pamt_bs = tdmr->pamt_4k_base; 607 pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size; 608 609 WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK)); 610 611 *pamt_base = pamt_bs; 612 *pamt_size = pamt_sz; 613 } 614 615 static void tdmr_do_pamt_func(struct tdmr_info *tdmr, 616 void (*pamt_func)(unsigned long base, unsigned long size)) 617 { 618 unsigned long pamt_base, pamt_size; 619 620 tdmr_get_pamt(tdmr, &pamt_base, &pamt_size); 621 622 /* Do nothing if PAMT hasn't been allocated for this TDMR */ 623 if (!pamt_size) 624 return; 625 626 if (WARN_ON_ONCE(!pamt_base)) 627 return; 628 629 pamt_func(pamt_base, pamt_size); 630 } 631 632 static void free_pamt(unsigned long pamt_base, unsigned long pamt_size) 633 { 634 free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT); 635 } 636 637 static void tdmr_free_pamt(struct tdmr_info *tdmr) 638 { 639 tdmr_do_pamt_func(tdmr, free_pamt); 640 } 641 642 static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) 643 { 644 int i; 645 646 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 647 tdmr_free_pamt(tdmr_entry(tdmr_list, i)); 648 } 649 650 /* Allocate and set up PAMTs for all TDMRs */ 651 static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, 652 struct list_head *tmb_list, 653 u16 pamt_entry_size[]) 654 { 655 int i, ret = 0; 656 657 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 658 ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list, 659 pamt_entry_size); 660 if (ret) 661 goto err; 662 } 663 664 return 0; 665 err: 666 tdmrs_free_pamt_all(tdmr_list); 667 return ret; 668 } 669 670 /* 671 * Convert TDX private pages back to normal by using MOVDIR64B to 672 * clear these pages. Note this function doesn't flush cache of 673 * these TDX private pages. The caller should make sure of that. 674 */ 675 static void reset_tdx_pages(unsigned long base, unsigned long size) 676 { 677 const void *zero_page = (const void *)page_address(ZERO_PAGE(0)); 678 unsigned long phys, end; 679 680 end = base + size; 681 for (phys = base; phys < end; phys += 64) 682 movdir64b(__va(phys), zero_page); 683 684 /* 685 * MOVDIR64B uses WC protocol. Use memory barrier to 686 * make sure any later user of these pages sees the 687 * updated data. 688 */ 689 mb(); 690 } 691 692 static void tdmr_reset_pamt(struct tdmr_info *tdmr) 693 { 694 tdmr_do_pamt_func(tdmr, reset_tdx_pages); 695 } 696 697 static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list) 698 { 699 int i; 700 701 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 702 tdmr_reset_pamt(tdmr_entry(tdmr_list, i)); 703 } 704 705 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) 706 { 707 unsigned long pamt_size = 0; 708 int i; 709 710 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 711 unsigned long base, size; 712 713 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); 714 pamt_size += size; 715 } 716 717 return pamt_size / 1024; 718 } 719 720 static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr, 721 u64 size, u16 max_reserved_per_tdmr) 722 { 723 struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas; 724 int idx = *p_idx; 725 726 /* Reserved area must be 4K aligned in offset and size */ 727 if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK)) 728 return -EINVAL; 729 730 if (idx >= max_reserved_per_tdmr) { 731 pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n", 732 tdmr->base, tdmr_end(tdmr)); 733 return -ENOSPC; 734 } 735 736 /* 737 * Consume one reserved area per call. Make no effort to 738 * optimize or reduce the number of reserved areas which are 739 * consumed by contiguous reserved areas, for instance. 740 */ 741 rsvd_areas[idx].offset = addr - tdmr->base; 742 rsvd_areas[idx].size = size; 743 744 *p_idx = idx + 1; 745 746 return 0; 747 } 748 749 /* 750 * Go through @tmb_list to find holes between memory areas. If any of 751 * those holes fall within @tdmr, set up a TDMR reserved area to cover 752 * the hole. 753 */ 754 static int tdmr_populate_rsvd_holes(struct list_head *tmb_list, 755 struct tdmr_info *tdmr, 756 int *rsvd_idx, 757 u16 max_reserved_per_tdmr) 758 { 759 struct tdx_memblock *tmb; 760 u64 prev_end; 761 int ret; 762 763 /* 764 * Start looking for reserved blocks at the 765 * beginning of the TDMR. 766 */ 767 prev_end = tdmr->base; 768 list_for_each_entry(tmb, tmb_list, list) { 769 u64 start, end; 770 771 start = PFN_PHYS(tmb->start_pfn); 772 end = PFN_PHYS(tmb->end_pfn); 773 774 /* Break if this region is after the TDMR */ 775 if (start >= tdmr_end(tdmr)) 776 break; 777 778 /* Exclude regions before this TDMR */ 779 if (end < tdmr->base) 780 continue; 781 782 /* 783 * Skip over memory areas that 784 * have already been dealt with. 785 */ 786 if (start <= prev_end) { 787 prev_end = end; 788 continue; 789 } 790 791 /* Add the hole before this region */ 792 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, 793 start - prev_end, 794 max_reserved_per_tdmr); 795 if (ret) 796 return ret; 797 798 prev_end = end; 799 } 800 801 /* Add the hole after the last region if it exists. */ 802 if (prev_end < tdmr_end(tdmr)) { 803 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, 804 tdmr_end(tdmr) - prev_end, 805 max_reserved_per_tdmr); 806 if (ret) 807 return ret; 808 } 809 810 return 0; 811 } 812 813 /* 814 * Go through @tdmr_list to find all PAMTs. If any of those PAMTs 815 * overlaps with @tdmr, set up a TDMR reserved area to cover the 816 * overlapping part. 817 */ 818 static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list, 819 struct tdmr_info *tdmr, 820 int *rsvd_idx, 821 u16 max_reserved_per_tdmr) 822 { 823 int i, ret; 824 825 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 826 struct tdmr_info *tmp = tdmr_entry(tdmr_list, i); 827 unsigned long pamt_base, pamt_size, pamt_end; 828 829 tdmr_get_pamt(tmp, &pamt_base, &pamt_size); 830 /* Each TDMR must already have PAMT allocated */ 831 WARN_ON_ONCE(!pamt_size || !pamt_base); 832 833 pamt_end = pamt_base + pamt_size; 834 /* Skip PAMTs outside of the given TDMR */ 835 if ((pamt_end <= tdmr->base) || 836 (pamt_base >= tdmr_end(tdmr))) 837 continue; 838 839 /* Only mark the part within the TDMR as reserved */ 840 if (pamt_base < tdmr->base) 841 pamt_base = tdmr->base; 842 if (pamt_end > tdmr_end(tdmr)) 843 pamt_end = tdmr_end(tdmr); 844 845 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base, 846 pamt_end - pamt_base, 847 max_reserved_per_tdmr); 848 if (ret) 849 return ret; 850 } 851 852 return 0; 853 } 854 855 /* Compare function called by sort() for TDMR reserved areas */ 856 static int rsvd_area_cmp_func(const void *a, const void *b) 857 { 858 struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a; 859 struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b; 860 861 if (r1->offset + r1->size <= r2->offset) 862 return -1; 863 if (r1->offset >= r2->offset + r2->size) 864 return 1; 865 866 /* Reserved areas cannot overlap. The caller must guarantee. */ 867 WARN_ON_ONCE(1); 868 return -1; 869 } 870 871 /* 872 * Populate reserved areas for the given @tdmr, including memory holes 873 * (via @tmb_list) and PAMTs (via @tdmr_list). 874 */ 875 static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr, 876 struct list_head *tmb_list, 877 struct tdmr_info_list *tdmr_list, 878 u16 max_reserved_per_tdmr) 879 { 880 int ret, rsvd_idx = 0; 881 882 ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx, 883 max_reserved_per_tdmr); 884 if (ret) 885 return ret; 886 887 ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx, 888 max_reserved_per_tdmr); 889 if (ret) 890 return ret; 891 892 /* TDX requires reserved areas listed in address ascending order */ 893 sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area), 894 rsvd_area_cmp_func, NULL); 895 896 return 0; 897 } 898 899 /* 900 * Populate reserved areas for all TDMRs in @tdmr_list, including memory 901 * holes (via @tmb_list) and PAMTs. 902 */ 903 static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, 904 struct list_head *tmb_list, 905 u16 max_reserved_per_tdmr) 906 { 907 int i; 908 909 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 910 int ret; 911 912 ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i), 913 tmb_list, tdmr_list, max_reserved_per_tdmr); 914 if (ret) 915 return ret; 916 } 917 918 return 0; 919 } 920 921 /* 922 * Construct a list of TDMRs on the preallocated space in @tdmr_list 923 * to cover all TDX memory regions in @tmb_list based on the TDX module 924 * TDMR global information in @tdmr_sysinfo. 925 */ 926 static int construct_tdmrs(struct list_head *tmb_list, 927 struct tdmr_info_list *tdmr_list, 928 struct tdx_tdmr_sysinfo *tdmr_sysinfo) 929 { 930 int ret; 931 932 ret = fill_out_tdmrs(tmb_list, tdmr_list); 933 if (ret) 934 return ret; 935 936 ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, 937 tdmr_sysinfo->pamt_entry_size); 938 if (ret) 939 return ret; 940 941 ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list, 942 tdmr_sysinfo->max_reserved_per_tdmr); 943 if (ret) 944 tdmrs_free_pamt_all(tdmr_list); 945 946 /* 947 * The tdmr_info_list is read-only from here on out. 948 * Ensure that these writes are seen by other CPUs. 949 * Pairs with a smp_rmb() in is_pamt_page(). 950 */ 951 smp_wmb(); 952 953 return ret; 954 } 955 956 static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid) 957 { 958 struct tdx_module_args args = {}; 959 u64 *tdmr_pa_array; 960 size_t array_sz; 961 int i, ret; 962 963 /* 964 * TDMRs are passed to the TDX module via an array of physical 965 * addresses of each TDMR. The array itself also has certain 966 * alignment requirement. 967 */ 968 array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64); 969 array_sz = roundup_pow_of_two(array_sz); 970 if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT) 971 array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT; 972 973 tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL); 974 if (!tdmr_pa_array) 975 return -ENOMEM; 976 977 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 978 tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i)); 979 980 args.rcx = __pa(tdmr_pa_array); 981 args.rdx = tdmr_list->nr_consumed_tdmrs; 982 args.r8 = global_keyid; 983 ret = seamcall_prerr(TDH_SYS_CONFIG, &args); 984 985 /* Free the array as it is not required anymore. */ 986 kfree(tdmr_pa_array); 987 988 return ret; 989 } 990 991 static int do_global_key_config(void *unused) 992 { 993 struct tdx_module_args args = {}; 994 995 return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args); 996 } 997 998 /* 999 * Attempt to configure the global KeyID on all physical packages. 1000 * 1001 * This requires running code on at least one CPU in each package. 1002 * TDMR initialization) will fail will fail if any package in the 1003 * system has no online CPUs. 1004 * 1005 * This code takes no affirmative steps to online CPUs. Callers (aka. 1006 * KVM) can ensure success by ensuring sufficient CPUs are online and 1007 * can run SEAMCALLs. 1008 */ 1009 static int config_global_keyid(void) 1010 { 1011 cpumask_var_t packages; 1012 int cpu, ret = -EINVAL; 1013 1014 if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) 1015 return -ENOMEM; 1016 1017 /* 1018 * Hardware doesn't guarantee cache coherency across different 1019 * KeyIDs. The kernel needs to flush PAMT's dirty cachelines 1020 * (associated with KeyID 0) before the TDX module can use the 1021 * global KeyID to access the PAMT. Given PAMTs are potentially 1022 * large (~1/256th of system RAM), just use WBINVD. 1023 */ 1024 wbinvd_on_all_cpus(); 1025 1026 for_each_online_cpu(cpu) { 1027 /* 1028 * The key configuration only needs to be done once per 1029 * package and will return an error if configured more 1030 * than once. Avoid doing it multiple times per package. 1031 */ 1032 if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu), 1033 packages)) 1034 continue; 1035 1036 /* 1037 * TDH.SYS.KEY.CONFIG cannot run concurrently on 1038 * different cpus. Do it one by one. 1039 */ 1040 ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true); 1041 if (ret) 1042 break; 1043 } 1044 1045 free_cpumask_var(packages); 1046 return ret; 1047 } 1048 1049 static int init_tdmr(struct tdmr_info *tdmr) 1050 { 1051 u64 next; 1052 1053 /* 1054 * Initializing a TDMR can be time consuming. To avoid long 1055 * SEAMCALLs, the TDX module may only initialize a part of the 1056 * TDMR in each call. 1057 */ 1058 do { 1059 struct tdx_module_args args = { 1060 .rcx = tdmr->base, 1061 }; 1062 int ret; 1063 1064 ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args); 1065 if (ret) 1066 return ret; 1067 /* 1068 * RDX contains 'next-to-initialize' address if 1069 * TDH.SYS.TDMR.INIT did not fully complete and 1070 * should be retried. 1071 */ 1072 next = args.rdx; 1073 cond_resched(); 1074 /* Keep making SEAMCALLs until the TDMR is done */ 1075 } while (next < tdmr->base + tdmr->size); 1076 1077 return 0; 1078 } 1079 1080 static int init_tdmrs(struct tdmr_info_list *tdmr_list) 1081 { 1082 int i; 1083 1084 /* 1085 * This operation is costly. It can be parallelized, 1086 * but keep it simple for now. 1087 */ 1088 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 1089 int ret; 1090 1091 ret = init_tdmr(tdmr_entry(tdmr_list, i)); 1092 if (ret) 1093 return ret; 1094 } 1095 1096 return 0; 1097 } 1098 1099 static int init_tdx_module(void) 1100 { 1101 struct tdx_tdmr_sysinfo tdmr_sysinfo; 1102 int ret; 1103 1104 /* 1105 * To keep things simple, assume that all TDX-protected memory 1106 * will come from the page allocator. Make sure all pages in the 1107 * page allocator are TDX-usable memory. 1108 * 1109 * Build the list of "TDX-usable" memory regions which cover all 1110 * pages in the page allocator to guarantee that. Do it while 1111 * holding mem_hotplug_lock read-lock as the memory hotplug code 1112 * path reads the @tdx_memlist to reject any new memory. 1113 */ 1114 get_online_mems(); 1115 1116 ret = build_tdx_memlist(&tdx_memlist); 1117 if (ret) 1118 goto out_put_tdxmem; 1119 1120 ret = get_tdx_tdmr_sysinfo(&tdmr_sysinfo); 1121 if (ret) 1122 goto err_free_tdxmem; 1123 1124 /* Allocate enough space for constructing TDMRs */ 1125 ret = alloc_tdmr_list(&tdx_tdmr_list, &tdmr_sysinfo); 1126 if (ret) 1127 goto err_free_tdxmem; 1128 1129 /* Cover all TDX-usable memory regions in TDMRs */ 1130 ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdmr_sysinfo); 1131 if (ret) 1132 goto err_free_tdmrs; 1133 1134 /* Pass the TDMRs and the global KeyID to the TDX module */ 1135 ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid); 1136 if (ret) 1137 goto err_free_pamts; 1138 1139 /* Config the key of global KeyID on all packages */ 1140 ret = config_global_keyid(); 1141 if (ret) 1142 goto err_reset_pamts; 1143 1144 /* Initialize TDMRs to complete the TDX module initialization */ 1145 ret = init_tdmrs(&tdx_tdmr_list); 1146 if (ret) 1147 goto err_reset_pamts; 1148 1149 pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list)); 1150 1151 out_put_tdxmem: 1152 /* 1153 * @tdx_memlist is written here and read at memory hotplug time. 1154 * Lock out memory hotplug code while building it. 1155 */ 1156 put_online_mems(); 1157 return ret; 1158 1159 err_reset_pamts: 1160 /* 1161 * Part of PAMTs may already have been initialized by the 1162 * TDX module. Flush cache before returning PAMTs back 1163 * to the kernel. 1164 */ 1165 wbinvd_on_all_cpus(); 1166 /* 1167 * According to the TDX hardware spec, if the platform 1168 * doesn't have the "partial write machine check" 1169 * erratum, any kernel read/write will never cause #MC 1170 * in kernel space, thus it's OK to not convert PAMTs 1171 * back to normal. But do the conversion anyway here 1172 * as suggested by the TDX spec. 1173 */ 1174 tdmrs_reset_pamt_all(&tdx_tdmr_list); 1175 err_free_pamts: 1176 tdmrs_free_pamt_all(&tdx_tdmr_list); 1177 err_free_tdmrs: 1178 free_tdmr_list(&tdx_tdmr_list); 1179 err_free_tdxmem: 1180 free_tdx_memlist(&tdx_memlist); 1181 goto out_put_tdxmem; 1182 } 1183 1184 static int __tdx_enable(void) 1185 { 1186 int ret; 1187 1188 ret = init_tdx_module(); 1189 if (ret) { 1190 pr_err("module initialization failed (%d)\n", ret); 1191 tdx_module_status = TDX_MODULE_ERROR; 1192 return ret; 1193 } 1194 1195 pr_info("module initialized\n"); 1196 tdx_module_status = TDX_MODULE_INITIALIZED; 1197 1198 return 0; 1199 } 1200 1201 /** 1202 * tdx_enable - Enable TDX module to make it ready to run TDX guests 1203 * 1204 * This function assumes the caller has: 1) held read lock of CPU hotplug 1205 * lock to prevent any new cpu from becoming online; 2) done both VMXON 1206 * and tdx_cpu_enable() on all online cpus. 1207 * 1208 * This function requires there's at least one online cpu for each CPU 1209 * package to succeed. 1210 * 1211 * This function can be called in parallel by multiple callers. 1212 * 1213 * Return 0 if TDX is enabled successfully, otherwise error. 1214 */ 1215 int tdx_enable(void) 1216 { 1217 int ret; 1218 1219 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 1220 return -ENODEV; 1221 1222 lockdep_assert_cpus_held(); 1223 1224 mutex_lock(&tdx_module_lock); 1225 1226 switch (tdx_module_status) { 1227 case TDX_MODULE_UNINITIALIZED: 1228 ret = __tdx_enable(); 1229 break; 1230 case TDX_MODULE_INITIALIZED: 1231 /* Already initialized, great, tell the caller. */ 1232 ret = 0; 1233 break; 1234 default: 1235 /* Failed to initialize in the previous attempts */ 1236 ret = -EINVAL; 1237 break; 1238 } 1239 1240 mutex_unlock(&tdx_module_lock); 1241 1242 return ret; 1243 } 1244 EXPORT_SYMBOL_GPL(tdx_enable); 1245 1246 static bool is_pamt_page(unsigned long phys) 1247 { 1248 struct tdmr_info_list *tdmr_list = &tdx_tdmr_list; 1249 int i; 1250 1251 /* Ensure that all remote 'tdmr_list' writes are visible: */ 1252 smp_rmb(); 1253 1254 /* 1255 * The TDX module is no longer returning TDX_SYS_NOT_READY and 1256 * is initialized. The 'tdmr_list' was initialized long ago 1257 * and is now read-only. 1258 */ 1259 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { 1260 unsigned long base, size; 1261 1262 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); 1263 1264 if (phys >= base && phys < (base + size)) 1265 return true; 1266 } 1267 1268 return false; 1269 } 1270 1271 /* 1272 * Return whether the memory page at the given physical address is TDX 1273 * private memory or not. 1274 * 1275 * This can be imprecise for two known reasons: 1276 * 1. PAMTs are private memory and exist before the TDX module is 1277 * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively 1278 * short window that occurs once per boot. 1279 * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the 1280 * page. However, the page can still cause #MC until it has been 1281 * fully converted to shared using 64-byte writes like MOVDIR64B. 1282 * Buggy hosts might still leave #MC-causing memory in place which 1283 * this function can not detect. 1284 */ 1285 static bool paddr_is_tdx_private(unsigned long phys) 1286 { 1287 struct tdx_module_args args = { 1288 .rcx = phys & PAGE_MASK, 1289 }; 1290 u64 sret; 1291 1292 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 1293 return false; 1294 1295 /* Get page type from the TDX module */ 1296 sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args); 1297 1298 /* 1299 * The SEAMCALL will not return success unless there is a 1300 * working, "ready" TDX module. Assume an absence of TDX 1301 * private pages until SEAMCALL is working. 1302 */ 1303 if (sret) 1304 return false; 1305 1306 /* 1307 * SEAMCALL was successful -- read page type (via RCX): 1308 * 1309 * - PT_NDA: Page is not used by the TDX module 1310 * - PT_RSVD: Reserved for Non-TDX use 1311 * - Others: Page is used by the TDX module 1312 * 1313 * Note PAMT pages are marked as PT_RSVD but they are also TDX 1314 * private memory. 1315 */ 1316 switch (args.rcx) { 1317 case PT_NDA: 1318 return false; 1319 case PT_RSVD: 1320 return is_pamt_page(phys); 1321 default: 1322 return true; 1323 } 1324 } 1325 1326 /* 1327 * Some TDX-capable CPUs have an erratum. A write to TDX private 1328 * memory poisons that memory, and a subsequent read of that memory 1329 * triggers #MC. 1330 * 1331 * Help distinguish erratum-triggered #MCs from a normal hardware one. 1332 * Just print additional message to show such #MC may be result of the 1333 * erratum. 1334 */ 1335 const char *tdx_dump_mce_info(struct mce *m) 1336 { 1337 if (!m || !mce_is_memory_error(m) || !mce_usable_address(m)) 1338 return NULL; 1339 1340 if (!paddr_is_tdx_private(m->addr)) 1341 return NULL; 1342 1343 return "TDX private memory error. Possible kernel bug."; 1344 } 1345 1346 static __init int record_keyid_partitioning(u32 *tdx_keyid_start, 1347 u32 *nr_tdx_keyids) 1348 { 1349 u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids; 1350 int ret; 1351 1352 /* 1353 * IA32_MKTME_KEYID_PARTIONING: 1354 * Bit [31:0]: Number of MKTME KeyIDs. 1355 * Bit [63:32]: Number of TDX private KeyIDs. 1356 */ 1357 ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids, 1358 &_nr_tdx_keyids); 1359 if (ret || !_nr_tdx_keyids) 1360 return -EINVAL; 1361 1362 /* TDX KeyIDs start after the last MKTME KeyID. */ 1363 _tdx_keyid_start = _nr_mktme_keyids + 1; 1364 1365 *tdx_keyid_start = _tdx_keyid_start; 1366 *nr_tdx_keyids = _nr_tdx_keyids; 1367 1368 return 0; 1369 } 1370 1371 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn) 1372 { 1373 struct tdx_memblock *tmb; 1374 1375 /* 1376 * This check assumes that the start_pfn<->end_pfn range does not 1377 * cross multiple @tdx_memlist entries. A single memory online 1378 * event across multiple memblocks (from which @tdx_memlist 1379 * entries are derived at the time of module initialization) is 1380 * not possible. This is because memory offline/online is done 1381 * on granularity of 'struct memory_block', and the hotpluggable 1382 * memory region (one memblock) must be multiple of memory_block. 1383 */ 1384 list_for_each_entry(tmb, &tdx_memlist, list) { 1385 if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn) 1386 return true; 1387 } 1388 return false; 1389 } 1390 1391 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action, 1392 void *v) 1393 { 1394 struct memory_notify *mn = v; 1395 1396 if (action != MEM_GOING_ONLINE) 1397 return NOTIFY_OK; 1398 1399 /* 1400 * Empty list means TDX isn't enabled. Allow any memory 1401 * to go online. 1402 */ 1403 if (list_empty(&tdx_memlist)) 1404 return NOTIFY_OK; 1405 1406 /* 1407 * The TDX memory configuration is static and can not be 1408 * changed. Reject onlining any memory which is outside of 1409 * the static configuration whether it supports TDX or not. 1410 */ 1411 if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages)) 1412 return NOTIFY_OK; 1413 1414 return NOTIFY_BAD; 1415 } 1416 1417 static struct notifier_block tdx_memory_nb = { 1418 .notifier_call = tdx_memory_notifier, 1419 }; 1420 1421 static void __init check_tdx_erratum(void) 1422 { 1423 /* 1424 * These CPUs have an erratum. A partial write from non-TD 1425 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX 1426 * private memory poisons that memory, and a subsequent read of 1427 * that memory triggers #MC. 1428 */ 1429 switch (boot_cpu_data.x86_vfm) { 1430 case INTEL_SAPPHIRERAPIDS_X: 1431 case INTEL_EMERALDRAPIDS_X: 1432 setup_force_cpu_bug(X86_BUG_TDX_PW_MCE); 1433 } 1434 } 1435 1436 void __init tdx_init(void) 1437 { 1438 u32 tdx_keyid_start, nr_tdx_keyids; 1439 int err; 1440 1441 err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids); 1442 if (err) 1443 return; 1444 1445 pr_info("BIOS enabled: private KeyID range [%u, %u)\n", 1446 tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids); 1447 1448 /* 1449 * The TDX module itself requires one 'global KeyID' to protect 1450 * its metadata. If there's only one TDX KeyID, there won't be 1451 * any left for TDX guests thus there's no point to enable TDX 1452 * at all. 1453 */ 1454 if (nr_tdx_keyids < 2) { 1455 pr_err("initialization failed: too few private KeyIDs available.\n"); 1456 return; 1457 } 1458 1459 /* 1460 * At this point, hibernation_available() indicates whether or 1461 * not hibernation support has been permanently disabled. 1462 */ 1463 if (hibernation_available()) { 1464 pr_err("initialization failed: Hibernation support is enabled\n"); 1465 return; 1466 } 1467 1468 err = register_memory_notifier(&tdx_memory_nb); 1469 if (err) { 1470 pr_err("initialization failed: register_memory_notifier() failed (%d)\n", 1471 err); 1472 return; 1473 } 1474 1475 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) 1476 pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n"); 1477 acpi_suspend_lowlevel = NULL; 1478 #endif 1479 1480 /* 1481 * Just use the first TDX KeyID as the 'global KeyID' and 1482 * leave the rest for TDX guests. 1483 */ 1484 tdx_global_keyid = tdx_keyid_start; 1485 tdx_guest_keyid_start = tdx_keyid_start + 1; 1486 tdx_nr_guest_keyids = nr_tdx_keyids - 1; 1487 1488 setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM); 1489 1490 check_tdx_erratum(); 1491 } 1492

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

TOMOYO Linux Cross Reference Linux/arch/x86/virt/vmx/tdx/tdx.c

TOMOYO Linux Cross Reference
Linux/arch/x86/virt/vmx/tdx/tdx.c