~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/virt/vmx/tdx/tdx.c

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * Copyright(c) 2023 Intel Corporation.
  4  *
  5  * Intel Trusted Domain Extensions (TDX) support
  6  */
  7 
  8 #define pr_fmt(fmt)     "virt/tdx: " fmt
  9 
 10 #include <linux/types.h>
 11 #include <linux/cache.h>
 12 #include <linux/init.h>
 13 #include <linux/errno.h>
 14 #include <linux/printk.h>
 15 #include <linux/cpu.h>
 16 #include <linux/spinlock.h>
 17 #include <linux/percpu-defs.h>
 18 #include <linux/mutex.h>
 19 #include <linux/list.h>
 20 #include <linux/memblock.h>
 21 #include <linux/memory.h>
 22 #include <linux/minmax.h>
 23 #include <linux/sizes.h>
 24 #include <linux/pfn.h>
 25 #include <linux/align.h>
 26 #include <linux/sort.h>
 27 #include <linux/log2.h>
 28 #include <linux/acpi.h>
 29 #include <linux/suspend.h>
 30 #include <asm/page.h>
 31 #include <asm/special_insns.h>
 32 #include <asm/msr-index.h>
 33 #include <asm/msr.h>
 34 #include <asm/cpufeature.h>
 35 #include <asm/tdx.h>
 36 #include <asm/cpu_device_id.h>
 37 #include <asm/processor.h>
 38 #include <asm/mce.h>
 39 #include "tdx.h"
 40 
 41 static u32 tdx_global_keyid __ro_after_init;
 42 static u32 tdx_guest_keyid_start __ro_after_init;
 43 static u32 tdx_nr_guest_keyids __ro_after_init;
 44 
 45 static DEFINE_PER_CPU(bool, tdx_lp_initialized);
 46 
 47 static struct tdmr_info_list tdx_tdmr_list;
 48 
 49 static enum tdx_module_status_t tdx_module_status;
 50 static DEFINE_MUTEX(tdx_module_lock);
 51 
 52 /* All TDX-usable memory regions.  Protected by mem_hotplug_lock. */
 53 static LIST_HEAD(tdx_memlist);
 54 
 55 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
 56 
 57 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
 58 {
 59         pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
 60 }
 61 
 62 static inline void seamcall_err_ret(u64 fn, u64 err,
 63                                     struct tdx_module_args *args)
 64 {
 65         seamcall_err(fn, err, args);
 66         pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
 67                         args->rcx, args->rdx, args->r8);
 68         pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
 69                         args->r9, args->r10, args->r11);
 70 }
 71 
 72 static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
 73                                  u64 fn, struct tdx_module_args *args)
 74 {
 75         u64 sret = sc_retry(func, fn, args);
 76 
 77         if (sret == TDX_SUCCESS)
 78                 return 0;
 79 
 80         if (sret == TDX_SEAMCALL_VMFAILINVALID)
 81                 return -ENODEV;
 82 
 83         if (sret == TDX_SEAMCALL_GP)
 84                 return -EOPNOTSUPP;
 85 
 86         if (sret == TDX_SEAMCALL_UD)
 87                 return -EACCES;
 88 
 89         err_func(fn, sret, args);
 90         return -EIO;
 91 }
 92 
 93 #define seamcall_prerr(__fn, __args)                                            \
 94         sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
 95 
 96 #define seamcall_prerr_ret(__fn, __args)                                        \
 97         sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
 98 
 99 /*
100  * Do the module global initialization once and return its result.
101  * It can be done on any cpu.  It's always called with interrupts
102  * disabled.
103  */
104 static int try_init_module_global(void)
105 {
106         struct tdx_module_args args = {};
107         static DEFINE_RAW_SPINLOCK(sysinit_lock);
108         static bool sysinit_done;
109         static int sysinit_ret;
110 
111         lockdep_assert_irqs_disabled();
112 
113         raw_spin_lock(&sysinit_lock);
114 
115         if (sysinit_done)
116                 goto out;
117 
118         /* RCX is module attributes and all bits are reserved */
119         args.rcx = 0;
120         sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
121 
122         /*
123          * The first SEAMCALL also detects the TDX module, thus
124          * it can fail due to the TDX module is not loaded.
125          * Dump message to let the user know.
126          */
127         if (sysinit_ret == -ENODEV)
128                 pr_err("module not loaded\n");
129 
130         sysinit_done = true;
131 out:
132         raw_spin_unlock(&sysinit_lock);
133         return sysinit_ret;
134 }
135 
136 /**
137  * tdx_cpu_enable - Enable TDX on local cpu
138  *
139  * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
140  * global initialization SEAMCALL if not done) on local cpu to make this
141  * cpu be ready to run any other SEAMCALLs.
142  *
143  * Always call this function via IPI function calls.
144  *
145  * Return 0 on success, otherwise errors.
146  */
147 int tdx_cpu_enable(void)
148 {
149         struct tdx_module_args args = {};
150         int ret;
151 
152         if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
153                 return -ENODEV;
154 
155         lockdep_assert_irqs_disabled();
156 
157         if (__this_cpu_read(tdx_lp_initialized))
158                 return 0;
159 
160         /*
161          * The TDX module global initialization is the very first step
162          * to enable TDX.  Need to do it first (if hasn't been done)
163          * before the per-cpu initialization.
164          */
165         ret = try_init_module_global();
166         if (ret)
167                 return ret;
168 
169         ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
170         if (ret)
171                 return ret;
172 
173         __this_cpu_write(tdx_lp_initialized, true);
174 
175         return 0;
176 }
177 EXPORT_SYMBOL_GPL(tdx_cpu_enable);
178 
179 /*
180  * Add a memory region as a TDX memory block.  The caller must make sure
181  * all memory regions are added in address ascending order and don't
182  * overlap.
183  */
184 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
185                             unsigned long end_pfn, int nid)
186 {
187         struct tdx_memblock *tmb;
188 
189         tmb = kmalloc(sizeof(*tmb), GFP_KERNEL);
190         if (!tmb)
191                 return -ENOMEM;
192 
193         INIT_LIST_HEAD(&tmb->list);
194         tmb->start_pfn = start_pfn;
195         tmb->end_pfn = end_pfn;
196         tmb->nid = nid;
197 
198         /* @tmb_list is protected by mem_hotplug_lock */
199         list_add_tail(&tmb->list, tmb_list);
200         return 0;
201 }
202 
203 static void free_tdx_memlist(struct list_head *tmb_list)
204 {
205         /* @tmb_list is protected by mem_hotplug_lock */
206         while (!list_empty(tmb_list)) {
207                 struct tdx_memblock *tmb = list_first_entry(tmb_list,
208                                 struct tdx_memblock, list);
209 
210                 list_del(&tmb->list);
211                 kfree(tmb);
212         }
213 }
214 
215 /*
216  * Ensure that all memblock memory regions are convertible to TDX
217  * memory.  Once this has been established, stash the memblock
218  * ranges off in a secondary structure because memblock is modified
219  * in memory hotplug while TDX memory regions are fixed.
220  */
221 static int build_tdx_memlist(struct list_head *tmb_list)
222 {
223         unsigned long start_pfn, end_pfn;
224         int i, nid, ret;
225 
226         for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
227                 /*
228                  * The first 1MB is not reported as TDX convertible memory.
229                  * Although the first 1MB is always reserved and won't end up
230                  * to the page allocator, it is still in memblock's memory
231                  * regions.  Skip them manually to exclude them as TDX memory.
232                  */
233                 start_pfn = max(start_pfn, PHYS_PFN(SZ_1M));
234                 if (start_pfn >= end_pfn)
235                         continue;
236 
237                 /*
238                  * Add the memory regions as TDX memory.  The regions in
239                  * memblock has already guaranteed they are in address
240                  * ascending order and don't overlap.
241                  */
242                 ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
243                 if (ret)
244                         goto err;
245         }
246 
247         return 0;
248 err:
249         free_tdx_memlist(tmb_list);
250         return ret;
251 }
252 
253 static int read_sys_metadata_field(u64 field_id, u64 *data)
254 {
255         struct tdx_module_args args = {};
256         int ret;
257 
258         /*
259          * TDH.SYS.RD -- reads one global metadata field
260          *  - RDX (in): the field to read
261          *  - R8 (out): the field data
262          */
263         args.rdx = field_id;
264         ret = seamcall_prerr_ret(TDH_SYS_RD, &args);
265         if (ret)
266                 return ret;
267 
268         *data = args.r8;
269 
270         return 0;
271 }
272 
273 static int read_sys_metadata_field16(u64 field_id,
274                                      int offset,
275                                      struct tdx_tdmr_sysinfo *ts)
276 {
277         u16 *ts_member = ((void *)ts) + offset;
278         u64 tmp;
279         int ret;
280 
281         if (WARN_ON_ONCE(MD_FIELD_ID_ELE_SIZE_CODE(field_id) !=
282                         MD_FIELD_ID_ELE_SIZE_16BIT))
283                 return -EINVAL;
284 
285         ret = read_sys_metadata_field(field_id, &tmp);
286         if (ret)
287                 return ret;
288 
289         *ts_member = tmp;
290 
291         return 0;
292 }
293 
294 struct field_mapping {
295         u64 field_id;
296         int offset;
297 };
298 
299 #define TD_SYSINFO_MAP(_field_id, _offset) \
300         { .field_id = MD_FIELD_ID_##_field_id,     \
301           .offset   = offsetof(struct tdx_tdmr_sysinfo, _offset) }
302 
303 /* Map TD_SYSINFO fields into 'struct tdx_tdmr_sysinfo': */
304 static const struct field_mapping fields[] = {
305         TD_SYSINFO_MAP(MAX_TDMRS,             max_tdmrs),
306         TD_SYSINFO_MAP(MAX_RESERVED_PER_TDMR, max_reserved_per_tdmr),
307         TD_SYSINFO_MAP(PAMT_4K_ENTRY_SIZE,    pamt_entry_size[TDX_PS_4K]),
308         TD_SYSINFO_MAP(PAMT_2M_ENTRY_SIZE,    pamt_entry_size[TDX_PS_2M]),
309         TD_SYSINFO_MAP(PAMT_1G_ENTRY_SIZE,    pamt_entry_size[TDX_PS_1G]),
310 };
311 
312 static int get_tdx_tdmr_sysinfo(struct tdx_tdmr_sysinfo *tdmr_sysinfo)
313 {
314         int ret;
315         int i;
316 
317         /* Populate 'tdmr_sysinfo' fields using the mapping structure above: */
318         for (i = 0; i < ARRAY_SIZE(fields); i++) {
319                 ret = read_sys_metadata_field16(fields[i].field_id,
320                                                 fields[i].offset,
321                                                 tdmr_sysinfo);
322                 if (ret)
323                         return ret;
324         }
325 
326         return 0;
327 }
328 
329 /* Calculate the actual TDMR size */
330 static int tdmr_size_single(u16 max_reserved_per_tdmr)
331 {
332         int tdmr_sz;
333 
334         /*
335          * The actual size of TDMR depends on the maximum
336          * number of reserved areas.
337          */
338         tdmr_sz = sizeof(struct tdmr_info);
339         tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
340 
341         return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
342 }
343 
344 static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
345                            struct tdx_tdmr_sysinfo *tdmr_sysinfo)
346 {
347         size_t tdmr_sz, tdmr_array_sz;
348         void *tdmr_array;
349 
350         tdmr_sz = tdmr_size_single(tdmr_sysinfo->max_reserved_per_tdmr);
351         tdmr_array_sz = tdmr_sz * tdmr_sysinfo->max_tdmrs;
352 
353         /*
354          * To keep things simple, allocate all TDMRs together.
355          * The buffer needs to be physically contiguous to make
356          * sure each TDMR is physically contiguous.
357          */
358         tdmr_array = alloc_pages_exact(tdmr_array_sz,
359                         GFP_KERNEL | __GFP_ZERO);
360         if (!tdmr_array)
361                 return -ENOMEM;
362 
363         tdmr_list->tdmrs = tdmr_array;
364 
365         /*
366          * Keep the size of TDMR to find the target TDMR
367          * at a given index in the TDMR list.
368          */
369         tdmr_list->tdmr_sz = tdmr_sz;
370         tdmr_list->max_tdmrs = tdmr_sysinfo->max_tdmrs;
371         tdmr_list->nr_consumed_tdmrs = 0;
372 
373         return 0;
374 }
375 
376 static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
377 {
378         free_pages_exact(tdmr_list->tdmrs,
379                         tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
380 }
381 
382 /* Get the TDMR from the list at the given index. */
383 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list,
384                                     int idx)
385 {
386         int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
387 
388         return (void *)tdmr_list->tdmrs + tdmr_info_offset;
389 }
390 
391 #define TDMR_ALIGNMENT          SZ_1G
392 #define TDMR_ALIGN_DOWN(_addr)  ALIGN_DOWN((_addr), TDMR_ALIGNMENT)
393 #define TDMR_ALIGN_UP(_addr)    ALIGN((_addr), TDMR_ALIGNMENT)
394 
395 static inline u64 tdmr_end(struct tdmr_info *tdmr)
396 {
397         return tdmr->base + tdmr->size;
398 }
399 
400 /*
401  * Take the memory referenced in @tmb_list and populate the
402  * preallocated @tdmr_list, following all the special alignment
403  * and size rules for TDMR.
404  */
405 static int fill_out_tdmrs(struct list_head *tmb_list,
406                           struct tdmr_info_list *tdmr_list)
407 {
408         struct tdx_memblock *tmb;
409         int tdmr_idx = 0;
410 
411         /*
412          * Loop over TDX memory regions and fill out TDMRs to cover them.
413          * To keep it simple, always try to use one TDMR to cover one
414          * memory region.
415          *
416          * In practice TDX supports at least 64 TDMRs.  A 2-socket system
417          * typically only consumes less than 10 of those.  This code is
418          * dumb and simple and may use more TMDRs than is strictly
419          * required.
420          */
421         list_for_each_entry(tmb, tmb_list, list) {
422                 struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
423                 u64 start, end;
424 
425                 start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
426                 end   = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
427 
428                 /*
429                  * A valid size indicates the current TDMR has already
430                  * been filled out to cover the previous memory region(s).
431                  */
432                 if (tdmr->size) {
433                         /*
434                          * Loop to the next if the current memory region
435                          * has already been fully covered.
436                          */
437                         if (end <= tdmr_end(tdmr))
438                                 continue;
439 
440                         /* Otherwise, skip the already covered part. */
441                         if (start < tdmr_end(tdmr))
442                                 start = tdmr_end(tdmr);
443 
444                         /*
445                          * Create a new TDMR to cover the current memory
446                          * region, or the remaining part of it.
447                          */
448                         tdmr_idx++;
449                         if (tdmr_idx >= tdmr_list->max_tdmrs) {
450                                 pr_warn("initialization failed: TDMRs exhausted.\n");
451                                 return -ENOSPC;
452                         }
453 
454                         tdmr = tdmr_entry(tdmr_list, tdmr_idx);
455                 }
456 
457                 tdmr->base = start;
458                 tdmr->size = end - start;
459         }
460 
461         /* @tdmr_idx is always the index of the last valid TDMR. */
462         tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
463 
464         /*
465          * Warn early that kernel is about to run out of TDMRs.
466          *
467          * This is an indication that TDMR allocation has to be
468          * reworked to be smarter to not run into an issue.
469          */
470         if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
471                 pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
472                                 tdmr_list->nr_consumed_tdmrs,
473                                 tdmr_list->max_tdmrs);
474 
475         return 0;
476 }
477 
478 /*
479  * Calculate PAMT size given a TDMR and a page size.  The returned
480  * PAMT size is always aligned up to 4K page boundary.
481  */
482 static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
483                                       u16 pamt_entry_size)
484 {
485         unsigned long pamt_sz, nr_pamt_entries;
486 
487         switch (pgsz) {
488         case TDX_PS_4K:
489                 nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
490                 break;
491         case TDX_PS_2M:
492                 nr_pamt_entries = tdmr->size >> PMD_SHIFT;
493                 break;
494         case TDX_PS_1G:
495                 nr_pamt_entries = tdmr->size >> PUD_SHIFT;
496                 break;
497         default:
498                 WARN_ON_ONCE(1);
499                 return 0;
500         }
501 
502         pamt_sz = nr_pamt_entries * pamt_entry_size;
503         /* TDX requires PAMT size must be 4K aligned */
504         pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
505 
506         return pamt_sz;
507 }
508 
509 /*
510  * Locate a NUMA node which should hold the allocation of the @tdmr
511  * PAMT.  This node will have some memory covered by the TDMR.  The
512  * relative amount of memory covered is not considered.
513  */
514 static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
515 {
516         struct tdx_memblock *tmb;
517 
518         /*
519          * A TDMR must cover at least part of one TMB.  That TMB will end
520          * after the TDMR begins.  But, that TMB may have started before
521          * the TDMR.  Find the next 'tmb' that _ends_ after this TDMR
522          * begins.  Ignore 'tmb' start addresses.  They are irrelevant.
523          */
524         list_for_each_entry(tmb, tmb_list, list) {
525                 if (tmb->end_pfn > PHYS_PFN(tdmr->base))
526                         return tmb->nid;
527         }
528 
529         /*
530          * Fall back to allocating the TDMR's metadata from node 0 when
531          * no TDX memory block can be found.  This should never happen
532          * since TDMRs originate from TDX memory blocks.
533          */
534         pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
535                         tdmr->base, tdmr_end(tdmr));
536         return 0;
537 }
538 
539 /*
540  * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
541  * within @tdmr, and set up PAMTs for @tdmr.
542  */
543 static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
544                             struct list_head *tmb_list,
545                             u16 pamt_entry_size[])
546 {
547         unsigned long pamt_base[TDX_PS_NR];
548         unsigned long pamt_size[TDX_PS_NR];
549         unsigned long tdmr_pamt_base;
550         unsigned long tdmr_pamt_size;
551         struct page *pamt;
552         int pgsz, nid;
553 
554         nid = tdmr_get_nid(tdmr, tmb_list);
555 
556         /*
557          * Calculate the PAMT size for each TDX supported page size
558          * and the total PAMT size.
559          */
560         tdmr_pamt_size = 0;
561         for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
562                 pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
563                                         pamt_entry_size[pgsz]);
564                 tdmr_pamt_size += pamt_size[pgsz];
565         }
566 
567         /*
568          * Allocate one chunk of physically contiguous memory for all
569          * PAMTs.  This helps minimize the PAMT's use of reserved areas
570          * in overlapped TDMRs.
571          */
572         pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
573                         nid, &node_online_map);
574         if (!pamt)
575                 return -ENOMEM;
576 
577         /*
578          * Break the contiguous allocation back up into the
579          * individual PAMTs for each page size.
580          */
581         tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
582         for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
583                 pamt_base[pgsz] = tdmr_pamt_base;
584                 tdmr_pamt_base += pamt_size[pgsz];
585         }
586 
587         tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
588         tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
589         tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
590         tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
591         tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
592         tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
593 
594         return 0;
595 }
596 
597 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
598                           unsigned long *pamt_size)
599 {
600         unsigned long pamt_bs, pamt_sz;
601 
602         /*
603          * The PAMT was allocated in one contiguous unit.  The 4K PAMT
604          * should always point to the beginning of that allocation.
605          */
606         pamt_bs = tdmr->pamt_4k_base;
607         pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
608 
609         WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK));
610 
611         *pamt_base = pamt_bs;
612         *pamt_size = pamt_sz;
613 }
614 
615 static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
616                 void (*pamt_func)(unsigned long base, unsigned long size))
617 {
618         unsigned long pamt_base, pamt_size;
619 
620         tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
621 
622         /* Do nothing if PAMT hasn't been allocated for this TDMR */
623         if (!pamt_size)
624                 return;
625 
626         if (WARN_ON_ONCE(!pamt_base))
627                 return;
628 
629         pamt_func(pamt_base, pamt_size);
630 }
631 
632 static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
633 {
634         free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
635 }
636 
637 static void tdmr_free_pamt(struct tdmr_info *tdmr)
638 {
639         tdmr_do_pamt_func(tdmr, free_pamt);
640 }
641 
642 static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
643 {
644         int i;
645 
646         for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
647                 tdmr_free_pamt(tdmr_entry(tdmr_list, i));
648 }
649 
650 /* Allocate and set up PAMTs for all TDMRs */
651 static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
652                                  struct list_head *tmb_list,
653                                  u16 pamt_entry_size[])
654 {
655         int i, ret = 0;
656 
657         for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
658                 ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
659                                 pamt_entry_size);
660                 if (ret)
661                         goto err;
662         }
663 
664         return 0;
665 err:
666         tdmrs_free_pamt_all(tdmr_list);
667         return ret;
668 }
669 
670 /*
671  * Convert TDX private pages back to normal by using MOVDIR64B to
672  * clear these pages.  Note this function doesn't flush cache of
673  * these TDX private pages.  The caller should make sure of that.
674  */
675 static void reset_tdx_pages(unsigned long base, unsigned long size)
676 {
677         const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
678         unsigned long phys, end;
679 
680         end = base + size;
681         for (phys = base; phys < end; phys += 64)
682                 movdir64b(__va(phys), zero_page);
683 
684         /*
685          * MOVDIR64B uses WC protocol.  Use memory barrier to
686          * make sure any later user of these pages sees the
687          * updated data.
688          */
689         mb();
690 }
691 
692 static void tdmr_reset_pamt(struct tdmr_info *tdmr)
693 {
694         tdmr_do_pamt_func(tdmr, reset_tdx_pages);
695 }
696 
697 static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
698 {
699         int i;
700 
701         for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
702                 tdmr_reset_pamt(tdmr_entry(tdmr_list, i));
703 }
704 
705 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
706 {
707         unsigned long pamt_size = 0;
708         int i;
709 
710         for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
711                 unsigned long base, size;
712 
713                 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
714                 pamt_size += size;
715         }
716 
717         return pamt_size / 1024;
718 }
719 
720 static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
721                               u64 size, u16 max_reserved_per_tdmr)
722 {
723         struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
724         int idx = *p_idx;
725 
726         /* Reserved area must be 4K aligned in offset and size */
727         if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK))
728                 return -EINVAL;
729 
730         if (idx >= max_reserved_per_tdmr) {
731                 pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
732                                 tdmr->base, tdmr_end(tdmr));
733                 return -ENOSPC;
734         }
735 
736         /*
737          * Consume one reserved area per call.  Make no effort to
738          * optimize or reduce the number of reserved areas which are
739          * consumed by contiguous reserved areas, for instance.
740          */
741         rsvd_areas[idx].offset = addr - tdmr->base;
742         rsvd_areas[idx].size = size;
743 
744         *p_idx = idx + 1;
745 
746         return 0;
747 }
748 
749 /*
750  * Go through @tmb_list to find holes between memory areas.  If any of
751  * those holes fall within @tdmr, set up a TDMR reserved area to cover
752  * the hole.
753  */
754 static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
755                                     struct tdmr_info *tdmr,
756                                     int *rsvd_idx,
757                                     u16 max_reserved_per_tdmr)
758 {
759         struct tdx_memblock *tmb;
760         u64 prev_end;
761         int ret;
762 
763         /*
764          * Start looking for reserved blocks at the
765          * beginning of the TDMR.
766          */
767         prev_end = tdmr->base;
768         list_for_each_entry(tmb, tmb_list, list) {
769                 u64 start, end;
770 
771                 start = PFN_PHYS(tmb->start_pfn);
772                 end   = PFN_PHYS(tmb->end_pfn);
773 
774                 /* Break if this region is after the TDMR */
775                 if (start >= tdmr_end(tdmr))
776                         break;
777 
778                 /* Exclude regions before this TDMR */
779                 if (end < tdmr->base)
780                         continue;
781 
782                 /*
783                  * Skip over memory areas that
784                  * have already been dealt with.
785                  */
786                 if (start <= prev_end) {
787                         prev_end = end;
788                         continue;
789                 }
790 
791                 /* Add the hole before this region */
792                 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
793                                 start - prev_end,
794                                 max_reserved_per_tdmr);
795                 if (ret)
796                         return ret;
797 
798                 prev_end = end;
799         }
800 
801         /* Add the hole after the last region if it exists. */
802         if (prev_end < tdmr_end(tdmr)) {
803                 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
804                                 tdmr_end(tdmr) - prev_end,
805                                 max_reserved_per_tdmr);
806                 if (ret)
807                         return ret;
808         }
809 
810         return 0;
811 }
812 
813 /*
814  * Go through @tdmr_list to find all PAMTs.  If any of those PAMTs
815  * overlaps with @tdmr, set up a TDMR reserved area to cover the
816  * overlapping part.
817  */
818 static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
819                                     struct tdmr_info *tdmr,
820                                     int *rsvd_idx,
821                                     u16 max_reserved_per_tdmr)
822 {
823         int i, ret;
824 
825         for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
826                 struct tdmr_info *tmp = tdmr_entry(tdmr_list, i);
827                 unsigned long pamt_base, pamt_size, pamt_end;
828 
829                 tdmr_get_pamt(tmp, &pamt_base, &pamt_size);
830                 /* Each TDMR must already have PAMT allocated */
831                 WARN_ON_ONCE(!pamt_size || !pamt_base);
832 
833                 pamt_end = pamt_base + pamt_size;
834                 /* Skip PAMTs outside of the given TDMR */
835                 if ((pamt_end <= tdmr->base) ||
836                                 (pamt_base >= tdmr_end(tdmr)))
837                         continue;
838 
839                 /* Only mark the part within the TDMR as reserved */
840                 if (pamt_base < tdmr->base)
841                         pamt_base = tdmr->base;
842                 if (pamt_end > tdmr_end(tdmr))
843                         pamt_end = tdmr_end(tdmr);
844 
845                 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
846                                 pamt_end - pamt_base,
847                                 max_reserved_per_tdmr);
848                 if (ret)
849                         return ret;
850         }
851 
852         return 0;
853 }
854 
855 /* Compare function called by sort() for TDMR reserved areas */
856 static int rsvd_area_cmp_func(const void *a, const void *b)
857 {
858         struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
859         struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
860 
861         if (r1->offset + r1->size <= r2->offset)
862                 return -1;
863         if (r1->offset >= r2->offset + r2->size)
864                 return 1;
865 
866         /* Reserved areas cannot overlap.  The caller must guarantee. */
867         WARN_ON_ONCE(1);
868         return -1;
869 }
870 
871 /*
872  * Populate reserved areas for the given @tdmr, including memory holes
873  * (via @tmb_list) and PAMTs (via @tdmr_list).
874  */
875 static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
876                                     struct list_head *tmb_list,
877                                     struct tdmr_info_list *tdmr_list,
878                                     u16 max_reserved_per_tdmr)
879 {
880         int ret, rsvd_idx = 0;
881 
882         ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
883                         max_reserved_per_tdmr);
884         if (ret)
885                 return ret;
886 
887         ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
888                         max_reserved_per_tdmr);
889         if (ret)
890                 return ret;
891 
892         /* TDX requires reserved areas listed in address ascending order */
893         sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
894                         rsvd_area_cmp_func, NULL);
895 
896         return 0;
897 }
898 
899 /*
900  * Populate reserved areas for all TDMRs in @tdmr_list, including memory
901  * holes (via @tmb_list) and PAMTs.
902  */
903 static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
904                                          struct list_head *tmb_list,
905                                          u16 max_reserved_per_tdmr)
906 {
907         int i;
908 
909         for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
910                 int ret;
911 
912                 ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
913                                 tmb_list, tdmr_list, max_reserved_per_tdmr);
914                 if (ret)
915                         return ret;
916         }
917 
918         return 0;
919 }
920 
921 /*
922  * Construct a list of TDMRs on the preallocated space in @tdmr_list
923  * to cover all TDX memory regions in @tmb_list based on the TDX module
924  * TDMR global information in @tdmr_sysinfo.
925  */
926 static int construct_tdmrs(struct list_head *tmb_list,
927                            struct tdmr_info_list *tdmr_list,
928                            struct tdx_tdmr_sysinfo *tdmr_sysinfo)
929 {
930         int ret;
931 
932         ret = fill_out_tdmrs(tmb_list, tdmr_list);
933         if (ret)
934                 return ret;
935 
936         ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list,
937                         tdmr_sysinfo->pamt_entry_size);
938         if (ret)
939                 return ret;
940 
941         ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
942                         tdmr_sysinfo->max_reserved_per_tdmr);
943         if (ret)
944                 tdmrs_free_pamt_all(tdmr_list);
945 
946         /*
947          * The tdmr_info_list is read-only from here on out.
948          * Ensure that these writes are seen by other CPUs.
949          * Pairs with a smp_rmb() in is_pamt_page().
950          */
951         smp_wmb();
952 
953         return ret;
954 }
955 
956 static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
957 {
958         struct tdx_module_args args = {};
959         u64 *tdmr_pa_array;
960         size_t array_sz;
961         int i, ret;
962 
963         /*
964          * TDMRs are passed to the TDX module via an array of physical
965          * addresses of each TDMR.  The array itself also has certain
966          * alignment requirement.
967          */
968         array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
969         array_sz = roundup_pow_of_two(array_sz);
970         if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
971                 array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
972 
973         tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
974         if (!tdmr_pa_array)
975                 return -ENOMEM;
976 
977         for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
978                 tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
979 
980         args.rcx = __pa(tdmr_pa_array);
981         args.rdx = tdmr_list->nr_consumed_tdmrs;
982         args.r8 = global_keyid;
983         ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
984 
985         /* Free the array as it is not required anymore. */
986         kfree(tdmr_pa_array);
987 
988         return ret;
989 }
990 
991 static int do_global_key_config(void *unused)
992 {
993         struct tdx_module_args args = {};
994 
995         return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args);
996 }
997 
998 /*
999  * Attempt to configure the global KeyID on all physical packages.
1000  *
1001  * This requires running code on at least one CPU in each package.
1002  * TDMR initialization) will fail will fail if any package in the
1003  * system has no online CPUs.
1004  *
1005  * This code takes no affirmative steps to online CPUs.  Callers (aka.
1006  * KVM) can ensure success by ensuring sufficient CPUs are online and
1007  * can run SEAMCALLs.
1008  */
1009 static int config_global_keyid(void)
1010 {
1011         cpumask_var_t packages;
1012         int cpu, ret = -EINVAL;
1013 
1014         if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
1015                 return -ENOMEM;
1016 
1017         /*
1018          * Hardware doesn't guarantee cache coherency across different
1019          * KeyIDs.  The kernel needs to flush PAMT's dirty cachelines
1020          * (associated with KeyID 0) before the TDX module can use the
1021          * global KeyID to access the PAMT.  Given PAMTs are potentially
1022          * large (~1/256th of system RAM), just use WBINVD.
1023          */
1024         wbinvd_on_all_cpus();
1025 
1026         for_each_online_cpu(cpu) {
1027                 /*
1028                  * The key configuration only needs to be done once per
1029                  * package and will return an error if configured more
1030                  * than once.  Avoid doing it multiple times per package.
1031                  */
1032                 if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
1033                                         packages))
1034                         continue;
1035 
1036                 /*
1037                  * TDH.SYS.KEY.CONFIG cannot run concurrently on
1038                  * different cpus.  Do it one by one.
1039                  */
1040                 ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true);
1041                 if (ret)
1042                         break;
1043         }
1044 
1045         free_cpumask_var(packages);
1046         return ret;
1047 }
1048 
1049 static int init_tdmr(struct tdmr_info *tdmr)
1050 {
1051         u64 next;
1052 
1053         /*
1054          * Initializing a TDMR can be time consuming.  To avoid long
1055          * SEAMCALLs, the TDX module may only initialize a part of the
1056          * TDMR in each call.
1057          */
1058         do {
1059                 struct tdx_module_args args = {
1060                         .rcx = tdmr->base,
1061                 };
1062                 int ret;
1063 
1064                 ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
1065                 if (ret)
1066                         return ret;
1067                 /*
1068                  * RDX contains 'next-to-initialize' address if
1069                  * TDH.SYS.TDMR.INIT did not fully complete and
1070                  * should be retried.
1071                  */
1072                 next = args.rdx;
1073                 cond_resched();
1074                 /* Keep making SEAMCALLs until the TDMR is done */
1075         } while (next < tdmr->base + tdmr->size);
1076 
1077         return 0;
1078 }
1079 
1080 static int init_tdmrs(struct tdmr_info_list *tdmr_list)
1081 {
1082         int i;
1083 
1084         /*
1085          * This operation is costly.  It can be parallelized,
1086          * but keep it simple for now.
1087          */
1088         for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1089                 int ret;
1090 
1091                 ret = init_tdmr(tdmr_entry(tdmr_list, i));
1092                 if (ret)
1093                         return ret;
1094         }
1095 
1096         return 0;
1097 }
1098 
1099 static int init_tdx_module(void)
1100 {
1101         struct tdx_tdmr_sysinfo tdmr_sysinfo;
1102         int ret;
1103 
1104         /*
1105          * To keep things simple, assume that all TDX-protected memory
1106          * will come from the page allocator.  Make sure all pages in the
1107          * page allocator are TDX-usable memory.
1108          *
1109          * Build the list of "TDX-usable" memory regions which cover all
1110          * pages in the page allocator to guarantee that.  Do it while
1111          * holding mem_hotplug_lock read-lock as the memory hotplug code
1112          * path reads the @tdx_memlist to reject any new memory.
1113          */
1114         get_online_mems();
1115 
1116         ret = build_tdx_memlist(&tdx_memlist);
1117         if (ret)
1118                 goto out_put_tdxmem;
1119 
1120         ret = get_tdx_tdmr_sysinfo(&tdmr_sysinfo);
1121         if (ret)
1122                 goto err_free_tdxmem;
1123 
1124         /* Allocate enough space for constructing TDMRs */
1125         ret = alloc_tdmr_list(&tdx_tdmr_list, &tdmr_sysinfo);
1126         if (ret)
1127                 goto err_free_tdxmem;
1128 
1129         /* Cover all TDX-usable memory regions in TDMRs */
1130         ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdmr_sysinfo);
1131         if (ret)
1132                 goto err_free_tdmrs;
1133 
1134         /* Pass the TDMRs and the global KeyID to the TDX module */
1135         ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
1136         if (ret)
1137                 goto err_free_pamts;
1138 
1139         /* Config the key of global KeyID on all packages */
1140         ret = config_global_keyid();
1141         if (ret)
1142                 goto err_reset_pamts;
1143 
1144         /* Initialize TDMRs to complete the TDX module initialization */
1145         ret = init_tdmrs(&tdx_tdmr_list);
1146         if (ret)
1147                 goto err_reset_pamts;
1148 
1149         pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
1150 
1151 out_put_tdxmem:
1152         /*
1153          * @tdx_memlist is written here and read at memory hotplug time.
1154          * Lock out memory hotplug code while building it.
1155          */
1156         put_online_mems();
1157         return ret;
1158 
1159 err_reset_pamts:
1160         /*
1161          * Part of PAMTs may already have been initialized by the
1162          * TDX module.  Flush cache before returning PAMTs back
1163          * to the kernel.
1164          */
1165         wbinvd_on_all_cpus();
1166         /*
1167          * According to the TDX hardware spec, if the platform
1168          * doesn't have the "partial write machine check"
1169          * erratum, any kernel read/write will never cause #MC
1170          * in kernel space, thus it's OK to not convert PAMTs
1171          * back to normal.  But do the conversion anyway here
1172          * as suggested by the TDX spec.
1173          */
1174         tdmrs_reset_pamt_all(&tdx_tdmr_list);
1175 err_free_pamts:
1176         tdmrs_free_pamt_all(&tdx_tdmr_list);
1177 err_free_tdmrs:
1178         free_tdmr_list(&tdx_tdmr_list);
1179 err_free_tdxmem:
1180         free_tdx_memlist(&tdx_memlist);
1181         goto out_put_tdxmem;
1182 }
1183 
1184 static int __tdx_enable(void)
1185 {
1186         int ret;
1187 
1188         ret = init_tdx_module();
1189         if (ret) {
1190                 pr_err("module initialization failed (%d)\n", ret);
1191                 tdx_module_status = TDX_MODULE_ERROR;
1192                 return ret;
1193         }
1194 
1195         pr_info("module initialized\n");
1196         tdx_module_status = TDX_MODULE_INITIALIZED;
1197 
1198         return 0;
1199 }
1200 
1201 /**
1202  * tdx_enable - Enable TDX module to make it ready to run TDX guests
1203  *
1204  * This function assumes the caller has: 1) held read lock of CPU hotplug
1205  * lock to prevent any new cpu from becoming online; 2) done both VMXON
1206  * and tdx_cpu_enable() on all online cpus.
1207  *
1208  * This function requires there's at least one online cpu for each CPU
1209  * package to succeed.
1210  *
1211  * This function can be called in parallel by multiple callers.
1212  *
1213  * Return 0 if TDX is enabled successfully, otherwise error.
1214  */
1215 int tdx_enable(void)
1216 {
1217         int ret;
1218 
1219         if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1220                 return -ENODEV;
1221 
1222         lockdep_assert_cpus_held();
1223 
1224         mutex_lock(&tdx_module_lock);
1225 
1226         switch (tdx_module_status) {
1227         case TDX_MODULE_UNINITIALIZED:
1228                 ret = __tdx_enable();
1229                 break;
1230         case TDX_MODULE_INITIALIZED:
1231                 /* Already initialized, great, tell the caller. */
1232                 ret = 0;
1233                 break;
1234         default:
1235                 /* Failed to initialize in the previous attempts */
1236                 ret = -EINVAL;
1237                 break;
1238         }
1239 
1240         mutex_unlock(&tdx_module_lock);
1241 
1242         return ret;
1243 }
1244 EXPORT_SYMBOL_GPL(tdx_enable);
1245 
1246 static bool is_pamt_page(unsigned long phys)
1247 {
1248         struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
1249         int i;
1250 
1251         /* Ensure that all remote 'tdmr_list' writes are visible: */
1252         smp_rmb();
1253 
1254         /*
1255          * The TDX module is no longer returning TDX_SYS_NOT_READY and
1256          * is initialized.  The 'tdmr_list' was initialized long ago
1257          * and is now read-only.
1258          */
1259         for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1260                 unsigned long base, size;
1261 
1262                 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
1263 
1264                 if (phys >= base && phys < (base + size))
1265                         return true;
1266         }
1267 
1268         return false;
1269 }
1270 
1271 /*
1272  * Return whether the memory page at the given physical address is TDX
1273  * private memory or not.
1274  *
1275  * This can be imprecise for two known reasons:
1276  * 1. PAMTs are private memory and exist before the TDX module is
1277  *    ready and TDH_PHYMEM_PAGE_RDMD works.  This is a relatively
1278  *    short window that occurs once per boot.
1279  * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the
1280  *    page.  However, the page can still cause #MC until it has been
1281  *    fully converted to shared using 64-byte writes like MOVDIR64B.
1282  *    Buggy hosts might still leave #MC-causing memory in place which
1283  *    this function can not detect.
1284  */
1285 static bool paddr_is_tdx_private(unsigned long phys)
1286 {
1287         struct tdx_module_args args = {
1288                 .rcx = phys & PAGE_MASK,
1289         };
1290         u64 sret;
1291 
1292         if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1293                 return false;
1294 
1295         /* Get page type from the TDX module */
1296         sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
1297 
1298         /*
1299          * The SEAMCALL will not return success unless there is a
1300          * working, "ready" TDX module.  Assume an absence of TDX
1301          * private pages until SEAMCALL is working.
1302          */
1303         if (sret)
1304                 return false;
1305 
1306         /*
1307          * SEAMCALL was successful -- read page type (via RCX):
1308          *
1309          *  - PT_NDA:   Page is not used by the TDX module
1310          *  - PT_RSVD:  Reserved for Non-TDX use
1311          *  - Others:   Page is used by the TDX module
1312          *
1313          * Note PAMT pages are marked as PT_RSVD but they are also TDX
1314          * private memory.
1315          */
1316         switch (args.rcx) {
1317         case PT_NDA:
1318                 return false;
1319         case PT_RSVD:
1320                 return is_pamt_page(phys);
1321         default:
1322                 return true;
1323         }
1324 }
1325 
1326 /*
1327  * Some TDX-capable CPUs have an erratum.  A write to TDX private
1328  * memory poisons that memory, and a subsequent read of that memory
1329  * triggers #MC.
1330  *
1331  * Help distinguish erratum-triggered #MCs from a normal hardware one.
1332  * Just print additional message to show such #MC may be result of the
1333  * erratum.
1334  */
1335 const char *tdx_dump_mce_info(struct mce *m)
1336 {
1337         if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
1338                 return NULL;
1339 
1340         if (!paddr_is_tdx_private(m->addr))
1341                 return NULL;
1342 
1343         return "TDX private memory error. Possible kernel bug.";
1344 }
1345 
1346 static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
1347                                             u32 *nr_tdx_keyids)
1348 {
1349         u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids;
1350         int ret;
1351 
1352         /*
1353          * IA32_MKTME_KEYID_PARTIONING:
1354          *   Bit [31:0]:        Number of MKTME KeyIDs.
1355          *   Bit [63:32]:       Number of TDX private KeyIDs.
1356          */
1357         ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
1358                         &_nr_tdx_keyids);
1359         if (ret || !_nr_tdx_keyids)
1360                 return -EINVAL;
1361 
1362         /* TDX KeyIDs start after the last MKTME KeyID. */
1363         _tdx_keyid_start = _nr_mktme_keyids + 1;
1364 
1365         *tdx_keyid_start = _tdx_keyid_start;
1366         *nr_tdx_keyids = _nr_tdx_keyids;
1367 
1368         return 0;
1369 }
1370 
1371 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn)
1372 {
1373         struct tdx_memblock *tmb;
1374 
1375         /*
1376          * This check assumes that the start_pfn<->end_pfn range does not
1377          * cross multiple @tdx_memlist entries.  A single memory online
1378          * event across multiple memblocks (from which @tdx_memlist
1379          * entries are derived at the time of module initialization) is
1380          * not possible.  This is because memory offline/online is done
1381          * on granularity of 'struct memory_block', and the hotpluggable
1382          * memory region (one memblock) must be multiple of memory_block.
1383          */
1384         list_for_each_entry(tmb, &tdx_memlist, list) {
1385                 if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn)
1386                         return true;
1387         }
1388         return false;
1389 }
1390 
1391 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action,
1392                                void *v)
1393 {
1394         struct memory_notify *mn = v;
1395 
1396         if (action != MEM_GOING_ONLINE)
1397                 return NOTIFY_OK;
1398 
1399         /*
1400          * Empty list means TDX isn't enabled.  Allow any memory
1401          * to go online.
1402          */
1403         if (list_empty(&tdx_memlist))
1404                 return NOTIFY_OK;
1405 
1406         /*
1407          * The TDX memory configuration is static and can not be
1408          * changed.  Reject onlining any memory which is outside of
1409          * the static configuration whether it supports TDX or not.
1410          */
1411         if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages))
1412                 return NOTIFY_OK;
1413 
1414         return NOTIFY_BAD;
1415 }
1416 
1417 static struct notifier_block tdx_memory_nb = {
1418         .notifier_call = tdx_memory_notifier,
1419 };
1420 
1421 static void __init check_tdx_erratum(void)
1422 {
1423         /*
1424          * These CPUs have an erratum.  A partial write from non-TD
1425          * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX
1426          * private memory poisons that memory, and a subsequent read of
1427          * that memory triggers #MC.
1428          */
1429         switch (boot_cpu_data.x86_vfm) {
1430         case INTEL_SAPPHIRERAPIDS_X:
1431         case INTEL_EMERALDRAPIDS_X:
1432                 setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
1433         }
1434 }
1435 
1436 void __init tdx_init(void)
1437 {
1438         u32 tdx_keyid_start, nr_tdx_keyids;
1439         int err;
1440 
1441         err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids);
1442         if (err)
1443                 return;
1444 
1445         pr_info("BIOS enabled: private KeyID range [%u, %u)\n",
1446                         tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids);
1447 
1448         /*
1449          * The TDX module itself requires one 'global KeyID' to protect
1450          * its metadata.  If there's only one TDX KeyID, there won't be
1451          * any left for TDX guests thus there's no point to enable TDX
1452          * at all.
1453          */
1454         if (nr_tdx_keyids < 2) {
1455                 pr_err("initialization failed: too few private KeyIDs available.\n");
1456                 return;
1457         }
1458 
1459         /*
1460          * At this point, hibernation_available() indicates whether or
1461          * not hibernation support has been permanently disabled.
1462          */
1463         if (hibernation_available()) {
1464                 pr_err("initialization failed: Hibernation support is enabled\n");
1465                 return;
1466         }
1467 
1468         err = register_memory_notifier(&tdx_memory_nb);
1469         if (err) {
1470                 pr_err("initialization failed: register_memory_notifier() failed (%d)\n",
1471                                 err);
1472                 return;
1473         }
1474 
1475 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
1476         pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
1477         acpi_suspend_lowlevel = NULL;
1478 #endif
1479 
1480         /*
1481          * Just use the first TDX KeyID as the 'global KeyID' and
1482          * leave the rest for TDX guests.
1483          */
1484         tdx_global_keyid = tdx_keyid_start;
1485         tdx_guest_keyid_start = tdx_keyid_start + 1;
1486         tdx_nr_guest_keyids = nr_tdx_keyids - 1;
1487 
1488         setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
1489 
1490         check_tdx_erratum();
1491 }
1492 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php