CoCalc -- tdx.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/virt/vmx/tdx/tdx.c
²⁹⁵³⁹ views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Copyright(c) 2023 Intel Corporation.
4
 *
5
 * Intel Trusted Domain Extensions (TDX) support
6
 */
7

8
#include "asm/page_types.h"
9
#define pr_fmt(fmt)	"virt/tdx: " fmt
10

11
#include <linux/types.h>
12
#include <linux/cache.h>
13
#include <linux/init.h>
14
#include <linux/errno.h>
15
#include <linux/printk.h>
16
#include <linux/cpu.h>
17
#include <linux/spinlock.h>
18
#include <linux/percpu-defs.h>
19
#include <linux/mutex.h>
20
#include <linux/list.h>
21
#include <linux/memblock.h>
22
#include <linux/memory.h>
23
#include <linux/minmax.h>
24
#include <linux/sizes.h>
25
#include <linux/pfn.h>
26
#include <linux/align.h>
27
#include <linux/sort.h>
28
#include <linux/log2.h>
29
#include <linux/acpi.h>
30
#include <linux/suspend.h>
31
#include <linux/idr.h>
32
#include <asm/page.h>
33
#include <asm/special_insns.h>
34
#include <asm/msr-index.h>
35
#include <asm/msr.h>
36
#include <asm/cpufeature.h>
37
#include <asm/tdx.h>
38
#include <asm/cpu_device_id.h>
39
#include <asm/processor.h>
40
#include <asm/mce.h>
41
#include "tdx.h"
42

43
static u32 tdx_global_keyid __ro_after_init;
44
static u32 tdx_guest_keyid_start __ro_after_init;
45
static u32 tdx_nr_guest_keyids __ro_after_init;
46

47
static DEFINE_IDA(tdx_guest_keyid_pool);
48

49
static DEFINE_PER_CPU(bool, tdx_lp_initialized);
50

51
static struct tdmr_info_list tdx_tdmr_list;
52

53
static enum tdx_module_status_t tdx_module_status;
54
static DEFINE_MUTEX(tdx_module_lock);
55

56
/* All TDX-usable memory regions.  Protected by mem_hotplug_lock. */
57
static LIST_HEAD(tdx_memlist);
58

59
static struct tdx_sys_info tdx_sysinfo;
60

61
typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
62

63
static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
64
{
65
	pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
66
}
67

68
static inline void seamcall_err_ret(u64 fn, u64 err,
69
				    struct tdx_module_args *args)
70
{
71
	seamcall_err(fn, err, args);
72
	pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
73
			args->rcx, args->rdx, args->r8);
74
	pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
75
			args->r9, args->r10, args->r11);
76
}
77

78
static __always_inline int sc_retry_prerr(sc_func_t func,
79
					  sc_err_func_t err_func,
80
					  u64 fn, struct tdx_module_args *args)
81
{
82
	u64 sret = sc_retry(func, fn, args);
83

84
	if (sret == TDX_SUCCESS)
85
		return 0;
86

87
	if (sret == TDX_SEAMCALL_VMFAILINVALID)
88
		return -ENODEV;
89

90
	if (sret == TDX_SEAMCALL_GP)
91
		return -EOPNOTSUPP;
92

93
	if (sret == TDX_SEAMCALL_UD)
94
		return -EACCES;
95

96
	err_func(fn, sret, args);
97
	return -EIO;
98
}
99

100
#define seamcall_prerr(__fn, __args)						\
101
	sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
102

103
#define seamcall_prerr_ret(__fn, __args)					\
104
	sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
105

106
/*
107
 * Do the module global initialization once and return its result.
108
 * It can be done on any cpu.  It's always called with interrupts
109
 * disabled.
110
 */
111
static int try_init_module_global(void)
112
{
113
	struct tdx_module_args args = {};
114
	static DEFINE_RAW_SPINLOCK(sysinit_lock);
115
	static bool sysinit_done;
116
	static int sysinit_ret;
117

118
	lockdep_assert_irqs_disabled();
119

120
	raw_spin_lock(&sysinit_lock);
121

122
	if (sysinit_done)
123
		goto out;
124

125
	/* RCX is module attributes and all bits are reserved */
126
	args.rcx = 0;
127
	sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
128

129
	/*
130
	 * The first SEAMCALL also detects the TDX module, thus
131
	 * it can fail due to the TDX module is not loaded.
132
	 * Dump message to let the user know.
133
	 */
134
	if (sysinit_ret == -ENODEV)
135
		pr_err("module not loaded\n");
136

137
	sysinit_done = true;
138
out:
139
	raw_spin_unlock(&sysinit_lock);
140
	return sysinit_ret;
141
}
142

143
/**
144
 * tdx_cpu_enable - Enable TDX on local cpu
145
 *
146
 * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
147
 * global initialization SEAMCALL if not done) on local cpu to make this
148
 * cpu be ready to run any other SEAMCALLs.
149
 *
150
 * Always call this function via IPI function calls.
151
 *
152
 * Return 0 on success, otherwise errors.
153
 */
154
int tdx_cpu_enable(void)
155
{
156
	struct tdx_module_args args = {};
157
	int ret;
158

159
	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
160
		return -ENODEV;
161

162
	lockdep_assert_irqs_disabled();
163

164
	if (__this_cpu_read(tdx_lp_initialized))
165
		return 0;
166

167
	/*
168
	 * The TDX module global initialization is the very first step
169
	 * to enable TDX.  Need to do it first (if hasn't been done)
170
	 * before the per-cpu initialization.
171
	 */
172
	ret = try_init_module_global();
173
	if (ret)
174
		return ret;
175

176
	ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
177
	if (ret)
178
		return ret;
179

180
	__this_cpu_write(tdx_lp_initialized, true);
181

182
	return 0;
183
}
184
EXPORT_SYMBOL_GPL(tdx_cpu_enable);
185

186
/*
187
 * Add a memory region as a TDX memory block.  The caller must make sure
188
 * all memory regions are added in address ascending order and don't
189
 * overlap.
190
 */
191
static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
192
			    unsigned long end_pfn, int nid)
193
{
194
	struct tdx_memblock *tmb;
195

196
	tmb = kmalloc(sizeof(*tmb), GFP_KERNEL);
197
	if (!tmb)
198
		return -ENOMEM;
199

200
	INIT_LIST_HEAD(&tmb->list);
201
	tmb->start_pfn = start_pfn;
202
	tmb->end_pfn = end_pfn;
203
	tmb->nid = nid;
204

205
	/* @tmb_list is protected by mem_hotplug_lock */
206
	list_add_tail(&tmb->list, tmb_list);
207
	return 0;
208
}
209

210
static void free_tdx_memlist(struct list_head *tmb_list)
211
{
212
	/* @tmb_list is protected by mem_hotplug_lock */
213
	while (!list_empty(tmb_list)) {
214
		struct tdx_memblock *tmb = list_first_entry(tmb_list,
215
				struct tdx_memblock, list);
216

217
		list_del(&tmb->list);
218
		kfree(tmb);
219
	}
220
}
221

222
/*
223
 * Ensure that all memblock memory regions are convertible to TDX
224
 * memory.  Once this has been established, stash the memblock
225
 * ranges off in a secondary structure because memblock is modified
226
 * in memory hotplug while TDX memory regions are fixed.
227
 */
228
static int build_tdx_memlist(struct list_head *tmb_list)
229
{
230
	unsigned long start_pfn, end_pfn;
231
	int i, nid, ret;
232

233
	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
234
		/*
235
		 * The first 1MB is not reported as TDX convertible memory.
236
		 * Although the first 1MB is always reserved and won't end up
237
		 * to the page allocator, it is still in memblock's memory
238
		 * regions.  Skip them manually to exclude them as TDX memory.
239
		 */
240
		start_pfn = max(start_pfn, PHYS_PFN(SZ_1M));
241
		if (start_pfn >= end_pfn)
242
			continue;
243

244
		/*
245
		 * Add the memory regions as TDX memory.  The regions in
246
		 * memblock has already guaranteed they are in address
247
		 * ascending order and don't overlap.
248
		 */
249
		ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
250
		if (ret)
251
			goto err;
252
	}
253

254
	return 0;
255
err:
256
	free_tdx_memlist(tmb_list);
257
	return ret;
258
}
259

260
static int read_sys_metadata_field(u64 field_id, u64 *data)
261
{
262
	struct tdx_module_args args = {};
263
	int ret;
264

265
	/*
266
	 * TDH.SYS.RD -- reads one global metadata field
267
	 *  - RDX (in): the field to read
268
	 *  - R8 (out): the field data
269
	 */
270
	args.rdx = field_id;
271
	ret = seamcall_prerr_ret(TDH_SYS_RD, &args);
272
	if (ret)
273
		return ret;
274

275
	*data = args.r8;
276

277
	return 0;
278
}
279

280
#include "tdx_global_metadata.c"
281

282
static int check_features(struct tdx_sys_info *sysinfo)
283
{
284
	u64 tdx_features0 = sysinfo->features.tdx_features0;
285

286
	if (!(tdx_features0 & TDX_FEATURES0_NO_RBP_MOD)) {
287
		pr_err("frame pointer (RBP) clobber bug present, upgrade TDX module\n");
288
		return -EINVAL;
289
	}
290

291
	return 0;
292
}
293

294
/* Calculate the actual TDMR size */
295
static int tdmr_size_single(u16 max_reserved_per_tdmr)
296
{
297
	int tdmr_sz;
298

299
	/*
300
	 * The actual size of TDMR depends on the maximum
301
	 * number of reserved areas.
302
	 */
303
	tdmr_sz = sizeof(struct tdmr_info);
304
	tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
305

306
	return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
307
}
308

309
static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
310
			   struct tdx_sys_info_tdmr *sysinfo_tdmr)
311
{
312
	size_t tdmr_sz, tdmr_array_sz;
313
	void *tdmr_array;
314

315
	tdmr_sz = tdmr_size_single(sysinfo_tdmr->max_reserved_per_tdmr);
316
	tdmr_array_sz = tdmr_sz * sysinfo_tdmr->max_tdmrs;
317

318
	/*
319
	 * To keep things simple, allocate all TDMRs together.
320
	 * The buffer needs to be physically contiguous to make
321
	 * sure each TDMR is physically contiguous.
322
	 */
323
	tdmr_array = alloc_pages_exact(tdmr_array_sz,
324
			GFP_KERNEL | __GFP_ZERO);
325
	if (!tdmr_array)
326
		return -ENOMEM;
327

328
	tdmr_list->tdmrs = tdmr_array;
329

330
	/*
331
	 * Keep the size of TDMR to find the target TDMR
332
	 * at a given index in the TDMR list.
333
	 */
334
	tdmr_list->tdmr_sz = tdmr_sz;
335
	tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs;
336
	tdmr_list->nr_consumed_tdmrs = 0;
337

338
	return 0;
339
}
340

341
static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
342
{
343
	free_pages_exact(tdmr_list->tdmrs,
344
			tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
345
}
346

347
/* Get the TDMR from the list at the given index. */
348
static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list,
349
				    int idx)
350
{
351
	int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
352

353
	return (void *)tdmr_list->tdmrs + tdmr_info_offset;
354
}
355

356
#define TDMR_ALIGNMENT		SZ_1G
357
#define TDMR_ALIGN_DOWN(_addr)	ALIGN_DOWN((_addr), TDMR_ALIGNMENT)
358
#define TDMR_ALIGN_UP(_addr)	ALIGN((_addr), TDMR_ALIGNMENT)
359

360
static inline u64 tdmr_end(struct tdmr_info *tdmr)
361
{
362
	return tdmr->base + tdmr->size;
363
}
364

365
/*
366
 * Take the memory referenced in @tmb_list and populate the
367
 * preallocated @tdmr_list, following all the special alignment
368
 * and size rules for TDMR.
369
 */
370
static int fill_out_tdmrs(struct list_head *tmb_list,
371
			  struct tdmr_info_list *tdmr_list)
372
{
373
	struct tdx_memblock *tmb;
374
	int tdmr_idx = 0;
375

376
	/*
377
	 * Loop over TDX memory regions and fill out TDMRs to cover them.
378
	 * To keep it simple, always try to use one TDMR to cover one
379
	 * memory region.
380
	 *
381
	 * In practice TDX supports at least 64 TDMRs.  A 2-socket system
382
	 * typically only consumes less than 10 of those.  This code is
383
	 * dumb and simple and may use more TMDRs than is strictly
384
	 * required.
385
	 */
386
	list_for_each_entry(tmb, tmb_list, list) {
387
		struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
388
		u64 start, end;
389

390
		start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
391
		end   = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
392

393
		/*
394
		 * A valid size indicates the current TDMR has already
395
		 * been filled out to cover the previous memory region(s).
396
		 */
397
		if (tdmr->size) {
398
			/*
399
			 * Loop to the next if the current memory region
400
			 * has already been fully covered.
401
			 */
402
			if (end <= tdmr_end(tdmr))
403
				continue;
404

405
			/* Otherwise, skip the already covered part. */
406
			if (start < tdmr_end(tdmr))
407
				start = tdmr_end(tdmr);
408

409
			/*
410
			 * Create a new TDMR to cover the current memory
411
			 * region, or the remaining part of it.
412
			 */
413
			tdmr_idx++;
414
			if (tdmr_idx >= tdmr_list->max_tdmrs) {
415
				pr_warn("initialization failed: TDMRs exhausted.\n");
416
				return -ENOSPC;
417
			}
418

419
			tdmr = tdmr_entry(tdmr_list, tdmr_idx);
420
		}
421

422
		tdmr->base = start;
423
		tdmr->size = end - start;
424
	}
425

426
	/* @tdmr_idx is always the index of the last valid TDMR. */
427
	tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
428

429
	/*
430
	 * Warn early that kernel is about to run out of TDMRs.
431
	 *
432
	 * This is an indication that TDMR allocation has to be
433
	 * reworked to be smarter to not run into an issue.
434
	 */
435
	if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
436
		pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
437
				tdmr_list->nr_consumed_tdmrs,
438
				tdmr_list->max_tdmrs);
439

440
	return 0;
441
}
442

443
/*
444
 * Calculate PAMT size given a TDMR and a page size.  The returned
445
 * PAMT size is always aligned up to 4K page boundary.
446
 */
447
static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
448
				      u16 pamt_entry_size)
449
{
450
	unsigned long pamt_sz, nr_pamt_entries;
451

452
	switch (pgsz) {
453
	case TDX_PS_4K:
454
		nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
455
		break;
456
	case TDX_PS_2M:
457
		nr_pamt_entries = tdmr->size >> PMD_SHIFT;
458
		break;
459
	case TDX_PS_1G:
460
		nr_pamt_entries = tdmr->size >> PUD_SHIFT;
461
		break;
462
	default:
463
		WARN_ON_ONCE(1);
464
		return 0;
465
	}
466

467
	pamt_sz = nr_pamt_entries * pamt_entry_size;
468
	/* TDX requires PAMT size must be 4K aligned */
469
	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
470

471
	return pamt_sz;
472
}
473

474
/*
475
 * Locate a NUMA node which should hold the allocation of the @tdmr
476
 * PAMT.  This node will have some memory covered by the TDMR.  The
477
 * relative amount of memory covered is not considered.
478
 */
479
static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
480
{
481
	struct tdx_memblock *tmb;
482

483
	/*
484
	 * A TDMR must cover at least part of one TMB.  That TMB will end
485
	 * after the TDMR begins.  But, that TMB may have started before
486
	 * the TDMR.  Find the next 'tmb' that _ends_ after this TDMR
487
	 * begins.  Ignore 'tmb' start addresses.  They are irrelevant.
488
	 */
489
	list_for_each_entry(tmb, tmb_list, list) {
490
		if (tmb->end_pfn > PHYS_PFN(tdmr->base))
491
			return tmb->nid;
492
	}
493

494
	/*
495
	 * Fall back to allocating the TDMR's metadata from node 0 when
496
	 * no TDX memory block can be found.  This should never happen
497
	 * since TDMRs originate from TDX memory blocks.
498
	 */
499
	pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
500
			tdmr->base, tdmr_end(tdmr));
501
	return 0;
502
}
503

504
/*
505
 * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
506
 * within @tdmr, and set up PAMTs for @tdmr.
507
 */
508
static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
509
			    struct list_head *tmb_list,
510
			    u16 pamt_entry_size[])
511
{
512
	unsigned long pamt_base[TDX_PS_NR];
513
	unsigned long pamt_size[TDX_PS_NR];
514
	unsigned long tdmr_pamt_base;
515
	unsigned long tdmr_pamt_size;
516
	struct page *pamt;
517
	int pgsz, nid;
518

519
	nid = tdmr_get_nid(tdmr, tmb_list);
520

521
	/*
522
	 * Calculate the PAMT size for each TDX supported page size
523
	 * and the total PAMT size.
524
	 */
525
	tdmr_pamt_size = 0;
526
	for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
527
		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
528
					pamt_entry_size[pgsz]);
529
		tdmr_pamt_size += pamt_size[pgsz];
530
	}
531

532
	/*
533
	 * Allocate one chunk of physically contiguous memory for all
534
	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
535
	 * in overlapped TDMRs.
536
	 */
537
	pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
538
			nid, &node_online_map);
539
	if (!pamt)
540
		return -ENOMEM;
541

542
	/*
543
	 * Break the contiguous allocation back up into the
544
	 * individual PAMTs for each page size.
545
	 */
546
	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
547
	for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
548
		pamt_base[pgsz] = tdmr_pamt_base;
549
		tdmr_pamt_base += pamt_size[pgsz];
550
	}
551

552
	tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
553
	tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
554
	tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
555
	tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
556
	tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
557
	tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
558

559
	return 0;
560
}
561

562
static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
563
			  unsigned long *pamt_size)
564
{
565
	unsigned long pamt_bs, pamt_sz;
566

567
	/*
568
	 * The PAMT was allocated in one contiguous unit.  The 4K PAMT
569
	 * should always point to the beginning of that allocation.
570
	 */
571
	pamt_bs = tdmr->pamt_4k_base;
572
	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
573

574
	WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK));
575

576
	*pamt_base = pamt_bs;
577
	*pamt_size = pamt_sz;
578
}
579

580
static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
581
		void (*pamt_func)(unsigned long base, unsigned long size))
582
{
583
	unsigned long pamt_base, pamt_size;
584

585
	tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
586

587
	/* Do nothing if PAMT hasn't been allocated for this TDMR */
588
	if (!pamt_size)
589
		return;
590

591
	if (WARN_ON_ONCE(!pamt_base))
592
		return;
593

594
	pamt_func(pamt_base, pamt_size);
595
}
596

597
static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
598
{
599
	free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
600
}
601

602
static void tdmr_free_pamt(struct tdmr_info *tdmr)
603
{
604
	tdmr_do_pamt_func(tdmr, free_pamt);
605
}
606

607
static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
608
{
609
	int i;
610

611
	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
612
		tdmr_free_pamt(tdmr_entry(tdmr_list, i));
613
}
614

615
/* Allocate and set up PAMTs for all TDMRs */
616
static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
617
				 struct list_head *tmb_list,
618
				 u16 pamt_entry_size[])
619
{
620
	int i, ret = 0;
621

622
	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
623
		ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
624
				pamt_entry_size);
625
		if (ret)
626
			goto err;
627
	}
628

629
	return 0;
630
err:
631
	tdmrs_free_pamt_all(tdmr_list);
632
	return ret;
633
}
634

635
/*
636
 * Convert TDX private pages back to normal by using MOVDIR64B to clear these
637
 * pages. Typically, any write to the page will convert it from TDX private back
638
 * to normal kernel memory. Systems with the X86_BUG_TDX_PW_MCE erratum need to
639
 * do the conversion explicitly via MOVDIR64B.
640
 */
641
static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size)
642
{
643
	const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
644
	unsigned long phys, end;
645

646
	if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
647
		return;
648

649
	end = base + size;
650
	for (phys = base; phys < end; phys += 64)
651
		movdir64b(__va(phys), zero_page);
652

653
	/*
654
	 * MOVDIR64B uses WC protocol.  Use memory barrier to
655
	 * make sure any later user of these pages sees the
656
	 * updated data.
657
	 */
658
	mb();
659
}
660

661
void tdx_quirk_reset_page(struct page *page)
662
{
663
	tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE);
664
}
665
EXPORT_SYMBOL_GPL(tdx_quirk_reset_page);
666

667
static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr)
668
{
669
	tdmr_do_pamt_func(tdmr, tdx_quirk_reset_paddr);
670
}
671

672
static void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list)
673
{
674
	int i;
675

676
	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
677
		tdmr_quirk_reset_pamt(tdmr_entry(tdmr_list, i));
678
}
679

680
static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
681
{
682
	unsigned long pamt_size = 0;
683
	int i;
684

685
	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
686
		unsigned long base, size;
687

688
		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
689
		pamt_size += size;
690
	}
691

692
	return pamt_size / 1024;
693
}
694

695
static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
696
			      u64 size, u16 max_reserved_per_tdmr)
697
{
698
	struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
699
	int idx = *p_idx;
700

701
	/* Reserved area must be 4K aligned in offset and size */
702
	if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK))
703
		return -EINVAL;
704

705
	if (idx >= max_reserved_per_tdmr) {
706
		pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
707
				tdmr->base, tdmr_end(tdmr));
708
		return -ENOSPC;
709
	}
710

711
	/*
712
	 * Consume one reserved area per call.  Make no effort to
713
	 * optimize or reduce the number of reserved areas which are
714
	 * consumed by contiguous reserved areas, for instance.
715
	 */
716
	rsvd_areas[idx].offset = addr - tdmr->base;
717
	rsvd_areas[idx].size = size;
718

719
	*p_idx = idx + 1;
720

721
	return 0;
722
}
723

724
/*
725
 * Go through @tmb_list to find holes between memory areas.  If any of
726
 * those holes fall within @tdmr, set up a TDMR reserved area to cover
727
 * the hole.
728
 */
729
static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
730
				    struct tdmr_info *tdmr,
731
				    int *rsvd_idx,
732
				    u16 max_reserved_per_tdmr)
733
{
734
	struct tdx_memblock *tmb;
735
	u64 prev_end;
736
	int ret;
737

738
	/*
739
	 * Start looking for reserved blocks at the
740
	 * beginning of the TDMR.
741
	 */
742
	prev_end = tdmr->base;
743
	list_for_each_entry(tmb, tmb_list, list) {
744
		u64 start, end;
745

746
		start = PFN_PHYS(tmb->start_pfn);
747
		end   = PFN_PHYS(tmb->end_pfn);
748

749
		/* Break if this region is after the TDMR */
750
		if (start >= tdmr_end(tdmr))
751
			break;
752

753
		/* Exclude regions before this TDMR */
754
		if (end < tdmr->base)
755
			continue;
756

757
		/*
758
		 * Skip over memory areas that
759
		 * have already been dealt with.
760
		 */
761
		if (start <= prev_end) {
762
			prev_end = end;
763
			continue;
764
		}
765

766
		/* Add the hole before this region */
767
		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
768
				start - prev_end,
769
				max_reserved_per_tdmr);
770
		if (ret)
771
			return ret;
772

773
		prev_end = end;
774
	}
775

776
	/* Add the hole after the last region if it exists. */
777
	if (prev_end < tdmr_end(tdmr)) {
778
		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
779
				tdmr_end(tdmr) - prev_end,
780
				max_reserved_per_tdmr);
781
		if (ret)
782
			return ret;
783
	}
784

785
	return 0;
786
}
787

788
/*
789
 * Go through @tdmr_list to find all PAMTs.  If any of those PAMTs
790
 * overlaps with @tdmr, set up a TDMR reserved area to cover the
791
 * overlapping part.
792
 */
793
static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
794
				    struct tdmr_info *tdmr,
795
				    int *rsvd_idx,
796
				    u16 max_reserved_per_tdmr)
797
{
798
	int i, ret;
799

800
	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
801
		struct tdmr_info *tmp = tdmr_entry(tdmr_list, i);
802
		unsigned long pamt_base, pamt_size, pamt_end;
803

804
		tdmr_get_pamt(tmp, &pamt_base, &pamt_size);
805
		/* Each TDMR must already have PAMT allocated */
806
		WARN_ON_ONCE(!pamt_size || !pamt_base);
807

808
		pamt_end = pamt_base + pamt_size;
809
		/* Skip PAMTs outside of the given TDMR */
810
		if ((pamt_end <= tdmr->base) ||
811
				(pamt_base >= tdmr_end(tdmr)))
812
			continue;
813

814
		/* Only mark the part within the TDMR as reserved */
815
		if (pamt_base < tdmr->base)
816
			pamt_base = tdmr->base;
817
		if (pamt_end > tdmr_end(tdmr))
818
			pamt_end = tdmr_end(tdmr);
819

820
		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
821
				pamt_end - pamt_base,
822
				max_reserved_per_tdmr);
823
		if (ret)
824
			return ret;
825
	}
826

827
	return 0;
828
}
829

830
/* Compare function called by sort() for TDMR reserved areas */
831
static int rsvd_area_cmp_func(const void *a, const void *b)
832
{
833
	struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
834
	struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
835

836
	if (r1->offset + r1->size <= r2->offset)
837
		return -1;
838
	if (r1->offset >= r2->offset + r2->size)
839
		return 1;
840

841
	/* Reserved areas cannot overlap.  The caller must guarantee. */
842
	WARN_ON_ONCE(1);
843
	return -1;
844
}
845

846
/*
847
 * Populate reserved areas for the given @tdmr, including memory holes
848
 * (via @tmb_list) and PAMTs (via @tdmr_list).
849
 */
850
static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
851
				    struct list_head *tmb_list,
852
				    struct tdmr_info_list *tdmr_list,
853
				    u16 max_reserved_per_tdmr)
854
{
855
	int ret, rsvd_idx = 0;
856

857
	ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
858
			max_reserved_per_tdmr);
859
	if (ret)
860
		return ret;
861

862
	ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
863
			max_reserved_per_tdmr);
864
	if (ret)
865
		return ret;
866

867
	/* TDX requires reserved areas listed in address ascending order */
868
	sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
869
			rsvd_area_cmp_func, NULL);
870

871
	return 0;
872
}
873

874
/*
875
 * Populate reserved areas for all TDMRs in @tdmr_list, including memory
876
 * holes (via @tmb_list) and PAMTs.
877
 */
878
static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
879
					 struct list_head *tmb_list,
880
					 u16 max_reserved_per_tdmr)
881
{
882
	int i;
883

884
	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
885
		int ret;
886

887
		ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
888
				tmb_list, tdmr_list, max_reserved_per_tdmr);
889
		if (ret)
890
			return ret;
891
	}
892

893
	return 0;
894
}
895

896
/*
897
 * Construct a list of TDMRs on the preallocated space in @tdmr_list
898
 * to cover all TDX memory regions in @tmb_list based on the TDX module
899
 * TDMR global information in @sysinfo_tdmr.
900
 */
901
static int construct_tdmrs(struct list_head *tmb_list,
902
			   struct tdmr_info_list *tdmr_list,
903
			   struct tdx_sys_info_tdmr *sysinfo_tdmr)
904
{
905
	u16 pamt_entry_size[TDX_PS_NR] = {
906
		sysinfo_tdmr->pamt_4k_entry_size,
907
		sysinfo_tdmr->pamt_2m_entry_size,
908
		sysinfo_tdmr->pamt_1g_entry_size,
909
	};
910
	int ret;
911

912
	ret = fill_out_tdmrs(tmb_list, tdmr_list);
913
	if (ret)
914
		return ret;
915

916
	ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size);
917
	if (ret)
918
		return ret;
919

920
	ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
921
			sysinfo_tdmr->max_reserved_per_tdmr);
922
	if (ret)
923
		tdmrs_free_pamt_all(tdmr_list);
924

925
	/*
926
	 * The tdmr_info_list is read-only from here on out.
927
	 * Ensure that these writes are seen by other CPUs.
928
	 * Pairs with a smp_rmb() in is_pamt_page().
929
	 */
930
	smp_wmb();
931

932
	return ret;
933
}
934

935
static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
936
{
937
	struct tdx_module_args args = {};
938
	u64 *tdmr_pa_array;
939
	size_t array_sz;
940
	int i, ret;
941

942
	/*
943
	 * TDMRs are passed to the TDX module via an array of physical
944
	 * addresses of each TDMR.  The array itself also has certain
945
	 * alignment requirement.
946
	 */
947
	array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
948
	array_sz = roundup_pow_of_two(array_sz);
949
	if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
950
		array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
951

952
	tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
953
	if (!tdmr_pa_array)
954
		return -ENOMEM;
955

956
	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
957
		tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
958

959
	args.rcx = __pa(tdmr_pa_array);
960
	args.rdx = tdmr_list->nr_consumed_tdmrs;
961
	args.r8 = global_keyid;
962
	ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
963

964
	/* Free the array as it is not required anymore. */
965
	kfree(tdmr_pa_array);
966

967
	return ret;
968
}
969

970
static int do_global_key_config(void *unused)
971
{
972
	struct tdx_module_args args = {};
973

974
	return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args);
975
}
976

977
/*
978
 * Attempt to configure the global KeyID on all physical packages.
979
 *
980
 * This requires running code on at least one CPU in each package.
981
 * TDMR initialization) will fail will fail if any package in the
982
 * system has no online CPUs.
983
 *
984
 * This code takes no affirmative steps to online CPUs.  Callers (aka.
985
 * KVM) can ensure success by ensuring sufficient CPUs are online and
986
 * can run SEAMCALLs.
987
 */
988
static int config_global_keyid(void)
989
{
990
	cpumask_var_t packages;
991
	int cpu, ret = -EINVAL;
992

993
	if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
994
		return -ENOMEM;
995

996
	/*
997
	 * Hardware doesn't guarantee cache coherency across different
998
	 * KeyIDs.  The kernel needs to flush PAMT's dirty cachelines
999
	 * (associated with KeyID 0) before the TDX module can use the
1000
	 * global KeyID to access the PAMT.  Given PAMTs are potentially
1001
	 * large (~1/256th of system RAM), just use WBINVD.
1002
	 */
1003
	wbinvd_on_all_cpus();
1004

1005
	for_each_online_cpu(cpu) {
1006
		/*
1007
		 * The key configuration only needs to be done once per
1008
		 * package and will return an error if configured more
1009
		 * than once.  Avoid doing it multiple times per package.
1010
		 */
1011
		if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
1012
					packages))
1013
			continue;
1014

1015
		/*
1016
		 * TDH.SYS.KEY.CONFIG cannot run concurrently on
1017
		 * different cpus.  Do it one by one.
1018
		 */
1019
		ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true);
1020
		if (ret)
1021
			break;
1022
	}
1023

1024
	free_cpumask_var(packages);
1025
	return ret;
1026
}
1027

1028
static int init_tdmr(struct tdmr_info *tdmr)
1029
{
1030
	u64 next;
1031

1032
	/*
1033
	 * Initializing a TDMR can be time consuming.  To avoid long
1034
	 * SEAMCALLs, the TDX module may only initialize a part of the
1035
	 * TDMR in each call.
1036
	 */
1037
	do {
1038
		struct tdx_module_args args = {
1039
			.rcx = tdmr->base,
1040
		};
1041
		int ret;
1042

1043
		ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
1044
		if (ret)
1045
			return ret;
1046
		/*
1047
		 * RDX contains 'next-to-initialize' address if
1048
		 * TDH.SYS.TDMR.INIT did not fully complete and
1049
		 * should be retried.
1050
		 */
1051
		next = args.rdx;
1052
		cond_resched();
1053
		/* Keep making SEAMCALLs until the TDMR is done */
1054
	} while (next < tdmr->base + tdmr->size);
1055

1056
	return 0;
1057
}
1058

1059
static int init_tdmrs(struct tdmr_info_list *tdmr_list)
1060
{
1061
	int i;
1062

1063
	/*
1064
	 * This operation is costly.  It can be parallelized,
1065
	 * but keep it simple for now.
1066
	 */
1067
	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1068
		int ret;
1069

1070
		ret = init_tdmr(tdmr_entry(tdmr_list, i));
1071
		if (ret)
1072
			return ret;
1073
	}
1074

1075
	return 0;
1076
}
1077

1078
static int init_tdx_module(void)
1079
{
1080
	int ret;
1081

1082
	ret = get_tdx_sys_info(&tdx_sysinfo);
1083
	if (ret)
1084
		return ret;
1085

1086
	/* Check whether the kernel can support this module */
1087
	ret = check_features(&tdx_sysinfo);
1088
	if (ret)
1089
		return ret;
1090

1091
	/*
1092
	 * To keep things simple, assume that all TDX-protected memory
1093
	 * will come from the page allocator.  Make sure all pages in the
1094
	 * page allocator are TDX-usable memory.
1095
	 *
1096
	 * Build the list of "TDX-usable" memory regions which cover all
1097
	 * pages in the page allocator to guarantee that.  Do it while
1098
	 * holding mem_hotplug_lock read-lock as the memory hotplug code
1099
	 * path reads the @tdx_memlist to reject any new memory.
1100
	 */
1101
	get_online_mems();
1102

1103
	ret = build_tdx_memlist(&tdx_memlist);
1104
	if (ret)
1105
		goto out_put_tdxmem;
1106

1107
	/* Allocate enough space for constructing TDMRs */
1108
	ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr);
1109
	if (ret)
1110
		goto err_free_tdxmem;
1111

1112
	/* Cover all TDX-usable memory regions in TDMRs */
1113
	ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr);
1114
	if (ret)
1115
		goto err_free_tdmrs;
1116

1117
	/* Pass the TDMRs and the global KeyID to the TDX module */
1118
	ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
1119
	if (ret)
1120
		goto err_free_pamts;
1121

1122
	/* Config the key of global KeyID on all packages */
1123
	ret = config_global_keyid();
1124
	if (ret)
1125
		goto err_reset_pamts;
1126

1127
	/* Initialize TDMRs to complete the TDX module initialization */
1128
	ret = init_tdmrs(&tdx_tdmr_list);
1129
	if (ret)
1130
		goto err_reset_pamts;
1131

1132
	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
1133

1134
out_put_tdxmem:
1135
	/*
1136
	 * @tdx_memlist is written here and read at memory hotplug time.
1137
	 * Lock out memory hotplug code while building it.
1138
	 */
1139
	put_online_mems();
1140
	return ret;
1141

1142
err_reset_pamts:
1143
	/*
1144
	 * Part of PAMTs may already have been initialized by the
1145
	 * TDX module.  Flush cache before returning PAMTs back
1146
	 * to the kernel.
1147
	 */
1148
	wbinvd_on_all_cpus();
1149
	tdmrs_quirk_reset_pamt_all(&tdx_tdmr_list);
1150
err_free_pamts:
1151
	tdmrs_free_pamt_all(&tdx_tdmr_list);
1152
err_free_tdmrs:
1153
	free_tdmr_list(&tdx_tdmr_list);
1154
err_free_tdxmem:
1155
	free_tdx_memlist(&tdx_memlist);
1156
	goto out_put_tdxmem;
1157
}
1158

1159
static int __tdx_enable(void)
1160
{
1161
	int ret;
1162

1163
	ret = init_tdx_module();
1164
	if (ret) {
1165
		pr_err("module initialization failed (%d)\n", ret);
1166
		tdx_module_status = TDX_MODULE_ERROR;
1167
		return ret;
1168
	}
1169

1170
	pr_info("module initialized\n");
1171
	tdx_module_status = TDX_MODULE_INITIALIZED;
1172

1173
	return 0;
1174
}
1175

1176
/**
1177
 * tdx_enable - Enable TDX module to make it ready to run TDX guests
1178
 *
1179
 * This function assumes the caller has: 1) held read lock of CPU hotplug
1180
 * lock to prevent any new cpu from becoming online; 2) done both VMXON
1181
 * and tdx_cpu_enable() on all online cpus.
1182
 *
1183
 * This function requires there's at least one online cpu for each CPU
1184
 * package to succeed.
1185
 *
1186
 * This function can be called in parallel by multiple callers.
1187
 *
1188
 * Return 0 if TDX is enabled successfully, otherwise error.
1189
 */
1190
int tdx_enable(void)
1191
{
1192
	int ret;
1193

1194
	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1195
		return -ENODEV;
1196

1197
	lockdep_assert_cpus_held();
1198

1199
	mutex_lock(&tdx_module_lock);
1200

1201
	switch (tdx_module_status) {
1202
	case TDX_MODULE_UNINITIALIZED:
1203
		ret = __tdx_enable();
1204
		break;
1205
	case TDX_MODULE_INITIALIZED:
1206
		/* Already initialized, great, tell the caller. */
1207
		ret = 0;
1208
		break;
1209
	default:
1210
		/* Failed to initialize in the previous attempts */
1211
		ret = -EINVAL;
1212
		break;
1213
	}
1214

1215
	mutex_unlock(&tdx_module_lock);
1216

1217
	return ret;
1218
}
1219
EXPORT_SYMBOL_GPL(tdx_enable);
1220

1221
static bool is_pamt_page(unsigned long phys)
1222
{
1223
	struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
1224
	int i;
1225

1226
	/* Ensure that all remote 'tdmr_list' writes are visible: */
1227
	smp_rmb();
1228

1229
	/*
1230
	 * The TDX module is no longer returning TDX_SYS_NOT_READY and
1231
	 * is initialized.  The 'tdmr_list' was initialized long ago
1232
	 * and is now read-only.
1233
	 */
1234
	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1235
		unsigned long base, size;
1236

1237
		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
1238

1239
		if (phys >= base && phys < (base + size))
1240
			return true;
1241
	}
1242

1243
	return false;
1244
}
1245

1246
/*
1247
 * Return whether the memory page at the given physical address is TDX
1248
 * private memory or not.
1249
 *
1250
 * This can be imprecise for two known reasons:
1251
 * 1. PAMTs are private memory and exist before the TDX module is
1252
 *    ready and TDH_PHYMEM_PAGE_RDMD works.  This is a relatively
1253
 *    short window that occurs once per boot.
1254
 * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the
1255
 *    page.  However, the page can still cause #MC until it has been
1256
 *    fully converted to shared using 64-byte writes like MOVDIR64B.
1257
 *    Buggy hosts might still leave #MC-causing memory in place which
1258
 *    this function can not detect.
1259
 */
1260
static bool paddr_is_tdx_private(unsigned long phys)
1261
{
1262
	struct tdx_module_args args = {
1263
		.rcx = phys & PAGE_MASK,
1264
	};
1265
	u64 sret;
1266

1267
	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1268
		return false;
1269

1270
	/* Get page type from the TDX module */
1271
	sret = __seamcall_dirty_cache(__seamcall_ret, TDH_PHYMEM_PAGE_RDMD, &args);
1272

1273
	/*
1274
	 * The SEAMCALL will not return success unless there is a
1275
	 * working, "ready" TDX module.  Assume an absence of TDX
1276
	 * private pages until SEAMCALL is working.
1277
	 */
1278
	if (sret)
1279
		return false;
1280

1281
	/*
1282
	 * SEAMCALL was successful -- read page type (via RCX):
1283
	 *
1284
	 *  - PT_NDA:	Page is not used by the TDX module
1285
	 *  - PT_RSVD:	Reserved for Non-TDX use
1286
	 *  - Others:	Page is used by the TDX module
1287
	 *
1288
	 * Note PAMT pages are marked as PT_RSVD but they are also TDX
1289
	 * private memory.
1290
	 */
1291
	switch (args.rcx) {
1292
	case PT_NDA:
1293
		return false;
1294
	case PT_RSVD:
1295
		return is_pamt_page(phys);
1296
	default:
1297
		return true;
1298
	}
1299
}
1300

1301
/*
1302
 * Some TDX-capable CPUs have an erratum.  A write to TDX private
1303
 * memory poisons that memory, and a subsequent read of that memory
1304
 * triggers #MC.
1305
 *
1306
 * Help distinguish erratum-triggered #MCs from a normal hardware one.
1307
 * Just print additional message to show such #MC may be result of the
1308
 * erratum.
1309
 */
1310
const char *tdx_dump_mce_info(struct mce *m)
1311
{
1312
	if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
1313
		return NULL;
1314

1315
	if (!paddr_is_tdx_private(m->addr))
1316
		return NULL;
1317

1318
	return "TDX private memory error. Possible kernel bug.";
1319
}
1320

1321
static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
1322
					    u32 *nr_tdx_keyids)
1323
{
1324
	u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids;
1325
	int ret;
1326

1327
	/*
1328
	 * IA32_MKTME_KEYID_PARTIONING:
1329
	 *   Bit [31:0]:	Number of MKTME KeyIDs.
1330
	 *   Bit [63:32]:	Number of TDX private KeyIDs.
1331
	 */
1332
	ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
1333
			&_nr_tdx_keyids);
1334
	if (ret || !_nr_tdx_keyids)
1335
		return -EINVAL;
1336

1337
	/* TDX KeyIDs start after the last MKTME KeyID. */
1338
	_tdx_keyid_start = _nr_mktme_keyids + 1;
1339

1340
	*tdx_keyid_start = _tdx_keyid_start;
1341
	*nr_tdx_keyids = _nr_tdx_keyids;
1342

1343
	return 0;
1344
}
1345

1346
static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn)
1347
{
1348
	struct tdx_memblock *tmb;
1349

1350
	/*
1351
	 * This check assumes that the start_pfn<->end_pfn range does not
1352
	 * cross multiple @tdx_memlist entries.  A single memory online
1353
	 * event across multiple memblocks (from which @tdx_memlist
1354
	 * entries are derived at the time of module initialization) is
1355
	 * not possible.  This is because memory offline/online is done
1356
	 * on granularity of 'struct memory_block', and the hotpluggable
1357
	 * memory region (one memblock) must be multiple of memory_block.
1358
	 */
1359
	list_for_each_entry(tmb, &tdx_memlist, list) {
1360
		if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn)
1361
			return true;
1362
	}
1363
	return false;
1364
}
1365

1366
static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action,
1367
			       void *v)
1368
{
1369
	struct memory_notify *mn = v;
1370

1371
	if (action != MEM_GOING_ONLINE)
1372
		return NOTIFY_OK;
1373

1374
	/*
1375
	 * Empty list means TDX isn't enabled.  Allow any memory
1376
	 * to go online.
1377
	 */
1378
	if (list_empty(&tdx_memlist))
1379
		return NOTIFY_OK;
1380

1381
	/*
1382
	 * The TDX memory configuration is static and can not be
1383
	 * changed.  Reject onlining any memory which is outside of
1384
	 * the static configuration whether it supports TDX or not.
1385
	 */
1386
	if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages))
1387
		return NOTIFY_OK;
1388

1389
	return NOTIFY_BAD;
1390
}
1391

1392
static struct notifier_block tdx_memory_nb = {
1393
	.notifier_call = tdx_memory_notifier,
1394
};
1395

1396
static void __init check_tdx_erratum(void)
1397
{
1398
	/*
1399
	 * These CPUs have an erratum.  A partial write from non-TD
1400
	 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX
1401
	 * private memory poisons that memory, and a subsequent read of
1402
	 * that memory triggers #MC.
1403
	 */
1404
	switch (boot_cpu_data.x86_vfm) {
1405
	case INTEL_SAPPHIRERAPIDS_X:
1406
	case INTEL_EMERALDRAPIDS_X:
1407
		setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
1408
	}
1409
}
1410

1411
void __init tdx_init(void)
1412
{
1413
	u32 tdx_keyid_start, nr_tdx_keyids;
1414
	int err;
1415

1416
	err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids);
1417
	if (err)
1418
		return;
1419

1420
	pr_info("BIOS enabled: private KeyID range [%u, %u)\n",
1421
			tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids);
1422

1423
	/*
1424
	 * The TDX module itself requires one 'global KeyID' to protect
1425
	 * its metadata.  If there's only one TDX KeyID, there won't be
1426
	 * any left for TDX guests thus there's no point to enable TDX
1427
	 * at all.
1428
	 */
1429
	if (nr_tdx_keyids < 2) {
1430
		pr_err("initialization failed: too few private KeyIDs available.\n");
1431
		return;
1432
	}
1433

1434
	/*
1435
	 * At this point, hibernation_available() indicates whether or
1436
	 * not hibernation support has been permanently disabled.
1437
	 */
1438
	if (hibernation_available()) {
1439
		pr_err("initialization failed: Hibernation support is enabled\n");
1440
		return;
1441
	}
1442

1443
	err = register_memory_notifier(&tdx_memory_nb);
1444
	if (err) {
1445
		pr_err("initialization failed: register_memory_notifier() failed (%d)\n",
1446
				err);
1447
		return;
1448
	}
1449

1450
#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
1451
	pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
1452
	acpi_suspend_lowlevel = NULL;
1453
#endif
1454

1455
	/*
1456
	 * Just use the first TDX KeyID as the 'global KeyID' and
1457
	 * leave the rest for TDX guests.
1458
	 */
1459
	tdx_global_keyid = tdx_keyid_start;
1460
	tdx_guest_keyid_start = tdx_keyid_start + 1;
1461
	tdx_nr_guest_keyids = nr_tdx_keyids - 1;
1462

1463
	setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
1464

1465
	check_tdx_erratum();
1466
}
1467

1468
const struct tdx_sys_info *tdx_get_sysinfo(void)
1469
{
1470
	const struct tdx_sys_info *p = NULL;
1471

1472
	/* Make sure all fields in @tdx_sysinfo have been populated */
1473
	mutex_lock(&tdx_module_lock);
1474
	if (tdx_module_status == TDX_MODULE_INITIALIZED)
1475
		p = (const struct tdx_sys_info *)&tdx_sysinfo;
1476
	mutex_unlock(&tdx_module_lock);
1477

1478
	return p;
1479
}
1480
EXPORT_SYMBOL_GPL(tdx_get_sysinfo);
1481

1482
u32 tdx_get_nr_guest_keyids(void)
1483
{
1484
	return tdx_nr_guest_keyids;
1485
}
1486
EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids);
1487

1488
int tdx_guest_keyid_alloc(void)
1489
{
1490
	return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start,
1491
			       tdx_guest_keyid_start + tdx_nr_guest_keyids - 1,
1492
			       GFP_KERNEL);
1493
}
1494
EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc);
1495

1496
void tdx_guest_keyid_free(unsigned int keyid)
1497
{
1498
	ida_free(&tdx_guest_keyid_pool, keyid);
1499
}
1500
EXPORT_SYMBOL_GPL(tdx_guest_keyid_free);
1501

1502
static inline u64 tdx_tdr_pa(struct tdx_td *td)
1503
{
1504
	return page_to_phys(td->tdr_page);
1505
}
1506

1507
/*
1508
 * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether
1509
 * a CLFLUSH of pages is required before handing them to the TDX module.
1510
 * Be conservative and make the code simpler by doing the CLFLUSH
1511
 * unconditionally.
1512
 */
1513
static void tdx_clflush_page(struct page *page)
1514
{
1515
	clflush_cache_range(page_to_virt(page), PAGE_SIZE);
1516
}
1517

1518
noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
1519
{
1520
	args->rcx = td->tdvpr_pa;
1521

1522
	return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args);
1523
}
1524
EXPORT_SYMBOL_GPL(tdh_vp_enter);
1525

1526
u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
1527
{
1528
	struct tdx_module_args args = {
1529
		.rcx = page_to_phys(tdcs_page),
1530
		.rdx = tdx_tdr_pa(td),
1531
	};
1532

1533
	tdx_clflush_page(tdcs_page);
1534
	return seamcall(TDH_MNG_ADDCX, &args);
1535
}
1536
EXPORT_SYMBOL_GPL(tdh_mng_addcx);
1537

1538
u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
1539
{
1540
	struct tdx_module_args args = {
1541
		.rcx = gpa,
1542
		.rdx = tdx_tdr_pa(td),
1543
		.r8 = page_to_phys(page),
1544
		.r9 = page_to_phys(source),
1545
	};
1546
	u64 ret;
1547

1548
	tdx_clflush_page(page);
1549
	ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args);
1550

1551
	*ext_err1 = args.rcx;
1552
	*ext_err2 = args.rdx;
1553

1554
	return ret;
1555
}
1556
EXPORT_SYMBOL_GPL(tdh_mem_page_add);
1557

1558
u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
1559
{
1560
	struct tdx_module_args args = {
1561
		.rcx = gpa | level,
1562
		.rdx = tdx_tdr_pa(td),
1563
		.r8 = page_to_phys(page),
1564
	};
1565
	u64 ret;
1566

1567
	tdx_clflush_page(page);
1568
	ret = seamcall_ret(TDH_MEM_SEPT_ADD, &args);
1569

1570
	*ext_err1 = args.rcx;
1571
	*ext_err2 = args.rdx;
1572

1573
	return ret;
1574
}
1575
EXPORT_SYMBOL_GPL(tdh_mem_sept_add);
1576

1577
u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
1578
{
1579
	struct tdx_module_args args = {
1580
		.rcx = page_to_phys(tdcx_page),
1581
		.rdx = vp->tdvpr_pa,
1582
	};
1583

1584
	tdx_clflush_page(tdcx_page);
1585
	return seamcall(TDH_VP_ADDCX, &args);
1586
}
1587
EXPORT_SYMBOL_GPL(tdh_vp_addcx);
1588

1589
u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
1590
{
1591
	struct tdx_module_args args = {
1592
		.rcx = gpa | level,
1593
		.rdx = tdx_tdr_pa(td),
1594
		.r8 = page_to_phys(page),
1595
	};
1596
	u64 ret;
1597

1598
	tdx_clflush_page(page);
1599
	ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args);
1600

1601
	*ext_err1 = args.rcx;
1602
	*ext_err2 = args.rdx;
1603

1604
	return ret;
1605
}
1606
EXPORT_SYMBOL_GPL(tdh_mem_page_aug);
1607

1608
u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2)
1609
{
1610
	struct tdx_module_args args = {
1611
		.rcx = gpa | level,
1612
		.rdx = tdx_tdr_pa(td),
1613
	};
1614
	u64 ret;
1615

1616
	ret = seamcall_ret(TDH_MEM_RANGE_BLOCK, &args);
1617

1618
	*ext_err1 = args.rcx;
1619
	*ext_err2 = args.rdx;
1620

1621
	return ret;
1622
}
1623
EXPORT_SYMBOL_GPL(tdh_mem_range_block);
1624

1625
u64 tdh_mng_key_config(struct tdx_td *td)
1626
{
1627
	struct tdx_module_args args = {
1628
		.rcx = tdx_tdr_pa(td),
1629
	};
1630

1631
	return seamcall(TDH_MNG_KEY_CONFIG, &args);
1632
}
1633
EXPORT_SYMBOL_GPL(tdh_mng_key_config);
1634

1635
u64 tdh_mng_create(struct tdx_td *td, u16 hkid)
1636
{
1637
	struct tdx_module_args args = {
1638
		.rcx = tdx_tdr_pa(td),
1639
		.rdx = hkid,
1640
	};
1641

1642
	tdx_clflush_page(td->tdr_page);
1643
	return seamcall(TDH_MNG_CREATE, &args);
1644
}
1645
EXPORT_SYMBOL_GPL(tdh_mng_create);
1646

1647
u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp)
1648
{
1649
	struct tdx_module_args args = {
1650
		.rcx = vp->tdvpr_pa,
1651
		.rdx = tdx_tdr_pa(td),
1652
	};
1653

1654
	tdx_clflush_page(vp->tdvpr_page);
1655
	return seamcall(TDH_VP_CREATE, &args);
1656
}
1657
EXPORT_SYMBOL_GPL(tdh_vp_create);
1658

1659
u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
1660
{
1661
	struct tdx_module_args args = {
1662
		.rcx = tdx_tdr_pa(td),
1663
		.rdx = field,
1664
	};
1665
	u64 ret;
1666

1667
	ret = seamcall_ret(TDH_MNG_RD, &args);
1668

1669
	/* R8: Content of the field, or 0 in case of error. */
1670
	*data = args.r8;
1671

1672
	return ret;
1673
}
1674
EXPORT_SYMBOL_GPL(tdh_mng_rd);
1675

1676
u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2)
1677
{
1678
	struct tdx_module_args args = {
1679
		.rcx = gpa,
1680
		.rdx = tdx_tdr_pa(td),
1681
	};
1682
	u64 ret;
1683

1684
	ret = seamcall_ret(TDH_MR_EXTEND, &args);
1685

1686
	*ext_err1 = args.rcx;
1687
	*ext_err2 = args.rdx;
1688

1689
	return ret;
1690
}
1691
EXPORT_SYMBOL_GPL(tdh_mr_extend);
1692

1693
u64 tdh_mr_finalize(struct tdx_td *td)
1694
{
1695
	struct tdx_module_args args = {
1696
		.rcx = tdx_tdr_pa(td),
1697
	};
1698

1699
	return seamcall(TDH_MR_FINALIZE, &args);
1700
}
1701
EXPORT_SYMBOL_GPL(tdh_mr_finalize);
1702

1703
u64 tdh_vp_flush(struct tdx_vp *vp)
1704
{
1705
	struct tdx_module_args args = {
1706
		.rcx = vp->tdvpr_pa,
1707
	};
1708

1709
	return seamcall(TDH_VP_FLUSH, &args);
1710
}
1711
EXPORT_SYMBOL_GPL(tdh_vp_flush);
1712

1713
u64 tdh_mng_vpflushdone(struct tdx_td *td)
1714
{
1715
	struct tdx_module_args args = {
1716
		.rcx = tdx_tdr_pa(td),
1717
	};
1718

1719
	return seamcall(TDH_MNG_VPFLUSHDONE, &args);
1720
}
1721
EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone);
1722

1723
u64 tdh_mng_key_freeid(struct tdx_td *td)
1724
{
1725
	struct tdx_module_args args = {
1726
		.rcx = tdx_tdr_pa(td),
1727
	};
1728

1729
	return seamcall(TDH_MNG_KEY_FREEID, &args);
1730
}
1731
EXPORT_SYMBOL_GPL(tdh_mng_key_freeid);
1732

1733
u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err)
1734
{
1735
	struct tdx_module_args args = {
1736
		.rcx = tdx_tdr_pa(td),
1737
		.rdx = td_params,
1738
	};
1739
	u64 ret;
1740

1741
	ret = seamcall_ret(TDH_MNG_INIT, &args);
1742

1743
	*extended_err = args.rcx;
1744

1745
	return ret;
1746
}
1747
EXPORT_SYMBOL_GPL(tdh_mng_init);
1748

1749
u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data)
1750
{
1751
	struct tdx_module_args args = {
1752
		.rcx = vp->tdvpr_pa,
1753
		.rdx = field,
1754
	};
1755
	u64 ret;
1756

1757
	ret = seamcall_ret(TDH_VP_RD, &args);
1758

1759
	/* R8: Content of the field, or 0 in case of error. */
1760
	*data = args.r8;
1761

1762
	return ret;
1763
}
1764
EXPORT_SYMBOL_GPL(tdh_vp_rd);
1765

1766
u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask)
1767
{
1768
	struct tdx_module_args args = {
1769
		.rcx = vp->tdvpr_pa,
1770
		.rdx = field,
1771
		.r8 = data,
1772
		.r9 = mask,
1773
	};
1774

1775
	return seamcall(TDH_VP_WR, &args);
1776
}
1777
EXPORT_SYMBOL_GPL(tdh_vp_wr);
1778

1779
u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
1780
{
1781
	struct tdx_module_args args = {
1782
		.rcx = vp->tdvpr_pa,
1783
		.rdx = initial_rcx,
1784
		.r8 = x2apicid,
1785
	};
1786

1787
	/* apicid requires version == 1. */
1788
	return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args);
1789
}
1790
EXPORT_SYMBOL_GPL(tdh_vp_init);
1791

1792
/*
1793
 * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats.
1794
 * So despite the names, they must be interpted specially as described by the spec. Return
1795
 * them only for error reporting purposes.
1796
 */
1797
u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size)
1798
{
1799
	struct tdx_module_args args = {
1800
		.rcx = page_to_phys(page),
1801
	};
1802
	u64 ret;
1803

1804
	ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args);
1805

1806
	*tdx_pt = args.rcx;
1807
	*tdx_owner = args.rdx;
1808
	*tdx_size = args.r8;
1809

1810
	return ret;
1811
}
1812
EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim);
1813

1814
u64 tdh_mem_track(struct tdx_td *td)
1815
{
1816
	struct tdx_module_args args = {
1817
		.rcx = tdx_tdr_pa(td),
1818
	};
1819

1820
	return seamcall(TDH_MEM_TRACK, &args);
1821
}
1822
EXPORT_SYMBOL_GPL(tdh_mem_track);
1823

1824
u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2)
1825
{
1826
	struct tdx_module_args args = {
1827
		.rcx = gpa | level,
1828
		.rdx = tdx_tdr_pa(td),
1829
	};
1830
	u64 ret;
1831

1832
	ret = seamcall_ret(TDH_MEM_PAGE_REMOVE, &args);
1833

1834
	*ext_err1 = args.rcx;
1835
	*ext_err2 = args.rdx;
1836

1837
	return ret;
1838
}
1839
EXPORT_SYMBOL_GPL(tdh_mem_page_remove);
1840

1841
u64 tdh_phymem_cache_wb(bool resume)
1842
{
1843
	struct tdx_module_args args = {
1844
		.rcx = resume ? 1 : 0,
1845
	};
1846

1847
	return seamcall(TDH_PHYMEM_CACHE_WB, &args);
1848
}
1849
EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb);
1850

1851
u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
1852
{
1853
	struct tdx_module_args args = {};
1854

1855
	args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page);
1856

1857
	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
1858
}
1859
EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr);
1860

1861
u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
1862
{
1863
	struct tdx_module_args args = {};
1864

1865
	args.rcx = mk_keyed_paddr(hkid, page);
1866

1867
	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
1868
}
1869
EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
1870

1871
#ifdef CONFIG_KEXEC_CORE
1872
void tdx_cpu_flush_cache_for_kexec(void)
1873
{
1874
	lockdep_assert_preemption_disabled();
1875

1876
	if (!this_cpu_read(cache_state_incoherent))
1877
		return;
1878

1879
	/*
1880
	 * Private memory cachelines need to be clean at the time of
1881
	 * kexec.  Write them back now, as the caller promises that
1882
	 * there should be no more SEAMCALLs on this CPU.
1883
	 */
1884
	wbinvd();
1885
	this_cpu_write(cache_state_incoherent, false);
1886
}
1887
EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache_for_kexec);
1888
#endif
1889

1890
Product

Resources

Company