CoCalc -- tdx.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/vmx/tdx.c
²⁹⁵²⁴ views
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/cleanup.h>
3
#include <linux/cpu.h>
4
#include <asm/cpufeature.h>
5
#include <asm/fpu/xcr.h>
6
#include <linux/misc_cgroup.h>
7
#include <linux/mmu_context.h>
8
#include <asm/tdx.h>
9
#include "capabilities.h"
10
#include "mmu.h"
11
#include "x86_ops.h"
12
#include "lapic.h"
13
#include "tdx.h"
14
#include "vmx.h"
15
#include "mmu/spte.h"
16
#include "common.h"
17
#include "posted_intr.h"
18
#include "irq.h"
19
#include <trace/events/kvm.h>
20
#include "trace.h"
21

22
#pragma GCC poison to_vmx
23

24
#undef pr_fmt
25
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26

27
#define pr_tdx_error(__fn, __err)	\
28
	pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
29

30
#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...)		\
31
	pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt,  __err,  __VA_ARGS__)
32

33
#define pr_tdx_error_1(__fn, __err, __rcx)		\
34
	__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
35

36
#define pr_tdx_error_2(__fn, __err, __rcx, __rdx)	\
37
	__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
38

39
#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8)	\
40
	__pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
41

42
bool enable_tdx __ro_after_init;
43
module_param_named(tdx, enable_tdx, bool, 0444);
44

45
#define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
46
#define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
47

48
static enum cpuhp_state tdx_cpuhp_state;
49

50
static const struct tdx_sys_info *tdx_sysinfo;
51

52
void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
53
{
54
	KVM_BUG_ON(1, tdx->vcpu.kvm);
55
	pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
56
}
57

58
void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
59
		      u64 val, u64 err)
60
{
61
	KVM_BUG_ON(1, tdx->vcpu.kvm);
62
	pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
63
}
64

65
#define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
66

67
static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
68
{
69
	return container_of(kvm, struct kvm_tdx, kvm);
70
}
71

72
static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
73
{
74
	return container_of(vcpu, struct vcpu_tdx, vcpu);
75
}
76

77
static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
78
{
79
	u64 val = KVM_SUPPORTED_TD_ATTRS;
80

81
	if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
82
		return 0;
83

84
	val &= td_conf->attributes_fixed0;
85

86
	return val;
87
}
88

89
static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
90
{
91
	u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
92

93
	if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
94
		return 0;
95

96
	val &= td_conf->xfam_fixed0;
97

98
	return val;
99
}
100

101
static int tdx_get_guest_phys_addr_bits(const u32 eax)
102
{
103
	return (eax & GENMASK(23, 16)) >> 16;
104
}
105

106
static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
107
{
108
	return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
109
}
110

111
#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
112

113
static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
114
{
115
	return entry->function == 7 && entry->index == 0 &&
116
	       (entry->ebx & TDX_FEATURE_TSX);
117
}
118

119
static void clear_tsx(struct kvm_cpuid_entry2 *entry)
120
{
121
	entry->ebx &= ~TDX_FEATURE_TSX;
122
}
123

124
static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
125
{
126
	return entry->function == 7 && entry->index == 0 &&
127
	       (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
128
}
129

130
static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
131
{
132
	entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
133
}
134

135
static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
136
{
137
	if (has_tsx(entry))
138
		clear_tsx(entry);
139

140
	if (has_waitpkg(entry))
141
		clear_waitpkg(entry);
142
}
143

144
static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
145
{
146
	return has_tsx(entry) || has_waitpkg(entry);
147
}
148

149
#define KVM_TDX_CPUID_NO_SUBLEAF	((__u32)-1)
150

151
static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
152
{
153
	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
154

155
	entry->function = (u32)td_conf->cpuid_config_leaves[idx];
156
	entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
157
	entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
158
	entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
159
	entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
160
	entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
161

162
	if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
163
		entry->index = 0;
164

165
	/*
166
	 * The TDX module doesn't allow configuring the guest phys addr bits
167
	 * (EAX[23:16]).  However, KVM uses it as an interface to the userspace
168
	 * to configure the GPAW.  Report these bits as configurable.
169
	 */
170
	if (entry->function == 0x80000008)
171
		entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
172

173
	tdx_clear_unsupported_cpuid(entry);
174
}
175

176
#define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT	BIT(1)
177

178
static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
179
			     struct kvm_tdx_capabilities *caps)
180
{
181
	int i;
182

183
	caps->supported_attrs = tdx_get_supported_attrs(td_conf);
184
	if (!caps->supported_attrs)
185
		return -EIO;
186

187
	caps->supported_xfam = tdx_get_supported_xfam(td_conf);
188
	if (!caps->supported_xfam)
189
		return -EIO;
190

191
	caps->cpuid.nent = td_conf->num_cpuid_config;
192

193
	caps->user_tdvmcallinfo_1_r11 =
194
		TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
195

196
	for (i = 0; i < td_conf->num_cpuid_config; i++)
197
		td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
198

199
	return 0;
200
}
201

202
/*
203
 * Some SEAMCALLs acquire the TDX module globally, and can fail with
204
 * TDX_OPERAND_BUSY.  Use a global mutex to serialize these SEAMCALLs.
205
 */
206
static DEFINE_MUTEX(tdx_lock);
207

208
static atomic_t nr_configured_hkid;
209

210
static bool tdx_operand_busy(u64 err)
211
{
212
	return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
213
}
214

215

216
/*
217
 * A per-CPU list of TD vCPUs associated with a given CPU.
218
 * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
219
 * list.
220
 * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
221
 *   the old CPU during the IPI callback running on the old CPU, and then added
222
 *   to the per-CPU list of the new CPU.
223
 * - When a TD is tearing down, all vCPUs are disassociated from their current
224
 *   running CPUs and removed from the per-CPU list during the IPI callback
225
 *   running on those CPUs.
226
 * - When a CPU is brought down, traverse the per-CPU list to disassociate all
227
 *   associated TD vCPUs and remove them from the per-CPU list.
228
 */
229
static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
230

231
static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
232
{
233
	return to_tdx(vcpu)->vp_enter_args.r10;
234
}
235

236
static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
237
{
238
	return to_tdx(vcpu)->vp_enter_args.r11;
239
}
240

241
static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
242
						     long val)
243
{
244
	to_tdx(vcpu)->vp_enter_args.r10 = val;
245
}
246

247
static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
248
						    unsigned long val)
249
{
250
	to_tdx(vcpu)->vp_enter_args.r11 = val;
251
}
252

253
static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
254
{
255
	tdx_guest_keyid_free(kvm_tdx->hkid);
256
	kvm_tdx->hkid = -1;
257
	atomic_dec(&nr_configured_hkid);
258
	misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
259
	put_misc_cg(kvm_tdx->misc_cg);
260
	kvm_tdx->misc_cg = NULL;
261
}
262

263
static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
264
{
265
	return kvm_tdx->hkid > 0;
266
}
267

268
static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
269
{
270
	lockdep_assert_irqs_disabled();
271

272
	list_del(&to_tdx(vcpu)->cpu_list);
273

274
	/*
275
	 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
276
	 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
277
	 * to its list before it's deleted from this CPU's list.
278
	 */
279
	smp_wmb();
280

281
	vcpu->cpu = -1;
282
}
283

284
static void tdx_no_vcpus_enter_start(struct kvm *kvm)
285
{
286
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
287

288
	lockdep_assert_held_write(&kvm->mmu_lock);
289

290
	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
291

292
	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
293
}
294

295
static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
296
{
297
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
298

299
	lockdep_assert_held_write(&kvm->mmu_lock);
300

301
	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
302
}
303

304
/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
305
static int __tdx_reclaim_page(struct page *page)
306
{
307
	u64 err, rcx, rdx, r8;
308

309
	err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
310

311
	/*
312
	 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
313
	 * before the HKID is released and control pages have also been
314
	 * released at this point, so there is no possibility of contention.
315
	 */
316
	if (WARN_ON_ONCE(err)) {
317
		pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
318
		return -EIO;
319
	}
320
	return 0;
321
}
322

323
static int tdx_reclaim_page(struct page *page)
324
{
325
	int r;
326

327
	r = __tdx_reclaim_page(page);
328
	if (!r)
329
		tdx_quirk_reset_page(page);
330
	return r;
331
}
332

333

334
/*
335
 * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
336
 * private KeyID.  Assume the cache associated with the TDX private KeyID has
337
 * been flushed.
338
 */
339
static void tdx_reclaim_control_page(struct page *ctrl_page)
340
{
341
	/*
342
	 * Leak the page if the kernel failed to reclaim the page.
343
	 * The kernel cannot use it safely anymore.
344
	 */
345
	if (tdx_reclaim_page(ctrl_page))
346
		return;
347

348
	__free_page(ctrl_page);
349
}
350

351
struct tdx_flush_vp_arg {
352
	struct kvm_vcpu *vcpu;
353
	u64 err;
354
};
355

356
static void tdx_flush_vp(void *_arg)
357
{
358
	struct tdx_flush_vp_arg *arg = _arg;
359
	struct kvm_vcpu *vcpu = arg->vcpu;
360
	u64 err;
361

362
	arg->err = 0;
363
	lockdep_assert_irqs_disabled();
364

365
	/* Task migration can race with CPU offlining. */
366
	if (unlikely(vcpu->cpu != raw_smp_processor_id()))
367
		return;
368

369
	/*
370
	 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
371
	 * list tracking still needs to be updated so that it's correct if/when
372
	 * the vCPU does get initialized.
373
	 */
374
	if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
375
		/*
376
		 * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are:
377
		 * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
378
		 * vp flush function is called when destructing vCPU/TD or vCPU
379
		 * migration.  No other thread uses TDVPR in those cases.
380
		 */
381
		err = tdh_vp_flush(&to_tdx(vcpu)->vp);
382
		if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
383
			/*
384
			 * This function is called in IPI context. Do not use
385
			 * printk to avoid console semaphore.
386
			 * The caller prints out the error message, instead.
387
			 */
388
			if (err)
389
				arg->err = err;
390
		}
391
	}
392

393
	tdx_disassociate_vp(vcpu);
394
}
395

396
static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
397
{
398
	struct tdx_flush_vp_arg arg = {
399
		.vcpu = vcpu,
400
	};
401
	int cpu = vcpu->cpu;
402

403
	if (unlikely(cpu == -1))
404
		return;
405

406
	smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
407
	if (KVM_BUG_ON(arg.err, vcpu->kvm))
408
		pr_tdx_error(TDH_VP_FLUSH, arg.err);
409
}
410

411
void tdx_disable_virtualization_cpu(void)
412
{
413
	int cpu = raw_smp_processor_id();
414
	struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
415
	struct tdx_flush_vp_arg arg;
416
	struct vcpu_tdx *tdx, *tmp;
417
	unsigned long flags;
418

419
	local_irq_save(flags);
420
	/* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
421
	list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
422
		arg.vcpu = &tdx->vcpu;
423
		tdx_flush_vp(&arg);
424
	}
425
	local_irq_restore(flags);
426

427
	/*
428
	 * Flush cache now if kexec is possible: this is necessary to avoid
429
	 * having dirty private memory cachelines when the new kernel boots,
430
	 * but WBINVD is a relatively expensive operation and doing it during
431
	 * kexec can exacerbate races in native_stop_other_cpus().  Do it
432
	 * now, since this is a safe moment and there is going to be no more
433
	 * TDX activity on this CPU from this point on.
434
	 */
435
	tdx_cpu_flush_cache_for_kexec();
436
}
437

438
#define TDX_SEAMCALL_RETRIES 10000
439

440
static void smp_func_do_phymem_cache_wb(void *unused)
441
{
442
	u64 err = 0;
443
	bool resume;
444
	int i;
445

446
	/*
447
	 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
448
	 * KeyID on the package or core.  The TDX module may not finish the
449
	 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead.  The
450
	 * kernel should retry it until it returns success w/o rescheduling.
451
	 */
452
	for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
453
		resume = !!err;
454
		err = tdh_phymem_cache_wb(resume);
455
		switch (err) {
456
		case TDX_INTERRUPTED_RESUMABLE:
457
			continue;
458
		case TDX_NO_HKID_READY_TO_WBCACHE:
459
			err = TDX_SUCCESS; /* Already done by other thread */
460
			fallthrough;
461
		default:
462
			goto out;
463
		}
464
	}
465

466
out:
467
	if (WARN_ON_ONCE(err))
468
		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
469
}
470

471
void tdx_mmu_release_hkid(struct kvm *kvm)
472
{
473
	bool packages_allocated, targets_allocated;
474
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
475
	cpumask_var_t packages, targets;
476
	struct kvm_vcpu *vcpu;
477
	unsigned long j;
478
	int i;
479
	u64 err;
480

481
	if (!is_hkid_assigned(kvm_tdx))
482
		return;
483

484
	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
485
	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
486
	cpus_read_lock();
487

488
	kvm_for_each_vcpu(j, vcpu, kvm)
489
		tdx_flush_vp_on_cpu(vcpu);
490

491
	/*
492
	 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
493
	 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
494
	 * Multiple TDX guests can be destroyed simultaneously. Take the
495
	 * mutex to prevent it from getting error.
496
	 */
497
	mutex_lock(&tdx_lock);
498

499
	/*
500
	 * Releasing HKID is in vm_destroy().
501
	 * After the above flushing vps, there should be no more vCPU
502
	 * associations, as all vCPU fds have been released at this stage.
503
	 */
504
	err = tdh_mng_vpflushdone(&kvm_tdx->td);
505
	if (err == TDX_FLUSHVP_NOT_DONE)
506
		goto out;
507
	if (KVM_BUG_ON(err, kvm)) {
508
		pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
509
		pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
510
		       kvm_tdx->hkid);
511
		goto out;
512
	}
513

514
	for_each_online_cpu(i) {
515
		if (packages_allocated &&
516
		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
517
					     packages))
518
			continue;
519
		if (targets_allocated)
520
			cpumask_set_cpu(i, targets);
521
	}
522
	if (targets_allocated)
523
		on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
524
	else
525
		on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
526
	/*
527
	 * In the case of error in smp_func_do_phymem_cache_wb(), the following
528
	 * tdh_mng_key_freeid() will fail.
529
	 */
530
	err = tdh_mng_key_freeid(&kvm_tdx->td);
531
	if (KVM_BUG_ON(err, kvm)) {
532
		pr_tdx_error(TDH_MNG_KEY_FREEID, err);
533
		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
534
		       kvm_tdx->hkid);
535
	} else {
536
		tdx_hkid_free(kvm_tdx);
537
	}
538

539
out:
540
	mutex_unlock(&tdx_lock);
541
	cpus_read_unlock();
542
	free_cpumask_var(targets);
543
	free_cpumask_var(packages);
544
}
545

546
static void tdx_reclaim_td_control_pages(struct kvm *kvm)
547
{
548
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
549
	u64 err;
550
	int i;
551

552
	/*
553
	 * tdx_mmu_release_hkid() failed to reclaim HKID.  Something went wrong
554
	 * heavily with TDX module.  Give up freeing TD pages.  As the function
555
	 * already warned, don't warn it again.
556
	 */
557
	if (is_hkid_assigned(kvm_tdx))
558
		return;
559

560
	if (kvm_tdx->td.tdcs_pages) {
561
		for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
562
			if (!kvm_tdx->td.tdcs_pages[i])
563
				continue;
564

565
			tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
566
		}
567
		kfree(kvm_tdx->td.tdcs_pages);
568
		kvm_tdx->td.tdcs_pages = NULL;
569
	}
570

571
	if (!kvm_tdx->td.tdr_page)
572
		return;
573

574
	if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
575
		return;
576

577
	/*
578
	 * Use a SEAMCALL to ask the TDX module to flush the cache based on the
579
	 * KeyID. TDX module may access TDR while operating on TD (Especially
580
	 * when it is reclaiming TDCS).
581
	 */
582
	err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
583
	if (KVM_BUG_ON(err, kvm)) {
584
		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
585
		return;
586
	}
587
	tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
588

589
	__free_page(kvm_tdx->td.tdr_page);
590
	kvm_tdx->td.tdr_page = NULL;
591
}
592

593
void tdx_vm_destroy(struct kvm *kvm)
594
{
595
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
596

597
	tdx_reclaim_td_control_pages(kvm);
598

599
	kvm_tdx->state = TD_STATE_UNINITIALIZED;
600
}
601

602
static int tdx_do_tdh_mng_key_config(void *param)
603
{
604
	struct kvm_tdx *kvm_tdx = param;
605
	u64 err;
606

607
	/* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
608
	err = tdh_mng_key_config(&kvm_tdx->td);
609

610
	if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
611
		pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
612
		return -EIO;
613
	}
614

615
	return 0;
616
}
617

618
int tdx_vm_init(struct kvm *kvm)
619
{
620
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
621

622
	kvm->arch.has_protected_state = true;
623
	/*
624
	 * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap,
625
	 * i.e. all EOIs are accelerated and never trigger exits.
626
	 */
627
	kvm->arch.has_protected_eoi = true;
628
	kvm->arch.has_private_mem = true;
629
	kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
630

631
	/*
632
	 * Because guest TD is protected, VMM can't parse the instruction in TD.
633
	 * Instead, guest uses MMIO hypercall.  For unmodified device driver,
634
	 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
635
	 * instruction into MMIO hypercall.
636
	 *
637
	 * SPTE value for MMIO needs to be setup so that #VE is injected into
638
	 * TD instead of triggering EPT MISCONFIG.
639
	 * - RWX=0 so that EPT violation is triggered.
640
	 * - suppress #VE bit is cleared to inject #VE.
641
	 */
642
	kvm_mmu_set_mmio_spte_value(kvm, 0);
643

644
	/*
645
	 * TDX has its own limit of maximum vCPUs it can support for all
646
	 * TDX guests in addition to KVM_MAX_VCPUS.  TDX module reports
647
	 * such limit via the MAX_VCPU_PER_TD global metadata.  In
648
	 * practice, it reflects the number of logical CPUs that ALL
649
	 * platforms that the TDX module supports can possibly have.
650
	 *
651
	 * Limit TDX guest's maximum vCPUs to the number of logical CPUs
652
	 * the platform has.  Simply forwarding the MAX_VCPU_PER_TD to
653
	 * userspace would result in an unpredictable ABI.
654
	 */
655
	kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
656

657
	kvm_tdx->state = TD_STATE_UNINITIALIZED;
658

659
	return 0;
660
}
661

662
int tdx_vcpu_create(struct kvm_vcpu *vcpu)
663
{
664
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
665
	struct vcpu_tdx *tdx = to_tdx(vcpu);
666

667
	if (kvm_tdx->state != TD_STATE_INITIALIZED)
668
		return -EIO;
669

670
	/*
671
	 * TDX module mandates APICv, which requires an in-kernel local APIC.
672
	 * Disallow an in-kernel I/O APIC, because level-triggered interrupts
673
	 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
674
	 */
675
	if (!irqchip_split(vcpu->kvm))
676
		return -EINVAL;
677

678
	fpstate_set_confidential(&vcpu->arch.guest_fpu);
679
	vcpu->arch.apic->guest_apic_protected = true;
680
	INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
681

682
	vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
683

684
	vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
685
	vcpu->arch.cr0_guest_owned_bits = -1ul;
686
	vcpu->arch.cr4_guest_owned_bits = -1ul;
687

688
	/* KVM can't change TSC offset/multiplier as TDX module manages them. */
689
	vcpu->arch.guest_tsc_protected = true;
690
	vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
691
	vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
692
	vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
693
	vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
694

695
	vcpu->arch.guest_state_protected =
696
		!(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
697

698
	if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
699
		vcpu->arch.xfd_no_write_intercept = true;
700

701
	tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
702
	__pi_set_sn(&tdx->vt.pi_desc);
703

704
	tdx->state = VCPU_TD_STATE_UNINITIALIZED;
705

706
	return 0;
707
}
708

709
void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
710
{
711
	struct vcpu_tdx *tdx = to_tdx(vcpu);
712

713
	vmx_vcpu_pi_load(vcpu, cpu);
714
	if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
715
		return;
716

717
	tdx_flush_vp_on_cpu(vcpu);
718

719
	KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
720
	local_irq_disable();
721
	/*
722
	 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
723
	 * vcpu->cpu is read before tdx->cpu_list.
724
	 */
725
	smp_rmb();
726

727
	list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
728
	local_irq_enable();
729
}
730

731
bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
732
{
733
	/*
734
	 * KVM can't get the interrupt status of TDX guest and it assumes
735
	 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
736
	 * which passes the interrupt blocked flag.
737
	 */
738
	return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
739
	       !to_tdx(vcpu)->vp_enter_args.r12;
740
}
741

742
static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
743
{
744
	u64 vcpu_state_details;
745

746
	if (pi_has_pending_interrupt(vcpu))
747
		return true;
748

749
	/*
750
	 * Only check RVI pending for HALTED case with IRQ enabled.
751
	 * For non-HLT cases, KVM doesn't care about STI/SS shadows.  And if the
752
	 * interrupt was pending before TD exit, then it _must_ be blocked,
753
	 * otherwise the interrupt would have been serviced at the instruction
754
	 * boundary.
755
	 */
756
	if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
757
	    to_tdx(vcpu)->vp_enter_args.r12)
758
		return false;
759

760
	vcpu_state_details =
761
		td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
762

763
	return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
764
}
765

766
/*
767
 * Compared to vmx_prepare_switch_to_guest(), there is not much to do
768
 * as SEAMCALL/SEAMRET calls take care of most of save and restore.
769
 */
770
void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
771
{
772
	struct vcpu_vt *vt = to_vt(vcpu);
773

774
	if (vt->guest_state_loaded)
775
		return;
776

777
	if (likely(is_64bit_mm(current->mm)))
778
		vt->msr_host_kernel_gs_base = current->thread.gsbase;
779
	else
780
		vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
781

782
	vt->guest_state_loaded = true;
783
}
784

785
struct tdx_uret_msr {
786
	u32 msr;
787
	unsigned int slot;
788
	u64 defval;
789
};
790

791
static struct tdx_uret_msr tdx_uret_msrs[] = {
792
	{.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
793
	{.msr = MSR_STAR,},
794
	{.msr = MSR_LSTAR,},
795
	{.msr = MSR_TSC_AUX,},
796
};
797

798
static void tdx_user_return_msr_update_cache(void)
799
{
800
	int i;
801

802
	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
803
		kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
804
						 tdx_uret_msrs[i].defval);
805
}
806

807
static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
808
{
809
	struct vcpu_vt *vt = to_vt(vcpu);
810
	struct vcpu_tdx *tdx = to_tdx(vcpu);
811

812
	if (!vt->guest_state_loaded)
813
		return;
814

815
	++vcpu->stat.host_state_reload;
816
	wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
817

818
	if (tdx->guest_entered) {
819
		tdx_user_return_msr_update_cache();
820
		tdx->guest_entered = false;
821
	}
822

823
	vt->guest_state_loaded = false;
824
}
825

826
void tdx_vcpu_put(struct kvm_vcpu *vcpu)
827
{
828
	vmx_vcpu_pi_put(vcpu);
829
	tdx_prepare_switch_to_host(vcpu);
830
}
831

832
void tdx_vcpu_free(struct kvm_vcpu *vcpu)
833
{
834
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
835
	struct vcpu_tdx *tdx = to_tdx(vcpu);
836
	int i;
837

838
	/*
839
	 * It is not possible to reclaim pages while hkid is assigned. It might
840
	 * be assigned if:
841
	 * 1. the TD VM is being destroyed but freeing hkid failed, in which
842
	 * case the pages are leaked
843
	 * 2. TD VCPU creation failed and this on the error path, in which case
844
	 * there is nothing to do anyway
845
	 */
846
	if (is_hkid_assigned(kvm_tdx))
847
		return;
848

849
	if (tdx->vp.tdcx_pages) {
850
		for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
851
			if (tdx->vp.tdcx_pages[i])
852
				tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
853
		}
854
		kfree(tdx->vp.tdcx_pages);
855
		tdx->vp.tdcx_pages = NULL;
856
	}
857
	if (tdx->vp.tdvpr_page) {
858
		tdx_reclaim_control_page(tdx->vp.tdvpr_page);
859
		tdx->vp.tdvpr_page = 0;
860
		tdx->vp.tdvpr_pa = 0;
861
	}
862

863
	tdx->state = VCPU_TD_STATE_UNINITIALIZED;
864
}
865

866
int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
867
{
868
	if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
869
		     to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
870
		return -EINVAL;
871

872
	return 1;
873
}
874

875
static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
876
{
877
	switch (tdvmcall_leaf(vcpu)) {
878
	case EXIT_REASON_CPUID:
879
	case EXIT_REASON_HLT:
880
	case EXIT_REASON_IO_INSTRUCTION:
881
	case EXIT_REASON_MSR_READ:
882
	case EXIT_REASON_MSR_WRITE:
883
		return tdvmcall_leaf(vcpu);
884
	case EXIT_REASON_EPT_VIOLATION:
885
		return EXIT_REASON_EPT_MISCONFIG;
886
	default:
887
		break;
888
	}
889

890
	return EXIT_REASON_TDCALL;
891
}
892

893
static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
894
{
895
	struct vcpu_tdx *tdx = to_tdx(vcpu);
896
	u32 exit_reason;
897

898
	switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
899
	case TDX_SUCCESS:
900
	case TDX_NON_RECOVERABLE_VCPU:
901
	case TDX_NON_RECOVERABLE_TD:
902
	case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
903
	case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
904
		break;
905
	default:
906
		return -1u;
907
	}
908

909
	exit_reason = tdx->vp_enter_ret;
910

911
	switch (exit_reason) {
912
	case EXIT_REASON_TDCALL:
913
		if (tdvmcall_exit_type(vcpu))
914
			return EXIT_REASON_VMCALL;
915

916
		return tdcall_to_vmx_exit_reason(vcpu);
917
	case EXIT_REASON_EPT_MISCONFIG:
918
		/*
919
		 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
920
		 * non-instrumentable code with interrupts disabled.
921
		 */
922
		return -1u;
923
	default:
924
		break;
925
	}
926

927
	return exit_reason;
928
}
929

930
static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
931
{
932
	struct vcpu_tdx *tdx = to_tdx(vcpu);
933
	struct vcpu_vt *vt = to_vt(vcpu);
934

935
	guest_state_enter_irqoff();
936

937
	tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
938

939
	vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
940

941
	vt->exit_qualification = tdx->vp_enter_args.rcx;
942
	tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
943
	tdx->exit_gpa = tdx->vp_enter_args.r8;
944
	vt->exit_intr_info = tdx->vp_enter_args.r9;
945

946
	vmx_handle_nmi(vcpu);
947

948
	guest_state_exit_irqoff();
949
}
950

951
static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
952
{
953
	return vmx_get_exit_reason(vcpu).failed_vmentry &&
954
	       vmx_get_exit_reason(vcpu).full != -1u;
955
}
956

957
static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
958
{
959
	u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
960

961
	/*
962
	 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
963
	 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
964
	 *
965
	 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
966
	 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
967
	 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
968
	 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
969
	 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
970
	 * requester may be blocked endlessly.
971
	 */
972
	if (unlikely(tdx_operand_busy(vp_enter_ret)))
973
		return EXIT_FASTPATH_EXIT_HANDLED;
974

975
	return EXIT_FASTPATH_NONE;
976
}
977

978
#define TDX_REGS_AVAIL_SET	(BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
979
				 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
980
				 BIT_ULL(VCPU_REGS_RAX) | \
981
				 BIT_ULL(VCPU_REGS_RBX) | \
982
				 BIT_ULL(VCPU_REGS_RCX) | \
983
				 BIT_ULL(VCPU_REGS_RDX) | \
984
				 BIT_ULL(VCPU_REGS_RBP) | \
985
				 BIT_ULL(VCPU_REGS_RSI) | \
986
				 BIT_ULL(VCPU_REGS_RDI) | \
987
				 BIT_ULL(VCPU_REGS_R8) | \
988
				 BIT_ULL(VCPU_REGS_R9) | \
989
				 BIT_ULL(VCPU_REGS_R10) | \
990
				 BIT_ULL(VCPU_REGS_R11) | \
991
				 BIT_ULL(VCPU_REGS_R12) | \
992
				 BIT_ULL(VCPU_REGS_R13) | \
993
				 BIT_ULL(VCPU_REGS_R14) | \
994
				 BIT_ULL(VCPU_REGS_R15))
995

996
static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
997
{
998
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
999

1000
	/*
1001
	 * All TDX hosts support PKRU; but even if they didn't,
1002
	 * vcpu->arch.host_pkru would be 0 and the wrpkru would be
1003
	 * skipped.
1004
	 */
1005
	if (vcpu->arch.host_pkru != 0)
1006
		wrpkru(vcpu->arch.host_pkru);
1007

1008
	if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
1009
		xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
1010

1011
	/*
1012
	 * Likewise, even if a TDX hosts didn't support XSS both arms of
1013
	 * the comparison would be 0 and the wrmsrl would be skipped.
1014
	 */
1015
	if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
1016
		wrmsrl(MSR_IA32_XSS, kvm_host.xss);
1017
}
1018

1019
#define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
1020
				DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
1021
				DEBUGCTLMSR_FREEZE_IN_SMM)
1022

1023
fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
1024
{
1025
	struct vcpu_tdx *tdx = to_tdx(vcpu);
1026
	struct vcpu_vt *vt = to_vt(vcpu);
1027

1028
	/*
1029
	 * WARN if KVM wants to force an immediate exit, as the TDX module does
1030
	 * not guarantee entry into the guest, i.e. it's possible for KVM to
1031
	 * _think_ it completed entry to the guest and forced an immediate exit
1032
	 * without actually having done so.  Luckily, KVM never needs to force
1033
	 * an immediate exit for TDX (KVM can't do direct event injection, so
1034
	 * just WARN and continue on.
1035
	 */
1036
	WARN_ON_ONCE(run_flags);
1037

1038
	/*
1039
	 * Wait until retry of SEPT-zap-related SEAMCALL completes before
1040
	 * allowing vCPU entry to avoid contention with tdh_vp_enter() and
1041
	 * TDCALLs.
1042
	 */
1043
	if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
1044
		return EXIT_FASTPATH_EXIT_HANDLED;
1045

1046
	trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT);
1047

1048
	if (pi_test_on(&vt->pi_desc)) {
1049
		apic->send_IPI_self(POSTED_INTR_VECTOR);
1050

1051
		if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
1052
			       APIC_VECTOR_MASK, &vt->pi_desc))
1053
			kvm_wait_lapic_expire(vcpu);
1054
	}
1055

1056
	tdx_vcpu_enter_exit(vcpu);
1057

1058
	if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED)
1059
		update_debugctlmsr(vcpu->arch.host_debugctl);
1060

1061
	tdx_load_host_xsave_state(vcpu);
1062
	tdx->guest_entered = true;
1063

1064
	vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
1065

1066
	if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
1067
		return EXIT_FASTPATH_NONE;
1068

1069
	if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
1070
		return EXIT_FASTPATH_NONE;
1071

1072
	if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
1073
		kvm_machine_check();
1074

1075
	trace_kvm_exit(vcpu, KVM_ISA_VMX);
1076

1077
	if (unlikely(tdx_failed_vmentry(vcpu)))
1078
		return EXIT_FASTPATH_NONE;
1079

1080
	return tdx_exit_handlers_fastpath(vcpu);
1081
}
1082

1083
void tdx_inject_nmi(struct kvm_vcpu *vcpu)
1084
{
1085
	++vcpu->stat.nmi_injections;
1086
	td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
1087
	/*
1088
	 * From KVM's perspective, NMI injection is completed right after
1089
	 * writing to PEND_NMI.  KVM doesn't care whether an NMI is injected by
1090
	 * the TDX module or not.
1091
	 */
1092
	vcpu->arch.nmi_injected = false;
1093
	/*
1094
	 * TDX doesn't support KVM to request NMI window exit.  If there is
1095
	 * still a pending vNMI, KVM is not able to inject it along with the
1096
	 * one pending in TDX module in a back-to-back way.  Since the previous
1097
	 * vNMI is still pending in TDX module, i.e. it has not been delivered
1098
	 * to TDX guest yet, it's OK to collapse the pending vNMI into the
1099
	 * previous one.  The guest is expected to handle all the NMI sources
1100
	 * when handling the first vNMI.
1101
	 */
1102
	vcpu->arch.nmi_pending = 0;
1103
}
1104

1105
static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
1106
{
1107
	u32 intr_info = vmx_get_intr_info(vcpu);
1108

1109
	/*
1110
	 * Machine checks are handled by handle_exception_irqoff(), or by
1111
	 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
1112
	 * VM-Entry.  NMIs are handled by tdx_vcpu_enter_exit().
1113
	 */
1114
	if (is_nmi(intr_info) || is_machine_check(intr_info))
1115
		return 1;
1116

1117
	vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
1118
	vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1119
	vcpu->run->ex.error_code = 0;
1120

1121
	return 0;
1122
}
1123

1124
static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
1125
{
1126
	tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
1127
	return 1;
1128
}
1129

1130
static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
1131
{
1132
	kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
1133
	kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
1134
	kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
1135
	kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
1136
	kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
1137

1138
	return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
1139
}
1140

1141
/*
1142
 * Split into chunks and check interrupt pending between chunks.  This allows
1143
 * for timely injection of interrupts to prevent issues with guest lockup
1144
 * detection.
1145
 */
1146
#define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
1147
static void __tdx_map_gpa(struct vcpu_tdx *tdx);
1148

1149
static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
1150
{
1151
	struct vcpu_tdx *tdx = to_tdx(vcpu);
1152

1153
	if (vcpu->run->hypercall.ret) {
1154
		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1155
		tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1156
		return 1;
1157
	}
1158

1159
	tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
1160
	if (tdx->map_gpa_next >= tdx->map_gpa_end)
1161
		return 1;
1162

1163
	/*
1164
	 * Stop processing the remaining part if there is a pending interrupt,
1165
	 * which could be qualified to deliver.  Skip checking pending RVI for
1166
	 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
1167
	 */
1168
	if (kvm_vcpu_has_events(vcpu)) {
1169
		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
1170
		tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1171
		return 1;
1172
	}
1173

1174
	__tdx_map_gpa(tdx);
1175
	return 0;
1176
}
1177

1178
static void __tdx_map_gpa(struct vcpu_tdx *tdx)
1179
{
1180
	u64 gpa = tdx->map_gpa_next;
1181
	u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
1182

1183
	if (size > TDX_MAP_GPA_MAX_LEN)
1184
		size = TDX_MAP_GPA_MAX_LEN;
1185

1186
	tdx->vcpu.run->exit_reason       = KVM_EXIT_HYPERCALL;
1187
	tdx->vcpu.run->hypercall.nr      = KVM_HC_MAP_GPA_RANGE;
1188
	/*
1189
	 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
1190
	 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
1191
	 * it was always zero on KVM_EXIT_HYPERCALL.  Since KVM is now overwriting
1192
	 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
1193
	 */
1194
	tdx->vcpu.run->hypercall.ret = 0;
1195
	tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1196
	tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
1197
	tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
1198
					   KVM_MAP_GPA_RANGE_ENCRYPTED :
1199
					   KVM_MAP_GPA_RANGE_DECRYPTED;
1200
	tdx->vcpu.run->hypercall.flags   = KVM_EXIT_HYPERCALL_LONG_MODE;
1201

1202
	tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
1203
}
1204

1205
static int tdx_map_gpa(struct kvm_vcpu *vcpu)
1206
{
1207
	struct vcpu_tdx *tdx = to_tdx(vcpu);
1208
	u64 gpa = tdx->vp_enter_args.r12;
1209
	u64 size = tdx->vp_enter_args.r13;
1210
	u64 ret;
1211

1212
	/*
1213
	 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
1214
	 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
1215
	 * bit set.  This is a base call so it should always be supported, but
1216
	 * KVM has no way to ensure that userspace implements the GHCI correctly.
1217
	 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
1218
	 * to the guest.
1219
	 */
1220
	if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
1221
		ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1222
		goto error;
1223
	}
1224

1225
	if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
1226
	    !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
1227
	    (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
1228
	     vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
1229
		ret = TDVMCALL_STATUS_INVALID_OPERAND;
1230
		goto error;
1231
	}
1232

1233
	if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
1234
		ret = TDVMCALL_STATUS_ALIGN_ERROR;
1235
		goto error;
1236
	}
1237

1238
	tdx->map_gpa_end = gpa + size;
1239
	tdx->map_gpa_next = gpa;
1240

1241
	__tdx_map_gpa(tdx);
1242
	return 0;
1243

1244
error:
1245
	tdvmcall_set_return_code(vcpu, ret);
1246
	tdx->vp_enter_args.r11 = gpa;
1247
	return 1;
1248
}
1249

1250
static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
1251
{
1252
	struct vcpu_tdx *tdx = to_tdx(vcpu);
1253
	u64 *regs = vcpu->run->system_event.data;
1254
	u64 *module_regs = &tdx->vp_enter_args.r8;
1255
	int index = VCPU_REGS_RAX;
1256

1257
	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
1258
	vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
1259
	vcpu->run->system_event.ndata = 16;
1260

1261
	/* Dump 16 general-purpose registers to userspace in ascending order. */
1262
	regs[index++] = tdx->vp_enter_ret;
1263
	regs[index++] = tdx->vp_enter_args.rcx;
1264
	regs[index++] = tdx->vp_enter_args.rdx;
1265
	regs[index++] = tdx->vp_enter_args.rbx;
1266
	regs[index++] = 0;
1267
	regs[index++] = 0;
1268
	regs[index++] = tdx->vp_enter_args.rsi;
1269
	regs[index] = tdx->vp_enter_args.rdi;
1270
	for (index = 0; index < 8; index++)
1271
		regs[VCPU_REGS_R8 + index] = module_regs[index];
1272

1273
	return 0;
1274
}
1275

1276
static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
1277
{
1278
	u32 eax, ebx, ecx, edx;
1279
	struct vcpu_tdx *tdx = to_tdx(vcpu);
1280

1281
	/* EAX and ECX for cpuid is stored in R12 and R13. */
1282
	eax = tdx->vp_enter_args.r12;
1283
	ecx = tdx->vp_enter_args.r13;
1284

1285
	kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
1286

1287
	tdx->vp_enter_args.r12 = eax;
1288
	tdx->vp_enter_args.r13 = ebx;
1289
	tdx->vp_enter_args.r14 = ecx;
1290
	tdx->vp_enter_args.r15 = edx;
1291

1292
	return 1;
1293
}
1294

1295
static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
1296
{
1297
	vcpu->arch.pio.count = 0;
1298
	return 1;
1299
}
1300

1301
static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
1302
{
1303
	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1304
	unsigned long val = 0;
1305
	int ret;
1306

1307
	ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
1308
					 vcpu->arch.pio.port, &val, 1);
1309

1310
	WARN_ON_ONCE(!ret);
1311

1312
	tdvmcall_set_return_val(vcpu, val);
1313

1314
	return 1;
1315
}
1316

1317
static int tdx_emulate_io(struct kvm_vcpu *vcpu)
1318
{
1319
	struct vcpu_tdx *tdx = to_tdx(vcpu);
1320
	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1321
	unsigned long val = 0;
1322
	unsigned int port;
1323
	u64 size, write;
1324
	int ret;
1325

1326
	++vcpu->stat.io_exits;
1327

1328
	size = tdx->vp_enter_args.r12;
1329
	write = tdx->vp_enter_args.r13;
1330
	port = tdx->vp_enter_args.r14;
1331

1332
	if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
1333
		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1334
		return 1;
1335
	}
1336

1337
	if (write) {
1338
		val = tdx->vp_enter_args.r15;
1339
		ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
1340
	} else {
1341
		ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
1342
	}
1343

1344
	if (!ret)
1345
		vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
1346
							   tdx_complete_pio_in;
1347
	else if (!write)
1348
		tdvmcall_set_return_val(vcpu, val);
1349

1350
	return ret;
1351
}
1352

1353
static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
1354
{
1355
	unsigned long val = 0;
1356
	gpa_t gpa;
1357
	int size;
1358

1359
	gpa = vcpu->mmio_fragments[0].gpa;
1360
	size = vcpu->mmio_fragments[0].len;
1361

1362
	memcpy(&val, vcpu->run->mmio.data, size);
1363
	tdvmcall_set_return_val(vcpu, val);
1364
	trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1365
	return 1;
1366
}
1367

1368
static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
1369
				 unsigned long val)
1370
{
1371
	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1372
		trace_kvm_fast_mmio(gpa);
1373
		return 0;
1374
	}
1375

1376
	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
1377
	if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1378
		return -EOPNOTSUPP;
1379

1380
	return 0;
1381
}
1382

1383
static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
1384
{
1385
	unsigned long val;
1386

1387
	if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1388
		return -EOPNOTSUPP;
1389

1390
	tdvmcall_set_return_val(vcpu, val);
1391
	trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1392
	return 0;
1393
}
1394

1395
static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
1396
{
1397
	struct vcpu_tdx *tdx = to_tdx(vcpu);
1398
	int size, write, r;
1399
	unsigned long val;
1400
	gpa_t gpa;
1401

1402
	size = tdx->vp_enter_args.r12;
1403
	write = tdx->vp_enter_args.r13;
1404
	gpa = tdx->vp_enter_args.r14;
1405
	val = write ? tdx->vp_enter_args.r15 : 0;
1406

1407
	if (size != 1 && size != 2 && size != 4 && size != 8)
1408
		goto error;
1409
	if (write != 0 && write != 1)
1410
		goto error;
1411

1412
	/*
1413
	 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
1414
	 * do MMIO emulation for private GPA.
1415
	 */
1416
	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
1417
	    vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
1418
		goto error;
1419

1420
	gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
1421

1422
	if (write)
1423
		r = tdx_mmio_write(vcpu, gpa, size, val);
1424
	else
1425
		r = tdx_mmio_read(vcpu, gpa, size);
1426
	if (!r)
1427
		/* Kernel completed device emulation. */
1428
		return 1;
1429

1430
	/* Request the device emulation to userspace device model. */
1431
	vcpu->mmio_is_write = write;
1432
	if (!write)
1433
		vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
1434

1435
	vcpu->run->mmio.phys_addr = gpa;
1436
	vcpu->run->mmio.len = size;
1437
	vcpu->run->mmio.is_write = write;
1438
	vcpu->run->exit_reason = KVM_EXIT_MMIO;
1439

1440
	if (write) {
1441
		memcpy(vcpu->run->mmio.data, &val, size);
1442
	} else {
1443
		vcpu->mmio_fragments[0].gpa = gpa;
1444
		vcpu->mmio_fragments[0].len = size;
1445
		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
1446
	}
1447
	return 0;
1448

1449
error:
1450
	tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1451
	return 1;
1452
}
1453

1454
static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1455
{
1456
	struct vcpu_tdx *tdx = to_tdx(vcpu);
1457

1458
	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
1459

1460
	/*
1461
	 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
1462
	 * directly without the support from userspace, just set the value
1463
	 * returned from userspace.
1464
	 */
1465
	tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
1466
	tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
1467
	tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
1468
	tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
1469

1470
	return 1;
1471
}
1472

1473
static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1474
{
1475
	struct vcpu_tdx *tdx = to_tdx(vcpu);
1476

1477
	switch (tdx->vp_enter_args.r12) {
1478
	case 0:
1479
		tdx->vp_enter_args.r11 = 0;
1480
		tdx->vp_enter_args.r12 = 0;
1481
		tdx->vp_enter_args.r13 = 0;
1482
		tdx->vp_enter_args.r14 = 0;
1483
		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
1484
		return 1;
1485
	case 1:
1486
		vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
1487
		vcpu->run->exit_reason = KVM_EXIT_TDX;
1488
		vcpu->run->tdx.flags = 0;
1489
		vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
1490
		vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
1491
		vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
1492
		vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
1493
		vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
1494
		vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
1495
		vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
1496
		return 0;
1497
	default:
1498
		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1499
		return 1;
1500
	}
1501
}
1502

1503
static int tdx_complete_simple(struct kvm_vcpu *vcpu)
1504
{
1505
	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
1506
	return 1;
1507
}
1508

1509
static int tdx_get_quote(struct kvm_vcpu *vcpu)
1510
{
1511
	struct vcpu_tdx *tdx = to_tdx(vcpu);
1512
	u64 gpa = tdx->vp_enter_args.r12;
1513
	u64 size = tdx->vp_enter_args.r13;
1514

1515
	/* The gpa of buffer must have shared bit set. */
1516
	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1517
		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1518
		return 1;
1519
	}
1520

1521
	vcpu->run->exit_reason = KVM_EXIT_TDX;
1522
	vcpu->run->tdx.flags = 0;
1523
	vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
1524
	vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1525
	vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1526
	vcpu->run->tdx.get_quote.size = size;
1527

1528
	vcpu->arch.complete_userspace_io = tdx_complete_simple;
1529

1530
	return 0;
1531
}
1532

1533
static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
1534
{
1535
	struct vcpu_tdx *tdx = to_tdx(vcpu);
1536
	u64 vector = tdx->vp_enter_args.r12;
1537

1538
	if (vector < 32 || vector > 255) {
1539
		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1540
		return 1;
1541
	}
1542

1543
	vcpu->run->exit_reason = KVM_EXIT_TDX;
1544
	vcpu->run->tdx.flags = 0;
1545
	vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT;
1546
	vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1547
	vcpu->run->tdx.setup_event_notify.vector = vector;
1548

1549
	vcpu->arch.complete_userspace_io = tdx_complete_simple;
1550

1551
	return 0;
1552
}
1553

1554
static int handle_tdvmcall(struct kvm_vcpu *vcpu)
1555
{
1556
	switch (tdvmcall_leaf(vcpu)) {
1557
	case TDVMCALL_MAP_GPA:
1558
		return tdx_map_gpa(vcpu);
1559
	case TDVMCALL_REPORT_FATAL_ERROR:
1560
		return tdx_report_fatal_error(vcpu);
1561
	case TDVMCALL_GET_TD_VM_CALL_INFO:
1562
		return tdx_get_td_vm_call_info(vcpu);
1563
	case TDVMCALL_GET_QUOTE:
1564
		return tdx_get_quote(vcpu);
1565
	case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
1566
		return tdx_setup_event_notify_interrupt(vcpu);
1567
	default:
1568
		break;
1569
	}
1570

1571
	tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
1572
	return 1;
1573
}
1574

1575
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
1576
{
1577
	u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
1578
			  TDX_SHARED_BIT_PWL_4;
1579

1580
	if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
1581
		return;
1582

1583
	td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
1584
}
1585

1586
static void tdx_unpin(struct kvm *kvm, struct page *page)
1587
{
1588
	put_page(page);
1589
}
1590

1591
static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
1592
			    enum pg_level level, struct page *page)
1593
{
1594
	int tdx_level = pg_level_to_tdx_sept_level(level);
1595
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1596
	gpa_t gpa = gfn_to_gpa(gfn);
1597
	u64 entry, level_state;
1598
	u64 err;
1599

1600
	err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
1601
	if (unlikely(tdx_operand_busy(err))) {
1602
		tdx_unpin(kvm, page);
1603
		return -EBUSY;
1604
	}
1605

1606
	if (KVM_BUG_ON(err, kvm)) {
1607
		pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
1608
		tdx_unpin(kvm, page);
1609
		return -EIO;
1610
	}
1611

1612
	return 0;
1613
}
1614

1615
/*
1616
 * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
1617
 * callback tdx_gmem_post_populate() then maps pages into private memory.
1618
 * through the a seamcall TDH.MEM.PAGE.ADD().  The SEAMCALL also requires the
1619
 * private EPT structures for the page to have been built before, which is
1620
 * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
1621
 * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
1622
 * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
1623
 * are no half-initialized shared EPT pages.
1624
 */
1625
static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
1626
					  enum pg_level level, kvm_pfn_t pfn)
1627
{
1628
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1629

1630
	if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
1631
		return -EINVAL;
1632

1633
	/* nr_premapped will be decreased when tdh_mem_page_add() is called. */
1634
	atomic64_inc(&kvm_tdx->nr_premapped);
1635
	return 0;
1636
}
1637

1638
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
1639
				     enum pg_level level, kvm_pfn_t pfn)
1640
{
1641
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1642
	struct page *page = pfn_to_page(pfn);
1643

1644
	/* TODO: handle large pages. */
1645
	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1646
		return -EINVAL;
1647

1648
	/*
1649
	 * Because guest_memfd doesn't support page migration with
1650
	 * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
1651
	 * migration.  Until guest_memfd supports page migration, prevent page
1652
	 * migration.
1653
	 * TODO: Once guest_memfd introduces callback on page migration,
1654
	 * implement it and remove get_page/put_page().
1655
	 */
1656
	get_page(page);
1657

1658
	/*
1659
	 * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
1660
	 * barrier in tdx_td_finalize().
1661
	 */
1662
	smp_rmb();
1663
	if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
1664
		return tdx_mem_page_aug(kvm, gfn, level, page);
1665

1666
	return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
1667
}
1668

1669
static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
1670
				      enum pg_level level, struct page *page)
1671
{
1672
	int tdx_level = pg_level_to_tdx_sept_level(level);
1673
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1674
	gpa_t gpa = gfn_to_gpa(gfn);
1675
	u64 err, entry, level_state;
1676

1677
	/* TODO: handle large pages. */
1678
	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1679
		return -EINVAL;
1680

1681
	if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
1682
		return -EINVAL;
1683

1684
	/*
1685
	 * When zapping private page, write lock is held. So no race condition
1686
	 * with other vcpu sept operation.
1687
	 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
1688
	 */
1689
	err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
1690
				  &level_state);
1691

1692
	if (unlikely(tdx_operand_busy(err))) {
1693
		/*
1694
		 * The second retry is expected to succeed after kicking off all
1695
		 * other vCPUs and prevent them from invoking TDH.VP.ENTER.
1696
		 */
1697
		tdx_no_vcpus_enter_start(kvm);
1698
		err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
1699
					  &level_state);
1700
		tdx_no_vcpus_enter_stop(kvm);
1701
	}
1702

1703
	if (KVM_BUG_ON(err, kvm)) {
1704
		pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
1705
		return -EIO;
1706
	}
1707

1708
	err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
1709

1710
	if (KVM_BUG_ON(err, kvm)) {
1711
		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
1712
		return -EIO;
1713
	}
1714
	tdx_quirk_reset_page(page);
1715
	tdx_unpin(kvm, page);
1716
	return 0;
1717
}
1718

1719
static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
1720
				     enum pg_level level, void *private_spt)
1721
{
1722
	int tdx_level = pg_level_to_tdx_sept_level(level);
1723
	gpa_t gpa = gfn_to_gpa(gfn);
1724
	struct page *page = virt_to_page(private_spt);
1725
	u64 err, entry, level_state;
1726

1727
	err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
1728
			       &level_state);
1729
	if (unlikely(tdx_operand_busy(err)))
1730
		return -EBUSY;
1731

1732
	if (KVM_BUG_ON(err, kvm)) {
1733
		pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
1734
		return -EIO;
1735
	}
1736

1737
	return 0;
1738
}
1739

1740
/*
1741
 * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
1742
 * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
1743
 * successfully.
1744
 *
1745
 * Since tdh_mem_sept_add() must have been invoked successfully before a
1746
 * non-leaf entry present in the mirrored page table, the SEPT ZAP related
1747
 * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
1748
 * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
1749
 * SEPT.
1750
 *
1751
 * Further check if the returned entry from SEPT walking is with RWX permissions
1752
 * to filter out anything unexpected.
1753
 *
1754
 * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
1755
 * level_state returned from a SEAMCALL error is the same as that passed into
1756
 * the SEAMCALL.
1757
 */
1758
static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
1759
					     u64 entry, int level)
1760
{
1761
	if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
1762
		return false;
1763

1764
	if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
1765
		return false;
1766

1767
	if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
1768
		return false;
1769

1770
	return true;
1771
}
1772

1773
static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
1774
				     enum pg_level level, struct page *page)
1775
{
1776
	int tdx_level = pg_level_to_tdx_sept_level(level);
1777
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1778
	gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
1779
	u64 err, entry, level_state;
1780

1781
	/* For now large page isn't supported yet. */
1782
	WARN_ON_ONCE(level != PG_LEVEL_4K);
1783

1784
	err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1785

1786
	if (unlikely(tdx_operand_busy(err))) {
1787
		/* After no vCPUs enter, the second retry is expected to succeed */
1788
		tdx_no_vcpus_enter_start(kvm);
1789
		err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1790
		tdx_no_vcpus_enter_stop(kvm);
1791
	}
1792
	if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
1793
	    !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
1794
		atomic64_dec(&kvm_tdx->nr_premapped);
1795
		tdx_unpin(kvm, page);
1796
		return 0;
1797
	}
1798

1799
	if (KVM_BUG_ON(err, kvm)) {
1800
		pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
1801
		return -EIO;
1802
	}
1803
	return 1;
1804
}
1805

1806
/*
1807
 * Ensure shared and private EPTs to be flushed on all vCPUs.
1808
 * tdh_mem_track() is the only caller that increases TD epoch. An increase in
1809
 * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
1810
 * running in guest mode with the value "N - 1".
1811
 *
1812
 * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
1813
 * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
1814
 * being increased to "N + 1".
1815
 *
1816
 * Kicking off all vCPUs after that further results in no vCPUs can run in guest
1817
 * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
1818
 * to increase TD epoch to "N + 2").
1819
 *
1820
 * TDX module will flush EPT on the next TD enter and make vCPUs to run in
1821
 * guest mode with TD epoch value "N + 1".
1822
 *
1823
 * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
1824
 * waiting empty IPI handler ack_kick().
1825
 *
1826
 * No action is required to the vCPUs being kicked off since the kicking off
1827
 * occurs certainly after TD epoch increment and before the next
1828
 * tdh_mem_track().
1829
 */
1830
static void tdx_track(struct kvm *kvm)
1831
{
1832
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1833
	u64 err;
1834

1835
	/* If TD isn't finalized, it's before any vcpu running. */
1836
	if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1837
		return;
1838

1839
	lockdep_assert_held_write(&kvm->mmu_lock);
1840

1841
	err = tdh_mem_track(&kvm_tdx->td);
1842
	if (unlikely(tdx_operand_busy(err))) {
1843
		/* After no vCPUs enter, the second retry is expected to succeed */
1844
		tdx_no_vcpus_enter_start(kvm);
1845
		err = tdh_mem_track(&kvm_tdx->td);
1846
		tdx_no_vcpus_enter_stop(kvm);
1847
	}
1848

1849
	if (KVM_BUG_ON(err, kvm))
1850
		pr_tdx_error(TDH_MEM_TRACK, err);
1851

1852
	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
1853
}
1854

1855
static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
1856
				     enum pg_level level, void *private_spt)
1857
{
1858
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1859

1860
	/*
1861
	 * free_external_spt() is only called after hkid is freed when TD is
1862
	 * tearing down.
1863
	 * KVM doesn't (yet) zap page table pages in mirror page table while
1864
	 * TD is active, though guest pages mapped in mirror page table could be
1865
	 * zapped during TD is active, e.g. for shared <-> private conversion
1866
	 * and slot move/deletion.
1867
	 */
1868
	if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
1869
		return -EINVAL;
1870

1871
	/*
1872
	 * The HKID assigned to this TD was already freed and cache was
1873
	 * already flushed. We don't have to flush again.
1874
	 */
1875
	return tdx_reclaim_page(virt_to_page(private_spt));
1876
}
1877

1878
static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
1879
					enum pg_level level, kvm_pfn_t pfn)
1880
{
1881
	struct page *page = pfn_to_page(pfn);
1882
	int ret;
1883

1884
	/*
1885
	 * HKID is released after all private pages have been removed, and set
1886
	 * before any might be populated. Warn if zapping is attempted when
1887
	 * there can't be anything populated in the private EPT.
1888
	 */
1889
	if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
1890
		return -EINVAL;
1891

1892
	ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
1893
	if (ret <= 0)
1894
		return ret;
1895

1896
	/*
1897
	 * TDX requires TLB tracking before dropping private page.  Do
1898
	 * it here, although it is also done later.
1899
	 */
1900
	tdx_track(kvm);
1901

1902
	return tdx_sept_drop_private_spte(kvm, gfn, level, page);
1903
}
1904

1905
void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
1906
			   int trig_mode, int vector)
1907
{
1908
	struct kvm_vcpu *vcpu = apic->vcpu;
1909
	struct vcpu_tdx *tdx = to_tdx(vcpu);
1910

1911
	/* TDX supports only posted interrupt.  No lapic emulation. */
1912
	__vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
1913

1914
	trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
1915
}
1916

1917
static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
1918
{
1919
	u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
1920
	u64 eq = vmx_get_exit_qual(vcpu);
1921

1922
	if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
1923
		return false;
1924

1925
	return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
1926
}
1927

1928
static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
1929
{
1930
	unsigned long exit_qual;
1931
	gpa_t gpa = to_tdx(vcpu)->exit_gpa;
1932
	bool local_retry = false;
1933
	int ret;
1934

1935
	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1936
		if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
1937
			pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
1938
				gpa, vcpu->vcpu_id);
1939
			kvm_vm_dead(vcpu->kvm);
1940
			return -EIO;
1941
		}
1942
		/*
1943
		 * Always treat SEPT violations as write faults.  Ignore the
1944
		 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
1945
		 * TD private pages are always RWX in the SEPT tables,
1946
		 * i.e. they're always mapped writable.  Just as importantly,
1947
		 * treating SEPT violations as write faults is necessary to
1948
		 * avoid COW allocations, which will cause TDAUGPAGE failures
1949
		 * due to aliasing a single HPA to multiple GPAs.
1950
		 */
1951
		exit_qual = EPT_VIOLATION_ACC_WRITE;
1952

1953
		/* Only private GPA triggers zero-step mitigation */
1954
		local_retry = true;
1955
	} else {
1956
		exit_qual = vmx_get_exit_qual(vcpu);
1957
		/*
1958
		 * EPT violation due to instruction fetch should never be
1959
		 * triggered from shared memory in TDX guest.  If such EPT
1960
		 * violation occurs, treat it as broken hardware.
1961
		 */
1962
		if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
1963
			return -EIO;
1964
	}
1965

1966
	trace_kvm_page_fault(vcpu, gpa, exit_qual);
1967

1968
	/*
1969
	 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
1970
	 * mapping in TDX.
1971
	 *
1972
	 * KVM may return RET_PF_RETRY for private GPA due to
1973
	 * - contentions when atomically updating SPTEs of the mirror page table
1974
	 * - in-progress GFN invalidation or memslot removal.
1975
	 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
1976
	 *   caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
1977
	 *   or certain TDCALLs.
1978
	 *
1979
	 * If TDH.VP.ENTER is invoked more times than the threshold set by the
1980
	 * TDX module before KVM resolves the private GPA mapping, the TDX
1981
	 * module will activate zero-step mitigation during TDH.VP.ENTER. This
1982
	 * process acquires an SEPT tree lock in the TDX module, leading to
1983
	 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
1984
	 * operations on other vCPUs.
1985
	 *
1986
	 * Breaking out of local retries for kvm_vcpu_has_events() is for
1987
	 * interrupt injection. kvm_vcpu_has_events() should not see pending
1988
	 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
1989
	 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
1990
	 * the guest even if the IRQ/NMI can't be delivered.
1991
	 *
1992
	 * Note: even without breaking out of local retries, zero-step
1993
	 * mitigation may still occur due to
1994
	 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
1995
	 * - a single RIP causing EPT violations for more GFNs than the
1996
	 *   threshold count.
1997
	 * This is safe, as triggering zero-step mitigation only introduces
1998
	 * contentions to page installation SEAMCALLs on other vCPUs, which will
1999
	 * handle retries locally in their EPT violation handlers.
2000
	 */
2001
	while (1) {
2002
		struct kvm_memory_slot *slot;
2003

2004
		ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
2005

2006
		if (ret != RET_PF_RETRY || !local_retry)
2007
			break;
2008

2009
		if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
2010
			break;
2011

2012
		if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
2013
			ret = -EIO;
2014
			break;
2015
		}
2016

2017
		/*
2018
		 * Bail if the memslot is invalid, i.e. is being deleted, as
2019
		 * faulting in will never succeed and this task needs to drop
2020
		 * SRCU in order to let memslot deletion complete.
2021
		 */
2022
		slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa));
2023
		if (slot && slot->flags & KVM_MEMSLOT_INVALID)
2024
			break;
2025

2026
		cond_resched();
2027
	}
2028
	return ret;
2029
}
2030

2031
int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2032
{
2033
	if (err) {
2034
		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
2035
		return 1;
2036
	}
2037

2038
	if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
2039
		tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
2040

2041
	return 1;
2042
}
2043

2044

2045
int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
2046
{
2047
	struct vcpu_tdx *tdx = to_tdx(vcpu);
2048
	u64 vp_enter_ret = tdx->vp_enter_ret;
2049
	union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
2050

2051
	if (fastpath != EXIT_FASTPATH_NONE)
2052
		return 1;
2053

2054
	if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
2055
		KVM_BUG_ON(1, vcpu->kvm);
2056
		return -EIO;
2057
	}
2058

2059
	/*
2060
	 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
2061
	 * TDX_SEAMCALL_VMFAILINVALID.
2062
	 */
2063
	if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
2064
		KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
2065
		goto unhandled_exit;
2066
	}
2067

2068
	if (unlikely(tdx_failed_vmentry(vcpu))) {
2069
		/*
2070
		 * If the guest state is protected, that means off-TD debug is
2071
		 * not enabled, TDX_NON_RECOVERABLE must be set.
2072
		 */
2073
		WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
2074
				!(vp_enter_ret & TDX_NON_RECOVERABLE));
2075
		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2076
		vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
2077
		vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
2078
		return 0;
2079
	}
2080

2081
	if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
2082
		exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
2083
		kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
2084
		goto unhandled_exit;
2085
	}
2086

2087
	WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
2088
		     (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
2089

2090
	switch (exit_reason.basic) {
2091
	case EXIT_REASON_TRIPLE_FAULT:
2092
		vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2093
		vcpu->mmio_needed = 0;
2094
		return 0;
2095
	case EXIT_REASON_EXCEPTION_NMI:
2096
		return tdx_handle_exception_nmi(vcpu);
2097
	case EXIT_REASON_EXTERNAL_INTERRUPT:
2098
		++vcpu->stat.irq_exits;
2099
		return 1;
2100
	case EXIT_REASON_CPUID:
2101
		return tdx_emulate_cpuid(vcpu);
2102
	case EXIT_REASON_HLT:
2103
		return kvm_emulate_halt_noskip(vcpu);
2104
	case EXIT_REASON_TDCALL:
2105
		return handle_tdvmcall(vcpu);
2106
	case EXIT_REASON_VMCALL:
2107
		return tdx_emulate_vmcall(vcpu);
2108
	case EXIT_REASON_IO_INSTRUCTION:
2109
		return tdx_emulate_io(vcpu);
2110
	case EXIT_REASON_MSR_READ:
2111
		kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2112
		return kvm_emulate_rdmsr(vcpu);
2113
	case EXIT_REASON_MSR_WRITE:
2114
		kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2115
		kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
2116
		kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
2117
		return kvm_emulate_wrmsr(vcpu);
2118
	case EXIT_REASON_EPT_MISCONFIG:
2119
		return tdx_emulate_mmio(vcpu);
2120
	case EXIT_REASON_EPT_VIOLATION:
2121
		return tdx_handle_ept_violation(vcpu);
2122
	case EXIT_REASON_OTHER_SMI:
2123
		/*
2124
		 * Unlike VMX, SMI in SEAM non-root mode (i.e. when
2125
		 * TD guest vCPU is running) will cause VM exit to TDX module,
2126
		 * then SEAMRET to KVM.  Once it exits to KVM, SMI is delivered
2127
		 * and handled by kernel handler right away.
2128
		 *
2129
		 * The Other SMI exit can also be caused by the SEAM non-root
2130
		 * machine check delivered via Machine Check System Management
2131
		 * Interrupt (MSMI), but it has already been handled by the
2132
		 * kernel machine check handler, i.e., the memory page has been
2133
		 * marked as poisoned and it won't be freed to the free list
2134
		 * when the TDX guest is terminated (the TDX module marks the
2135
		 * guest as dead and prevent it from further running when
2136
		 * machine check happens in SEAM non-root).
2137
		 *
2138
		 * - A MSMI will not reach here, it's handled as non_recoverable
2139
		 *   case above.
2140
		 * - If it's not an MSMI, no need to do anything here.
2141
		 */
2142
		return 1;
2143
	default:
2144
		break;
2145
	}
2146

2147
unhandled_exit:
2148
	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2149
	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
2150
	vcpu->run->internal.ndata = 2;
2151
	vcpu->run->internal.data[0] = vp_enter_ret;
2152
	vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
2153
	return 0;
2154
}
2155

2156
void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
2157
		u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
2158
{
2159
	struct vcpu_tdx *tdx = to_tdx(vcpu);
2160

2161
	*reason = tdx->vt.exit_reason.full;
2162
	if (*reason != -1u) {
2163
		*info1 = vmx_get_exit_qual(vcpu);
2164
		*info2 = tdx->ext_exit_qualification;
2165
		*intr_info = vmx_get_intr_info(vcpu);
2166
	} else {
2167
		*info1 = 0;
2168
		*info2 = 0;
2169
		*intr_info = 0;
2170
	}
2171

2172
	*error_code = 0;
2173
}
2174

2175
bool tdx_has_emulated_msr(u32 index)
2176
{
2177
	switch (index) {
2178
	case MSR_IA32_UCODE_REV:
2179
	case MSR_IA32_ARCH_CAPABILITIES:
2180
	case MSR_IA32_POWER_CTL:
2181
	case MSR_IA32_CR_PAT:
2182
	case MSR_MTRRcap:
2183
	case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
2184
	case MSR_MTRRdefType:
2185
	case MSR_IA32_TSC_DEADLINE:
2186
	case MSR_IA32_MISC_ENABLE:
2187
	case MSR_PLATFORM_INFO:
2188
	case MSR_MISC_FEATURES_ENABLES:
2189
	case MSR_IA32_APICBASE:
2190
	case MSR_EFER:
2191
	case MSR_IA32_FEAT_CTL:
2192
	case MSR_IA32_MCG_CAP:
2193
	case MSR_IA32_MCG_STATUS:
2194
	case MSR_IA32_MCG_CTL:
2195
	case MSR_IA32_MCG_EXT_CTL:
2196
	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2197
	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
2198
		/* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
2199
	case MSR_KVM_POLL_CONTROL:
2200
		return true;
2201
	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2202
		/*
2203
		 * x2APIC registers that are virtualized by the CPU can't be
2204
		 * emulated, KVM doesn't have access to the virtual APIC page.
2205
		 */
2206
		switch (index) {
2207
		case X2APIC_MSR(APIC_TASKPRI):
2208
		case X2APIC_MSR(APIC_PROCPRI):
2209
		case X2APIC_MSR(APIC_EOI):
2210
		case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
2211
		case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
2212
		case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
2213
			return false;
2214
		default:
2215
			return true;
2216
		}
2217
	default:
2218
		return false;
2219
	}
2220
}
2221

2222
static bool tdx_is_read_only_msr(u32 index)
2223
{
2224
	return  index == MSR_IA32_APICBASE || index == MSR_EFER ||
2225
		index == MSR_IA32_FEAT_CTL;
2226
}
2227

2228
int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2229
{
2230
	switch (msr->index) {
2231
	case MSR_IA32_FEAT_CTL:
2232
		/*
2233
		 * MCE and MCA are advertised via cpuid. Guest kernel could
2234
		 * check if LMCE is enabled or not.
2235
		 */
2236
		msr->data = FEAT_CTL_LOCKED;
2237
		if (vcpu->arch.mcg_cap & MCG_LMCE_P)
2238
			msr->data |= FEAT_CTL_LMCE_ENABLED;
2239
		return 0;
2240
	case MSR_IA32_MCG_EXT_CTL:
2241
		if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
2242
			return 1;
2243
		msr->data = vcpu->arch.mcg_ext_ctl;
2244
		return 0;
2245
	default:
2246
		if (!tdx_has_emulated_msr(msr->index))
2247
			return 1;
2248

2249
		return kvm_get_msr_common(vcpu, msr);
2250
	}
2251
}
2252

2253
int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2254
{
2255
	switch (msr->index) {
2256
	case MSR_IA32_MCG_EXT_CTL:
2257
		if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
2258
		    (msr->data & ~MCG_EXT_CTL_LMCE_EN))
2259
			return 1;
2260
		vcpu->arch.mcg_ext_ctl = msr->data;
2261
		return 0;
2262
	default:
2263
		if (tdx_is_read_only_msr(msr->index))
2264
			return 1;
2265

2266
		if (!tdx_has_emulated_msr(msr->index))
2267
			return 1;
2268

2269
		return kvm_set_msr_common(vcpu, msr);
2270
	}
2271
}
2272

2273
static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
2274
{
2275
	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2276
	struct kvm_tdx_capabilities __user *user_caps;
2277
	struct kvm_tdx_capabilities *caps = NULL;
2278
	u32 nr_user_entries;
2279
	int ret = 0;
2280

2281
	/* flags is reserved for future use */
2282
	if (cmd->flags)
2283
		return -EINVAL;
2284

2285
	caps = kzalloc(sizeof(*caps) +
2286
		       sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
2287
		       GFP_KERNEL);
2288
	if (!caps)
2289
		return -ENOMEM;
2290

2291
	user_caps = u64_to_user_ptr(cmd->data);
2292
	if (get_user(nr_user_entries, &user_caps->cpuid.nent)) {
2293
		ret = -EFAULT;
2294
		goto out;
2295
	}
2296

2297
	if (nr_user_entries < td_conf->num_cpuid_config) {
2298
		ret = -E2BIG;
2299
		goto out;
2300
	}
2301

2302
	ret = init_kvm_tdx_caps(td_conf, caps);
2303
	if (ret)
2304
		goto out;
2305

2306
	if (copy_to_user(user_caps, caps, sizeof(*caps))) {
2307
		ret = -EFAULT;
2308
		goto out;
2309
	}
2310

2311
	if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
2312
			 caps->cpuid.nent *
2313
			 sizeof(caps->cpuid.entries[0])))
2314
		ret = -EFAULT;
2315

2316
out:
2317
	/* kfree() accepts NULL. */
2318
	kfree(caps);
2319
	return ret;
2320
}
2321

2322
/*
2323
 * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
2324
 * similar to TDX's GPAW. Use this field as the interface for userspace to
2325
 * configure the GPAW and EPT level for TDs.
2326
 *
2327
 * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
2328
 * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
2329
 * supported. Value 52 is only supported when the platform supports 5 level
2330
 * EPT.
2331
 */
2332
static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
2333
					struct td_params *td_params)
2334
{
2335
	const struct kvm_cpuid_entry2 *entry;
2336
	int guest_pa;
2337

2338
	entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
2339
	if (!entry)
2340
		return -EINVAL;
2341

2342
	guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
2343

2344
	if (guest_pa != 48 && guest_pa != 52)
2345
		return -EINVAL;
2346

2347
	if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
2348
		return -EINVAL;
2349

2350
	td_params->eptp_controls = VMX_EPTP_MT_WB;
2351
	if (guest_pa == 52) {
2352
		td_params->eptp_controls |= VMX_EPTP_PWL_5;
2353
		td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
2354
	} else {
2355
		td_params->eptp_controls |= VMX_EPTP_PWL_4;
2356
	}
2357

2358
	return 0;
2359
}
2360

2361
static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
2362
				 struct td_params *td_params)
2363
{
2364
	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2365
	const struct kvm_cpuid_entry2 *entry;
2366
	struct tdx_cpuid_value *value;
2367
	int i, copy_cnt = 0;
2368

2369
	/*
2370
	 * td_params.cpuid_values: The number and the order of cpuid_value must
2371
	 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
2372
	 * It's assumed that td_params was zeroed.
2373
	 */
2374
	for (i = 0; i < td_conf->num_cpuid_config; i++) {
2375
		struct kvm_cpuid_entry2 tmp;
2376

2377
		td_init_cpuid_entry2(&tmp, i);
2378

2379
		entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
2380
					      tmp.function, tmp.index);
2381
		if (!entry)
2382
			continue;
2383

2384
		if (tdx_unsupported_cpuid(entry))
2385
			return -EINVAL;
2386

2387
		copy_cnt++;
2388

2389
		value = &td_params->cpuid_values[i];
2390
		value->eax = entry->eax;
2391
		value->ebx = entry->ebx;
2392
		value->ecx = entry->ecx;
2393
		value->edx = entry->edx;
2394

2395
		/*
2396
		 * TDX module does not accept nonzero bits 16..23 for the
2397
		 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
2398
		 */
2399
		if (tmp.function == 0x80000008)
2400
			value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
2401
	}
2402

2403
	/*
2404
	 * Rely on the TDX module to reject invalid configuration, but it can't
2405
	 * check of leafs that don't have a proper slot in td_params->cpuid_values
2406
	 * to stick then. So fail if there were entries that didn't get copied to
2407
	 * td_params.
2408
	 */
2409
	if (copy_cnt != cpuid->nent)
2410
		return -EINVAL;
2411

2412
	return 0;
2413
}
2414

2415
static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
2416
			struct kvm_tdx_init_vm *init_vm)
2417
{
2418
	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2419
	struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
2420
	int ret;
2421

2422
	if (kvm->created_vcpus)
2423
		return -EBUSY;
2424

2425
	if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
2426
		return -EINVAL;
2427

2428
	if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
2429
		return -EINVAL;
2430

2431
	td_params->max_vcpus = kvm->max_vcpus;
2432
	td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
2433
	td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
2434

2435
	td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
2436
	td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
2437

2438
	ret = setup_tdparams_eptp_controls(cpuid, td_params);
2439
	if (ret)
2440
		return ret;
2441

2442
	ret = setup_tdparams_cpuids(cpuid, td_params);
2443
	if (ret)
2444
		return ret;
2445

2446
#define MEMCPY_SAME_SIZE(dst, src)				\
2447
	do {							\
2448
		BUILD_BUG_ON(sizeof(dst) != sizeof(src));	\
2449
		memcpy((dst), (src), sizeof(dst));		\
2450
	} while (0)
2451

2452
	MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
2453
	MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
2454
	MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
2455

2456
	return 0;
2457
}
2458

2459
static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
2460
			 u64 *seamcall_err)
2461
{
2462
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2463
	cpumask_var_t packages;
2464
	struct page **tdcs_pages = NULL;
2465
	struct page *tdr_page;
2466
	int ret, i;
2467
	u64 err, rcx;
2468

2469
	*seamcall_err = 0;
2470
	ret = tdx_guest_keyid_alloc();
2471
	if (ret < 0)
2472
		return ret;
2473
	kvm_tdx->hkid = ret;
2474
	kvm_tdx->misc_cg = get_current_misc_cg();
2475
	ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
2476
	if (ret)
2477
		goto free_hkid;
2478

2479
	ret = -ENOMEM;
2480

2481
	atomic_inc(&nr_configured_hkid);
2482

2483
	tdr_page = alloc_page(GFP_KERNEL);
2484
	if (!tdr_page)
2485
		goto free_hkid;
2486

2487
	kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
2488
	/* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
2489
	kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
2490
	tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
2491
			     GFP_KERNEL);
2492
	if (!tdcs_pages)
2493
		goto free_tdr;
2494

2495
	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2496
		tdcs_pages[i] = alloc_page(GFP_KERNEL);
2497
		if (!tdcs_pages[i])
2498
			goto free_tdcs;
2499
	}
2500

2501
	if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
2502
		goto free_tdcs;
2503

2504
	cpus_read_lock();
2505

2506
	/*
2507
	 * Need at least one CPU of the package to be online in order to
2508
	 * program all packages for host key id.  Check it.
2509
	 */
2510
	for_each_present_cpu(i)
2511
		cpumask_set_cpu(topology_physical_package_id(i), packages);
2512
	for_each_online_cpu(i)
2513
		cpumask_clear_cpu(topology_physical_package_id(i), packages);
2514
	if (!cpumask_empty(packages)) {
2515
		ret = -EIO;
2516
		/*
2517
		 * Because it's hard for human operator to figure out the
2518
		 * reason, warn it.
2519
		 */
2520
#define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
2521
		pr_warn_ratelimited(MSG_ALLPKG);
2522
		goto free_packages;
2523
	}
2524

2525
	/*
2526
	 * TDH.MNG.CREATE tries to grab the global TDX module and fails
2527
	 * with TDX_OPERAND_BUSY when it fails to grab.  Take the global
2528
	 * lock to prevent it from failure.
2529
	 */
2530
	mutex_lock(&tdx_lock);
2531
	kvm_tdx->td.tdr_page = tdr_page;
2532
	err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
2533
	mutex_unlock(&tdx_lock);
2534

2535
	if (err == TDX_RND_NO_ENTROPY) {
2536
		ret = -EAGAIN;
2537
		goto free_packages;
2538
	}
2539

2540
	if (WARN_ON_ONCE(err)) {
2541
		pr_tdx_error(TDH_MNG_CREATE, err);
2542
		ret = -EIO;
2543
		goto free_packages;
2544
	}
2545

2546
	for_each_online_cpu(i) {
2547
		int pkg = topology_physical_package_id(i);
2548

2549
		if (cpumask_test_and_set_cpu(pkg, packages))
2550
			continue;
2551

2552
		/*
2553
		 * Program the memory controller in the package with an
2554
		 * encryption key associated to a TDX private host key id
2555
		 * assigned to this TDR.  Concurrent operations on same memory
2556
		 * controller results in TDX_OPERAND_BUSY. No locking needed
2557
		 * beyond the cpus_read_lock() above as it serializes against
2558
		 * hotplug and the first online CPU of the package is always
2559
		 * used. We never have two CPUs in the same socket trying to
2560
		 * program the key.
2561
		 */
2562
		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
2563
				      kvm_tdx, true);
2564
		if (ret)
2565
			break;
2566
	}
2567
	cpus_read_unlock();
2568
	free_cpumask_var(packages);
2569
	if (ret) {
2570
		i = 0;
2571
		goto teardown;
2572
	}
2573

2574
	kvm_tdx->td.tdcs_pages = tdcs_pages;
2575
	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2576
		err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
2577
		if (err == TDX_RND_NO_ENTROPY) {
2578
			/* Here it's hard to allow userspace to retry. */
2579
			ret = -EAGAIN;
2580
			goto teardown;
2581
		}
2582
		if (WARN_ON_ONCE(err)) {
2583
			pr_tdx_error(TDH_MNG_ADDCX, err);
2584
			ret = -EIO;
2585
			goto teardown;
2586
		}
2587
	}
2588

2589
	err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
2590
	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
2591
		/*
2592
		 * Because a user gives operands, don't warn.
2593
		 * Return a hint to the user because it's sometimes hard for the
2594
		 * user to figure out which operand is invalid.  SEAMCALL status
2595
		 * code includes which operand caused invalid operand error.
2596
		 */
2597
		*seamcall_err = err;
2598
		ret = -EINVAL;
2599
		goto teardown;
2600
	} else if (WARN_ON_ONCE(err)) {
2601
		pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
2602
		ret = -EIO;
2603
		goto teardown;
2604
	}
2605

2606
	return 0;
2607

2608
	/*
2609
	 * The sequence for freeing resources from a partially initialized TD
2610
	 * varies based on where in the initialization flow failure occurred.
2611
	 * Simply use the full teardown and destroy, which naturally play nice
2612
	 * with partial initialization.
2613
	 */
2614
teardown:
2615
	/* Only free pages not yet added, so start at 'i' */
2616
	for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2617
		if (tdcs_pages[i]) {
2618
			__free_page(tdcs_pages[i]);
2619
			tdcs_pages[i] = NULL;
2620
		}
2621
	}
2622
	if (!kvm_tdx->td.tdcs_pages)
2623
		kfree(tdcs_pages);
2624

2625
	tdx_mmu_release_hkid(kvm);
2626
	tdx_reclaim_td_control_pages(kvm);
2627

2628
	return ret;
2629

2630
free_packages:
2631
	cpus_read_unlock();
2632
	free_cpumask_var(packages);
2633

2634
free_tdcs:
2635
	for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2636
		if (tdcs_pages[i])
2637
			__free_page(tdcs_pages[i]);
2638
	}
2639
	kfree(tdcs_pages);
2640
	kvm_tdx->td.tdcs_pages = NULL;
2641

2642
free_tdr:
2643
	if (tdr_page)
2644
		__free_page(tdr_page);
2645
	kvm_tdx->td.tdr_page = 0;
2646

2647
free_hkid:
2648
	tdx_hkid_free(kvm_tdx);
2649

2650
	return ret;
2651
}
2652

2653
static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
2654
				      u64 *data)
2655
{
2656
	u64 err;
2657

2658
	err = tdh_mng_rd(&tdx->td, field_id, data);
2659

2660
	return err;
2661
}
2662

2663
#define TDX_MD_UNREADABLE_LEAF_MASK	GENMASK(30, 7)
2664
#define TDX_MD_UNREADABLE_SUBLEAF_MASK	GENMASK(31, 7)
2665

2666
static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
2667
			  bool sub_leaf_set, int *entry_index,
2668
			  struct kvm_cpuid_entry2 *out)
2669
{
2670
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2671
	u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
2672
	u64 ebx_eax, edx_ecx;
2673
	u64 err = 0;
2674

2675
	if (sub_leaf > 0b1111111)
2676
		return -EINVAL;
2677

2678
	if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
2679
		return -EINVAL;
2680

2681
	if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
2682
	    sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
2683
		return -EINVAL;
2684

2685
	/*
2686
	 * bit 23:17, REVSERVED: reserved, must be 0;
2687
	 * bit 16,    LEAF_31: leaf number bit 31;
2688
	 * bit 15:9,  LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
2689
	 *                      implicitly 0;
2690
	 * bit 8,     SUBLEAF_NA: sub-leaf not applicable flag;
2691
	 * bit 7:1,   SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
2692
	 *                         the SUBLEAF_6_0 is all-1.
2693
	 *                         sub-leaf bits 31:7 are implicitly 0;
2694
	 * bit 0,     ELEMENT_I: Element index within field;
2695
	 */
2696
	field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
2697
	field_id |= (leaf & 0x7f) << 9;
2698
	if (sub_leaf_set)
2699
		field_id |= (sub_leaf & 0x7f) << 1;
2700
	else
2701
		field_id |= 0x1fe;
2702

2703
	err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
2704
	if (err) //TODO check for specific errors
2705
		goto err_out;
2706

2707
	out->eax = (u32) ebx_eax;
2708
	out->ebx = (u32) (ebx_eax >> 32);
2709

2710
	field_id++;
2711
	err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
2712
	/*
2713
	 * It's weird that reading edx_ecx fails while reading ebx_eax
2714
	 * succeeded.
2715
	 */
2716
	if (WARN_ON_ONCE(err))
2717
		goto err_out;
2718

2719
	out->ecx = (u32) edx_ecx;
2720
	out->edx = (u32) (edx_ecx >> 32);
2721

2722
	out->function = leaf;
2723
	out->index = sub_leaf;
2724
	out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
2725

2726
	/*
2727
	 * Work around missing support on old TDX modules, fetch
2728
	 * guest maxpa from gfn_direct_bits.
2729
	 */
2730
	if (leaf == 0x80000008) {
2731
		gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
2732
		unsigned int g_maxpa = __ffs(gpa_bits) + 1;
2733

2734
		out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
2735
	}
2736

2737
	(*entry_index)++;
2738

2739
	return 0;
2740

2741
err_out:
2742
	out->eax = 0;
2743
	out->ebx = 0;
2744
	out->ecx = 0;
2745
	out->edx = 0;
2746

2747
	return -EIO;
2748
}
2749

2750
static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2751
{
2752
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2753
	struct kvm_tdx_init_vm *init_vm;
2754
	struct td_params *td_params = NULL;
2755
	int ret;
2756

2757
	BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
2758
	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
2759

2760
	if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
2761
		return -EINVAL;
2762

2763
	if (cmd->flags)
2764
		return -EINVAL;
2765

2766
	init_vm = kmalloc(sizeof(*init_vm) +
2767
			  sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
2768
			  GFP_KERNEL);
2769
	if (!init_vm)
2770
		return -ENOMEM;
2771

2772
	if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
2773
		ret = -EFAULT;
2774
		goto out;
2775
	}
2776

2777
	if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
2778
		ret = -E2BIG;
2779
		goto out;
2780
	}
2781

2782
	if (copy_from_user(init_vm->cpuid.entries,
2783
			   u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
2784
			   flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
2785
		ret = -EFAULT;
2786
		goto out;
2787
	}
2788

2789
	if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
2790
		ret = -EINVAL;
2791
		goto out;
2792
	}
2793

2794
	if (init_vm->cpuid.padding) {
2795
		ret = -EINVAL;
2796
		goto out;
2797
	}
2798

2799
	td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
2800
	if (!td_params) {
2801
		ret = -ENOMEM;
2802
		goto out;
2803
	}
2804

2805
	ret = setup_tdparams(kvm, td_params, init_vm);
2806
	if (ret)
2807
		goto out;
2808

2809
	ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
2810
	if (ret)
2811
		goto out;
2812

2813
	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
2814
	kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
2815
	kvm_tdx->attributes = td_params->attributes;
2816
	kvm_tdx->xfam = td_params->xfam;
2817

2818
	if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
2819
		kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
2820
	else
2821
		kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
2822

2823
	kvm_tdx->state = TD_STATE_INITIALIZED;
2824
out:
2825
	/* kfree() accepts NULL. */
2826
	kfree(init_vm);
2827
	kfree(td_params);
2828

2829
	return ret;
2830
}
2831

2832
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
2833
{
2834
	/*
2835
	 * flush_tlb_current() is invoked when the first time for the vcpu to
2836
	 * run or when root of shared EPT is invalidated.
2837
	 * KVM only needs to flush shared EPT because the TDX module handles TLB
2838
	 * invalidation for private EPT in tdh_vp_enter();
2839
	 *
2840
	 * A single context invalidation for shared EPT can be performed here.
2841
	 * However, this single context invalidation requires the private EPTP
2842
	 * rather than the shared EPTP to flush shared EPT, as shared EPT uses
2843
	 * private EPTP as its ASID for TLB invalidation.
2844
	 *
2845
	 * To avoid reading back private EPTP, perform a global invalidation for
2846
	 * shared EPT instead to keep this function simple.
2847
	 */
2848
	ept_sync_global();
2849
}
2850

2851
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
2852
{
2853
	/*
2854
	 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
2855
	 * ensure that private EPT will be flushed on the next TD enter. No need
2856
	 * to call tdx_track() here again even when this callback is a result of
2857
	 * zapping private EPT.
2858
	 *
2859
	 * Due to the lack of the context to determine which EPT has been
2860
	 * affected by zapping, invoke invept() directly here for both shared
2861
	 * EPT and private EPT for simplicity, though it's not necessary for
2862
	 * private EPT.
2863
	 */
2864
	ept_sync_global();
2865
}
2866

2867
static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2868
{
2869
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2870

2871
	guard(mutex)(&kvm->slots_lock);
2872

2873
	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
2874
		return -EINVAL;
2875
	/*
2876
	 * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
2877
	 * TDH.MEM.PAGE.ADD().
2878
	 */
2879
	if (atomic64_read(&kvm_tdx->nr_premapped))
2880
		return -EINVAL;
2881

2882
	cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
2883
	if (tdx_operand_busy(cmd->hw_error))
2884
		return -EBUSY;
2885
	if (KVM_BUG_ON(cmd->hw_error, kvm)) {
2886
		pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
2887
		return -EIO;
2888
	}
2889

2890
	kvm_tdx->state = TD_STATE_RUNNABLE;
2891
	/* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
2892
	smp_wmb();
2893
	kvm->arch.pre_fault_allowed = true;
2894
	return 0;
2895
}
2896

2897
int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
2898
{
2899
	struct kvm_tdx_cmd tdx_cmd;
2900
	int r;
2901

2902
	if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
2903
		return -EFAULT;
2904

2905
	/*
2906
	 * Userspace should never set hw_error. It is used to fill
2907
	 * hardware-defined error by the kernel.
2908
	 */
2909
	if (tdx_cmd.hw_error)
2910
		return -EINVAL;
2911

2912
	mutex_lock(&kvm->lock);
2913

2914
	switch (tdx_cmd.id) {
2915
	case KVM_TDX_CAPABILITIES:
2916
		r = tdx_get_capabilities(&tdx_cmd);
2917
		break;
2918
	case KVM_TDX_INIT_VM:
2919
		r = tdx_td_init(kvm, &tdx_cmd);
2920
		break;
2921
	case KVM_TDX_FINALIZE_VM:
2922
		r = tdx_td_finalize(kvm, &tdx_cmd);
2923
		break;
2924
	default:
2925
		r = -EINVAL;
2926
		goto out;
2927
	}
2928

2929
	if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
2930
		r = -EFAULT;
2931

2932
out:
2933
	mutex_unlock(&kvm->lock);
2934
	return r;
2935
}
2936

2937
/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
2938
static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
2939
{
2940
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2941
	struct vcpu_tdx *tdx = to_tdx(vcpu);
2942
	struct page *page;
2943
	int ret, i;
2944
	u64 err;
2945

2946
	page = alloc_page(GFP_KERNEL);
2947
	if (!page)
2948
		return -ENOMEM;
2949
	tdx->vp.tdvpr_page = page;
2950

2951
	/*
2952
	 * page_to_phys() does not work in 'noinstr' code, like guest
2953
	 * entry via tdh_vp_enter(). Precalculate and store it instead
2954
	 * of doing it at runtime later.
2955
	 */
2956
	tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page);
2957

2958
	tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
2959
			       	     GFP_KERNEL);
2960
	if (!tdx->vp.tdcx_pages) {
2961
		ret = -ENOMEM;
2962
		goto free_tdvpr;
2963
	}
2964

2965
	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2966
		page = alloc_page(GFP_KERNEL);
2967
		if (!page) {
2968
			ret = -ENOMEM;
2969
			goto free_tdcx;
2970
		}
2971
		tdx->vp.tdcx_pages[i] = page;
2972
	}
2973

2974
	err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
2975
	if (KVM_BUG_ON(err, vcpu->kvm)) {
2976
		ret = -EIO;
2977
		pr_tdx_error(TDH_VP_CREATE, err);
2978
		goto free_tdcx;
2979
	}
2980

2981
	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2982
		err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
2983
		if (KVM_BUG_ON(err, vcpu->kvm)) {
2984
			pr_tdx_error(TDH_VP_ADDCX, err);
2985
			/*
2986
			 * Pages already added are reclaimed by the vcpu_free
2987
			 * method, but the rest are freed here.
2988
			 */
2989
			for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2990
				__free_page(tdx->vp.tdcx_pages[i]);
2991
				tdx->vp.tdcx_pages[i] = NULL;
2992
			}
2993
			return -EIO;
2994
		}
2995
	}
2996

2997
	err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
2998
	if (KVM_BUG_ON(err, vcpu->kvm)) {
2999
		pr_tdx_error(TDH_VP_INIT, err);
3000
		return -EIO;
3001
	}
3002

3003
	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3004

3005
	return 0;
3006

3007
free_tdcx:
3008
	for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
3009
		if (tdx->vp.tdcx_pages[i])
3010
			__free_page(tdx->vp.tdcx_pages[i]);
3011
		tdx->vp.tdcx_pages[i] = NULL;
3012
	}
3013
	kfree(tdx->vp.tdcx_pages);
3014
	tdx->vp.tdcx_pages = NULL;
3015

3016
free_tdvpr:
3017
	if (tdx->vp.tdvpr_page)
3018
		__free_page(tdx->vp.tdvpr_page);
3019
	tdx->vp.tdvpr_page = 0;
3020
	tdx->vp.tdvpr_pa = 0;
3021

3022
	return ret;
3023
}
3024

3025
/* Sometimes reads multipple subleafs. Return how many enties were written. */
3026
static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
3027
				   struct kvm_cpuid_entry2 *output_e)
3028
{
3029
	int sub_leaf = 0;
3030
	int ret;
3031

3032
	/* First try without a subleaf */
3033
	ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
3034

3035
	/* If success, or invalid leaf, just give up */
3036
	if (ret != -EIO)
3037
		return ret;
3038

3039
	/*
3040
	 * If the try without a subleaf failed, try reading subleafs until
3041
	 * failure. The TDX module only supports 6 bits of subleaf index.
3042
	 */
3043
	while (1) {
3044
		/* Keep reading subleafs until there is a failure. */
3045
		if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
3046
			return !sub_leaf;
3047

3048
		sub_leaf++;
3049
		output_e++;
3050
	}
3051

3052
	return 0;
3053
}
3054

3055
static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3056
{
3057
	struct kvm_cpuid2 __user *output, *td_cpuid;
3058
	int r = 0, i = 0, leaf;
3059
	u32 level;
3060

3061
	output = u64_to_user_ptr(cmd->data);
3062
	td_cpuid = kzalloc(sizeof(*td_cpuid) +
3063
			sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
3064
			GFP_KERNEL);
3065
	if (!td_cpuid)
3066
		return -ENOMEM;
3067

3068
	if (copy_from_user(td_cpuid, output, sizeof(*output))) {
3069
		r = -EFAULT;
3070
		goto out;
3071
	}
3072

3073
	/* Read max CPUID for normal range */
3074
	if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
3075
		r = -EIO;
3076
		goto out;
3077
	}
3078
	level = td_cpuid->entries[0].eax;
3079

3080
	for (leaf = 1; leaf <= level; leaf++)
3081
		tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3082

3083
	/* Read max CPUID for extended range */
3084
	if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
3085
		r = -EIO;
3086
		goto out;
3087
	}
3088
	level = td_cpuid->entries[i - 1].eax;
3089

3090
	for (leaf = 0x80000001; leaf <= level; leaf++)
3091
		tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3092

3093
	if (td_cpuid->nent < i)
3094
		r = -E2BIG;
3095
	td_cpuid->nent = i;
3096

3097
	if (copy_to_user(output, td_cpuid, sizeof(*output))) {
3098
		r = -EFAULT;
3099
		goto out;
3100
	}
3101

3102
	if (r == -E2BIG)
3103
		goto out;
3104

3105
	if (copy_to_user(output->entries, td_cpuid->entries,
3106
			 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
3107
		r = -EFAULT;
3108

3109
out:
3110
	kfree(td_cpuid);
3111

3112
	return r;
3113
}
3114

3115
static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3116
{
3117
	u64 apic_base;
3118
	struct vcpu_tdx *tdx = to_tdx(vcpu);
3119
	int ret;
3120

3121
	if (cmd->flags)
3122
		return -EINVAL;
3123

3124
	if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
3125
		return -EINVAL;
3126

3127
	/*
3128
	 * TDX requires X2APIC, userspace is responsible for configuring guest
3129
	 * CPUID accordingly.
3130
	 */
3131
	apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
3132
		(kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
3133
	if (kvm_apic_set_base(vcpu, apic_base, true))
3134
		return -EINVAL;
3135

3136
	ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
3137
	if (ret)
3138
		return ret;
3139

3140
	td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
3141
	td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
3142
	td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
3143

3144
	tdx->state = VCPU_TD_STATE_INITIALIZED;
3145

3146
	return 0;
3147
}
3148

3149
void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
3150
{
3151
	/*
3152
	 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
3153
	 * INIT events.
3154
	 *
3155
	 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
3156
	 * userspace needs to define the vCPU model before KVM can initialize
3157
	 * vCPU state, e.g. to enable x2APIC.
3158
	 */
3159
	WARN_ON_ONCE(init_event);
3160
}
3161

3162
struct tdx_gmem_post_populate_arg {
3163
	struct kvm_vcpu *vcpu;
3164
	__u32 flags;
3165
};
3166

3167
static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
3168
				  void __user *src, int order, void *_arg)
3169
{
3170
	u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
3171
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3172
	struct tdx_gmem_post_populate_arg *arg = _arg;
3173
	struct kvm_vcpu *vcpu = arg->vcpu;
3174
	gpa_t gpa = gfn_to_gpa(gfn);
3175
	u8 level = PG_LEVEL_4K;
3176
	struct page *src_page;
3177
	int ret, i;
3178
	u64 err, entry, level_state;
3179

3180
	/*
3181
	 * Get the source page if it has been faulted in. Return failure if the
3182
	 * source page has been swapped out or unmapped in primary memory.
3183
	 */
3184
	ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
3185
	if (ret < 0)
3186
		return ret;
3187
	if (ret != 1)
3188
		return -ENOMEM;
3189

3190
	ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
3191
	if (ret < 0)
3192
		goto out;
3193

3194
	/*
3195
	 * The private mem cannot be zapped after kvm_tdp_map_page()
3196
	 * because all paths are covered by slots_lock and the
3197
	 * filemap invalidate lock.  Check that they are indeed enough.
3198
	 */
3199
	if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
3200
		scoped_guard(read_lock, &kvm->mmu_lock) {
3201
			if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
3202
				ret = -EIO;
3203
				goto out;
3204
			}
3205
		}
3206
	}
3207

3208
	ret = 0;
3209
	err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
3210
			       src_page, &entry, &level_state);
3211
	if (err) {
3212
		ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
3213
		goto out;
3214
	}
3215

3216
	if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
3217
		atomic64_dec(&kvm_tdx->nr_premapped);
3218

3219
	if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
3220
		for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
3221
			err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
3222
					    &level_state);
3223
			if (err) {
3224
				ret = -EIO;
3225
				break;
3226
			}
3227
		}
3228
	}
3229

3230
out:
3231
	put_page(src_page);
3232
	return ret;
3233
}
3234

3235
static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3236
{
3237
	struct vcpu_tdx *tdx = to_tdx(vcpu);
3238
	struct kvm *kvm = vcpu->kvm;
3239
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3240
	struct kvm_tdx_init_mem_region region;
3241
	struct tdx_gmem_post_populate_arg arg;
3242
	long gmem_ret;
3243
	int ret;
3244

3245
	if (tdx->state != VCPU_TD_STATE_INITIALIZED)
3246
		return -EINVAL;
3247

3248
	guard(mutex)(&kvm->slots_lock);
3249

3250
	/* Once TD is finalized, the initial guest memory is fixed. */
3251
	if (kvm_tdx->state == TD_STATE_RUNNABLE)
3252
		return -EINVAL;
3253

3254
	if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
3255
		return -EINVAL;
3256

3257
	if (copy_from_user(&region, u64_to_user_ptr(cmd->data), sizeof(region)))
3258
		return -EFAULT;
3259

3260
	if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
3261
	    !region.nr_pages ||
3262
	    region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
3263
	    !vt_is_tdx_private_gpa(kvm, region.gpa) ||
3264
	    !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
3265
		return -EINVAL;
3266

3267
	kvm_mmu_reload(vcpu);
3268
	ret = 0;
3269
	while (region.nr_pages) {
3270
		if (signal_pending(current)) {
3271
			ret = -EINTR;
3272
			break;
3273
		}
3274

3275
		arg = (struct tdx_gmem_post_populate_arg) {
3276
			.vcpu = vcpu,
3277
			.flags = cmd->flags,
3278
		};
3279
		gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
3280
					     u64_to_user_ptr(region.source_addr),
3281
					     1, tdx_gmem_post_populate, &arg);
3282
		if (gmem_ret < 0) {
3283
			ret = gmem_ret;
3284
			break;
3285
		}
3286

3287
		if (gmem_ret != 1) {
3288
			ret = -EIO;
3289
			break;
3290
		}
3291

3292
		region.source_addr += PAGE_SIZE;
3293
		region.gpa += PAGE_SIZE;
3294
		region.nr_pages--;
3295

3296
		cond_resched();
3297
	}
3298

3299
	if (copy_to_user(u64_to_user_ptr(cmd->data), &region, sizeof(region)))
3300
		ret = -EFAULT;
3301
	return ret;
3302
}
3303

3304
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3305
{
3306
	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
3307
	struct kvm_tdx_cmd cmd;
3308
	int ret;
3309

3310
	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3311
		return -EINVAL;
3312

3313
	if (copy_from_user(&cmd, argp, sizeof(cmd)))
3314
		return -EFAULT;
3315

3316
	if (cmd.hw_error)
3317
		return -EINVAL;
3318

3319
	switch (cmd.id) {
3320
	case KVM_TDX_INIT_VCPU:
3321
		ret = tdx_vcpu_init(vcpu, &cmd);
3322
		break;
3323
	case KVM_TDX_INIT_MEM_REGION:
3324
		ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
3325
		break;
3326
	case KVM_TDX_GET_CPUID:
3327
		ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
3328
		break;
3329
	default:
3330
		ret = -EINVAL;
3331
		break;
3332
	}
3333

3334
	return ret;
3335
}
3336

3337
int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
3338
{
3339
	if (!is_private)
3340
		return 0;
3341

3342
	return PG_LEVEL_4K;
3343
}
3344

3345
static int tdx_online_cpu(unsigned int cpu)
3346
{
3347
	unsigned long flags;
3348
	int r;
3349

3350
	/* Sanity check CPU is already in post-VMXON */
3351
	WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
3352

3353
	local_irq_save(flags);
3354
	r = tdx_cpu_enable();
3355
	local_irq_restore(flags);
3356

3357
	return r;
3358
}
3359

3360
static int tdx_offline_cpu(unsigned int cpu)
3361
{
3362
	int i;
3363

3364
	/* No TD is running.  Allow any cpu to be offline. */
3365
	if (!atomic_read(&nr_configured_hkid))
3366
		return 0;
3367

3368
	/*
3369
	 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
3370
	 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
3371
	 * controller with pconfig.  If we have active TDX HKID, refuse to
3372
	 * offline the last online cpu.
3373
	 */
3374
	for_each_online_cpu(i) {
3375
		/*
3376
		 * Found another online cpu on the same package.
3377
		 * Allow to offline.
3378
		 */
3379
		if (i != cpu && topology_physical_package_id(i) ==
3380
				topology_physical_package_id(cpu))
3381
			return 0;
3382
	}
3383

3384
	/*
3385
	 * This is the last cpu of this package.  Don't offline it.
3386
	 *
3387
	 * Because it's hard for human operator to understand the
3388
	 * reason, warn it.
3389
	 */
3390
#define MSG_ALLPKG_ONLINE \
3391
	"TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
3392
	pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
3393
	return -EBUSY;
3394
}
3395

3396
static void __do_tdx_cleanup(void)
3397
{
3398
	/*
3399
	 * Once TDX module is initialized, it cannot be disabled and
3400
	 * re-initialized again w/o runtime update (which isn't
3401
	 * supported by kernel).  Only need to remove the cpuhp here.
3402
	 * The TDX host core code tracks TDX status and can handle
3403
	 * 'multiple enabling' scenario.
3404
	 */
3405
	WARN_ON_ONCE(!tdx_cpuhp_state);
3406
	cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
3407
	tdx_cpuhp_state = 0;
3408
}
3409

3410
static void __tdx_cleanup(void)
3411
{
3412
	cpus_read_lock();
3413
	__do_tdx_cleanup();
3414
	cpus_read_unlock();
3415
}
3416

3417
static int __init __do_tdx_bringup(void)
3418
{
3419
	int r;
3420

3421
	/*
3422
	 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
3423
	 * online CPUs before calling tdx_enable(), and on any new
3424
	 * going-online CPU to make sure it is ready for TDX guest.
3425
	 */
3426
	r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
3427
					 "kvm/cpu/tdx:online",
3428
					 tdx_online_cpu, tdx_offline_cpu);
3429
	if (r < 0)
3430
		return r;
3431

3432
	tdx_cpuhp_state = r;
3433

3434
	r = tdx_enable();
3435
	if (r)
3436
		__do_tdx_cleanup();
3437

3438
	return r;
3439
}
3440

3441
static int __init __tdx_bringup(void)
3442
{
3443
	const struct tdx_sys_info_td_conf *td_conf;
3444
	int r, i;
3445

3446
	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
3447
		/*
3448
		 * Check if MSRs (tdx_uret_msrs) can be saved/restored
3449
		 * before returning to user space.
3450
		 *
3451
		 * this_cpu_ptr(user_return_msrs)->registered isn't checked
3452
		 * because the registration is done at vcpu runtime by
3453
		 * tdx_user_return_msr_update_cache().
3454
		 */
3455
		tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
3456
		if (tdx_uret_msrs[i].slot == -1) {
3457
			/* If any MSR isn't supported, it is a KVM bug */
3458
			pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
3459
				tdx_uret_msrs[i].msr);
3460
			return -EIO;
3461
		}
3462
	}
3463

3464
	/*
3465
	 * Enabling TDX requires enabling hardware virtualization first,
3466
	 * as making SEAMCALLs requires CPU being in post-VMXON state.
3467
	 */
3468
	r = kvm_enable_virtualization();
3469
	if (r)
3470
		return r;
3471

3472
	cpus_read_lock();
3473
	r = __do_tdx_bringup();
3474
	cpus_read_unlock();
3475

3476
	if (r)
3477
		goto tdx_bringup_err;
3478

3479
	r = -EINVAL;
3480
	/* Get TDX global information for later use */
3481
	tdx_sysinfo = tdx_get_sysinfo();
3482
	if (WARN_ON_ONCE(!tdx_sysinfo))
3483
		goto get_sysinfo_err;
3484

3485
	/* Check TDX module and KVM capabilities */
3486
	if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
3487
	    !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
3488
		goto get_sysinfo_err;
3489

3490
	if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
3491
		goto get_sysinfo_err;
3492

3493
	/*
3494
	 * TDX has its own limit of maximum vCPUs it can support for all
3495
	 * TDX guests in addition to KVM_MAX_VCPUS.  Userspace needs to
3496
	 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
3497
	 * extension on per-VM basis.
3498
	 *
3499
	 * TDX module reports such limit via the MAX_VCPU_PER_TD global
3500
	 * metadata.  Different modules may report different values.
3501
	 * Some old module may also not support this metadata (in which
3502
	 * case this limit is U16_MAX).
3503
	 *
3504
	 * In practice, the reported value reflects the maximum logical
3505
	 * CPUs that ALL the platforms that the module supports can
3506
	 * possibly have.
3507
	 *
3508
	 * Simply forwarding the MAX_VCPU_PER_TD to userspace could
3509
	 * result in an unpredictable ABI.  KVM instead always advertise
3510
	 * the number of logical CPUs the platform has as the maximum
3511
	 * vCPUs for TDX guests.
3512
	 *
3513
	 * Make sure MAX_VCPU_PER_TD reported by TDX module is not
3514
	 * smaller than the number of logical CPUs, otherwise KVM will
3515
	 * report an unsupported value to userspace.
3516
	 *
3517
	 * Note, a platform with TDX enabled in the BIOS cannot support
3518
	 * physical CPU hotplug, and TDX requires the BIOS has marked
3519
	 * all logical CPUs in MADT table as enabled.  Just use
3520
	 * num_present_cpus() for the number of logical CPUs.
3521
	 */
3522
	td_conf = &tdx_sysinfo->td_conf;
3523
	if (td_conf->max_vcpus_per_td < num_present_cpus()) {
3524
		pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
3525
				td_conf->max_vcpus_per_td, num_present_cpus());
3526
		goto get_sysinfo_err;
3527
	}
3528

3529
	if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids()))
3530
		goto get_sysinfo_err;
3531

3532
	/*
3533
	 * Leave hardware virtualization enabled after TDX is enabled
3534
	 * successfully.  TDX CPU hotplug depends on this.
3535
	 */
3536
	return 0;
3537

3538
get_sysinfo_err:
3539
	__tdx_cleanup();
3540
tdx_bringup_err:
3541
	kvm_disable_virtualization();
3542
	return r;
3543
}
3544

3545
void tdx_cleanup(void)
3546
{
3547
	if (enable_tdx) {
3548
		misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
3549
		__tdx_cleanup();
3550
		kvm_disable_virtualization();
3551
	}
3552
}
3553

3554
int __init tdx_bringup(void)
3555
{
3556
	int r, i;
3557

3558
	/* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
3559
	for_each_possible_cpu(i)
3560
		INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
3561

3562
	if (!enable_tdx)
3563
		return 0;
3564

3565
	if (!enable_ept) {
3566
		pr_err("EPT is required for TDX\n");
3567
		goto success_disable_tdx;
3568
	}
3569

3570
	if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
3571
		pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
3572
		goto success_disable_tdx;
3573
	}
3574

3575
	if (!enable_apicv) {
3576
		pr_err("APICv is required for TDX\n");
3577
		goto success_disable_tdx;
3578
	}
3579

3580
	if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
3581
		pr_err("tdx: OSXSAVE is required for TDX\n");
3582
		goto success_disable_tdx;
3583
	}
3584

3585
	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
3586
		pr_err("tdx: MOVDIR64B is required for TDX\n");
3587
		goto success_disable_tdx;
3588
	}
3589

3590
	if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
3591
		pr_err("Self-snoop is required for TDX\n");
3592
		goto success_disable_tdx;
3593
	}
3594

3595
	if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
3596
		pr_err("tdx: no TDX private KeyIDs available\n");
3597
		goto success_disable_tdx;
3598
	}
3599

3600
	if (!enable_virt_at_load) {
3601
		pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
3602
		goto success_disable_tdx;
3603
	}
3604

3605
	/*
3606
	 * Ideally KVM should probe whether TDX module has been loaded
3607
	 * first and then try to bring it up.  But TDX needs to use SEAMCALL
3608
	 * to probe whether the module is loaded (there is no CPUID or MSR
3609
	 * for that), and making SEAMCALL requires enabling virtualization
3610
	 * first, just like the rest steps of bringing up TDX module.
3611
	 *
3612
	 * So, for simplicity do everything in __tdx_bringup(); the first
3613
	 * SEAMCALL will return -ENODEV when the module is not loaded.  The
3614
	 * only complication is having to make sure that initialization
3615
	 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
3616
	 * cases.
3617
	 */
3618
	r = __tdx_bringup();
3619
	if (r) {
3620
		/*
3621
		 * Disable TDX only but don't fail to load module if the TDX
3622
		 * module could not be loaded.  No need to print message saying
3623
		 * "module is not loaded" because it was printed when the first
3624
		 * SEAMCALL failed.  Don't bother unwinding the S-EPT hooks or
3625
		 * vm_size, as kvm_x86_ops have already been finalized (and are
3626
		 * intentionally not exported).  The S-EPT code is unreachable,
3627
		 * and allocating a few more bytes per VM in a should-be-rare
3628
		 * failure scenario is a non-issue.
3629
		 */
3630
		if (r == -ENODEV)
3631
			goto success_disable_tdx;
3632

3633
		enable_tdx = 0;
3634
	}
3635

3636
	return r;
3637

3638
success_disable_tdx:
3639
	enable_tdx = 0;
3640
	return 0;
3641
}
3642

3643
void __init tdx_hardware_setup(void)
3644
{
3645
	KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
3646

3647
	/*
3648
	 * Note, if the TDX module can't be loaded, KVM TDX support will be
3649
	 * disabled but KVM will continue loading (see tdx_bringup()).
3650
	 */
3651
	vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
3652

3653
	vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
3654
	vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
3655
	vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
3656
	vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
3657
	vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
3658
}
3659

3660
Product

Resources

Company