CoCalc -- tdp

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/mmu/tdp_mmu.c
²⁹⁵²⁴ views
1
// SPDX-License-Identifier: GPL-2.0
2
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3

4
#include "mmu.h"
5
#include "mmu_internal.h"
6
#include "mmutrace.h"
7
#include "tdp_iter.h"
8
#include "tdp_mmu.h"
9
#include "spte.h"
10

11
#include <asm/cmpxchg.h>
12
#include <trace/events/kvm.h>
13

14
/* Initializes the TDP MMU for the VM, if enabled. */
15
void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
16
{
17
	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
18
	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
19
}
20

21
/* Arbitrarily returns true so that this may be used in if statements. */
22
static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
23
							     bool shared)
24
{
25
	if (shared)
26
		lockdep_assert_held_read(&kvm->mmu_lock);
27
	else
28
		lockdep_assert_held_write(&kvm->mmu_lock);
29

30
	return true;
31
}
32

33
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
34
{
35
	/*
36
	 * Invalidate all roots, which besides the obvious, schedules all roots
37
	 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
38
	 * ultimately frees all roots.
39
	 */
40
	kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
41
	kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
42

43
#ifdef CONFIG_KVM_PROVE_MMU
44
	KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
45
#endif
46
	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
47

48
	/*
49
	 * Ensure that all the outstanding RCU callbacks to free shadow pages
50
	 * can run before the VM is torn down.  Putting the last reference to
51
	 * zapped roots will create new callbacks.
52
	 */
53
	rcu_barrier();
54
}
55

56
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
57
{
58
	free_page((unsigned long)sp->external_spt);
59
	free_page((unsigned long)sp->spt);
60
	kmem_cache_free(mmu_page_header_cache, sp);
61
}
62

63
/*
64
 * This is called through call_rcu in order to free TDP page table memory
65
 * safely with respect to other kernel threads that may be operating on
66
 * the memory.
67
 * By only accessing TDP MMU page table memory in an RCU read critical
68
 * section, and freeing it after a grace period, lockless access to that
69
 * memory won't use it after it is freed.
70
 */
71
static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
72
{
73
	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
74
					       rcu_head);
75

76
	tdp_mmu_free_sp(sp);
77
}
78

79
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
80
{
81
	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
82
		return;
83

84
	/*
85
	 * The TDP MMU itself holds a reference to each root until the root is
86
	 * explicitly invalidated, i.e. the final reference should be never be
87
	 * put for a valid root.
88
	 */
89
	KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
90

91
	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
92
	list_del_rcu(&root->link);
93
	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
94
	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
95
}
96

97
static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
98
			       enum kvm_tdp_mmu_root_types types)
99
{
100
	if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
101
		return false;
102

103
	if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
104
		return false;
105

106
	if (likely(!is_mirror_sp(root)))
107
		return types & KVM_DIRECT_ROOTS;
108
	return types & KVM_MIRROR_ROOTS;
109
}
110

111
/*
112
 * Returns the next root after @prev_root (or the first root if @prev_root is
113
 * NULL) that matches with @types.  A reference to the returned root is
114
 * acquired, and the reference to @prev_root is released (the caller obviously
115
 * must hold a reference to @prev_root if it's non-NULL).
116
 *
117
 * Roots that doesn't match with @types are skipped.
118
 *
119
 * Returns NULL if the end of tdp_mmu_roots was reached.
120
 */
121
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
122
					      struct kvm_mmu_page *prev_root,
123
					      enum kvm_tdp_mmu_root_types types)
124
{
125
	struct kvm_mmu_page *next_root;
126

127
	/*
128
	 * While the roots themselves are RCU-protected, fields such as
129
	 * role.invalid are protected by mmu_lock.
130
	 */
131
	lockdep_assert_held(&kvm->mmu_lock);
132

133
	rcu_read_lock();
134

135
	if (prev_root)
136
		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
137
						  &prev_root->link,
138
						  typeof(*prev_root), link);
139
	else
140
		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
141
						   typeof(*next_root), link);
142

143
	while (next_root) {
144
		if (tdp_mmu_root_match(next_root, types) &&
145
		    kvm_tdp_mmu_get_root(next_root))
146
			break;
147

148
		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
149
				&next_root->link, typeof(*next_root), link);
150
	}
151

152
	rcu_read_unlock();
153

154
	if (prev_root)
155
		kvm_tdp_mmu_put_root(kvm, prev_root);
156

157
	return next_root;
158
}
159

160
/*
161
 * Note: this iterator gets and puts references to the roots it iterates over.
162
 * This makes it safe to release the MMU lock and yield within the loop, but
163
 * if exiting the loop early, the caller must drop the reference to the most
164
 * recent root. (Unless keeping a live reference is desirable.)
165
 *
166
 * If shared is set, this function is operating under the MMU lock in read
167
 * mode.
168
 */
169
#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types)	\
170
	for (_root = tdp_mmu_next_root(_kvm, NULL, _types);		\
171
	     ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root;		\
172
	     _root = tdp_mmu_next_root(_kvm, _root, _types))		\
173
		if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) {	\
174
		} else
175

176
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)	\
177
	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
178

179
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root)			\
180
	for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS);		\
181
	     ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root;	\
182
	     _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
183

184
/*
185
 * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
186
 * the implication being that any flow that holds mmu_lock for read is
187
 * inherently yield-friendly and should use the yield-safe variant above.
188
 * Holding mmu_lock for write obviates the need for RCU protection as the list
189
 * is guaranteed to be stable.
190
 */
191
#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types)			\
192
	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)		\
193
		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&		\
194
		    ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||	\
195
		     !tdp_mmu_root_match((_root), (_types)))) {			\
196
		} else
197

198
/*
199
 * Iterate over all TDP MMU roots in an RCU read-side critical section.
200
 * It is safe to iterate over the SPTEs under the root, but their values will
201
 * be unstable, so all writes must be atomic. As this routine is meant to be
202
 * used without holding the mmu_lock at all, any bits that are flipped must
203
 * be reflected in kvm_tdp_mmu_spte_need_atomic_write().
204
 */
205
#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types)			\
206
	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link)		\
207
		if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||	\
208
		    !tdp_mmu_root_match((_root), (_types))) {			\
209
		} else
210

211
#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id)		\
212
	__for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
213

214
static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
215
{
216
	struct kvm_mmu_page *sp;
217

218
	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
219
	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
220

221
	return sp;
222
}
223

224
static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
225
			    gfn_t gfn, union kvm_mmu_page_role role)
226
{
227
	INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
228

229
	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
230

231
	sp->role = role;
232
	sp->gfn = gfn;
233
	sp->ptep = sptep;
234
	sp->tdp_mmu_page = true;
235

236
	trace_kvm_mmu_get_page(sp, true);
237
}
238

239
static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
240
				  struct tdp_iter *iter)
241
{
242
	struct kvm_mmu_page *parent_sp;
243
	union kvm_mmu_page_role role;
244

245
	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
246

247
	role = parent_sp->role;
248
	role.level--;
249

250
	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
251
}
252

253
void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
254
{
255
	struct kvm_mmu *mmu = vcpu->arch.mmu;
256
	union kvm_mmu_page_role role = mmu->root_role;
257
	int as_id = kvm_mmu_role_as_id(role);
258
	struct kvm *kvm = vcpu->kvm;
259
	struct kvm_mmu_page *root;
260

261
	if (mirror)
262
		role.is_mirror = true;
263

264
	/*
265
	 * Check for an existing root before acquiring the pages lock to avoid
266
	 * unnecessary serialization if multiple vCPUs are loading a new root.
267
	 * E.g. when bringing up secondary vCPUs, KVM will already have created
268
	 * a valid root on behalf of the primary vCPU.
269
	 */
270
	read_lock(&kvm->mmu_lock);
271

272
	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
273
		if (root->role.word == role.word)
274
			goto out_read_unlock;
275
	}
276

277
	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
278

279
	/*
280
	 * Recheck for an existing root after acquiring the pages lock, another
281
	 * vCPU may have raced ahead and created a new usable root.  Manually
282
	 * walk the list of roots as the standard macros assume that the pages
283
	 * lock is *not* held.  WARN if grabbing a reference to a usable root
284
	 * fails, as the last reference to a root can only be put *after* the
285
	 * root has been invalidated, which requires holding mmu_lock for write.
286
	 */
287
	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
288
		if (root->role.word == role.word &&
289
		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
290
			goto out_spin_unlock;
291
	}
292

293
	root = tdp_mmu_alloc_sp(vcpu);
294
	tdp_mmu_init_sp(root, NULL, 0, role);
295

296
	/*
297
	 * TDP MMU roots are kept until they are explicitly invalidated, either
298
	 * by a memslot update or by the destruction of the VM.  Initialize the
299
	 * refcount to two; one reference for the vCPU, and one reference for
300
	 * the TDP MMU itself, which is held until the root is invalidated and
301
	 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
302
	 */
303
	refcount_set(&root->tdp_mmu_root_count, 2);
304
	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
305

306
out_spin_unlock:
307
	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
308
out_read_unlock:
309
	read_unlock(&kvm->mmu_lock);
310
	/*
311
	 * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
312
	 * and actually consuming the root if it's invalidated after dropping
313
	 * mmu_lock, and the root can't be freed as this vCPU holds a reference.
314
	 */
315
	if (mirror) {
316
		mmu->mirror_root_hpa = __pa(root->spt);
317
	} else {
318
		mmu->root.hpa = __pa(root->spt);
319
		mmu->root.pgd = 0;
320
	}
321
}
322

323
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
324
				u64 old_spte, u64 new_spte, int level,
325
				bool shared);
326

327
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
328
{
329
	kvm_account_pgtable_pages((void *)sp->spt, +1);
330
#ifdef CONFIG_KVM_PROVE_MMU
331
	atomic64_inc(&kvm->arch.tdp_mmu_pages);
332
#endif
333
}
334

335
static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
336
{
337
	kvm_account_pgtable_pages((void *)sp->spt, -1);
338
#ifdef CONFIG_KVM_PROVE_MMU
339
	atomic64_dec(&kvm->arch.tdp_mmu_pages);
340
#endif
341
}
342

343
/**
344
 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
345
 *
346
 * @kvm: kvm instance
347
 * @sp: the page to be removed
348
 */
349
static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
350
{
351
	tdp_unaccount_mmu_page(kvm, sp);
352

353
	if (!sp->nx_huge_page_disallowed)
354
		return;
355

356
	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
357
	sp->nx_huge_page_disallowed = false;
358
	untrack_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
359
	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
360
}
361

362
static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
363
				 int level)
364
{
365
	kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
366
	int ret;
367

368
	/*
369
	 * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
370
	 * PTs are removed in a special order, involving free_external_spt().
371
	 * But remove_external_spte() will be called on non-leaf PTEs via
372
	 * __tdp_mmu_zap_root(), so avoid the error the former would return
373
	 * in this case.
374
	 */
375
	if (!is_last_spte(old_spte, level))
376
		return;
377

378
	/* Zapping leaf spte is allowed only when write lock is held. */
379
	lockdep_assert_held_write(&kvm->mmu_lock);
380
	/* Because write lock is held, operation should success. */
381
	ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn);
382
	KVM_BUG_ON(ret, kvm);
383
}
384

385
/**
386
 * handle_removed_pt() - handle a page table removed from the TDP structure
387
 *
388
 * @kvm: kvm instance
389
 * @pt: the page removed from the paging structure
390
 * @shared: This operation may not be running under the exclusive use
391
 *	    of the MMU lock and the operation must synchronize with other
392
 *	    threads that might be modifying SPTEs.
393
 *
394
 * Given a page table that has been removed from the TDP paging structure,
395
 * iterates through the page table to clear SPTEs and free child page tables.
396
 *
397
 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
398
 * protection. Since this thread removed it from the paging structure,
399
 * this thread will be responsible for ensuring the page is freed. Hence the
400
 * early rcu_dereferences in the function.
401
 */
402
static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
403
{
404
	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
405
	int level = sp->role.level;
406
	gfn_t base_gfn = sp->gfn;
407
	int i;
408

409
	trace_kvm_mmu_prepare_zap_page(sp);
410

411
	tdp_mmu_unlink_sp(kvm, sp);
412

413
	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
414
		tdp_ptep_t sptep = pt + i;
415
		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
416
		u64 old_spte;
417

418
		if (shared) {
419
			/*
420
			 * Set the SPTE to a nonpresent value that other
421
			 * threads will not overwrite. If the SPTE was
422
			 * already marked as frozen then another thread
423
			 * handling a page fault could overwrite it, so
424
			 * set the SPTE until it is set from some other
425
			 * value to the frozen SPTE value.
426
			 */
427
			for (;;) {
428
				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
429
				if (!is_frozen_spte(old_spte))
430
					break;
431
				cpu_relax();
432
			}
433
		} else {
434
			/*
435
			 * If the SPTE is not MMU-present, there is no backing
436
			 * page associated with the SPTE and so no side effects
437
			 * that need to be recorded, and exclusive ownership of
438
			 * mmu_lock ensures the SPTE can't be made present.
439
			 * Note, zapping MMIO SPTEs is also unnecessary as they
440
			 * are guarded by the memslots generation, not by being
441
			 * unreachable.
442
			 */
443
			old_spte = kvm_tdp_mmu_read_spte(sptep);
444
			if (!is_shadow_present_pte(old_spte))
445
				continue;
446

447
			/*
448
			 * Use the common helper instead of a raw WRITE_ONCE as
449
			 * the SPTE needs to be updated atomically if it can be
450
			 * modified by a different vCPU outside of mmu_lock.
451
			 * Even though the parent SPTE is !PRESENT, the TLB
452
			 * hasn't yet been flushed, and both Intel and AMD
453
			 * document that A/D assists can use upper-level PxE
454
			 * entries that are cached in the TLB, i.e. the CPU can
455
			 * still access the page and mark it dirty.
456
			 *
457
			 * No retry is needed in the atomic update path as the
458
			 * sole concern is dropping a Dirty bit, i.e. no other
459
			 * task can zap/remove the SPTE as mmu_lock is held for
460
			 * write.  Marking the SPTE as a frozen SPTE is not
461
			 * strictly necessary for the same reason, but using
462
			 * the frozen SPTE value keeps the shared/exclusive
463
			 * paths consistent and allows the handle_changed_spte()
464
			 * call below to hardcode the new value to FROZEN_SPTE.
465
			 *
466
			 * Note, even though dropping a Dirty bit is the only
467
			 * scenario where a non-atomic update could result in a
468
			 * functional bug, simply checking the Dirty bit isn't
469
			 * sufficient as a fast page fault could read the upper
470
			 * level SPTE before it is zapped, and then make this
471
			 * target SPTE writable, resume the guest, and set the
472
			 * Dirty bit between reading the SPTE above and writing
473
			 * it here.
474
			 */
475
			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
476
							  FROZEN_SPTE, level);
477
		}
478
		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
479
				    old_spte, FROZEN_SPTE, level, shared);
480

481
		if (is_mirror_sp(sp)) {
482
			KVM_BUG_ON(shared, kvm);
483
			remove_external_spte(kvm, gfn, old_spte, level);
484
		}
485
	}
486

487
	if (is_mirror_sp(sp) &&
488
	    WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level,
489
						    sp->external_spt))) {
490
		/*
491
		 * Failed to free page table page in mirror page table and
492
		 * there is nothing to do further.
493
		 * Intentionally leak the page to prevent the kernel from
494
		 * accessing the encrypted page.
495
		 */
496
		sp->external_spt = NULL;
497
	}
498

499
	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
500
}
501

502
static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
503
{
504
	if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
505
		struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
506

507
		WARN_ON_ONCE(sp->role.level + 1 != level);
508
		WARN_ON_ONCE(sp->gfn != gfn);
509
		return sp->external_spt;
510
	}
511

512
	return NULL;
513
}
514

515
static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
516
						 gfn_t gfn, u64 old_spte,
517
						 u64 new_spte, int level)
518
{
519
	bool was_present = is_shadow_present_pte(old_spte);
520
	bool is_present = is_shadow_present_pte(new_spte);
521
	bool is_leaf = is_present && is_last_spte(new_spte, level);
522
	kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
523
	int ret = 0;
524

525
	KVM_BUG_ON(was_present, kvm);
526

527
	lockdep_assert_held(&kvm->mmu_lock);
528
	/*
529
	 * We need to lock out other updates to the SPTE until the external
530
	 * page table has been modified. Use FROZEN_SPTE similar to
531
	 * the zapping case.
532
	 */
533
	if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
534
		return -EBUSY;
535

536
	/*
537
	 * Use different call to either set up middle level
538
	 * external page table, or leaf.
539
	 */
540
	if (is_leaf) {
541
		ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn);
542
	} else {
543
		void *external_spt = get_external_spt(gfn, new_spte, level);
544

545
		KVM_BUG_ON(!external_spt, kvm);
546
		ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
547
	}
548
	if (ret)
549
		__kvm_tdp_mmu_write_spte(sptep, old_spte);
550
	else
551
		__kvm_tdp_mmu_write_spte(sptep, new_spte);
552
	return ret;
553
}
554

555
/**
556
 * handle_changed_spte - handle bookkeeping associated with an SPTE change
557
 * @kvm: kvm instance
558
 * @as_id: the address space of the paging structure the SPTE was a part of
559
 * @gfn: the base GFN that was mapped by the SPTE
560
 * @old_spte: The value of the SPTE before the change
561
 * @new_spte: The value of the SPTE after the change
562
 * @level: the level of the PT the SPTE is part of in the paging structure
563
 * @shared: This operation may not be running under the exclusive use of
564
 *	    the MMU lock and the operation must synchronize with other
565
 *	    threads that might be modifying SPTEs.
566
 *
567
 * Handle bookkeeping that might result from the modification of a SPTE.  Note,
568
 * dirty logging updates are handled in common code, not here (see make_spte()
569
 * and fast_pf_fix_direct_spte()).
570
 */
571
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
572
				u64 old_spte, u64 new_spte, int level,
573
				bool shared)
574
{
575
	bool was_present = is_shadow_present_pte(old_spte);
576
	bool is_present = is_shadow_present_pte(new_spte);
577
	bool was_leaf = was_present && is_last_spte(old_spte, level);
578
	bool is_leaf = is_present && is_last_spte(new_spte, level);
579
	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
580

581
	WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
582
	WARN_ON_ONCE(level < PG_LEVEL_4K);
583
	WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
584

585
	/*
586
	 * If this warning were to trigger it would indicate that there was a
587
	 * missing MMU notifier or a race with some notifier handler.
588
	 * A present, leaf SPTE should never be directly replaced with another
589
	 * present leaf SPTE pointing to a different PFN. A notifier handler
590
	 * should be zapping the SPTE before the main MM's page table is
591
	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
592
	 * thread before replacement.
593
	 */
594
	if (was_leaf && is_leaf && pfn_changed) {
595
		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
596
		       "SPTE with another present leaf SPTE mapping a\n"
597
		       "different PFN!\n"
598
		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
599
		       as_id, gfn, old_spte, new_spte, level);
600

601
		/*
602
		 * Crash the host to prevent error propagation and guest data
603
		 * corruption.
604
		 */
605
		BUG();
606
	}
607

608
	if (old_spte == new_spte)
609
		return;
610

611
	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
612

613
	if (is_leaf)
614
		check_spte_writable_invariants(new_spte);
615

616
	/*
617
	 * The only times a SPTE should be changed from a non-present to
618
	 * non-present state is when an MMIO entry is installed/modified/
619
	 * removed. In that case, there is nothing to do here.
620
	 */
621
	if (!was_present && !is_present) {
622
		/*
623
		 * If this change does not involve a MMIO SPTE or frozen SPTE,
624
		 * it is unexpected. Log the change, though it should not
625
		 * impact the guest since both the former and current SPTEs
626
		 * are nonpresent.
627
		 */
628
		if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
629
				 !is_mmio_spte(kvm, new_spte) &&
630
				 !is_frozen_spte(new_spte)))
631
			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
632
			       "should not be replaced with another,\n"
633
			       "different nonpresent SPTE, unless one or both\n"
634
			       "are MMIO SPTEs, or the new SPTE is\n"
635
			       "a temporary frozen SPTE.\n"
636
			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
637
			       as_id, gfn, old_spte, new_spte, level);
638
		return;
639
	}
640

641
	if (is_leaf != was_leaf)
642
		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
643

644
	/*
645
	 * Recursively handle child PTs if the change removed a subtree from
646
	 * the paging structure.  Note the WARN on the PFN changing without the
647
	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
648
	 * pages are kernel allocations and should never be migrated.
649
	 */
650
	if (was_present && !was_leaf &&
651
	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
652
		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
653
}
654

655
static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
656
							 struct tdp_iter *iter,
657
							 u64 new_spte)
658
{
659
	/*
660
	 * The caller is responsible for ensuring the old SPTE is not a FROZEN
661
	 * SPTE.  KVM should never attempt to zap or manipulate a FROZEN SPTE,
662
	 * and pre-checking before inserting a new SPTE is advantageous as it
663
	 * avoids unnecessary work.
664
	 */
665
	WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
666

667
	if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
668
		int ret;
669

670
		/*
671
		 * Users of atomic zapping don't operate on mirror roots,
672
		 * so don't handle it and bug the VM if it's seen.
673
		 */
674
		if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
675
			return -EBUSY;
676

677
		ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
678
						iter->old_spte, new_spte, iter->level);
679
		if (ret)
680
			return ret;
681
	} else {
682
		u64 *sptep = rcu_dereference(iter->sptep);
683

684
		/*
685
		 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
686
		 * and does not hold the mmu_lock.  On failure, i.e. if a
687
		 * different logical CPU modified the SPTE, try_cmpxchg64()
688
		 * updates iter->old_spte with the current value, so the caller
689
		 * operates on fresh data, e.g. if it retries
690
		 * tdp_mmu_set_spte_atomic()
691
		 */
692
		if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
693
			return -EBUSY;
694
	}
695

696
	return 0;
697
}
698

699
/*
700
 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
701
 * and handle the associated bookkeeping.  Do not mark the page dirty
702
 * in KVM's dirty bitmaps.
703
 *
704
 * If setting the SPTE fails because it has changed, iter->old_spte will be
705
 * refreshed to the current value of the spte.
706
 *
707
 * @kvm: kvm instance
708
 * @iter: a tdp_iter instance currently on the SPTE that should be set
709
 * @new_spte: The value the SPTE should be set to
710
 * Return:
711
 * * 0      - If the SPTE was set.
712
 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
713
 *            no side-effects other than setting iter->old_spte to the last
714
 *            known value of the spte.
715
 */
716
static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
717
						       struct tdp_iter *iter,
718
						       u64 new_spte)
719
{
720
	int ret;
721

722
	lockdep_assert_held_read(&kvm->mmu_lock);
723

724
	ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
725
	if (ret)
726
		return ret;
727

728
	handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
729
			    new_spte, iter->level, true);
730

731
	return 0;
732
}
733

734
/*
735
 * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
736
 * @kvm:	      KVM instance
737
 * @as_id:	      Address space ID, i.e. regular vs. SMM
738
 * @sptep:	      Pointer to the SPTE
739
 * @old_spte:	      The current value of the SPTE
740
 * @new_spte:	      The new value that will be set for the SPTE
741
 * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
742
 * @level:	      The level _containing_ the SPTE (its parent PT's level)
743
 *
744
 * Returns the old SPTE value, which _may_ be different than @old_spte if the
745
 * SPTE had voldatile bits.
746
 */
747
static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
748
			    u64 old_spte, u64 new_spte, gfn_t gfn, int level)
749
{
750
	lockdep_assert_held_write(&kvm->mmu_lock);
751

752
	/*
753
	 * No thread should be using this function to set SPTEs to or from the
754
	 * temporary frozen SPTE value.
755
	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
756
	 * should be used. If operating under the MMU lock in write mode, the
757
	 * use of the frozen SPTE should not be necessary.
758
	 */
759
	WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
760

761
	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
762

763
	handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
764

765
	/*
766
	 * Users that do non-atomic setting of PTEs don't operate on mirror
767
	 * roots, so don't handle it and bug the VM if it's seen.
768
	 */
769
	if (is_mirror_sptep(sptep)) {
770
		KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
771
		remove_external_spte(kvm, gfn, old_spte, level);
772
	}
773

774
	return old_spte;
775
}
776

777
static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
778
					 u64 new_spte)
779
{
780
	WARN_ON_ONCE(iter->yielded);
781
	iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
782
					  iter->old_spte, new_spte,
783
					  iter->gfn, iter->level);
784
}
785

786
#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end)	\
787
	for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
788

789
#define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end)	\
790
	tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end)		\
791
		if (!is_shadow_present_pte(_iter.old_spte) ||		\
792
		    !is_last_spte(_iter.old_spte, _iter.level))		\
793
			continue;					\
794
		else
795

796
static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
797
							  struct tdp_iter *iter)
798
{
799
	if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
800
		return false;
801

802
	/* Ensure forward progress has been made before yielding. */
803
	return iter->next_last_level_gfn != iter->yielded_gfn;
804
}
805

806
/*
807
 * Yield if the MMU lock is contended or this thread needs to return control
808
 * to the scheduler.
809
 *
810
 * If this function should yield and flush is set, it will perform a remote
811
 * TLB flush before yielding.
812
 *
813
 * If this function yields, iter->yielded is set and the caller must skip to
814
 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
815
 * over the paging structures to allow the iterator to continue its traversal
816
 * from the paging structure root.
817
 *
818
 * Returns true if this function yielded.
819
 */
820
static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
821
							  struct tdp_iter *iter,
822
							  bool flush, bool shared)
823
{
824
	KVM_MMU_WARN_ON(iter->yielded);
825

826
	if (!tdp_mmu_iter_need_resched(kvm, iter))
827
		return false;
828

829
	if (flush)
830
		kvm_flush_remote_tlbs(kvm);
831

832
	rcu_read_unlock();
833

834
	if (shared)
835
		cond_resched_rwlock_read(&kvm->mmu_lock);
836
	else
837
		cond_resched_rwlock_write(&kvm->mmu_lock);
838

839
	rcu_read_lock();
840

841
	WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
842

843
	iter->yielded = true;
844
	return true;
845
}
846

847
static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
848
{
849
	/*
850
	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
851
	 * a gpa range that would exceed the max gfn, and KVM does not create
852
	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
853
	 * the slow emulation path every time.
854
	 */
855
	return kvm_mmu_max_gfn() + 1;
856
}
857

858
static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
859
			       bool shared, int zap_level)
860
{
861
	struct tdp_iter iter;
862

863
	for_each_tdp_pte_min_level_all(iter, root, zap_level) {
864
retry:
865
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
866
			continue;
867

868
		if (!is_shadow_present_pte(iter.old_spte))
869
			continue;
870

871
		if (iter.level > zap_level)
872
			continue;
873

874
		if (!shared)
875
			tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
876
		else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
877
			goto retry;
878
	}
879
}
880

881
static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
882
			     bool shared)
883
{
884

885
	/*
886
	 * The root must have an elevated refcount so that it's reachable via
887
	 * mmu_notifier callbacks, which allows this path to yield and drop
888
	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
889
	 * must drop all references to relevant pages prior to completing the
890
	 * callback.  Dropping mmu_lock with an unreachable root would result
891
	 * in zapping SPTEs after a relevant mmu_notifier callback completes
892
	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
893
	 * dirty accessed bits to the SPTE's associated struct page.
894
	 */
895
	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
896

897
	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
898

899
	rcu_read_lock();
900

901
	/*
902
	 * Zap roots in multiple passes of decreasing granularity, i.e. zap at
903
	 * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
904
	 * preempt models) or mmu_lock contention (full or real-time models).
905
	 * Zapping at finer granularity marginally increases the total time of
906
	 * the zap, but in most cases the zap itself isn't latency sensitive.
907
	 *
908
	 * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
909
	 * in order to mimic the page fault path, which can replace a 1GiB page
910
	 * table with an equivalent 1GiB hugepage, i.e. can get saddled with
911
	 * zapping a 1GiB region that's fully populated with 4KiB SPTEs.  This
912
	 * allows verifying that KVM can safely zap 1GiB regions, e.g. without
913
	 * inducing RCU stalls, without relying on a relatively rare event
914
	 * (zapping roots is orders of magnitude more common).  Note, because
915
	 * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
916
	 * in the iterator itself is unnecessary.
917
	 */
918
	if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
919
		__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
920
		__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
921
	}
922
	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
923
	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
924

925
	rcu_read_unlock();
926
}
927

928
bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
929
					   struct kvm_mmu_page *sp)
930
{
931
	struct tdp_iter iter = {
932
		.old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
933
		.sptep = sp->ptep,
934
		.level = sp->role.level + 1,
935
		.gfn = sp->gfn,
936
		.as_id = kvm_mmu_page_as_id(sp),
937
	};
938

939
	lockdep_assert_held_read(&kvm->mmu_lock);
940

941
	if (WARN_ON_ONCE(!is_tdp_mmu_page(sp)))
942
		return false;
943

944
	/*
945
	 * Root shadow pages don't have a parent page table and thus no
946
	 * associated entry, but they can never be possible NX huge pages.
947
	 */
948
	if (WARN_ON_ONCE(!sp->ptep))
949
		return false;
950

951
	/*
952
	 * Since mmu_lock is held in read mode, it's possible another task has
953
	 * already modified the SPTE. Zap the SPTE if and only if the SPTE
954
	 * points at the SP's page table, as checking shadow-present isn't
955
	 * sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even
956
	 * another SP. Note, spte_to_child_pt() also checks that the SPTE is
957
	 * shadow-present, i.e. guards against zapping a frozen SPTE.
958
	 */
959
	if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
960
		return false;
961

962
	/*
963
	 * If a different task modified the SPTE, then it should be impossible
964
	 * for the SPTE to still be used for the to-be-zapped SP. Non-leaf
965
	 * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when
966
	 * creating non-leaf SPTEs, and all other bits are immutable for non-
967
	 * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are
968
	 * zapping and replacement.
969
	 */
970
	if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
971
		WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level));
972
		return false;
973
	}
974

975
	return true;
976
}
977

978
/*
979
 * If can_yield is true, will release the MMU lock and reschedule if the
980
 * scheduler needs the CPU or there is contention on the MMU lock. If this
981
 * function cannot yield, it will not release the MMU lock or reschedule and
982
 * the caller must ensure it does not supply too large a GFN range, or the
983
 * operation can cause a soft lockup.
984
 */
985
static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
986
			      gfn_t start, gfn_t end, bool can_yield, bool flush)
987
{
988
	struct tdp_iter iter;
989

990
	end = min(end, tdp_mmu_max_gfn_exclusive());
991

992
	lockdep_assert_held_write(&kvm->mmu_lock);
993

994
	rcu_read_lock();
995

996
	for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
997
		if (can_yield &&
998
		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
999
			flush = false;
1000
			continue;
1001
		}
1002

1003
		if (!is_shadow_present_pte(iter.old_spte) ||
1004
		    !is_last_spte(iter.old_spte, iter.level))
1005
			continue;
1006

1007
		tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
1008

1009
		/*
1010
		 * Zappings SPTEs in invalid roots doesn't require a TLB flush,
1011
		 * see kvm_tdp_mmu_zap_invalidated_roots() for details.
1012
		 */
1013
		if (!root->role.invalid)
1014
			flush = true;
1015
	}
1016

1017
	rcu_read_unlock();
1018

1019
	/*
1020
	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
1021
	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
1022
	 */
1023
	return flush;
1024
}
1025

1026
/*
1027
 * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
1028
 * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
1029
 * one or more SPTEs were zapped since the MMU lock was last acquired.
1030
 */
1031
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
1032
{
1033
	struct kvm_mmu_page *root;
1034

1035
	lockdep_assert_held_write(&kvm->mmu_lock);
1036
	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
1037
		flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
1038

1039
	return flush;
1040
}
1041

1042
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
1043
{
1044
	struct kvm_mmu_page *root;
1045

1046
	/*
1047
	 * Zap all direct roots, including invalid direct roots, as all direct
1048
	 * SPTEs must be dropped before returning to the caller. For TDX, mirror
1049
	 * roots don't need handling in response to the mmu notifier (the caller).
1050
	 *
1051
	 * Zap directly even if the root is also being zapped by a concurrent
1052
	 * "fast zap".  Walking zapped top-level SPTEs isn't all that expensive
1053
	 * and mmu_lock is already held, which means the other thread has yielded.
1054
	 *
1055
	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
1056
	 * is being destroyed or the userspace VMM has exited.  In both cases,
1057
	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1058
	 */
1059
	lockdep_assert_held_write(&kvm->mmu_lock);
1060
	__for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
1061
					   KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
1062
		tdp_mmu_zap_root(kvm, root, false);
1063
}
1064

1065
/*
1066
 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1067
 * zap" completes.
1068
 */
1069
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
1070
{
1071
	struct kvm_mmu_page *root;
1072

1073
	if (shared)
1074
		read_lock(&kvm->mmu_lock);
1075
	else
1076
		write_lock(&kvm->mmu_lock);
1077

1078
	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1079
		if (!root->tdp_mmu_scheduled_root_to_zap)
1080
			continue;
1081

1082
		root->tdp_mmu_scheduled_root_to_zap = false;
1083
		KVM_BUG_ON(!root->role.invalid, kvm);
1084

1085
		/*
1086
		 * A TLB flush is not necessary as KVM performs a local TLB
1087
		 * flush when allocating a new root (see kvm_mmu_load()), and
1088
		 * when migrating a vCPU to a different pCPU.  Note, the local
1089
		 * TLB flush on reuse also invalidates paging-structure-cache
1090
		 * entries, i.e. TLB entries for intermediate paging structures,
1091
		 * that may be zapped, as such entries are associated with the
1092
		 * ASID on both VMX and SVM.
1093
		 */
1094
		tdp_mmu_zap_root(kvm, root, shared);
1095

1096
		/*
1097
		 * The referenced needs to be put *after* zapping the root, as
1098
		 * the root must be reachable by mmu_notifiers while it's being
1099
		 * zapped
1100
		 */
1101
		kvm_tdp_mmu_put_root(kvm, root);
1102
	}
1103

1104
	if (shared)
1105
		read_unlock(&kvm->mmu_lock);
1106
	else
1107
		write_unlock(&kvm->mmu_lock);
1108
}
1109

1110
/*
1111
 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1112
 * is about to be zapped, e.g. in response to a memslots update.  The actual
1113
 * zapping is done separately so that it happens with mmu_lock with read,
1114
 * whereas invalidating roots must be done with mmu_lock held for write (unless
1115
 * the VM is being destroyed).
1116
 *
1117
 * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
1118
 * See kvm_tdp_mmu_alloc_root().
1119
 */
1120
void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
1121
				  enum kvm_tdp_mmu_root_types root_types)
1122
{
1123
	struct kvm_mmu_page *root;
1124

1125
	/*
1126
	 * Invalidating invalid roots doesn't make sense, prevent developers from
1127
	 * having to think about it.
1128
	 */
1129
	if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
1130
		root_types &= ~KVM_INVALID_ROOTS;
1131

1132
	/*
1133
	 * mmu_lock must be held for write to ensure that a root doesn't become
1134
	 * invalid while there are active readers (invalidating a root while
1135
	 * there are active readers may or may not be problematic in practice,
1136
	 * but it's uncharted territory and not supported).
1137
	 *
1138
	 * Waive the assertion if there are no users of @kvm, i.e. the VM is
1139
	 * being destroyed after all references have been put, or if no vCPUs
1140
	 * have been created (which means there are no roots), i.e. the VM is
1141
	 * being destroyed in an error path of KVM_CREATE_VM.
1142
	 */
1143
	if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
1144
	    refcount_read(&kvm->users_count) && kvm->created_vcpus)
1145
		lockdep_assert_held_write(&kvm->mmu_lock);
1146

1147
	/*
1148
	 * As above, mmu_lock isn't held when destroying the VM!  There can't
1149
	 * be other references to @kvm, i.e. nothing else can invalidate roots
1150
	 * or get/put references to roots.
1151
	 */
1152
	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1153
		if (!tdp_mmu_root_match(root, root_types))
1154
			continue;
1155

1156
		/*
1157
		 * Note, invalid roots can outlive a memslot update!  Invalid
1158
		 * roots must be *zapped* before the memslot update completes,
1159
		 * but a different task can acquire a reference and keep the
1160
		 * root alive after its been zapped.
1161
		 */
1162
		if (!root->role.invalid) {
1163
			root->tdp_mmu_scheduled_root_to_zap = true;
1164
			root->role.invalid = true;
1165
		}
1166
	}
1167
}
1168

1169
/*
1170
 * Installs a last-level SPTE to handle a TDP page fault.
1171
 * (NPT/EPT violation/misconfiguration)
1172
 */
1173
static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1174
					  struct kvm_page_fault *fault,
1175
					  struct tdp_iter *iter)
1176
{
1177
	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1178
	u64 new_spte;
1179
	int ret = RET_PF_FIXED;
1180
	bool wrprot = false;
1181

1182
	if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
1183
		return RET_PF_RETRY;
1184

1185
	if (is_shadow_present_pte(iter->old_spte) &&
1186
	    (fault->prefetch || is_access_allowed(fault, iter->old_spte)) &&
1187
	    is_last_spte(iter->old_spte, iter->level)) {
1188
		WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte));
1189
		return RET_PF_SPURIOUS;
1190
	}
1191

1192
	if (unlikely(!fault->slot))
1193
		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1194
	else
1195
		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1196
				   fault->pfn, iter->old_spte, fault->prefetch,
1197
				   false, fault->map_writable, &new_spte);
1198

1199
	if (new_spte == iter->old_spte)
1200
		ret = RET_PF_SPURIOUS;
1201
	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1202
		return RET_PF_RETRY;
1203
	else if (is_shadow_present_pte(iter->old_spte) &&
1204
		 (!is_last_spte(iter->old_spte, iter->level) ||
1205
		  WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
1206
		kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
1207

1208
	/*
1209
	 * If the page fault was caused by a write but the page is write
1210
	 * protected, emulation is needed. If the emulation was skipped,
1211
	 * the vCPU would have the same fault again.
1212
	 */
1213
	if (wrprot && fault->write)
1214
		ret = RET_PF_WRITE_PROTECTED;
1215

1216
	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1217
	if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
1218
		vcpu->stat.pf_mmio_spte_created++;
1219
		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1220
				     new_spte);
1221
		ret = RET_PF_EMULATE;
1222
	} else {
1223
		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1224
				       rcu_dereference(iter->sptep));
1225
	}
1226

1227
	return ret;
1228
}
1229

1230
/*
1231
 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1232
 * provided page table.
1233
 *
1234
 * @kvm: kvm instance
1235
 * @iter: a tdp_iter instance currently on the SPTE that should be set
1236
 * @sp: The new TDP page table to install.
1237
 * @shared: This operation is running under the MMU lock in read mode.
1238
 *
1239
 * Returns: 0 if the new page table was installed. Non-0 if the page table
1240
 *          could not be installed (e.g. the atomic compare-exchange failed).
1241
 */
1242
static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1243
			   struct kvm_mmu_page *sp, bool shared)
1244
{
1245
	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
1246
	int ret = 0;
1247

1248
	if (shared) {
1249
		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1250
		if (ret)
1251
			return ret;
1252
	} else {
1253
		tdp_mmu_iter_set_spte(kvm, iter, spte);
1254
	}
1255

1256
	tdp_account_mmu_page(kvm, sp);
1257

1258
	return 0;
1259
}
1260

1261
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1262
				   struct kvm_mmu_page *sp, bool shared);
1263

1264
/*
1265
 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1266
 * page tables and SPTEs to translate the faulting guest physical address.
1267
 */
1268
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1269
{
1270
	struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
1271
	struct kvm *kvm = vcpu->kvm;
1272
	struct tdp_iter iter;
1273
	struct kvm_mmu_page *sp;
1274
	int ret = RET_PF_RETRY;
1275

1276
	kvm_mmu_hugepage_adjust(vcpu, fault);
1277

1278
	trace_kvm_mmu_spte_requested(fault);
1279

1280
	rcu_read_lock();
1281

1282
	for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
1283
		int r;
1284

1285
		if (fault->nx_huge_page_workaround_enabled)
1286
			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1287

1288
		/*
1289
		 * If SPTE has been frozen by another thread, just give up and
1290
		 * retry, avoiding unnecessary page table allocation and free.
1291
		 */
1292
		if (is_frozen_spte(iter.old_spte))
1293
			goto retry;
1294

1295
		if (iter.level == fault->goal_level)
1296
			goto map_target_level;
1297

1298
		/* Step down into the lower level page table if it exists. */
1299
		if (is_shadow_present_pte(iter.old_spte) &&
1300
		    !is_large_pte(iter.old_spte))
1301
			continue;
1302

1303
		/*
1304
		 * The SPTE is either non-present or points to a huge page that
1305
		 * needs to be split.
1306
		 */
1307
		sp = tdp_mmu_alloc_sp(vcpu);
1308
		tdp_mmu_init_child_sp(sp, &iter);
1309
		if (is_mirror_sp(sp))
1310
			kvm_mmu_alloc_external_spt(vcpu, sp);
1311

1312
		sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1313

1314
		if (is_shadow_present_pte(iter.old_spte)) {
1315
			/* Don't support large page for mirrored roots (TDX) */
1316
			KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
1317
			r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1318
		} else {
1319
			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1320
		}
1321

1322
		/*
1323
		 * Force the guest to retry if installing an upper level SPTE
1324
		 * failed, e.g. because a different task modified the SPTE.
1325
		 */
1326
		if (r) {
1327
			tdp_mmu_free_sp(sp);
1328
			goto retry;
1329
		}
1330

1331
		if (fault->huge_page_disallowed &&
1332
		    fault->req_level >= iter.level) {
1333
			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1334
			if (sp->nx_huge_page_disallowed)
1335
				track_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
1336
			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1337
		}
1338
	}
1339

1340
	/*
1341
	 * The walk aborted before reaching the target level, e.g. because the
1342
	 * iterator detected an upper level SPTE was frozen during traversal.
1343
	 */
1344
	WARN_ON_ONCE(iter.level == fault->goal_level);
1345
	goto retry;
1346

1347
map_target_level:
1348
	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1349

1350
retry:
1351
	rcu_read_unlock();
1352
	return ret;
1353
}
1354

1355
/* Used by mmu notifier via kvm_unmap_gfn_range() */
1356
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1357
				 bool flush)
1358
{
1359
	enum kvm_tdp_mmu_root_types types;
1360
	struct kvm_mmu_page *root;
1361

1362
	types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
1363

1364
	__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
1365
		flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
1366
					  range->may_block, flush);
1367

1368
	return flush;
1369
}
1370

1371
/*
1372
 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1373
 * if any of the GFNs in the range have been accessed.
1374
 *
1375
 * No need to mark the corresponding PFN as accessed as this call is coming
1376
 * from the clear_young() or clear_flush_young() notifier, which uses the
1377
 * return value to determine if the page has been accessed.
1378
 */
1379
static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
1380
{
1381
	u64 new_spte;
1382

1383
	if (spte_ad_enabled(iter->old_spte)) {
1384
		iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
1385
								shadow_accessed_mask);
1386
		new_spte = iter->old_spte & ~shadow_accessed_mask;
1387
	} else {
1388
		new_spte = mark_spte_for_access_track(iter->old_spte);
1389
		/*
1390
		 * It is safe for the following cmpxchg to fail. Leave the
1391
		 * Accessed bit set, as the spte is most likely young anyway.
1392
		 */
1393
		if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
1394
			return;
1395
	}
1396

1397
	trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1398
				       iter->old_spte, new_spte);
1399
}
1400

1401
static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
1402
					struct kvm_gfn_range *range,
1403
					bool test_only)
1404
{
1405
	enum kvm_tdp_mmu_root_types types;
1406
	struct kvm_mmu_page *root;
1407
	struct tdp_iter iter;
1408
	bool ret = false;
1409

1410
	types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
1411

1412
	/*
1413
	 * Don't support rescheduling, none of the MMU notifiers that funnel
1414
	 * into this helper allow blocking; it'd be dead, wasteful code.  Note,
1415
	 * this helper must NOT be used to unmap GFNs, as it processes only
1416
	 * valid roots!
1417
	 */
1418
	WARN_ON(types & ~KVM_VALID_ROOTS);
1419

1420
	guard(rcu)();
1421
	for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
1422
		tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
1423
			if (!is_accessed_spte(iter.old_spte))
1424
				continue;
1425

1426
			if (test_only)
1427
				return true;
1428

1429
			ret = true;
1430
			kvm_tdp_mmu_age_spte(kvm, &iter);
1431
		}
1432
	}
1433

1434
	return ret;
1435
}
1436

1437
bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1438
{
1439
	return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
1440
}
1441

1442
bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1443
{
1444
	return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
1445
}
1446

1447
/*
1448
 * Remove write access from all SPTEs at or above min_level that map GFNs
1449
 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1450
 * be flushed.
1451
 */
1452
static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1453
			     gfn_t start, gfn_t end, int min_level)
1454
{
1455
	struct tdp_iter iter;
1456
	u64 new_spte;
1457
	bool spte_set = false;
1458

1459
	rcu_read_lock();
1460

1461
	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1462

1463
	for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
1464
retry:
1465
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1466
			continue;
1467

1468
		if (!is_shadow_present_pte(iter.old_spte) ||
1469
		    !is_last_spte(iter.old_spte, iter.level) ||
1470
		    !(iter.old_spte & PT_WRITABLE_MASK))
1471
			continue;
1472

1473
		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1474

1475
		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1476
			goto retry;
1477

1478
		spte_set = true;
1479
	}
1480

1481
	rcu_read_unlock();
1482
	return spte_set;
1483
}
1484

1485
/*
1486
 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1487
 * only affect leaf SPTEs down to min_level.
1488
 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1489
 */
1490
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1491
			     const struct kvm_memory_slot *slot, int min_level)
1492
{
1493
	struct kvm_mmu_page *root;
1494
	bool spte_set = false;
1495

1496
	lockdep_assert_held_read(&kvm->mmu_lock);
1497

1498
	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1499
		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1500
			     slot->base_gfn + slot->npages, min_level);
1501

1502
	return spte_set;
1503
}
1504

1505
static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
1506
{
1507
	struct kvm_mmu_page *sp;
1508

1509
	sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
1510
	if (!sp)
1511
		return NULL;
1512

1513
	sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
1514
	if (!sp->spt) {
1515
		kmem_cache_free(mmu_page_header_cache, sp);
1516
		return NULL;
1517
	}
1518

1519
	return sp;
1520
}
1521

1522
/* Note, the caller is responsible for initializing @sp. */
1523
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1524
				   struct kvm_mmu_page *sp, bool shared)
1525
{
1526
	const u64 huge_spte = iter->old_spte;
1527
	const int level = iter->level;
1528
	int ret, i;
1529

1530
	/*
1531
	 * No need for atomics when writing to sp->spt since the page table has
1532
	 * not been linked in yet and thus is not reachable from any other CPU.
1533
	 */
1534
	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1535
		sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
1536

1537
	/*
1538
	 * Replace the huge spte with a pointer to the populated lower level
1539
	 * page table. Since we are making this change without a TLB flush vCPUs
1540
	 * will see a mix of the split mappings and the original huge mapping,
1541
	 * depending on what's currently in their TLB. This is fine from a
1542
	 * correctness standpoint since the translation will be the same either
1543
	 * way.
1544
	 */
1545
	ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1546
	if (ret)
1547
		goto out;
1548

1549
	/*
1550
	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1551
	 * are overwriting from the page stats. But we have to manually update
1552
	 * the page stats with the new present child pages.
1553
	 */
1554
	kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1555

1556
out:
1557
	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1558
	return ret;
1559
}
1560

1561
static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1562
					 struct kvm_mmu_page *root,
1563
					 gfn_t start, gfn_t end,
1564
					 int target_level, bool shared)
1565
{
1566
	struct kvm_mmu_page *sp = NULL;
1567
	struct tdp_iter iter;
1568

1569
	rcu_read_lock();
1570

1571
	/*
1572
	 * Traverse the page table splitting all huge pages above the target
1573
	 * level into one lower level. For example, if we encounter a 1GB page
1574
	 * we split it into 512 2MB pages.
1575
	 *
1576
	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1577
	 * to visit an SPTE before ever visiting its children, which means we
1578
	 * will correctly recursively split huge pages that are more than one
1579
	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1580
	 * and then splitting each of those to 512 4KB pages).
1581
	 */
1582
	for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
1583
retry:
1584
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1585
			continue;
1586

1587
		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1588
			continue;
1589

1590
		if (!sp) {
1591
			rcu_read_unlock();
1592

1593
			if (shared)
1594
				read_unlock(&kvm->mmu_lock);
1595
			else
1596
				write_unlock(&kvm->mmu_lock);
1597

1598
			sp = tdp_mmu_alloc_sp_for_split();
1599

1600
			if (shared)
1601
				read_lock(&kvm->mmu_lock);
1602
			else
1603
				write_lock(&kvm->mmu_lock);
1604

1605
			if (!sp) {
1606
				trace_kvm_mmu_split_huge_page(iter.gfn,
1607
							      iter.old_spte,
1608
							      iter.level, -ENOMEM);
1609
				return -ENOMEM;
1610
			}
1611

1612
			rcu_read_lock();
1613

1614
			iter.yielded = true;
1615
			continue;
1616
		}
1617

1618
		tdp_mmu_init_child_sp(sp, &iter);
1619

1620
		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1621
			goto retry;
1622

1623
		sp = NULL;
1624
	}
1625

1626
	rcu_read_unlock();
1627

1628
	/*
1629
	 * It's possible to exit the loop having never used the last sp if, for
1630
	 * example, a vCPU doing HugePage NX splitting wins the race and
1631
	 * installs its own sp in place of the last sp we tried to split.
1632
	 */
1633
	if (sp)
1634
		tdp_mmu_free_sp(sp);
1635

1636
	return 0;
1637
}
1638

1639

1640
/*
1641
 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1642
 */
1643
void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1644
				      const struct kvm_memory_slot *slot,
1645
				      gfn_t start, gfn_t end,
1646
				      int target_level, bool shared)
1647
{
1648
	struct kvm_mmu_page *root;
1649
	int r = 0;
1650

1651
	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1652
	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) {
1653
		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1654
		if (r) {
1655
			kvm_tdp_mmu_put_root(kvm, root);
1656
			break;
1657
		}
1658
	}
1659
}
1660

1661
static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp)
1662
{
1663
	/*
1664
	 * All TDP MMU shadow pages share the same role as their root, aside
1665
	 * from level, so it is valid to key off any shadow page to determine if
1666
	 * write protection is needed for an entire tree.
1667
	 */
1668
	return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled;
1669
}
1670

1671
static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1672
				  gfn_t start, gfn_t end)
1673
{
1674
	const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ?
1675
			 PT_WRITABLE_MASK : shadow_dirty_mask;
1676
	struct tdp_iter iter;
1677

1678
	rcu_read_lock();
1679

1680
	tdp_root_for_each_pte(iter, kvm, root, start, end) {
1681
retry:
1682
		if (!is_shadow_present_pte(iter.old_spte) ||
1683
		    !is_last_spte(iter.old_spte, iter.level))
1684
			continue;
1685

1686
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1687
			continue;
1688

1689
		KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1690
				spte_ad_need_write_protect(iter.old_spte));
1691

1692
		if (!(iter.old_spte & dbit))
1693
			continue;
1694

1695
		if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1696
			goto retry;
1697
	}
1698

1699
	rcu_read_unlock();
1700
}
1701

1702
/*
1703
 * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
1704
 * memslot.
1705
 */
1706
void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1707
				  const struct kvm_memory_slot *slot)
1708
{
1709
	struct kvm_mmu_page *root;
1710

1711
	lockdep_assert_held_read(&kvm->mmu_lock);
1712
	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1713
		clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1714
				      slot->base_gfn + slot->npages);
1715
}
1716

1717
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1718
				  gfn_t gfn, unsigned long mask, bool wrprot)
1719
{
1720
	const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ?
1721
			  PT_WRITABLE_MASK : shadow_dirty_mask;
1722
	struct tdp_iter iter;
1723

1724
	lockdep_assert_held_write(&kvm->mmu_lock);
1725

1726
	rcu_read_lock();
1727

1728
	tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
1729
				    gfn + BITS_PER_LONG) {
1730
		if (!mask)
1731
			break;
1732

1733
		KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1734
				spte_ad_need_write_protect(iter.old_spte));
1735

1736
		if (iter.level > PG_LEVEL_4K ||
1737
		    !(mask & (1UL << (iter.gfn - gfn))))
1738
			continue;
1739

1740
		mask &= ~(1UL << (iter.gfn - gfn));
1741

1742
		if (!(iter.old_spte & dbit))
1743
			continue;
1744

1745
		iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1746
							iter.old_spte, dbit,
1747
							iter.level);
1748

1749
		trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1750
					       iter.old_spte,
1751
					       iter.old_spte & ~dbit);
1752
	}
1753

1754
	rcu_read_unlock();
1755
}
1756

1757
/*
1758
 * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for
1759
 * which a bit is set in mask, starting at gfn. The given memslot is expected to
1760
 * contain all the GFNs represented by set bits in the mask.
1761
 */
1762
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1763
				       struct kvm_memory_slot *slot,
1764
				       gfn_t gfn, unsigned long mask,
1765
				       bool wrprot)
1766
{
1767
	struct kvm_mmu_page *root;
1768

1769
	for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1770
		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1771
}
1772

1773
static int tdp_mmu_make_huge_spte(struct kvm *kvm,
1774
				  struct tdp_iter *parent,
1775
				  u64 *huge_spte)
1776
{
1777
	struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
1778
	gfn_t start = parent->gfn;
1779
	gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
1780
	struct tdp_iter iter;
1781

1782
	tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
1783
		/*
1784
		 * Use the parent iterator when checking for forward progress so
1785
		 * that KVM doesn't get stuck continuously trying to yield (i.e.
1786
		 * returning -EAGAIN here and then failing the forward progress
1787
		 * check in the caller ad nauseam).
1788
		 */
1789
		if (tdp_mmu_iter_need_resched(kvm, parent))
1790
			return -EAGAIN;
1791

1792
		*huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
1793
		return 0;
1794
	}
1795

1796
	return -ENOENT;
1797
}
1798

1799
static void recover_huge_pages_range(struct kvm *kvm,
1800
				     struct kvm_mmu_page *root,
1801
				     const struct kvm_memory_slot *slot)
1802
{
1803
	gfn_t start = slot->base_gfn;
1804
	gfn_t end = start + slot->npages;
1805
	struct tdp_iter iter;
1806
	int max_mapping_level;
1807
	bool flush = false;
1808
	u64 huge_spte;
1809
	int r;
1810

1811
	if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
1812
		return;
1813

1814
	rcu_read_lock();
1815

1816
	for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
1817
retry:
1818
		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1819
			flush = false;
1820
			continue;
1821
		}
1822

1823
		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1824
		    !is_shadow_present_pte(iter.old_spte))
1825
			continue;
1826

1827
		/*
1828
		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1829
		 * a large page size, then its parent would have been zapped
1830
		 * instead of stepping down.
1831
		 */
1832
		if (is_last_spte(iter.old_spte, iter.level))
1833
			continue;
1834

1835
		/*
1836
		 * If iter.gfn resides outside of the slot, i.e. the page for
1837
		 * the current level overlaps but is not contained by the slot,
1838
		 * then the SPTE can't be made huge.  More importantly, trying
1839
		 * to query that info from slot->arch.lpage_info will cause an
1840
		 * out-of-bounds access.
1841
		 */
1842
		if (iter.gfn < start || iter.gfn >= end)
1843
			continue;
1844

1845
		max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
1846
		if (max_mapping_level < iter.level)
1847
			continue;
1848

1849
		r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
1850
		if (r == -EAGAIN)
1851
			goto retry;
1852
		else if (r)
1853
			continue;
1854

1855
		if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
1856
			goto retry;
1857

1858
		flush = true;
1859
	}
1860

1861
	if (flush)
1862
		kvm_flush_remote_tlbs_memslot(kvm, slot);
1863

1864
	rcu_read_unlock();
1865
}
1866

1867
/*
1868
 * Recover huge page mappings within the slot by replacing non-leaf SPTEs with
1869
 * huge SPTEs where possible.
1870
 */
1871
void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
1872
				    const struct kvm_memory_slot *slot)
1873
{
1874
	struct kvm_mmu_page *root;
1875

1876
	lockdep_assert_held_read(&kvm->mmu_lock);
1877
	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1878
		recover_huge_pages_range(kvm, root, slot);
1879
}
1880

1881
/*
1882
 * Removes write access on the last level SPTE mapping this GFN and unsets the
1883
 * MMU-writable bit to ensure future writes continue to be intercepted.
1884
 * Returns true if an SPTE was set and a TLB flush is needed.
1885
 */
1886
static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1887
			      gfn_t gfn, int min_level)
1888
{
1889
	struct tdp_iter iter;
1890
	u64 new_spte;
1891
	bool spte_set = false;
1892

1893
	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1894

1895
	rcu_read_lock();
1896

1897
	for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
1898
		if (!is_shadow_present_pte(iter.old_spte) ||
1899
		    !is_last_spte(iter.old_spte, iter.level))
1900
			continue;
1901

1902
		new_spte = iter.old_spte &
1903
			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1904

1905
		if (new_spte == iter.old_spte)
1906
			break;
1907

1908
		tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
1909
		spte_set = true;
1910
	}
1911

1912
	rcu_read_unlock();
1913

1914
	return spte_set;
1915
}
1916

1917
/*
1918
 * Removes write access on the last level SPTE mapping this GFN and unsets the
1919
 * MMU-writable bit to ensure future writes continue to be intercepted.
1920
 * Returns true if an SPTE was set and a TLB flush is needed.
1921
 */
1922
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1923
				   struct kvm_memory_slot *slot, gfn_t gfn,
1924
				   int min_level)
1925
{
1926
	struct kvm_mmu_page *root;
1927
	bool spte_set = false;
1928

1929
	lockdep_assert_held_write(&kvm->mmu_lock);
1930
	for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1931
		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1932

1933
	return spte_set;
1934
}
1935

1936
/*
1937
 * Return the level of the lowest level SPTE added to sptes.
1938
 * That SPTE may be non-present.
1939
 *
1940
 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1941
 */
1942
static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1943
				  struct kvm_mmu_page *root)
1944
{
1945
	struct tdp_iter iter;
1946
	gfn_t gfn = addr >> PAGE_SHIFT;
1947
	int leaf = -1;
1948

1949
	for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1950
		leaf = iter.level;
1951
		sptes[leaf] = iter.old_spte;
1952
	}
1953

1954
	return leaf;
1955
}
1956

1957
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1958
			 int *root_level)
1959
{
1960
	struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
1961
	*root_level = vcpu->arch.mmu->root_role.level;
1962

1963
	return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root);
1964
}
1965

1966
bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa)
1967
{
1968
	struct kvm *kvm = vcpu->kvm;
1969
	bool is_direct = kvm_is_addr_direct(kvm, gpa);
1970
	hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa :
1971
				 vcpu->arch.mmu->mirror_root_hpa;
1972
	u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte;
1973
	int leaf;
1974

1975
	lockdep_assert_held(&kvm->mmu_lock);
1976
	rcu_read_lock();
1977
	leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root));
1978
	rcu_read_unlock();
1979
	if (leaf < 0)
1980
		return false;
1981

1982
	spte = sptes[leaf];
1983
	return is_shadow_present_pte(spte) && is_last_spte(spte, leaf);
1984
}
1985
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_gpa_is_mapped);
1986

1987
/*
1988
 * Returns the last level spte pointer of the shadow page walk for the given
1989
 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1990
 * walk could be performed, returns NULL and *spte does not contain valid data.
1991
 *
1992
 * Contract:
1993
 *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1994
 *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1995
 *
1996
 * WARNING: This function is only intended to be called during fast_page_fault.
1997
 */
1998
u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
1999
					u64 *spte)
2000
{
2001
	/* Fast pf is not supported for mirrored roots  */
2002
	struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
2003
	struct tdp_iter iter;
2004
	tdp_ptep_t sptep = NULL;
2005

2006
	for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
2007
		*spte = iter.old_spte;
2008
		sptep = iter.sptep;
2009
	}
2010

2011
	/*
2012
	 * Perform the rcu_dereference to get the raw spte pointer value since
2013
	 * we are passing it up to fast_page_fault, which is shared with the
2014
	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
2015
	 * annotation.
2016
	 *
2017
	 * This is safe since fast_page_fault obeys the contracts of this
2018
	 * function as well as all TDP MMU contracts around modifying SPTEs
2019
	 * outside of mmu_lock.
2020
	 */
2021
	return rcu_dereference(sptep);
2022
}
2023

2024
Product

Resources

Company