Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/vmx/nested.c
29538 views
1
// SPDX-License-Identifier: GPL-2.0
2
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4
#include <linux/objtool.h>
5
#include <linux/percpu.h>
6
7
#include <asm/debugreg.h>
8
#include <asm/mmu_context.h>
9
#include <asm/msr.h>
10
11
#include "x86.h"
12
#include "cpuid.h"
13
#include "hyperv.h"
14
#include "mmu.h"
15
#include "nested.h"
16
#include "pmu.h"
17
#include "posted_intr.h"
18
#include "sgx.h"
19
#include "trace.h"
20
#include "vmx.h"
21
#include "smm.h"
22
23
static bool __read_mostly enable_shadow_vmcs = 1;
24
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
25
26
static bool __read_mostly nested_early_check = 0;
27
module_param(nested_early_check, bool, S_IRUGO);
28
29
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
30
31
/*
32
* Hyper-V requires all of these, so mark them as supported even though
33
* they are just treated the same as all-context.
34
*/
35
#define VMX_VPID_EXTENT_SUPPORTED_MASK \
36
(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
37
VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
38
VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
39
VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
40
41
#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
42
43
enum {
44
VMX_VMREAD_BITMAP,
45
VMX_VMWRITE_BITMAP,
46
VMX_BITMAP_NR
47
};
48
static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
49
50
#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
51
#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
52
53
struct shadow_vmcs_field {
54
u16 encoding;
55
u16 offset;
56
};
57
static struct shadow_vmcs_field shadow_read_only_fields[] = {
58
#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
59
#include "vmcs_shadow_fields.h"
60
};
61
static int max_shadow_read_only_fields =
62
ARRAY_SIZE(shadow_read_only_fields);
63
64
static struct shadow_vmcs_field shadow_read_write_fields[] = {
65
#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
66
#include "vmcs_shadow_fields.h"
67
};
68
static int max_shadow_read_write_fields =
69
ARRAY_SIZE(shadow_read_write_fields);
70
71
static void init_vmcs_shadow_fields(void)
72
{
73
int i, j;
74
75
memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
76
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
77
78
for (i = j = 0; i < max_shadow_read_only_fields; i++) {
79
struct shadow_vmcs_field entry = shadow_read_only_fields[i];
80
u16 field = entry.encoding;
81
82
if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
83
(i + 1 == max_shadow_read_only_fields ||
84
shadow_read_only_fields[i + 1].encoding != field + 1))
85
pr_err("Missing field from shadow_read_only_field %x\n",
86
field + 1);
87
88
clear_bit(field, vmx_vmread_bitmap);
89
if (field & 1)
90
#ifdef CONFIG_X86_64
91
continue;
92
#else
93
entry.offset += sizeof(u32);
94
#endif
95
shadow_read_only_fields[j++] = entry;
96
}
97
max_shadow_read_only_fields = j;
98
99
for (i = j = 0; i < max_shadow_read_write_fields; i++) {
100
struct shadow_vmcs_field entry = shadow_read_write_fields[i];
101
u16 field = entry.encoding;
102
103
if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
104
(i + 1 == max_shadow_read_write_fields ||
105
shadow_read_write_fields[i + 1].encoding != field + 1))
106
pr_err("Missing field from shadow_read_write_field %x\n",
107
field + 1);
108
109
WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
110
field <= GUEST_TR_AR_BYTES,
111
"Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
112
113
/*
114
* PML and the preemption timer can be emulated, but the
115
* processor cannot vmwrite to fields that don't exist
116
* on bare metal.
117
*/
118
switch (field) {
119
case GUEST_PML_INDEX:
120
if (!cpu_has_vmx_pml())
121
continue;
122
break;
123
case VMX_PREEMPTION_TIMER_VALUE:
124
if (!cpu_has_vmx_preemption_timer())
125
continue;
126
break;
127
case GUEST_INTR_STATUS:
128
if (!cpu_has_vmx_apicv())
129
continue;
130
break;
131
default:
132
break;
133
}
134
135
clear_bit(field, vmx_vmwrite_bitmap);
136
clear_bit(field, vmx_vmread_bitmap);
137
if (field & 1)
138
#ifdef CONFIG_X86_64
139
continue;
140
#else
141
entry.offset += sizeof(u32);
142
#endif
143
shadow_read_write_fields[j++] = entry;
144
}
145
max_shadow_read_write_fields = j;
146
}
147
148
/*
149
* The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
150
* set the success or error code of an emulated VMX instruction (as specified
151
* by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
152
* instruction.
153
*/
154
static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
155
{
156
vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
157
& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
158
X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
159
return kvm_skip_emulated_instruction(vcpu);
160
}
161
162
static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
163
{
164
vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
165
& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
166
X86_EFLAGS_SF | X86_EFLAGS_OF))
167
| X86_EFLAGS_CF);
168
return kvm_skip_emulated_instruction(vcpu);
169
}
170
171
static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
172
u32 vm_instruction_error)
173
{
174
vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
175
& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
176
X86_EFLAGS_SF | X86_EFLAGS_OF))
177
| X86_EFLAGS_ZF);
178
get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
179
/*
180
* We don't need to force sync to shadow VMCS because
181
* VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
182
* fields and thus must be synced.
183
*/
184
if (nested_vmx_is_evmptr12_set(to_vmx(vcpu)))
185
to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
186
187
return kvm_skip_emulated_instruction(vcpu);
188
}
189
190
static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
191
{
192
struct vcpu_vmx *vmx = to_vmx(vcpu);
193
194
/*
195
* failValid writes the error number to the current VMCS, which
196
* can't be done if there isn't a current VMCS.
197
*/
198
if (vmx->nested.current_vmptr == INVALID_GPA &&
199
!nested_vmx_is_evmptr12_valid(vmx))
200
return nested_vmx_failInvalid(vcpu);
201
202
return nested_vmx_failValid(vcpu, vm_instruction_error);
203
}
204
205
static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
206
{
207
/* TODO: not to reset guest simply here. */
208
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
209
pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator);
210
}
211
212
static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
213
{
214
return fixed_bits_valid(control, low, high);
215
}
216
217
static inline u64 vmx_control_msr(u32 low, u32 high)
218
{
219
return low | ((u64)high << 32);
220
}
221
222
static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
223
{
224
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
225
vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
226
vmx->nested.need_vmcs12_to_shadow_sync = false;
227
}
228
229
static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
230
{
231
#ifdef CONFIG_KVM_HYPERV
232
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
233
struct vcpu_vmx *vmx = to_vmx(vcpu);
234
235
kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map);
236
vmx->nested.hv_evmcs = NULL;
237
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
238
239
if (hv_vcpu) {
240
hv_vcpu->nested.pa_page_gpa = INVALID_GPA;
241
hv_vcpu->nested.vm_id = 0;
242
hv_vcpu->nested.vp_id = 0;
243
}
244
#endif
245
}
246
247
static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr)
248
{
249
#ifdef CONFIG_KVM_HYPERV
250
struct vcpu_vmx *vmx = to_vmx(vcpu);
251
/*
252
* When Enlightened VMEntry is enabled on the calling CPU we treat
253
* memory area pointer by vmptr as Enlightened VMCS (as there's no good
254
* way to distinguish it from VMCS12) and we must not corrupt it by
255
* writing to the non-existent 'launch_state' field. The area doesn't
256
* have to be the currently active EVMCS on the calling CPU and there's
257
* nothing KVM has to do to transition it from 'active' to 'non-active'
258
* state. It is possible that the area will stay mapped as
259
* vmx->nested.hv_evmcs but this shouldn't be a problem.
260
*/
261
if (!guest_cpu_cap_has_evmcs(vcpu) ||
262
!evmptr_is_valid(nested_get_evmptr(vcpu)))
263
return false;
264
265
if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr)
266
nested_release_evmcs(vcpu);
267
268
return true;
269
#else
270
return false;
271
#endif
272
}
273
274
static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
275
struct loaded_vmcs *prev)
276
{
277
struct vmcs_host_state *dest, *src;
278
279
if (unlikely(!vmx->vt.guest_state_loaded))
280
return;
281
282
src = &prev->host_state;
283
dest = &vmx->loaded_vmcs->host_state;
284
285
vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
286
dest->ldt_sel = src->ldt_sel;
287
#ifdef CONFIG_X86_64
288
dest->ds_sel = src->ds_sel;
289
dest->es_sel = src->es_sel;
290
#endif
291
}
292
293
static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
294
{
295
struct vcpu_vmx *vmx = to_vmx(vcpu);
296
struct loaded_vmcs *prev;
297
int cpu;
298
299
if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
300
return;
301
302
cpu = get_cpu();
303
prev = vmx->loaded_vmcs;
304
vmx->loaded_vmcs = vmcs;
305
vmx_vcpu_load_vmcs(vcpu, cpu);
306
vmx_sync_vmcs_host_state(vmx, prev);
307
put_cpu();
308
309
vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
310
311
/*
312
* All lazily updated registers will be reloaded from VMCS12 on both
313
* vmentry and vmexit.
314
*/
315
vcpu->arch.regs_dirty = 0;
316
}
317
318
static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu)
319
{
320
struct vcpu_vmx *vmx = to_vmx(vcpu);
321
322
kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map);
323
kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map);
324
kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map);
325
vmx->nested.pi_desc = NULL;
326
}
327
328
/*
329
* Free whatever needs to be freed from vmx->nested when L1 goes down, or
330
* just stops using VMX.
331
*/
332
static void free_nested(struct kvm_vcpu *vcpu)
333
{
334
struct vcpu_vmx *vmx = to_vmx(vcpu);
335
336
if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
337
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
338
339
if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
340
return;
341
342
kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
343
344
vmx->nested.vmxon = false;
345
vmx->nested.smm.vmxon = false;
346
vmx->nested.vmxon_ptr = INVALID_GPA;
347
free_vpid(vmx->nested.vpid02);
348
vmx->nested.posted_intr_nv = -1;
349
vmx->nested.current_vmptr = INVALID_GPA;
350
if (enable_shadow_vmcs) {
351
vmx_disable_shadow_vmcs(vmx);
352
vmcs_clear(vmx->vmcs01.shadow_vmcs);
353
free_vmcs(vmx->vmcs01.shadow_vmcs);
354
vmx->vmcs01.shadow_vmcs = NULL;
355
}
356
kfree(vmx->nested.cached_vmcs12);
357
vmx->nested.cached_vmcs12 = NULL;
358
kfree(vmx->nested.cached_shadow_vmcs12);
359
vmx->nested.cached_shadow_vmcs12 = NULL;
360
361
nested_put_vmcs12_pages(vcpu);
362
363
kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
364
365
nested_release_evmcs(vcpu);
366
367
free_loaded_vmcs(&vmx->nested.vmcs02);
368
}
369
370
/*
371
* Ensure that the current vmcs of the logical processor is the
372
* vmcs01 of the vcpu before calling free_nested().
373
*/
374
void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
375
{
376
vcpu_load(vcpu);
377
vmx_leave_nested(vcpu);
378
vcpu_put(vcpu);
379
}
380
381
#define EPTP_PA_MASK GENMASK_ULL(51, 12)
382
383
static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
384
{
385
return VALID_PAGE(root_hpa) &&
386
((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
387
}
388
389
static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
390
gpa_t addr)
391
{
392
unsigned long roots = 0;
393
uint i;
394
struct kvm_mmu_root_info *cached_root;
395
396
WARN_ON_ONCE(!mmu_is_nested(vcpu));
397
398
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
399
cached_root = &vcpu->arch.mmu->prev_roots[i];
400
401
if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
402
eptp))
403
roots |= KVM_MMU_ROOT_PREVIOUS(i);
404
}
405
if (roots)
406
kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots);
407
}
408
409
static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
410
struct x86_exception *fault)
411
{
412
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
413
struct vcpu_vmx *vmx = to_vmx(vcpu);
414
unsigned long exit_qualification;
415
u32 vm_exit_reason;
416
417
if (vmx->nested.pml_full) {
418
vm_exit_reason = EXIT_REASON_PML_FULL;
419
vmx->nested.pml_full = false;
420
421
/*
422
* It should be impossible to trigger a nested PML Full VM-Exit
423
* for anything other than an EPT Violation from L2. KVM *can*
424
* trigger nEPT page fault injection in response to an EPT
425
* Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT
426
* tables also changed, but KVM should not treat EPT Misconfig
427
* VM-Exits as writes.
428
*/
429
WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION);
430
431
/*
432
* PML Full and EPT Violation VM-Exits both use bit 12 to report
433
* "NMI unblocking due to IRET", i.e. the bit can be propagated
434
* as-is from the original EXIT_QUALIFICATION.
435
*/
436
exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI;
437
} else {
438
if (fault->error_code & PFERR_RSVD_MASK) {
439
vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
440
exit_qualification = 0;
441
} else {
442
exit_qualification = fault->exit_qualification;
443
exit_qualification |= vmx_get_exit_qual(vcpu) &
444
(EPT_VIOLATION_GVA_IS_VALID |
445
EPT_VIOLATION_GVA_TRANSLATED);
446
vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
447
}
448
449
/*
450
* Although the caller (kvm_inject_emulated_page_fault) would
451
* have already synced the faulting address in the shadow EPT
452
* tables for the current EPTP12, we also need to sync it for
453
* any other cached EPTP02s based on the same EP4TA, since the
454
* TLB associates mappings to the EP4TA rather than the full EPTP.
455
*/
456
nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
457
fault->address);
458
}
459
460
nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
461
vmcs12->guest_physical_address = fault->address;
462
}
463
464
static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
465
{
466
struct vcpu_vmx *vmx = to_vmx(vcpu);
467
bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT;
468
int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps);
469
470
kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
471
nested_ept_ad_enabled(vcpu),
472
nested_ept_get_eptp(vcpu));
473
}
474
475
static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
476
{
477
WARN_ON(mmu_is_nested(vcpu));
478
479
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
480
nested_ept_new_eptp(vcpu);
481
vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
482
vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
483
vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
484
485
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
486
}
487
488
static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
489
{
490
vcpu->arch.mmu = &vcpu->arch.root_mmu;
491
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
492
}
493
494
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
495
u16 error_code)
496
{
497
bool inequality, bit;
498
499
bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
500
inequality =
501
(error_code & vmcs12->page_fault_error_code_mask) !=
502
vmcs12->page_fault_error_code_match;
503
return inequality ^ bit;
504
}
505
506
static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
507
u32 error_code)
508
{
509
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
510
511
/*
512
* Drop bits 31:16 of the error code when performing the #PF mask+match
513
* check. All VMCS fields involved are 32 bits, but Intel CPUs never
514
* set bits 31:16 and VMX disallows setting bits 31:16 in the injected
515
* error code. Including the to-be-dropped bits in the check might
516
* result in an "impossible" or missed exit from L1's perspective.
517
*/
518
if (vector == PF_VECTOR)
519
return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code);
520
521
return (vmcs12->exception_bitmap & (1u << vector));
522
}
523
524
static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
525
struct vmcs12 *vmcs12)
526
{
527
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
528
return 0;
529
530
if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
531
CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
532
return -EINVAL;
533
534
return 0;
535
}
536
537
static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
538
struct vmcs12 *vmcs12)
539
{
540
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
541
return 0;
542
543
if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
544
return -EINVAL;
545
546
return 0;
547
}
548
549
static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
550
struct vmcs12 *vmcs12)
551
{
552
if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
553
return 0;
554
555
if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
556
return -EINVAL;
557
558
return 0;
559
}
560
561
/*
562
* For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1
563
* itself utilizing x2APIC. All MSRs were previously set to be intercepted,
564
* only the "disable intercept" case needs to be handled.
565
*/
566
static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1,
567
unsigned long *msr_bitmap_l0,
568
u32 msr, int type)
569
{
570
if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr))
571
vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr);
572
573
if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr))
574
vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr);
575
}
576
577
static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
578
{
579
int msr;
580
581
for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
582
unsigned word = msr / BITS_PER_LONG;
583
584
msr_bitmap[word] = ~0;
585
msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
586
}
587
}
588
589
#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \
590
static inline \
591
void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \
592
unsigned long *msr_bitmap_l1, \
593
unsigned long *msr_bitmap_l0, u32 msr) \
594
{ \
595
if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \
596
vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \
597
vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \
598
else \
599
vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \
600
}
601
BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
602
BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
603
604
static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
605
unsigned long *msr_bitmap_l1,
606
unsigned long *msr_bitmap_l0,
607
u32 msr, int types)
608
{
609
if (types & MSR_TYPE_R)
610
nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
611
msr_bitmap_l0, msr);
612
if (types & MSR_TYPE_W)
613
nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
614
msr_bitmap_l0, msr);
615
}
616
617
/*
618
* Merge L0's and L1's MSR bitmap, return false to indicate that
619
* we do not use the hardware.
620
*/
621
static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
622
struct vmcs12 *vmcs12)
623
{
624
struct vcpu_vmx *vmx = to_vmx(vcpu);
625
int msr;
626
unsigned long *msr_bitmap_l1;
627
unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
628
struct kvm_host_map map;
629
630
/* Nothing to do if the MSR bitmap is not in use. */
631
if (!cpu_has_vmx_msr_bitmap() ||
632
!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
633
return false;
634
635
/*
636
* MSR bitmap update can be skipped when:
637
* - MSR bitmap for L1 hasn't changed.
638
* - Nested hypervisor (L1) is attempting to launch the same L2 as
639
* before.
640
* - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
641
* and tells KVM (L0) there were no changes in MSR bitmap for L2.
642
*/
643
if (!vmx->nested.force_msr_bitmap_recalc) {
644
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
645
646
if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap &&
647
evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
648
return true;
649
}
650
651
if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map))
652
return false;
653
654
msr_bitmap_l1 = (unsigned long *)map.hva;
655
656
/*
657
* To keep the control flow simple, pay eight 8-byte writes (sixteen
658
* 4-byte writes on 32-bit systems) up front to enable intercepts for
659
* the x2APIC MSR range and selectively toggle those relevant to L2.
660
*/
661
enable_x2apic_msr_intercepts(msr_bitmap_l0);
662
663
if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
664
if (nested_cpu_has_apic_reg_virt(vmcs12)) {
665
/*
666
* L0 need not intercept reads for MSRs between 0x800
667
* and 0x8ff, it just lets the processor take the value
668
* from the virtual-APIC page; take those 256 bits
669
* directly from the L1 bitmap.
670
*/
671
for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
672
unsigned word = msr / BITS_PER_LONG;
673
674
msr_bitmap_l0[word] = msr_bitmap_l1[word];
675
}
676
}
677
678
nested_vmx_disable_intercept_for_x2apic_msr(
679
msr_bitmap_l1, msr_bitmap_l0,
680
X2APIC_MSR(APIC_TASKPRI),
681
MSR_TYPE_R | MSR_TYPE_W);
682
683
if (nested_cpu_has_vid(vmcs12)) {
684
nested_vmx_disable_intercept_for_x2apic_msr(
685
msr_bitmap_l1, msr_bitmap_l0,
686
X2APIC_MSR(APIC_EOI),
687
MSR_TYPE_W);
688
nested_vmx_disable_intercept_for_x2apic_msr(
689
msr_bitmap_l1, msr_bitmap_l0,
690
X2APIC_MSR(APIC_SELF_IPI),
691
MSR_TYPE_W);
692
}
693
}
694
695
/*
696
* Always check vmcs01's bitmap to honor userspace MSR filters and any
697
* other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
698
*/
699
#ifdef CONFIG_X86_64
700
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
701
MSR_FS_BASE, MSR_TYPE_RW);
702
703
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
704
MSR_GS_BASE, MSR_TYPE_RW);
705
706
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
707
MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
708
#endif
709
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
710
MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
711
712
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
713
MSR_IA32_PRED_CMD, MSR_TYPE_W);
714
715
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
716
MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
717
718
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
719
MSR_IA32_APERF, MSR_TYPE_R);
720
721
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
722
MSR_IA32_MPERF, MSR_TYPE_R);
723
724
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
725
MSR_IA32_U_CET, MSR_TYPE_RW);
726
727
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
728
MSR_IA32_S_CET, MSR_TYPE_RW);
729
730
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
731
MSR_IA32_PL0_SSP, MSR_TYPE_RW);
732
733
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
734
MSR_IA32_PL1_SSP, MSR_TYPE_RW);
735
736
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
737
MSR_IA32_PL2_SSP, MSR_TYPE_RW);
738
739
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
740
MSR_IA32_PL3_SSP, MSR_TYPE_RW);
741
742
kvm_vcpu_unmap(vcpu, &map);
743
744
vmx->nested.force_msr_bitmap_recalc = false;
745
746
return true;
747
}
748
749
static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
750
struct vmcs12 *vmcs12)
751
{
752
struct vcpu_vmx *vmx = to_vmx(vcpu);
753
struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
754
755
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
756
vmcs12->vmcs_link_pointer == INVALID_GPA)
757
return;
758
759
if (ghc->gpa != vmcs12->vmcs_link_pointer &&
760
kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
761
vmcs12->vmcs_link_pointer, VMCS12_SIZE))
762
return;
763
764
kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
765
VMCS12_SIZE);
766
}
767
768
static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
769
struct vmcs12 *vmcs12)
770
{
771
struct vcpu_vmx *vmx = to_vmx(vcpu);
772
struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
773
774
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
775
vmcs12->vmcs_link_pointer == INVALID_GPA)
776
return;
777
778
if (ghc->gpa != vmcs12->vmcs_link_pointer &&
779
kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
780
vmcs12->vmcs_link_pointer, VMCS12_SIZE))
781
return;
782
783
kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
784
VMCS12_SIZE);
785
}
786
787
/*
788
* In nested virtualization, check if L1 has set
789
* VM_EXIT_ACK_INTR_ON_EXIT
790
*/
791
static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
792
{
793
return get_vmcs12(vcpu)->vm_exit_controls &
794
VM_EXIT_ACK_INTR_ON_EXIT;
795
}
796
797
static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
798
struct vmcs12 *vmcs12)
799
{
800
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
801
CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
802
return -EINVAL;
803
else
804
return 0;
805
}
806
807
static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
808
struct vmcs12 *vmcs12)
809
{
810
if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
811
!nested_cpu_has_apic_reg_virt(vmcs12) &&
812
!nested_cpu_has_vid(vmcs12) &&
813
!nested_cpu_has_posted_intr(vmcs12))
814
return 0;
815
816
/*
817
* If virtualize x2apic mode is enabled,
818
* virtualize apic access must be disabled.
819
*/
820
if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
821
nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
822
return -EINVAL;
823
824
/*
825
* If virtual interrupt delivery is enabled,
826
* we must exit on external interrupts.
827
*/
828
if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
829
return -EINVAL;
830
831
/*
832
* bits 15:8 should be zero in posted_intr_nv,
833
* the descriptor address has been already checked
834
* in nested_get_vmcs12_pages.
835
*
836
* bits 5:0 of posted_intr_desc_addr should be zero.
837
*/
838
if (nested_cpu_has_posted_intr(vmcs12) &&
839
(CC(!nested_cpu_has_vid(vmcs12)) ||
840
CC(!nested_exit_intr_ack_set(vcpu)) ||
841
CC((vmcs12->posted_intr_nv & 0xff00)) ||
842
CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
843
return -EINVAL;
844
845
/* tpr shadow is needed by all apicv features. */
846
if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
847
return -EINVAL;
848
849
return 0;
850
}
851
852
static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
853
{
854
struct vcpu_vmx *vmx = to_vmx(vcpu);
855
u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
856
vmx->nested.msrs.misc_high);
857
858
return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
859
}
860
861
static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
862
u32 count, u64 addr)
863
{
864
if (count == 0)
865
return 0;
866
867
/*
868
* Exceeding the limit results in architecturally _undefined_ behavior,
869
* i.e. KVM is allowed to do literally anything in response to a bad
870
* limit. Immediately generate a consistency check so that code that
871
* consumes the count doesn't need to worry about extreme edge cases.
872
*/
873
if (count > nested_vmx_max_atomic_switch_msrs(vcpu))
874
return -EINVAL;
875
876
if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
877
!kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
878
return -EINVAL;
879
880
return 0;
881
}
882
883
static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
884
struct vmcs12 *vmcs12)
885
{
886
if (CC(nested_vmx_check_msr_switch(vcpu,
887
vmcs12->vm_exit_msr_load_count,
888
vmcs12->vm_exit_msr_load_addr)) ||
889
CC(nested_vmx_check_msr_switch(vcpu,
890
vmcs12->vm_exit_msr_store_count,
891
vmcs12->vm_exit_msr_store_addr)))
892
return -EINVAL;
893
894
return 0;
895
}
896
897
static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
898
struct vmcs12 *vmcs12)
899
{
900
if (CC(nested_vmx_check_msr_switch(vcpu,
901
vmcs12->vm_entry_msr_load_count,
902
vmcs12->vm_entry_msr_load_addr)))
903
return -EINVAL;
904
905
return 0;
906
}
907
908
static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
909
struct vmcs12 *vmcs12)
910
{
911
if (!nested_cpu_has_pml(vmcs12))
912
return 0;
913
914
if (CC(!nested_cpu_has_ept(vmcs12)) ||
915
CC(!page_address_valid(vcpu, vmcs12->pml_address)))
916
return -EINVAL;
917
918
return 0;
919
}
920
921
static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
922
struct vmcs12 *vmcs12)
923
{
924
if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
925
!nested_cpu_has_ept(vmcs12)))
926
return -EINVAL;
927
return 0;
928
}
929
930
static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
931
struct vmcs12 *vmcs12)
932
{
933
if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
934
!nested_cpu_has_ept(vmcs12)))
935
return -EINVAL;
936
return 0;
937
}
938
939
static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
940
struct vmcs12 *vmcs12)
941
{
942
if (!nested_cpu_has_shadow_vmcs(vmcs12))
943
return 0;
944
945
if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
946
CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
947
return -EINVAL;
948
949
return 0;
950
}
951
952
static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
953
struct vmx_msr_entry *e)
954
{
955
/* x2APIC MSR accesses are not allowed */
956
if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
957
return -EINVAL;
958
if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
959
CC(e->index == MSR_IA32_UCODE_REV))
960
return -EINVAL;
961
if (CC(e->reserved != 0))
962
return -EINVAL;
963
return 0;
964
}
965
966
static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
967
struct vmx_msr_entry *e)
968
{
969
if (CC(e->index == MSR_FS_BASE) ||
970
CC(e->index == MSR_GS_BASE) ||
971
CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
972
nested_vmx_msr_check_common(vcpu, e))
973
return -EINVAL;
974
return 0;
975
}
976
977
static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
978
struct vmx_msr_entry *e)
979
{
980
if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
981
nested_vmx_msr_check_common(vcpu, e))
982
return -EINVAL;
983
return 0;
984
}
985
986
/*
987
* Load guest's/host's msr at nested entry/exit.
988
* return 0 for success, entry index for failure.
989
*
990
* One of the failure modes for MSR load/store is when a list exceeds the
991
* virtual hardware's capacity. To maintain compatibility with hardware inasmuch
992
* as possible, process all valid entries before failing rather than precheck
993
* for a capacity violation.
994
*/
995
static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
996
{
997
u32 i;
998
struct vmx_msr_entry e;
999
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
1000
1001
for (i = 0; i < count; i++) {
1002
if (WARN_ON_ONCE(i >= max_msr_list_size))
1003
goto fail;
1004
1005
if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
1006
&e, sizeof(e))) {
1007
pr_debug_ratelimited(
1008
"%s cannot read MSR entry (%u, 0x%08llx)\n",
1009
__func__, i, gpa + i * sizeof(e));
1010
goto fail;
1011
}
1012
if (nested_vmx_load_msr_check(vcpu, &e)) {
1013
pr_debug_ratelimited(
1014
"%s check failed (%u, 0x%x, 0x%x)\n",
1015
__func__, i, e.index, e.reserved);
1016
goto fail;
1017
}
1018
if (kvm_emulate_msr_write(vcpu, e.index, e.value)) {
1019
pr_debug_ratelimited(
1020
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1021
__func__, i, e.index, e.value);
1022
goto fail;
1023
}
1024
}
1025
return 0;
1026
fail:
1027
/* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
1028
return i + 1;
1029
}
1030
1031
static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
1032
u32 msr_index,
1033
u64 *data)
1034
{
1035
struct vcpu_vmx *vmx = to_vmx(vcpu);
1036
1037
/*
1038
* If the L0 hypervisor stored a more accurate value for the TSC that
1039
* does not include the time taken for emulation of the L2->L1
1040
* VM-exit in L0, use the more accurate value.
1041
*/
1042
if (msr_index == MSR_IA32_TSC) {
1043
int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
1044
MSR_IA32_TSC);
1045
1046
if (i >= 0) {
1047
u64 val = vmx->msr_autostore.guest.val[i].value;
1048
1049
*data = kvm_read_l1_tsc(vcpu, val);
1050
return true;
1051
}
1052
}
1053
1054
if (kvm_emulate_msr_read(vcpu, msr_index, data)) {
1055
pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
1056
msr_index);
1057
return false;
1058
}
1059
return true;
1060
}
1061
1062
static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
1063
struct vmx_msr_entry *e)
1064
{
1065
if (kvm_vcpu_read_guest(vcpu,
1066
gpa + i * sizeof(*e),
1067
e, 2 * sizeof(u32))) {
1068
pr_debug_ratelimited(
1069
"%s cannot read MSR entry (%u, 0x%08llx)\n",
1070
__func__, i, gpa + i * sizeof(*e));
1071
return false;
1072
}
1073
if (nested_vmx_store_msr_check(vcpu, e)) {
1074
pr_debug_ratelimited(
1075
"%s check failed (%u, 0x%x, 0x%x)\n",
1076
__func__, i, e->index, e->reserved);
1077
return false;
1078
}
1079
return true;
1080
}
1081
1082
static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
1083
{
1084
u64 data;
1085
u32 i;
1086
struct vmx_msr_entry e;
1087
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
1088
1089
for (i = 0; i < count; i++) {
1090
if (WARN_ON_ONCE(i >= max_msr_list_size))
1091
return -EINVAL;
1092
1093
if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1094
return -EINVAL;
1095
1096
if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
1097
return -EINVAL;
1098
1099
if (kvm_vcpu_write_guest(vcpu,
1100
gpa + i * sizeof(e) +
1101
offsetof(struct vmx_msr_entry, value),
1102
&data, sizeof(data))) {
1103
pr_debug_ratelimited(
1104
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1105
__func__, i, e.index, data);
1106
return -EINVAL;
1107
}
1108
}
1109
return 0;
1110
}
1111
1112
static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1113
{
1114
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1115
u32 count = vmcs12->vm_exit_msr_store_count;
1116
u64 gpa = vmcs12->vm_exit_msr_store_addr;
1117
struct vmx_msr_entry e;
1118
u32 i;
1119
1120
for (i = 0; i < count; i++) {
1121
if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1122
return false;
1123
1124
if (e.index == msr_index)
1125
return true;
1126
}
1127
return false;
1128
}
1129
1130
static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1131
u32 msr_index)
1132
{
1133
struct vcpu_vmx *vmx = to_vmx(vcpu);
1134
struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1135
bool in_vmcs12_store_list;
1136
int msr_autostore_slot;
1137
bool in_autostore_list;
1138
int last;
1139
1140
msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
1141
in_autostore_list = msr_autostore_slot >= 0;
1142
in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1143
1144
if (in_vmcs12_store_list && !in_autostore_list) {
1145
if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
1146
/*
1147
* Emulated VMEntry does not fail here. Instead a less
1148
* accurate value will be returned by
1149
* nested_vmx_get_vmexit_msr_value() by reading KVM's
1150
* internal MSR state instead of reading the value from
1151
* the vmcs02 VMExit MSR-store area.
1152
*/
1153
pr_warn_ratelimited(
1154
"Not enough msr entries in msr_autostore. Can't add msr %x\n",
1155
msr_index);
1156
return;
1157
}
1158
last = autostore->nr++;
1159
autostore->val[last].index = msr_index;
1160
} else if (!in_vmcs12_store_list && in_autostore_list) {
1161
last = --autostore->nr;
1162
autostore->val[msr_autostore_slot] = autostore->val[last];
1163
}
1164
}
1165
1166
/*
1167
* Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
1168
* emulating VM-Entry into a guest with EPT enabled. On failure, the expected
1169
* Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
1170
* @entry_failure_code.
1171
*/
1172
static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
1173
bool nested_ept, bool reload_pdptrs,
1174
enum vm_entry_failure_code *entry_failure_code)
1175
{
1176
if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) {
1177
*entry_failure_code = ENTRY_FAIL_DEFAULT;
1178
return -EINVAL;
1179
}
1180
1181
/*
1182
* If PAE paging and EPT are both on, CR3 is not used by the CPU and
1183
* must not be dereferenced.
1184
*/
1185
if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
1186
CC(!load_pdptrs(vcpu, cr3))) {
1187
*entry_failure_code = ENTRY_FAIL_PDPTE;
1188
return -EINVAL;
1189
}
1190
1191
vcpu->arch.cr3 = cr3;
1192
kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
1193
1194
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
1195
kvm_init_mmu(vcpu);
1196
1197
if (!nested_ept)
1198
kvm_mmu_new_pgd(vcpu, cr3);
1199
1200
return 0;
1201
}
1202
1203
/*
1204
* Returns if KVM is able to config CPU to tag TLB entries
1205
* populated by L2 differently than TLB entries populated
1206
* by L1.
1207
*
1208
* If L0 uses EPT, L1 and L2 run with different EPTP because
1209
* guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1210
* are tagged with different EPTP.
1211
*
1212
* If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1213
* with different VPID (L1 entries are tagged with vmx->vpid
1214
* while L2 entries are tagged with vmx->nested.vpid02).
1215
*/
1216
static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1217
{
1218
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1219
1220
return enable_ept ||
1221
(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1222
}
1223
1224
static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
1225
struct vmcs12 *vmcs12,
1226
bool is_vmenter)
1227
{
1228
struct vcpu_vmx *vmx = to_vmx(vcpu);
1229
1230
/* Handle pending Hyper-V TLB flush requests */
1231
kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept);
1232
1233
/*
1234
* If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
1235
* same VPID as the host, and so architecturally, linear and combined
1236
* mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM
1237
* emulates L2 sharing L1's VPID=0 by using vpid01 while running L2,
1238
* and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This
1239
* is required if VPID is disabled in KVM, as a TLB flush (there are no
1240
* VPIDs) still occurs from L1's perspective, and KVM may need to
1241
* synchronize the MMU in response to the guest TLB flush.
1242
*
1243
* Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
1244
* EPT is a special snowflake, as guest-physical mappings aren't
1245
* flushed on VPID invalidations, including VM-Enter or VM-Exit with
1246
* VPID disabled. As a result, KVM _never_ needs to sync nEPT
1247
* entries on VM-Enter because L1 can't rely on VM-Enter to flush
1248
* those mappings.
1249
*/
1250
if (!nested_cpu_has_vpid(vmcs12)) {
1251
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1252
return;
1253
}
1254
1255
/* L2 should never have a VPID if VPID is disabled. */
1256
WARN_ON(!enable_vpid);
1257
1258
/*
1259
* VPID is enabled and in use by vmcs12. If vpid12 is changing, then
1260
* emulate a guest TLB flush as KVM does not track vpid12 history nor
1261
* is the VPID incorporated into the MMU context. I.e. KVM must assume
1262
* that the new vpid12 has never been used and thus represents a new
1263
* guest ASID that cannot have entries in the TLB.
1264
*/
1265
if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
1266
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
1267
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1268
return;
1269
}
1270
1271
/*
1272
* If VPID is enabled, used by vmc12, and vpid12 is not changing but
1273
* does not have a unique TLB tag (ASID), i.e. EPT is disabled and
1274
* KVM was unable to allocate a VPID for L2, flush the current context
1275
* as the effective ASID is common to both L1 and L2.
1276
*/
1277
if (!nested_has_guest_tlb_tag(vcpu))
1278
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1279
}
1280
1281
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1282
{
1283
superset &= mask;
1284
subset &= mask;
1285
1286
return (superset | subset) == superset;
1287
}
1288
1289
static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1290
{
1291
const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
1292
VMX_BASIC_INOUT |
1293
VMX_BASIC_TRUE_CTLS |
1294
VMX_BASIC_NO_HW_ERROR_CODE_CC;
1295
1296
const u64 reserved_bits = GENMASK_ULL(63, 57) |
1297
GENMASK_ULL(47, 45) |
1298
BIT_ULL(31);
1299
1300
u64 vmx_basic = vmcs_config.nested.basic;
1301
1302
BUILD_BUG_ON(feature_bits & reserved_bits);
1303
1304
/*
1305
* Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has
1306
* inverted polarity), the incoming value must not set feature bits or
1307
* reserved bits that aren't allowed/supported by KVM. Fields, i.e.
1308
* multi-bit values, are explicitly checked below.
1309
*/
1310
if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits))
1311
return -EINVAL;
1312
1313
/*
1314
* KVM does not emulate a version of VMX that constrains physical
1315
* addresses of VMX structures (e.g. VMCS) to 32-bits.
1316
*/
1317
if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
1318
return -EINVAL;
1319
1320
if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1321
vmx_basic_vmcs_revision_id(data))
1322
return -EINVAL;
1323
1324
if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1325
return -EINVAL;
1326
1327
vmx->nested.msrs.basic = data;
1328
return 0;
1329
}
1330
1331
static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
1332
u32 **low, u32 **high)
1333
{
1334
switch (msr_index) {
1335
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1336
*low = &msrs->pinbased_ctls_low;
1337
*high = &msrs->pinbased_ctls_high;
1338
break;
1339
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1340
*low = &msrs->procbased_ctls_low;
1341
*high = &msrs->procbased_ctls_high;
1342
break;
1343
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1344
*low = &msrs->exit_ctls_low;
1345
*high = &msrs->exit_ctls_high;
1346
break;
1347
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1348
*low = &msrs->entry_ctls_low;
1349
*high = &msrs->entry_ctls_high;
1350
break;
1351
case MSR_IA32_VMX_PROCBASED_CTLS2:
1352
*low = &msrs->secondary_ctls_low;
1353
*high = &msrs->secondary_ctls_high;
1354
break;
1355
default:
1356
BUG();
1357
}
1358
}
1359
1360
static int
1361
vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1362
{
1363
u32 *lowp, *highp;
1364
u64 supported;
1365
1366
vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp);
1367
1368
supported = vmx_control_msr(*lowp, *highp);
1369
1370
/* Check must-be-1 bits are still 1. */
1371
if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1372
return -EINVAL;
1373
1374
/* Check must-be-0 bits are still 0. */
1375
if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1376
return -EINVAL;
1377
1378
vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
1379
*lowp = data;
1380
*highp = data >> 32;
1381
return 0;
1382
}
1383
1384
static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1385
{
1386
const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA |
1387
VMX_MISC_ACTIVITY_HLT |
1388
VMX_MISC_ACTIVITY_SHUTDOWN |
1389
VMX_MISC_ACTIVITY_WAIT_SIPI |
1390
VMX_MISC_INTEL_PT |
1391
VMX_MISC_RDMSR_IN_SMM |
1392
VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
1393
VMX_MISC_VMXOFF_BLOCK_SMI |
1394
VMX_MISC_ZERO_LEN_INS;
1395
1396
const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9);
1397
1398
u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
1399
vmcs_config.nested.misc_high);
1400
1401
BUILD_BUG_ON(feature_bits & reserved_bits);
1402
1403
/*
1404
* The incoming value must not set feature bits or reserved bits that
1405
* aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are
1406
* explicitly checked below.
1407
*/
1408
if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits))
1409
return -EINVAL;
1410
1411
if ((vmx->nested.msrs.pinbased_ctls_high &
1412
PIN_BASED_VMX_PREEMPTION_TIMER) &&
1413
vmx_misc_preemption_timer_rate(data) !=
1414
vmx_misc_preemption_timer_rate(vmx_misc))
1415
return -EINVAL;
1416
1417
if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1418
return -EINVAL;
1419
1420
if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1421
return -EINVAL;
1422
1423
if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1424
return -EINVAL;
1425
1426
vmx->nested.msrs.misc_low = data;
1427
vmx->nested.msrs.misc_high = data >> 32;
1428
1429
return 0;
1430
}
1431
1432
static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1433
{
1434
u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps,
1435
vmcs_config.nested.vpid_caps);
1436
1437
/* Every bit is either reserved or a feature bit. */
1438
if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1439
return -EINVAL;
1440
1441
vmx->nested.msrs.ept_caps = data;
1442
vmx->nested.msrs.vpid_caps = data >> 32;
1443
return 0;
1444
}
1445
1446
static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
1447
{
1448
switch (msr_index) {
1449
case MSR_IA32_VMX_CR0_FIXED0:
1450
return &msrs->cr0_fixed0;
1451
case MSR_IA32_VMX_CR4_FIXED0:
1452
return &msrs->cr4_fixed0;
1453
default:
1454
BUG();
1455
}
1456
}
1457
1458
static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1459
{
1460
const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index);
1461
1462
/*
1463
* 1 bits (which indicates bits which "must-be-1" during VMX operation)
1464
* must be 1 in the restored value.
1465
*/
1466
if (!is_bitwise_subset(data, *msr, -1ULL))
1467
return -EINVAL;
1468
1469
*vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data;
1470
return 0;
1471
}
1472
1473
/*
1474
* Called when userspace is restoring VMX MSRs.
1475
*
1476
* Returns 0 on success, non-0 otherwise.
1477
*/
1478
int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1479
{
1480
struct vcpu_vmx *vmx = to_vmx(vcpu);
1481
1482
/*
1483
* Don't allow changes to the VMX capability MSRs while the vCPU
1484
* is in VMX operation.
1485
*/
1486
if (vmx->nested.vmxon)
1487
return -EBUSY;
1488
1489
switch (msr_index) {
1490
case MSR_IA32_VMX_BASIC:
1491
return vmx_restore_vmx_basic(vmx, data);
1492
case MSR_IA32_VMX_PINBASED_CTLS:
1493
case MSR_IA32_VMX_PROCBASED_CTLS:
1494
case MSR_IA32_VMX_EXIT_CTLS:
1495
case MSR_IA32_VMX_ENTRY_CTLS:
1496
/*
1497
* The "non-true" VMX capability MSRs are generated from the
1498
* "true" MSRs, so we do not support restoring them directly.
1499
*
1500
* If userspace wants to emulate VMX_BASIC[55]=0, userspace
1501
* should restore the "true" MSRs with the must-be-1 bits
1502
* set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1503
* DEFAULT SETTINGS".
1504
*/
1505
return -EINVAL;
1506
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1507
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1508
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1509
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1510
case MSR_IA32_VMX_PROCBASED_CTLS2:
1511
return vmx_restore_control_msr(vmx, msr_index, data);
1512
case MSR_IA32_VMX_MISC:
1513
return vmx_restore_vmx_misc(vmx, data);
1514
case MSR_IA32_VMX_CR0_FIXED0:
1515
case MSR_IA32_VMX_CR4_FIXED0:
1516
return vmx_restore_fixed0_msr(vmx, msr_index, data);
1517
case MSR_IA32_VMX_CR0_FIXED1:
1518
case MSR_IA32_VMX_CR4_FIXED1:
1519
/*
1520
* These MSRs are generated based on the vCPU's CPUID, so we
1521
* do not support restoring them directly.
1522
*/
1523
return -EINVAL;
1524
case MSR_IA32_VMX_EPT_VPID_CAP:
1525
return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1526
case MSR_IA32_VMX_VMCS_ENUM:
1527
vmx->nested.msrs.vmcs_enum = data;
1528
return 0;
1529
case MSR_IA32_VMX_VMFUNC:
1530
if (data & ~vmcs_config.nested.vmfunc_controls)
1531
return -EINVAL;
1532
vmx->nested.msrs.vmfunc_controls = data;
1533
return 0;
1534
default:
1535
/*
1536
* The rest of the VMX capability MSRs do not support restore.
1537
*/
1538
return -EINVAL;
1539
}
1540
}
1541
1542
/* Returns 0 on success, non-0 otherwise. */
1543
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1544
{
1545
switch (msr_index) {
1546
case MSR_IA32_VMX_BASIC:
1547
*pdata = msrs->basic;
1548
break;
1549
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1550
case MSR_IA32_VMX_PINBASED_CTLS:
1551
*pdata = vmx_control_msr(
1552
msrs->pinbased_ctls_low,
1553
msrs->pinbased_ctls_high);
1554
if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1555
*pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1556
break;
1557
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1558
case MSR_IA32_VMX_PROCBASED_CTLS:
1559
*pdata = vmx_control_msr(
1560
msrs->procbased_ctls_low,
1561
msrs->procbased_ctls_high);
1562
if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1563
*pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1564
break;
1565
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1566
case MSR_IA32_VMX_EXIT_CTLS:
1567
*pdata = vmx_control_msr(
1568
msrs->exit_ctls_low,
1569
msrs->exit_ctls_high);
1570
if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1571
*pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1572
break;
1573
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1574
case MSR_IA32_VMX_ENTRY_CTLS:
1575
*pdata = vmx_control_msr(
1576
msrs->entry_ctls_low,
1577
msrs->entry_ctls_high);
1578
if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1579
*pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1580
break;
1581
case MSR_IA32_VMX_MISC:
1582
*pdata = vmx_control_msr(
1583
msrs->misc_low,
1584
msrs->misc_high);
1585
break;
1586
case MSR_IA32_VMX_CR0_FIXED0:
1587
*pdata = msrs->cr0_fixed0;
1588
break;
1589
case MSR_IA32_VMX_CR0_FIXED1:
1590
*pdata = msrs->cr0_fixed1;
1591
break;
1592
case MSR_IA32_VMX_CR4_FIXED0:
1593
*pdata = msrs->cr4_fixed0;
1594
break;
1595
case MSR_IA32_VMX_CR4_FIXED1:
1596
*pdata = msrs->cr4_fixed1;
1597
break;
1598
case MSR_IA32_VMX_VMCS_ENUM:
1599
*pdata = msrs->vmcs_enum;
1600
break;
1601
case MSR_IA32_VMX_PROCBASED_CTLS2:
1602
*pdata = vmx_control_msr(
1603
msrs->secondary_ctls_low,
1604
msrs->secondary_ctls_high);
1605
break;
1606
case MSR_IA32_VMX_EPT_VPID_CAP:
1607
*pdata = msrs->ept_caps |
1608
((u64)msrs->vpid_caps << 32);
1609
break;
1610
case MSR_IA32_VMX_VMFUNC:
1611
*pdata = msrs->vmfunc_controls;
1612
break;
1613
default:
1614
return 1;
1615
}
1616
1617
return 0;
1618
}
1619
1620
/*
1621
* Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1622
* been modified by the L1 guest. Note, "writable" in this context means
1623
* "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1624
* fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1625
* VM-exit information fields (which are actually writable if the vCPU is
1626
* configured to support "VMWRITE to any supported field in the VMCS").
1627
*/
1628
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1629
{
1630
struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1631
struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1632
struct shadow_vmcs_field field;
1633
unsigned long val;
1634
int i;
1635
1636
if (WARN_ON(!shadow_vmcs))
1637
return;
1638
1639
preempt_disable();
1640
1641
vmcs_load(shadow_vmcs);
1642
1643
for (i = 0; i < max_shadow_read_write_fields; i++) {
1644
field = shadow_read_write_fields[i];
1645
val = __vmcs_readl(field.encoding);
1646
vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1647
}
1648
1649
vmcs_clear(shadow_vmcs);
1650
vmcs_load(vmx->loaded_vmcs->vmcs);
1651
1652
preempt_enable();
1653
}
1654
1655
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1656
{
1657
const struct shadow_vmcs_field *fields[] = {
1658
shadow_read_write_fields,
1659
shadow_read_only_fields
1660
};
1661
const int max_fields[] = {
1662
max_shadow_read_write_fields,
1663
max_shadow_read_only_fields
1664
};
1665
struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1666
struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1667
struct shadow_vmcs_field field;
1668
unsigned long val;
1669
int i, q;
1670
1671
if (WARN_ON(!shadow_vmcs))
1672
return;
1673
1674
vmcs_load(shadow_vmcs);
1675
1676
for (q = 0; q < ARRAY_SIZE(fields); q++) {
1677
for (i = 0; i < max_fields[q]; i++) {
1678
field = fields[q][i];
1679
val = vmcs12_read_any(vmcs12, field.encoding,
1680
field.offset);
1681
__vmcs_writel(field.encoding, val);
1682
}
1683
}
1684
1685
vmcs_clear(shadow_vmcs);
1686
vmcs_load(vmx->loaded_vmcs->vmcs);
1687
}
1688
1689
static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
1690
{
1691
#ifdef CONFIG_KVM_HYPERV
1692
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1693
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
1694
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu);
1695
1696
/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1697
vmcs12->tpr_threshold = evmcs->tpr_threshold;
1698
vmcs12->guest_rip = evmcs->guest_rip;
1699
1700
if (unlikely(!(hv_clean_fields &
1701
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) {
1702
hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page;
1703
hv_vcpu->nested.vm_id = evmcs->hv_vm_id;
1704
hv_vcpu->nested.vp_id = evmcs->hv_vp_id;
1705
}
1706
1707
if (unlikely(!(hv_clean_fields &
1708
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1709
vmcs12->guest_rsp = evmcs->guest_rsp;
1710
vmcs12->guest_rflags = evmcs->guest_rflags;
1711
vmcs12->guest_interruptibility_info =
1712
evmcs->guest_interruptibility_info;
1713
/*
1714
* Not present in struct vmcs12:
1715
* vmcs12->guest_ssp = evmcs->guest_ssp;
1716
*/
1717
}
1718
1719
if (unlikely(!(hv_clean_fields &
1720
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1721
vmcs12->cpu_based_vm_exec_control =
1722
evmcs->cpu_based_vm_exec_control;
1723
}
1724
1725
if (unlikely(!(hv_clean_fields &
1726
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1727
vmcs12->exception_bitmap = evmcs->exception_bitmap;
1728
}
1729
1730
if (unlikely(!(hv_clean_fields &
1731
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1732
vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1733
}
1734
1735
if (unlikely(!(hv_clean_fields &
1736
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1737
vmcs12->vm_entry_intr_info_field =
1738
evmcs->vm_entry_intr_info_field;
1739
vmcs12->vm_entry_exception_error_code =
1740
evmcs->vm_entry_exception_error_code;
1741
vmcs12->vm_entry_instruction_len =
1742
evmcs->vm_entry_instruction_len;
1743
}
1744
1745
if (unlikely(!(hv_clean_fields &
1746
HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1747
vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1748
vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1749
vmcs12->host_cr0 = evmcs->host_cr0;
1750
vmcs12->host_cr3 = evmcs->host_cr3;
1751
vmcs12->host_cr4 = evmcs->host_cr4;
1752
vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1753
vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1754
vmcs12->host_rip = evmcs->host_rip;
1755
vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1756
vmcs12->host_es_selector = evmcs->host_es_selector;
1757
vmcs12->host_cs_selector = evmcs->host_cs_selector;
1758
vmcs12->host_ss_selector = evmcs->host_ss_selector;
1759
vmcs12->host_ds_selector = evmcs->host_ds_selector;
1760
vmcs12->host_fs_selector = evmcs->host_fs_selector;
1761
vmcs12->host_gs_selector = evmcs->host_gs_selector;
1762
vmcs12->host_tr_selector = evmcs->host_tr_selector;
1763
vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl;
1764
/*
1765
* Not present in struct vmcs12:
1766
* vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet;
1767
* vmcs12->host_ssp = evmcs->host_ssp;
1768
* vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr;
1769
*/
1770
}
1771
1772
if (unlikely(!(hv_clean_fields &
1773
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1774
vmcs12->pin_based_vm_exec_control =
1775
evmcs->pin_based_vm_exec_control;
1776
vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1777
vmcs12->secondary_vm_exec_control =
1778
evmcs->secondary_vm_exec_control;
1779
}
1780
1781
if (unlikely(!(hv_clean_fields &
1782
HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1783
vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1784
vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1785
}
1786
1787
if (unlikely(!(hv_clean_fields &
1788
HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1789
vmcs12->msr_bitmap = evmcs->msr_bitmap;
1790
}
1791
1792
if (unlikely(!(hv_clean_fields &
1793
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1794
vmcs12->guest_es_base = evmcs->guest_es_base;
1795
vmcs12->guest_cs_base = evmcs->guest_cs_base;
1796
vmcs12->guest_ss_base = evmcs->guest_ss_base;
1797
vmcs12->guest_ds_base = evmcs->guest_ds_base;
1798
vmcs12->guest_fs_base = evmcs->guest_fs_base;
1799
vmcs12->guest_gs_base = evmcs->guest_gs_base;
1800
vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1801
vmcs12->guest_tr_base = evmcs->guest_tr_base;
1802
vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1803
vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1804
vmcs12->guest_es_limit = evmcs->guest_es_limit;
1805
vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1806
vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1807
vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1808
vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1809
vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1810
vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1811
vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1812
vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1813
vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1814
vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1815
vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1816
vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1817
vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1818
vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1819
vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1820
vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1821
vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1822
vmcs12->guest_es_selector = evmcs->guest_es_selector;
1823
vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1824
vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1825
vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1826
vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1827
vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1828
vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1829
vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1830
}
1831
1832
if (unlikely(!(hv_clean_fields &
1833
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1834
vmcs12->tsc_offset = evmcs->tsc_offset;
1835
vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1836
vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1837
vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap;
1838
vmcs12->tsc_multiplier = evmcs->tsc_multiplier;
1839
}
1840
1841
if (unlikely(!(hv_clean_fields &
1842
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1843
vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1844
vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1845
vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1846
vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1847
vmcs12->guest_cr0 = evmcs->guest_cr0;
1848
vmcs12->guest_cr3 = evmcs->guest_cr3;
1849
vmcs12->guest_cr4 = evmcs->guest_cr4;
1850
vmcs12->guest_dr7 = evmcs->guest_dr7;
1851
}
1852
1853
if (unlikely(!(hv_clean_fields &
1854
HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1855
vmcs12->host_fs_base = evmcs->host_fs_base;
1856
vmcs12->host_gs_base = evmcs->host_gs_base;
1857
vmcs12->host_tr_base = evmcs->host_tr_base;
1858
vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1859
vmcs12->host_idtr_base = evmcs->host_idtr_base;
1860
vmcs12->host_rsp = evmcs->host_rsp;
1861
}
1862
1863
if (unlikely(!(hv_clean_fields &
1864
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1865
vmcs12->ept_pointer = evmcs->ept_pointer;
1866
vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1867
}
1868
1869
if (unlikely(!(hv_clean_fields &
1870
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1871
vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1872
vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1873
vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1874
vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1875
vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1876
vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1877
vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1878
vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1879
vmcs12->guest_pending_dbg_exceptions =
1880
evmcs->guest_pending_dbg_exceptions;
1881
vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1882
vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1883
vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1884
vmcs12->guest_activity_state = evmcs->guest_activity_state;
1885
vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1886
vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl;
1887
/*
1888
* Not present in struct vmcs12:
1889
* vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet;
1890
* vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl;
1891
* vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr;
1892
*/
1893
}
1894
1895
/*
1896
* Not used?
1897
* vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1898
* vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1899
* vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1900
* vmcs12->page_fault_error_code_mask =
1901
* evmcs->page_fault_error_code_mask;
1902
* vmcs12->page_fault_error_code_match =
1903
* evmcs->page_fault_error_code_match;
1904
* vmcs12->cr3_target_count = evmcs->cr3_target_count;
1905
* vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1906
* vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1907
* vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1908
*/
1909
1910
/*
1911
* Read only fields:
1912
* vmcs12->guest_physical_address = evmcs->guest_physical_address;
1913
* vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1914
* vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1915
* vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1916
* vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1917
* vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1918
* vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1919
* vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1920
* vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1921
* vmcs12->exit_qualification = evmcs->exit_qualification;
1922
* vmcs12->guest_linear_address = evmcs->guest_linear_address;
1923
*
1924
* Not present in struct vmcs12:
1925
* vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1926
* vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1927
* vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1928
* vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1929
*/
1930
1931
return;
1932
#else /* CONFIG_KVM_HYPERV */
1933
KVM_BUG_ON(1, vmx->vcpu.kvm);
1934
#endif /* CONFIG_KVM_HYPERV */
1935
}
1936
1937
static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1938
{
1939
#ifdef CONFIG_KVM_HYPERV
1940
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1941
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
1942
1943
/*
1944
* Should not be changed by KVM:
1945
*
1946
* evmcs->host_es_selector = vmcs12->host_es_selector;
1947
* evmcs->host_cs_selector = vmcs12->host_cs_selector;
1948
* evmcs->host_ss_selector = vmcs12->host_ss_selector;
1949
* evmcs->host_ds_selector = vmcs12->host_ds_selector;
1950
* evmcs->host_fs_selector = vmcs12->host_fs_selector;
1951
* evmcs->host_gs_selector = vmcs12->host_gs_selector;
1952
* evmcs->host_tr_selector = vmcs12->host_tr_selector;
1953
* evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1954
* evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1955
* evmcs->host_cr0 = vmcs12->host_cr0;
1956
* evmcs->host_cr3 = vmcs12->host_cr3;
1957
* evmcs->host_cr4 = vmcs12->host_cr4;
1958
* evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1959
* evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1960
* evmcs->host_rip = vmcs12->host_rip;
1961
* evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1962
* evmcs->host_fs_base = vmcs12->host_fs_base;
1963
* evmcs->host_gs_base = vmcs12->host_gs_base;
1964
* evmcs->host_tr_base = vmcs12->host_tr_base;
1965
* evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1966
* evmcs->host_idtr_base = vmcs12->host_idtr_base;
1967
* evmcs->host_rsp = vmcs12->host_rsp;
1968
* sync_vmcs02_to_vmcs12() doesn't read these:
1969
* evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1970
* evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1971
* evmcs->msr_bitmap = vmcs12->msr_bitmap;
1972
* evmcs->ept_pointer = vmcs12->ept_pointer;
1973
* evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1974
* evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1975
* evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1976
* evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1977
* evmcs->tpr_threshold = vmcs12->tpr_threshold;
1978
* evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1979
* evmcs->exception_bitmap = vmcs12->exception_bitmap;
1980
* evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1981
* evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1982
* evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1983
* evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1984
* evmcs->page_fault_error_code_mask =
1985
* vmcs12->page_fault_error_code_mask;
1986
* evmcs->page_fault_error_code_match =
1987
* vmcs12->page_fault_error_code_match;
1988
* evmcs->cr3_target_count = vmcs12->cr3_target_count;
1989
* evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1990
* evmcs->tsc_offset = vmcs12->tsc_offset;
1991
* evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1992
* evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1993
* evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1994
* evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1995
* evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1996
* evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1997
* evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1998
* evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1999
* evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl;
2000
* evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl;
2001
* evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap;
2002
* evmcs->tsc_multiplier = vmcs12->tsc_multiplier;
2003
*
2004
* Not present in struct vmcs12:
2005
* evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
2006
* evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
2007
* evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
2008
* evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
2009
* evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet;
2010
* evmcs->host_ssp = vmcs12->host_ssp;
2011
* evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr;
2012
* evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet;
2013
* evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl;
2014
* evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr;
2015
* evmcs->guest_ssp = vmcs12->guest_ssp;
2016
*/
2017
2018
evmcs->guest_es_selector = vmcs12->guest_es_selector;
2019
evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
2020
evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
2021
evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
2022
evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
2023
evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
2024
evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
2025
evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
2026
2027
evmcs->guest_es_limit = vmcs12->guest_es_limit;
2028
evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
2029
evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
2030
evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
2031
evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
2032
evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
2033
evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
2034
evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
2035
evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
2036
evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
2037
2038
evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
2039
evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
2040
evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
2041
evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
2042
evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
2043
evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
2044
evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
2045
evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
2046
2047
evmcs->guest_es_base = vmcs12->guest_es_base;
2048
evmcs->guest_cs_base = vmcs12->guest_cs_base;
2049
evmcs->guest_ss_base = vmcs12->guest_ss_base;
2050
evmcs->guest_ds_base = vmcs12->guest_ds_base;
2051
evmcs->guest_fs_base = vmcs12->guest_fs_base;
2052
evmcs->guest_gs_base = vmcs12->guest_gs_base;
2053
evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
2054
evmcs->guest_tr_base = vmcs12->guest_tr_base;
2055
evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
2056
evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
2057
2058
evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
2059
evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
2060
2061
evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
2062
evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
2063
evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
2064
evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
2065
2066
evmcs->guest_pending_dbg_exceptions =
2067
vmcs12->guest_pending_dbg_exceptions;
2068
evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
2069
evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
2070
2071
evmcs->guest_activity_state = vmcs12->guest_activity_state;
2072
evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
2073
2074
evmcs->guest_cr0 = vmcs12->guest_cr0;
2075
evmcs->guest_cr3 = vmcs12->guest_cr3;
2076
evmcs->guest_cr4 = vmcs12->guest_cr4;
2077
evmcs->guest_dr7 = vmcs12->guest_dr7;
2078
2079
evmcs->guest_physical_address = vmcs12->guest_physical_address;
2080
2081
evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
2082
evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
2083
evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
2084
evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
2085
evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
2086
evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
2087
evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
2088
evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
2089
2090
evmcs->exit_qualification = vmcs12->exit_qualification;
2091
2092
evmcs->guest_linear_address = vmcs12->guest_linear_address;
2093
evmcs->guest_rsp = vmcs12->guest_rsp;
2094
evmcs->guest_rflags = vmcs12->guest_rflags;
2095
2096
evmcs->guest_interruptibility_info =
2097
vmcs12->guest_interruptibility_info;
2098
evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
2099
evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
2100
evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
2101
evmcs->vm_entry_exception_error_code =
2102
vmcs12->vm_entry_exception_error_code;
2103
evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
2104
2105
evmcs->guest_rip = vmcs12->guest_rip;
2106
2107
evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
2108
2109
return;
2110
#else /* CONFIG_KVM_HYPERV */
2111
KVM_BUG_ON(1, vmx->vcpu.kvm);
2112
#endif /* CONFIG_KVM_HYPERV */
2113
}
2114
2115
/*
2116
* This is an equivalent of the nested hypervisor executing the vmptrld
2117
* instruction.
2118
*/
2119
static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
2120
struct kvm_vcpu *vcpu, bool from_launch)
2121
{
2122
#ifdef CONFIG_KVM_HYPERV
2123
struct vcpu_vmx *vmx = to_vmx(vcpu);
2124
bool evmcs_gpa_changed = false;
2125
u64 evmcs_gpa;
2126
2127
if (likely(!guest_cpu_cap_has_evmcs(vcpu)))
2128
return EVMPTRLD_DISABLED;
2129
2130
evmcs_gpa = nested_get_evmptr(vcpu);
2131
if (!evmptr_is_valid(evmcs_gpa)) {
2132
nested_release_evmcs(vcpu);
2133
return EVMPTRLD_DISABLED;
2134
}
2135
2136
if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
2137
vmx->nested.current_vmptr = INVALID_GPA;
2138
2139
nested_release_evmcs(vcpu);
2140
2141
if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
2142
&vmx->nested.hv_evmcs_map))
2143
return EVMPTRLD_ERROR;
2144
2145
vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
2146
2147
/*
2148
* Currently, KVM only supports eVMCS version 1
2149
* (== KVM_EVMCS_VERSION) and thus we expect guest to set this
2150
* value to first u32 field of eVMCS which should specify eVMCS
2151
* VersionNumber.
2152
*
2153
* Guest should be aware of supported eVMCS versions by host by
2154
* examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
2155
* expected to set this CPUID leaf according to the value
2156
* returned in vmcs_version from nested_enable_evmcs().
2157
*
2158
* However, it turns out that Microsoft Hyper-V fails to comply
2159
* to their own invented interface: When Hyper-V use eVMCS, it
2160
* just sets first u32 field of eVMCS to revision_id specified
2161
* in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
2162
* which is one of the supported versions specified in
2163
* CPUID.0x4000000A.EAX[0:15].
2164
*
2165
* To overcome Hyper-V bug, we accept here either a supported
2166
* eVMCS version or VMCS12 revision_id as valid values for first
2167
* u32 field of eVMCS.
2168
*/
2169
if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
2170
(vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
2171
nested_release_evmcs(vcpu);
2172
return EVMPTRLD_VMFAIL;
2173
}
2174
2175
vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
2176
2177
evmcs_gpa_changed = true;
2178
/*
2179
* Unlike normal vmcs12, enlightened vmcs12 is not fully
2180
* reloaded from guest's memory (read only fields, fields not
2181
* present in struct hv_enlightened_vmcs, ...). Make sure there
2182
* are no leftovers.
2183
*/
2184
if (from_launch) {
2185
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2186
memset(vmcs12, 0, sizeof(*vmcs12));
2187
vmcs12->hdr.revision_id = VMCS12_REVISION;
2188
}
2189
2190
}
2191
2192
/*
2193
* Clean fields data can't be used on VMLAUNCH and when we switch
2194
* between different L2 guests as KVM keeps a single VMCS12 per L1.
2195
*/
2196
if (from_launch || evmcs_gpa_changed) {
2197
vmx->nested.hv_evmcs->hv_clean_fields &=
2198
~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2199
2200
vmx->nested.force_msr_bitmap_recalc = true;
2201
}
2202
2203
return EVMPTRLD_SUCCEEDED;
2204
#else
2205
return EVMPTRLD_DISABLED;
2206
#endif
2207
}
2208
2209
void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
2210
{
2211
struct vcpu_vmx *vmx = to_vmx(vcpu);
2212
2213
if (nested_vmx_is_evmptr12_valid(vmx))
2214
copy_vmcs12_to_enlightened(vmx);
2215
else
2216
copy_vmcs12_to_shadow(vmx);
2217
2218
vmx->nested.need_vmcs12_to_shadow_sync = false;
2219
}
2220
2221
static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2222
{
2223
struct vcpu_vmx *vmx =
2224
container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2225
2226
vmx->nested.preemption_timer_expired = true;
2227
kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2228
kvm_vcpu_kick(&vmx->vcpu);
2229
2230
return HRTIMER_NORESTART;
2231
}
2232
2233
static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
2234
{
2235
struct vcpu_vmx *vmx = to_vmx(vcpu);
2236
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2237
2238
u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
2239
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2240
2241
if (!vmx->nested.has_preemption_timer_deadline) {
2242
vmx->nested.preemption_timer_deadline =
2243
vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
2244
vmx->nested.has_preemption_timer_deadline = true;
2245
}
2246
return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
2247
}
2248
2249
static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
2250
u64 preemption_timeout)
2251
{
2252
struct vcpu_vmx *vmx = to_vmx(vcpu);
2253
2254
/*
2255
* A timer value of zero is architecturally guaranteed to cause
2256
* a VMExit prior to executing any instructions in the guest.
2257
*/
2258
if (preemption_timeout == 0) {
2259
vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2260
return;
2261
}
2262
2263
if (vcpu->arch.virtual_tsc_khz == 0)
2264
return;
2265
2266
preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2267
preemption_timeout *= 1000000;
2268
do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2269
hrtimer_start(&vmx->nested.preemption_timer,
2270
ktime_add_ns(ktime_get(), preemption_timeout),
2271
HRTIMER_MODE_ABS_PINNED);
2272
}
2273
2274
static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2275
{
2276
if (vmx->nested.nested_run_pending &&
2277
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2278
return vmcs12->guest_ia32_efer;
2279
else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2280
return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2281
else
2282
return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2283
}
2284
2285
static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2286
{
2287
struct kvm *kvm = vmx->vcpu.kvm;
2288
2289
/*
2290
* If vmcs02 hasn't been initialized, set the constant vmcs02 state
2291
* according to L0's settings (vmcs12 is irrelevant here). Host
2292
* fields that come from L0 and are not constant, e.g. HOST_CR3,
2293
* will be set as needed prior to VMLAUNCH/VMRESUME.
2294
*/
2295
if (vmx->nested.vmcs02_initialized)
2296
return;
2297
vmx->nested.vmcs02_initialized = true;
2298
2299
/*
2300
* We don't care what the EPTP value is we just need to guarantee
2301
* it's valid so we don't get a false positive when doing early
2302
* consistency checks.
2303
*/
2304
if (enable_ept && nested_early_check)
2305
vmcs_write64(EPT_POINTER,
2306
construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
2307
2308
if (vmx->ve_info)
2309
vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info));
2310
2311
/* All VMFUNCs are currently emulated through L0 vmexits. */
2312
if (cpu_has_vmx_vmfunc())
2313
vmcs_write64(VM_FUNCTION_CONTROL, 0);
2314
2315
if (cpu_has_vmx_posted_intr())
2316
vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2317
2318
if (cpu_has_vmx_msr_bitmap())
2319
vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2320
2321
/*
2322
* PML is emulated for L2, but never enabled in hardware as the MMU
2323
* handles A/D emulation. Disabling PML for L2 also avoids having to
2324
* deal with filtering out L2 GPAs from the buffer.
2325
*/
2326
if (enable_pml) {
2327
vmcs_write64(PML_ADDRESS, 0);
2328
vmcs_write16(GUEST_PML_INDEX, -1);
2329
}
2330
2331
if (cpu_has_vmx_encls_vmexit())
2332
vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
2333
2334
if (kvm_notify_vmexit_enabled(kvm))
2335
vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
2336
2337
/*
2338
* Set the MSR load/store lists to match L0's settings. Only the
2339
* addresses are constant (for vmcs02), the counts can change based
2340
* on L2's behavior, e.g. switching to/from long mode.
2341
*/
2342
vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
2343
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2344
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2345
2346
vmx_set_constant_host_state(vmx);
2347
}
2348
2349
static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
2350
struct vmcs12 *vmcs12)
2351
{
2352
prepare_vmcs02_constant_state(vmx);
2353
2354
vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
2355
2356
/*
2357
* If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
2358
* same VPID as the host. Emulate this behavior by using vpid01 for L2
2359
* if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter
2360
* and VM-Exit are architecturally required to flush VPID=0, but *only*
2361
* VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the
2362
* required flushes), but doing so would cause KVM to over-flush. E.g.
2363
* if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled,
2364
* and then runs L2 X again, then KVM can and should retain TLB entries
2365
* for VPID12=1.
2366
*/
2367
if (enable_vpid) {
2368
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2369
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2370
else
2371
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2372
}
2373
}
2374
2375
static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
2376
struct vmcs12 *vmcs12)
2377
{
2378
u32 exec_control;
2379
u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2380
2381
if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx))
2382
prepare_vmcs02_early_rare(vmx, vmcs12);
2383
2384
/*
2385
* PIN CONTROLS
2386
*/
2387
exec_control = __pin_controls_get(vmcs01);
2388
exec_control |= (vmcs12->pin_based_vm_exec_control &
2389
~PIN_BASED_VMX_PREEMPTION_TIMER);
2390
2391
/* Posted interrupts setting is only taken from vmcs12. */
2392
vmx->nested.pi_pending = false;
2393
if (nested_cpu_has_posted_intr(vmcs12)) {
2394
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2395
} else {
2396
vmx->nested.posted_intr_nv = -1;
2397
exec_control &= ~PIN_BASED_POSTED_INTR;
2398
}
2399
pin_controls_set(vmx, exec_control);
2400
2401
/*
2402
* EXEC CONTROLS
2403
*/
2404
exec_control = __exec_controls_get(vmcs01); /* L0's desires */
2405
exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
2406
exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
2407
exec_control &= ~CPU_BASED_TPR_SHADOW;
2408
exec_control |= vmcs12->cpu_based_vm_exec_control;
2409
2410
vmx->nested.l1_tpr_threshold = -1;
2411
if (exec_control & CPU_BASED_TPR_SHADOW)
2412
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2413
#ifdef CONFIG_X86_64
2414
else
2415
exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2416
CPU_BASED_CR8_STORE_EXITING;
2417
#endif
2418
2419
/*
2420
* A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2421
* for I/O port accesses.
2422
*/
2423
exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2424
exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2425
2426
/*
2427
* This bit will be computed in nested_get_vmcs12_pages, because
2428
* we do not have access to L1's MSR bitmap yet. For now, keep
2429
* the same bit as before, hoping to avoid multiple VMWRITEs that
2430
* only set/clear this bit.
2431
*/
2432
exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2433
exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2434
2435
exec_controls_set(vmx, exec_control);
2436
2437
/*
2438
* SECONDARY EXEC CONTROLS
2439
*/
2440
if (cpu_has_secondary_exec_ctrls()) {
2441
exec_control = __secondary_exec_controls_get(vmcs01);
2442
2443
/* Take the following fields only from vmcs12 */
2444
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2445
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2446
SECONDARY_EXEC_ENABLE_INVPCID |
2447
SECONDARY_EXEC_ENABLE_RDTSCP |
2448
SECONDARY_EXEC_ENABLE_XSAVES |
2449
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2450
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2451
SECONDARY_EXEC_APIC_REGISTER_VIRT |
2452
SECONDARY_EXEC_ENABLE_VMFUNC |
2453
SECONDARY_EXEC_DESC);
2454
2455
if (nested_cpu_has(vmcs12,
2456
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
2457
exec_control |= vmcs12->secondary_vm_exec_control;
2458
2459
/* PML is emulated and never enabled in hardware for L2. */
2460
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
2461
2462
/* VMCS shadowing for L2 is emulated for now */
2463
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2464
2465
/*
2466
* Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2467
* will not have to rewrite the controls just for this bit.
2468
*/
2469
if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP))
2470
exec_control |= SECONDARY_EXEC_DESC;
2471
2472
if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2473
vmcs_write16(GUEST_INTR_STATUS,
2474
vmcs12->guest_intr_status);
2475
2476
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
2477
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2478
2479
if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
2480
vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
2481
2482
secondary_exec_controls_set(vmx, exec_control);
2483
}
2484
2485
/*
2486
* ENTRY CONTROLS
2487
*
2488
* vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2489
* are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2490
* on the related bits (if supported by the CPU) in the hope that
2491
* we can avoid VMWrites during vmx_set_efer().
2492
*
2493
* Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is
2494
* loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to
2495
* do the same for L2.
2496
*/
2497
exec_control = __vm_entry_controls_get(vmcs01);
2498
exec_control |= (vmcs12->vm_entry_controls &
2499
~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
2500
exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
2501
if (cpu_has_load_ia32_efer()) {
2502
if (guest_efer & EFER_LMA)
2503
exec_control |= VM_ENTRY_IA32E_MODE;
2504
if (guest_efer != kvm_host.efer)
2505
exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2506
}
2507
vm_entry_controls_set(vmx, exec_control);
2508
2509
/*
2510
* EXIT CONTROLS
2511
*
2512
* L2->L1 exit controls are emulated - the hardware exit is to L0 so
2513
* we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2514
* bits may be modified by vmx_set_efer() in prepare_vmcs02().
2515
*/
2516
exec_control = __vm_exit_controls_get(vmcs01);
2517
if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer)
2518
exec_control |= VM_EXIT_LOAD_IA32_EFER;
2519
else
2520
exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
2521
vm_exit_controls_set(vmx, exec_control);
2522
2523
/*
2524
* Interrupt/Exception Fields
2525
*/
2526
if (vmx->nested.nested_run_pending) {
2527
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2528
vmcs12->vm_entry_intr_info_field);
2529
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2530
vmcs12->vm_entry_exception_error_code);
2531
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2532
vmcs12->vm_entry_instruction_len);
2533
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2534
vmcs12->guest_interruptibility_info);
2535
vmx->loaded_vmcs->nmi_known_unmasked =
2536
!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2537
} else {
2538
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2539
}
2540
}
2541
2542
static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet,
2543
u64 *ssp, u64 *ssp_tbl)
2544
{
2545
if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) ||
2546
guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
2547
*s_cet = vmcs_readl(GUEST_S_CET);
2548
2549
if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) {
2550
*ssp = vmcs_readl(GUEST_SSP);
2551
*ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE);
2552
}
2553
}
2554
2555
static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet,
2556
u64 ssp, u64 ssp_tbl)
2557
{
2558
if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) ||
2559
guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
2560
vmcs_writel(GUEST_S_CET, s_cet);
2561
2562
if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) {
2563
vmcs_writel(GUEST_SSP, ssp);
2564
vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl);
2565
}
2566
}
2567
2568
static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2569
{
2570
struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx);
2571
2572
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2573
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2574
2575
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2576
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2577
vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2578
vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2579
vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2580
vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2581
vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2582
vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2583
vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2584
vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2585
vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2586
vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2587
vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2588
vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2589
vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2590
vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2591
vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2592
vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2593
vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2594
vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2595
vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2596
vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2597
vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2598
vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2599
vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2600
vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2601
vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2602
vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2603
vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2604
vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2605
vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2606
vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2607
vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2608
vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2609
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2610
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2611
2612
vmx_segment_cache_clear(vmx);
2613
}
2614
2615
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2616
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2617
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2618
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2619
vmcs12->guest_pending_dbg_exceptions);
2620
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2621
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2622
2623
/*
2624
* L1 may access the L2's PDPTR, so save them to construct
2625
* vmcs12
2626
*/
2627
if (enable_ept) {
2628
vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2629
vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2630
vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2631
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2632
}
2633
2634
if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2635
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2636
vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2637
}
2638
2639
if (nested_cpu_has_xsaves(vmcs12))
2640
vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2641
2642
/*
2643
* Whether page-faults are trapped is determined by a combination of
2644
* 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0
2645
* doesn't care about page faults then we should set all of these to
2646
* L1's desires. However, if L0 does care about (some) page faults, it
2647
* is not easy (if at all possible?) to merge L0 and L1's desires, we
2648
* simply ask to exit on each and every L2 page fault. This is done by
2649
* setting MASK=MATCH=0 and (see below) EB.PF=1.
2650
* Note that below we don't need special code to set EB.PF beyond the
2651
* "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2652
* vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2653
* !enable_ept, EB.PF is 1, so the "or" will always be 1.
2654
*/
2655
if (vmx_need_pf_intercept(&vmx->vcpu)) {
2656
/*
2657
* TODO: if both L0 and L1 need the same MASK and MATCH,
2658
* go ahead and use it?
2659
*/
2660
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
2661
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
2662
} else {
2663
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
2664
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
2665
}
2666
2667
if (cpu_has_vmx_apicv()) {
2668
vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2669
vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2670
vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2671
vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2672
}
2673
2674
/*
2675
* Make sure the msr_autostore list is up to date before we set the
2676
* count in the vmcs02.
2677
*/
2678
prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2679
2680
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
2681
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2682
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2683
2684
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)
2685
vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet,
2686
vmcs12->guest_ssp, vmcs12->guest_ssp_tbl);
2687
2688
set_cr4_guest_host_mask(vmx);
2689
}
2690
2691
/*
2692
* prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2693
* L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2694
* with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2695
* guest in a way that will both be appropriate to L1's requests, and our
2696
* needs. In addition to modifying the active vmcs (which is vmcs02), this
2697
* function also has additional necessary side-effects, like setting various
2698
* vcpu->arch fields.
2699
* Returns 0 on success, 1 on failure. Invalid state exit qualification code
2700
* is assigned to entry_failure_code on failure.
2701
*/
2702
static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2703
bool from_vmentry,
2704
enum vm_entry_failure_code *entry_failure_code)
2705
{
2706
struct vcpu_vmx *vmx = to_vmx(vcpu);
2707
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
2708
bool load_guest_pdptrs_vmcs12 = false;
2709
2710
if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) {
2711
prepare_vmcs02_rare(vmx, vmcs12);
2712
vmx->nested.dirty_vmcs12 = false;
2713
2714
load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) ||
2715
!(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2716
}
2717
2718
if (vmx->nested.nested_run_pending &&
2719
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2720
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2721
vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
2722
vmx_get_supported_debugctl(vcpu, false));
2723
} else {
2724
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2725
vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
2726
}
2727
2728
if (!vmx->nested.nested_run_pending ||
2729
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE))
2730
vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet,
2731
vmx->nested.pre_vmenter_ssp,
2732
vmx->nested.pre_vmenter_ssp_tbl);
2733
2734
if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2735
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2736
vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs);
2737
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2738
2739
/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2740
* bitwise-or of what L1 wants to trap for L2, and what we want to
2741
* trap. Note that CR0.TS also needs updating - we do this later.
2742
*/
2743
vmx_update_exception_bitmap(vcpu);
2744
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2745
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2746
2747
if (vmx->nested.nested_run_pending &&
2748
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2749
vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2750
vcpu->arch.pat = vmcs12->guest_ia32_pat;
2751
} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2752
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2753
}
2754
2755
vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2756
vcpu->arch.l1_tsc_offset,
2757
vmx_get_l2_tsc_offset(vcpu),
2758
vmx_get_l2_tsc_multiplier(vcpu));
2759
2760
vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2761
vcpu->arch.l1_tsc_scaling_ratio,
2762
vmx_get_l2_tsc_multiplier(vcpu));
2763
2764
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2765
if (kvm_caps.has_tsc_control)
2766
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
2767
2768
nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
2769
2770
if (nested_cpu_has_ept(vmcs12))
2771
nested_ept_init_mmu_context(vcpu);
2772
2773
/*
2774
* Override the CR0/CR4 read shadows after setting the effective guest
2775
* CR0/CR4. The common helpers also set the shadows, but they don't
2776
* account for vmcs12's cr0/4_guest_host_mask.
2777
*/
2778
vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2779
vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2780
2781
vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2782
vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2783
2784
vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2785
/* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2786
vmx_set_efer(vcpu, vcpu->arch.efer);
2787
2788
/*
2789
* Guest state is invalid and unrestricted guest is disabled,
2790
* which means L1 attempted VMEntry to L2 with invalid state.
2791
* Fail the VMEntry.
2792
*
2793
* However when force loading the guest state (SMM exit or
2794
* loading nested state after migration, it is possible to
2795
* have invalid guest state now, which will be later fixed by
2796
* restoring L2 register state
2797
*/
2798
if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) {
2799
*entry_failure_code = ENTRY_FAIL_DEFAULT;
2800
return -EINVAL;
2801
}
2802
2803
/* Shadow page tables on either EPT or shadow page tables. */
2804
if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2805
from_vmentry, entry_failure_code))
2806
return -EINVAL;
2807
2808
/*
2809
* Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
2810
* on nested VM-Exit, which can occur without actually running L2 and
2811
* thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
2812
* vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2813
* transition to HLT instead of running L2.
2814
*/
2815
if (enable_ept)
2816
vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2817
2818
/* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2819
if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2820
is_pae_paging(vcpu)) {
2821
vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2822
vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2823
vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2824
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2825
}
2826
2827
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2828
kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
2829
WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2830
vmcs12->guest_ia32_perf_global_ctrl))) {
2831
*entry_failure_code = ENTRY_FAIL_DEFAULT;
2832
return -EINVAL;
2833
}
2834
2835
kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2836
kvm_rip_write(vcpu, vmcs12->guest_rip);
2837
2838
/*
2839
* It was observed that genuine Hyper-V running in L1 doesn't reset
2840
* 'hv_clean_fields' by itself, it only sets the corresponding dirty
2841
* bits when it changes a field in eVMCS. Mark all fields as clean
2842
* here.
2843
*/
2844
if (nested_vmx_is_evmptr12_valid(vmx))
2845
evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2846
2847
return 0;
2848
}
2849
2850
static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2851
{
2852
if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2853
nested_cpu_has_virtual_nmis(vmcs12)))
2854
return -EINVAL;
2855
2856
if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
2857
nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
2858
return -EINVAL;
2859
2860
return 0;
2861
}
2862
2863
static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
2864
{
2865
struct vcpu_vmx *vmx = to_vmx(vcpu);
2866
2867
/* Check for memory type validity */
2868
switch (new_eptp & VMX_EPTP_MT_MASK) {
2869
case VMX_EPTP_MT_UC:
2870
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
2871
return false;
2872
break;
2873
case VMX_EPTP_MT_WB:
2874
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
2875
return false;
2876
break;
2877
default:
2878
return false;
2879
}
2880
2881
/* Page-walk levels validity. */
2882
switch (new_eptp & VMX_EPTP_PWL_MASK) {
2883
case VMX_EPTP_PWL_5:
2884
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
2885
return false;
2886
break;
2887
case VMX_EPTP_PWL_4:
2888
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
2889
return false;
2890
break;
2891
default:
2892
return false;
2893
}
2894
2895
/* Reserved bits should not be set */
2896
if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
2897
return false;
2898
2899
/* AD, if set, should be supported */
2900
if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
2901
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
2902
return false;
2903
}
2904
2905
return true;
2906
}
2907
2908
/*
2909
* Checks related to VM-Execution Control Fields
2910
*/
2911
static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2912
struct vmcs12 *vmcs12)
2913
{
2914
struct vcpu_vmx *vmx = to_vmx(vcpu);
2915
2916
if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2917
vmx->nested.msrs.pinbased_ctls_low,
2918
vmx->nested.msrs.pinbased_ctls_high)) ||
2919
CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2920
vmx->nested.msrs.procbased_ctls_low,
2921
vmx->nested.msrs.procbased_ctls_high)))
2922
return -EINVAL;
2923
2924
if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2925
CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2926
vmx->nested.msrs.secondary_ctls_low,
2927
vmx->nested.msrs.secondary_ctls_high)))
2928
return -EINVAL;
2929
2930
if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
2931
nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2932
nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2933
nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2934
nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2935
nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2936
nested_vmx_check_nmi_controls(vmcs12) ||
2937
nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2938
nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2939
nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2940
nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2941
CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2942
return -EINVAL;
2943
2944
if (!nested_cpu_has_preemption_timer(vmcs12) &&
2945
nested_cpu_has_save_preemption_timer(vmcs12))
2946
return -EINVAL;
2947
2948
if (nested_cpu_has_ept(vmcs12) &&
2949
CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
2950
return -EINVAL;
2951
2952
if (nested_cpu_has_vmfunc(vmcs12)) {
2953
if (CC(vmcs12->vm_function_control &
2954
~vmx->nested.msrs.vmfunc_controls))
2955
return -EINVAL;
2956
2957
if (nested_cpu_has_eptp_switching(vmcs12)) {
2958
if (CC(!nested_cpu_has_ept(vmcs12)) ||
2959
CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
2960
return -EINVAL;
2961
}
2962
}
2963
2964
return 0;
2965
}
2966
2967
/*
2968
* Checks related to VM-Exit Control Fields
2969
*/
2970
static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2971
struct vmcs12 *vmcs12)
2972
{
2973
struct vcpu_vmx *vmx = to_vmx(vcpu);
2974
2975
if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2976
vmx->nested.msrs.exit_ctls_low,
2977
vmx->nested.msrs.exit_ctls_high)) ||
2978
CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
2979
return -EINVAL;
2980
2981
return 0;
2982
}
2983
2984
/*
2985
* Checks related to VM-Entry Control Fields
2986
*/
2987
static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2988
struct vmcs12 *vmcs12)
2989
{
2990
struct vcpu_vmx *vmx = to_vmx(vcpu);
2991
2992
if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2993
vmx->nested.msrs.entry_ctls_low,
2994
vmx->nested.msrs.entry_ctls_high)))
2995
return -EINVAL;
2996
2997
/*
2998
* From the Intel SDM, volume 3:
2999
* Fields relevant to VM-entry event injection must be set properly.
3000
* These fields are the VM-entry interruption-information field, the
3001
* VM-entry exception error code, and the VM-entry instruction length.
3002
*/
3003
if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
3004
u32 intr_info = vmcs12->vm_entry_intr_info_field;
3005
u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
3006
u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
3007
bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
3008
bool urg = nested_cpu_has2(vmcs12,
3009
SECONDARY_EXEC_UNRESTRICTED_GUEST);
3010
bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
3011
3012
/* VM-entry interruption-info field: interruption type */
3013
if (CC(intr_type == INTR_TYPE_RESERVED) ||
3014
CC(intr_type == INTR_TYPE_OTHER_EVENT &&
3015
!nested_cpu_supports_monitor_trap_flag(vcpu)))
3016
return -EINVAL;
3017
3018
/* VM-entry interruption-info field: vector */
3019
if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
3020
CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
3021
CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
3022
return -EINVAL;
3023
3024
/*
3025
* Cannot deliver error code in real mode or if the interrupt
3026
* type is not hardware exception. For other cases, do the
3027
* consistency check only if the vCPU doesn't enumerate
3028
* VMX_BASIC_NO_HW_ERROR_CODE_CC.
3029
*/
3030
if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) {
3031
if (CC(has_error_code))
3032
return -EINVAL;
3033
} else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) {
3034
if (CC(has_error_code != x86_exception_has_error_code(vector)))
3035
return -EINVAL;
3036
}
3037
3038
/* VM-entry exception error code */
3039
if (CC(has_error_code &&
3040
vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
3041
return -EINVAL;
3042
3043
/* VM-entry interruption-info field: reserved bits */
3044
if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
3045
return -EINVAL;
3046
3047
/* VM-entry instruction length */
3048
switch (intr_type) {
3049
case INTR_TYPE_SOFT_EXCEPTION:
3050
case INTR_TYPE_SOFT_INTR:
3051
case INTR_TYPE_PRIV_SW_EXCEPTION:
3052
if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) ||
3053
CC(vmcs12->vm_entry_instruction_len == 0 &&
3054
CC(!nested_cpu_has_zero_length_injection(vcpu))))
3055
return -EINVAL;
3056
}
3057
}
3058
3059
if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
3060
return -EINVAL;
3061
3062
return 0;
3063
}
3064
3065
static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
3066
struct vmcs12 *vmcs12)
3067
{
3068
if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
3069
nested_check_vm_exit_controls(vcpu, vmcs12) ||
3070
nested_check_vm_entry_controls(vcpu, vmcs12))
3071
return -EINVAL;
3072
3073
#ifdef CONFIG_KVM_HYPERV
3074
if (guest_cpu_cap_has_evmcs(vcpu))
3075
return nested_evmcs_check_controls(vmcs12);
3076
#endif
3077
3078
return 0;
3079
}
3080
3081
static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
3082
struct vmcs12 *vmcs12)
3083
{
3084
#ifdef CONFIG_X86_64
3085
if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
3086
!!(vcpu->arch.efer & EFER_LMA)))
3087
return -EINVAL;
3088
#endif
3089
return 0;
3090
}
3091
3092
static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12)
3093
{
3094
/*
3095
* Check that the given linear address is canonical after a VM exit
3096
* from L2, based on HOST_CR4.LA57 value that will be loaded for L1.
3097
*/
3098
u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48;
3099
3100
return !__is_canonical_address(la, l1_address_bits_on_exit);
3101
}
3102
3103
static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet,
3104
u64 ssp, u64 ssp_tbl)
3105
{
3106
if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) ||
3107
CC(is_noncanonical_msr_address(ssp_tbl, vcpu)))
3108
return -EINVAL;
3109
3110
return 0;
3111
}
3112
3113
static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
3114
struct vmcs12 *vmcs12)
3115
{
3116
bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
3117
3118
if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
3119
CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
3120
CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3)))
3121
return -EINVAL;
3122
3123
if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP)))
3124
return -EINVAL;
3125
3126
if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
3127
CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
3128
return -EINVAL;
3129
3130
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
3131
CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
3132
return -EINVAL;
3133
3134
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
3135
CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
3136
vmcs12->host_ia32_perf_global_ctrl)))
3137
return -EINVAL;
3138
3139
if (ia32e) {
3140
if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
3141
return -EINVAL;
3142
} else {
3143
if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
3144
CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
3145
CC((vmcs12->host_rip) >> 32))
3146
return -EINVAL;
3147
}
3148
3149
if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3150
CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3151
CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3152
CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3153
CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3154
CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3155
CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3156
CC(vmcs12->host_cs_selector == 0) ||
3157
CC(vmcs12->host_tr_selector == 0) ||
3158
CC(vmcs12->host_ss_selector == 0 && !ia32e))
3159
return -EINVAL;
3160
3161
if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) ||
3162
CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) ||
3163
CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) ||
3164
CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) ||
3165
CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) ||
3166
CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12)))
3167
return -EINVAL;
3168
3169
/*
3170
* If the load IA32_EFER VM-exit control is 1, bits reserved in the
3171
* IA32_EFER MSR must be 0 in the field for that register. In addition,
3172
* the values of the LMA and LME bits in the field must each be that of
3173
* the host address-space size VM-exit control.
3174
*/
3175
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
3176
if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
3177
CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
3178
CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
3179
return -EINVAL;
3180
}
3181
3182
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) {
3183
if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet,
3184
vmcs12->host_ssp,
3185
vmcs12->host_ssp_tbl))
3186
return -EINVAL;
3187
3188
/*
3189
* IA32_S_CET and SSP must be canonical if the host will
3190
* enter 64-bit mode after VM-exit; otherwise, higher
3191
* 32-bits must be all 0s.
3192
*/
3193
if (ia32e) {
3194
if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) ||
3195
CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu)))
3196
return -EINVAL;
3197
} else {
3198
if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32))
3199
return -EINVAL;
3200
}
3201
}
3202
3203
return 0;
3204
}
3205
3206
static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
3207
struct vmcs12 *vmcs12)
3208
{
3209
struct vcpu_vmx *vmx = to_vmx(vcpu);
3210
struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
3211
struct vmcs_hdr hdr;
3212
3213
if (vmcs12->vmcs_link_pointer == INVALID_GPA)
3214
return 0;
3215
3216
if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
3217
return -EINVAL;
3218
3219
if (ghc->gpa != vmcs12->vmcs_link_pointer &&
3220
CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
3221
vmcs12->vmcs_link_pointer, VMCS12_SIZE)))
3222
return -EINVAL;
3223
3224
if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
3225
offsetof(struct vmcs12, hdr),
3226
sizeof(hdr))))
3227
return -EINVAL;
3228
3229
if (CC(hdr.revision_id != VMCS12_REVISION) ||
3230
CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
3231
return -EINVAL;
3232
3233
return 0;
3234
}
3235
3236
/*
3237
* Checks related to Guest Non-register State
3238
*/
3239
static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
3240
{
3241
if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
3242
vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
3243
vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
3244
return -EINVAL;
3245
3246
return 0;
3247
}
3248
3249
static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
3250
struct vmcs12 *vmcs12,
3251
enum vm_entry_failure_code *entry_failure_code)
3252
{
3253
bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE);
3254
3255
*entry_failure_code = ENTRY_FAIL_DEFAULT;
3256
3257
if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
3258
CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
3259
return -EINVAL;
3260
3261
if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP)))
3262
return -EINVAL;
3263
3264
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
3265
(CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
3266
CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
3267
return -EINVAL;
3268
3269
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
3270
CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
3271
return -EINVAL;
3272
3273
if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
3274
*entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
3275
return -EINVAL;
3276
}
3277
3278
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
3279
CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
3280
vmcs12->guest_ia32_perf_global_ctrl)))
3281
return -EINVAL;
3282
3283
if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG))
3284
return -EINVAL;
3285
3286
if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) ||
3287
CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG)))
3288
return -EINVAL;
3289
3290
/*
3291
* If the load IA32_EFER VM-entry control is 1, the following checks
3292
* are performed on the field for the IA32_EFER MSR:
3293
* - Bits reserved in the IA32_EFER MSR must be 0.
3294
* - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
3295
* the IA-32e mode guest VM-exit control. It must also be identical
3296
* to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
3297
* CR0.PG) is 1.
3298
*/
3299
if (to_vmx(vcpu)->nested.nested_run_pending &&
3300
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
3301
if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
3302
CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
3303
CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
3304
ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
3305
return -EINVAL;
3306
}
3307
3308
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
3309
(CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
3310
CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
3311
return -EINVAL;
3312
3313
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) {
3314
if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet,
3315
vmcs12->guest_ssp,
3316
vmcs12->guest_ssp_tbl))
3317
return -EINVAL;
3318
3319
/*
3320
* Guest SSP must have 63:N bits identical, rather than
3321
* be canonical (i.e., 63:N-1 bits identical), where N is
3322
* the CPU's maximum linear-address width. Similar to
3323
* is_noncanonical_msr_address(), use the host's
3324
* linear-address width.
3325
*/
3326
if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1)))
3327
return -EINVAL;
3328
}
3329
3330
if (nested_check_guest_non_reg_state(vmcs12))
3331
return -EINVAL;
3332
3333
return 0;
3334
}
3335
3336
static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
3337
{
3338
struct vcpu_vmx *vmx = to_vmx(vcpu);
3339
unsigned long cr3, cr4;
3340
bool vm_fail;
3341
3342
if (!nested_early_check)
3343
return 0;
3344
3345
if (vmx->msr_autoload.host.nr)
3346
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
3347
if (vmx->msr_autoload.guest.nr)
3348
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
3349
3350
preempt_disable();
3351
3352
vmx_prepare_switch_to_guest(vcpu);
3353
3354
/*
3355
* Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
3356
* which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
3357
* be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
3358
* there is no need to preserve other bits or save/restore the field.
3359
*/
3360
vmcs_writel(GUEST_RFLAGS, 0);
3361
3362
cr3 = __get_current_cr3_fast();
3363
if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
3364
vmcs_writel(HOST_CR3, cr3);
3365
vmx->loaded_vmcs->host_state.cr3 = cr3;
3366
}
3367
3368
cr4 = cr4_read_shadow();
3369
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
3370
vmcs_writel(HOST_CR4, cr4);
3371
vmx->loaded_vmcs->host_state.cr4 = cr4;
3372
}
3373
3374
vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
3375
__vmx_vcpu_run_flags(vmx));
3376
3377
if (vmx->msr_autoload.host.nr)
3378
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3379
if (vmx->msr_autoload.guest.nr)
3380
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3381
3382
if (vm_fail) {
3383
u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3384
3385
preempt_enable();
3386
3387
trace_kvm_nested_vmenter_failed(
3388
"early hardware check VM-instruction error: ", error);
3389
WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3390
return 1;
3391
}
3392
3393
/*
3394
* VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3395
*/
3396
if (hw_breakpoint_active())
3397
set_debugreg(__this_cpu_read(cpu_dr7), 7);
3398
local_irq_enable();
3399
preempt_enable();
3400
3401
/*
3402
* A non-failing VMEntry means we somehow entered guest mode with
3403
* an illegal RIP, and that's just the tip of the iceberg. There
3404
* is no telling what memory has been modified or what state has
3405
* been exposed to unknown code. Hitting this all but guarantees
3406
* a (very critical) hardware issue.
3407
*/
3408
WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3409
VMX_EXIT_REASONS_FAILED_VMENTRY));
3410
3411
return 0;
3412
}
3413
3414
#ifdef CONFIG_KVM_HYPERV
3415
static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
3416
{
3417
struct vcpu_vmx *vmx = to_vmx(vcpu);
3418
3419
/*
3420
* hv_evmcs may end up being not mapped after migration (when
3421
* L2 was running), map it here to make sure vmcs12 changes are
3422
* properly reflected.
3423
*/
3424
if (guest_cpu_cap_has_evmcs(vcpu) &&
3425
vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
3426
enum nested_evmptrld_status evmptrld_status =
3427
nested_vmx_handle_enlightened_vmptrld(vcpu, false);
3428
3429
if (evmptrld_status == EVMPTRLD_VMFAIL ||
3430
evmptrld_status == EVMPTRLD_ERROR)
3431
return false;
3432
3433
/*
3434
* Post migration VMCS12 always provides the most actual
3435
* information, copy it to eVMCS upon entry.
3436
*/
3437
vmx->nested.need_vmcs12_to_shadow_sync = true;
3438
}
3439
3440
return true;
3441
}
3442
#endif
3443
3444
static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
3445
{
3446
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3447
struct vcpu_vmx *vmx = to_vmx(vcpu);
3448
struct kvm_host_map *map;
3449
3450
if (!vcpu->arch.pdptrs_from_userspace &&
3451
!nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3452
/*
3453
* Reload the guest's PDPTRs since after a migration
3454
* the guest CR3 might be restored prior to setting the nested
3455
* state which can lead to a load of wrong PDPTRs.
3456
*/
3457
if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
3458
return false;
3459
}
3460
3461
3462
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3463
map = &vmx->nested.apic_access_page_map;
3464
3465
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
3466
vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
3467
} else {
3468
pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n",
3469
__func__);
3470
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3471
vcpu->run->internal.suberror =
3472
KVM_INTERNAL_ERROR_EMULATION;
3473
vcpu->run->internal.ndata = 0;
3474
return false;
3475
}
3476
}
3477
3478
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3479
map = &vmx->nested.virtual_apic_map;
3480
3481
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3482
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
3483
} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3484
nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3485
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3486
/*
3487
* The processor will never use the TPR shadow, simply
3488
* clear the bit from the execution control. Such a
3489
* configuration is useless, but it happens in tests.
3490
* For any other configuration, failing the vm entry is
3491
* _not_ what the processor does but it's basically the
3492
* only possibility we have.
3493
*/
3494
exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
3495
} else {
3496
/*
3497
* Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3498
* force VM-Entry to fail.
3499
*/
3500
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
3501
}
3502
}
3503
3504
if (nested_cpu_has_posted_intr(vmcs12)) {
3505
map = &vmx->nested.pi_desc_map;
3506
3507
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3508
vmx->nested.pi_desc =
3509
(struct pi_desc *)(((void *)map->hva) +
3510
offset_in_page(vmcs12->posted_intr_desc_addr));
3511
vmcs_write64(POSTED_INTR_DESC_ADDR,
3512
pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
3513
} else {
3514
/*
3515
* Defer the KVM_INTERNAL_EXIT until KVM tries to
3516
* access the contents of the VMCS12 posted interrupt
3517
* descriptor. (Note that KVM may do this when it
3518
* should not, per the architectural specification.)
3519
*/
3520
vmx->nested.pi_desc = NULL;
3521
pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
3522
}
3523
}
3524
if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3525
exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3526
else
3527
exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3528
3529
return true;
3530
}
3531
3532
static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
3533
{
3534
#ifdef CONFIG_KVM_HYPERV
3535
/*
3536
* Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy
3537
* in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory
3538
* to make nested_evmcs_l2_tlb_flush_enabled() work correctly post
3539
* migration.
3540
*/
3541
if (!nested_get_evmcs_page(vcpu)) {
3542
pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
3543
__func__);
3544
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3545
vcpu->run->internal.suberror =
3546
KVM_INTERNAL_ERROR_EMULATION;
3547
vcpu->run->internal.ndata = 0;
3548
3549
return false;
3550
}
3551
#endif
3552
3553
if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
3554
return false;
3555
3556
return true;
3557
}
3558
3559
static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
3560
{
3561
struct vmcs12 *vmcs12;
3562
struct vcpu_vmx *vmx = to_vmx(vcpu);
3563
gpa_t dst;
3564
3565
if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
3566
return 0;
3567
3568
if (WARN_ON_ONCE(vmx->nested.pml_full))
3569
return 1;
3570
3571
/*
3572
* Check if PML is enabled for the nested guest. Whether eptp bit 6 is
3573
* set is already checked as part of A/D emulation.
3574
*/
3575
vmcs12 = get_vmcs12(vcpu);
3576
if (!nested_cpu_has_pml(vmcs12))
3577
return 0;
3578
3579
if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) {
3580
vmx->nested.pml_full = true;
3581
return 1;
3582
}
3583
3584
gpa &= ~0xFFFull;
3585
dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
3586
3587
if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
3588
offset_in_page(dst), sizeof(gpa)))
3589
return 0;
3590
3591
vmcs12->guest_pml_index--;
3592
3593
return 0;
3594
}
3595
3596
/*
3597
* Intel's VMX Instruction Reference specifies a common set of prerequisites
3598
* for running VMX instructions (except VMXON, whose prerequisites are
3599
* slightly different). It also specifies what exception to inject otherwise.
3600
* Note that many of these exceptions have priority over VM exits, so they
3601
* don't have to be checked again here.
3602
*/
3603
static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3604
{
3605
if (!to_vmx(vcpu)->nested.vmxon) {
3606
kvm_queue_exception(vcpu, UD_VECTOR);
3607
return 0;
3608
}
3609
3610
if (vmx_get_cpl(vcpu)) {
3611
kvm_inject_gp(vcpu, 0);
3612
return 0;
3613
}
3614
3615
return 1;
3616
}
3617
3618
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3619
struct vmcs12 *vmcs12);
3620
3621
/*
3622
* If from_vmentry is false, this is being called from state restore (either RSM
3623
* or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
3624
*
3625
* Returns:
3626
* NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
3627
* NVMX_VMENTRY_VMFAIL: Consistency check VMFail
3628
* NVMX_VMENTRY_VMEXIT: Consistency check VMExit
3629
* NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
3630
*/
3631
enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
3632
bool from_vmentry)
3633
{
3634
struct vcpu_vmx *vmx = to_vmx(vcpu);
3635
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3636
enum vm_entry_failure_code entry_failure_code;
3637
union vmx_exit_reason exit_reason = {
3638
.basic = EXIT_REASON_INVALID_STATE,
3639
.failed_vmentry = 1,
3640
};
3641
u32 failed_index;
3642
3643
trace_kvm_nested_vmenter(kvm_rip_read(vcpu),
3644
vmx->nested.current_vmptr,
3645
vmcs12->guest_rip,
3646
vmcs12->guest_intr_status,
3647
vmcs12->vm_entry_intr_info_field,
3648
vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT,
3649
vmcs12->ept_pointer,
3650
vmcs12->guest_cr3,
3651
KVM_ISA_VMX);
3652
3653
kvm_service_local_tlb_flush_requests(vcpu);
3654
3655
if (!vmx->nested.nested_run_pending ||
3656
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3657
vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
3658
if (kvm_mpx_supported() &&
3659
(!vmx->nested.nested_run_pending ||
3660
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
3661
vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3662
3663
if (!vmx->nested.nested_run_pending ||
3664
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE))
3665
vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet,
3666
&vmx->nested.pre_vmenter_ssp,
3667
&vmx->nested.pre_vmenter_ssp_tbl);
3668
3669
/*
3670
* Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3671
* nested early checks are disabled. In the event of a "late" VM-Fail,
3672
* i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3673
* software model to the pre-VMEntry host state. When EPT is disabled,
3674
* GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3675
* nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
3676
* vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3677
* the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
3678
* VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3679
* guaranteed to be overwritten with a shadow CR3 prior to re-entering
3680
* L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3681
* KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3682
* pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3683
* path would need to manually save/restore vmcs01.GUEST_CR3.
3684
*/
3685
if (!enable_ept && !nested_early_check)
3686
vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3687
3688
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3689
3690
prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
3691
3692
if (from_vmentry) {
3693
if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
3694
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3695
return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
3696
}
3697
3698
if (nested_vmx_check_vmentry_hw(vcpu)) {
3699
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3700
return NVMX_VMENTRY_VMFAIL;
3701
}
3702
3703
if (nested_vmx_check_guest_state(vcpu, vmcs12,
3704
&entry_failure_code)) {
3705
exit_reason.basic = EXIT_REASON_INVALID_STATE;
3706
vmcs12->exit_qualification = entry_failure_code;
3707
goto vmentry_fail_vmexit;
3708
}
3709
}
3710
3711
enter_guest_mode(vcpu);
3712
3713
if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
3714
exit_reason.basic = EXIT_REASON_INVALID_STATE;
3715
vmcs12->exit_qualification = entry_failure_code;
3716
goto vmentry_fail_vmexit_guest_mode;
3717
}
3718
3719
if (from_vmentry) {
3720
failed_index = nested_vmx_load_msr(vcpu,
3721
vmcs12->vm_entry_msr_load_addr,
3722
vmcs12->vm_entry_msr_load_count);
3723
if (failed_index) {
3724
exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
3725
vmcs12->exit_qualification = failed_index;
3726
goto vmentry_fail_vmexit_guest_mode;
3727
}
3728
} else {
3729
/*
3730
* The MMU is not initialized to point at the right entities yet and
3731
* "get pages" would need to read data from the guest (i.e. we will
3732
* need to perform gpa to hpa translation). Request a call
3733
* to nested_get_vmcs12_pages before the next VM-entry. The MSRs
3734
* have already been set at vmentry time and should not be reset.
3735
*/
3736
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
3737
}
3738
3739
/*
3740
* Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI
3741
* when it executed VMLAUNCH/VMRESUME, as entering non-root mode can
3742
* effectively unblock various events, e.g. INIT/SIPI cause VM-Exit
3743
* unconditionally. Take care to pull data from vmcs01 as appropriate,
3744
* e.g. when checking for interrupt windows, as vmcs02 is now loaded.
3745
*/
3746
if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING |
3747
CPU_BASED_NMI_WINDOW_EXITING)) ||
3748
kvm_apic_has_pending_init_or_sipi(vcpu) ||
3749
kvm_apic_has_interrupt(vcpu))
3750
kvm_make_request(KVM_REQ_EVENT, vcpu);
3751
3752
/*
3753
* Do not start the preemption timer hrtimer until after we know
3754
* we are successful, so that only nested_vmx_vmexit needs to cancel
3755
* the timer.
3756
*/
3757
vmx->nested.preemption_timer_expired = false;
3758
if (nested_cpu_has_preemption_timer(vmcs12)) {
3759
u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
3760
vmx_start_preemption_timer(vcpu, timer_value);
3761
}
3762
3763
/*
3764
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3765
* we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3766
* returned as far as L1 is concerned. It will only return (and set
3767
* the success flag) when L2 exits (see nested_vmx_vmexit()).
3768
*/
3769
return NVMX_VMENTRY_SUCCESS;
3770
3771
/*
3772
* A failed consistency check that leads to a VMExit during L1's
3773
* VMEnter to L2 is a variation of a normal VMexit, as explained in
3774
* 26.7 "VM-entry failures during or after loading guest state".
3775
*/
3776
vmentry_fail_vmexit_guest_mode:
3777
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
3778
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3779
leave_guest_mode(vcpu);
3780
3781
vmentry_fail_vmexit:
3782
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3783
3784
if (!from_vmentry)
3785
return NVMX_VMENTRY_VMEXIT;
3786
3787
load_vmcs12_host_state(vcpu, vmcs12);
3788
vmcs12->vm_exit_reason = exit_reason.full;
3789
if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))
3790
vmx->nested.need_vmcs12_to_shadow_sync = true;
3791
return NVMX_VMENTRY_VMEXIT;
3792
}
3793
3794
/*
3795
* nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3796
* for running an L2 nested guest.
3797
*/
3798
static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3799
{
3800
struct vmcs12 *vmcs12;
3801
enum nvmx_vmentry_status status;
3802
struct vcpu_vmx *vmx = to_vmx(vcpu);
3803
u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3804
enum nested_evmptrld_status evmptrld_status;
3805
3806
if (!nested_vmx_check_permission(vcpu))
3807
return 1;
3808
3809
evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
3810
if (evmptrld_status == EVMPTRLD_ERROR) {
3811
kvm_queue_exception(vcpu, UD_VECTOR);
3812
return 1;
3813
}
3814
3815
kvm_pmu_branch_retired(vcpu);
3816
3817
if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
3818
return nested_vmx_failInvalid(vcpu);
3819
3820
if (CC(!nested_vmx_is_evmptr12_valid(vmx) &&
3821
vmx->nested.current_vmptr == INVALID_GPA))
3822
return nested_vmx_failInvalid(vcpu);
3823
3824
vmcs12 = get_vmcs12(vcpu);
3825
3826
/*
3827
* Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3828
* that there *is* a valid VMCS pointer, RFLAGS.CF is set
3829
* rather than RFLAGS.ZF, and no error number is stored to the
3830
* VM-instruction error field.
3831
*/
3832
if (CC(vmcs12->hdr.shadow_vmcs))
3833
return nested_vmx_failInvalid(vcpu);
3834
3835
if (nested_vmx_is_evmptr12_valid(vmx)) {
3836
struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
3837
3838
copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields);
3839
/* Enlightened VMCS doesn't have launch state */
3840
vmcs12->launch_state = !launch;
3841
} else if (enable_shadow_vmcs) {
3842
copy_shadow_to_vmcs12(vmx);
3843
}
3844
3845
/*
3846
* The nested entry process starts with enforcing various prerequisites
3847
* on vmcs12 as required by the Intel SDM, and act appropriately when
3848
* they fail: As the SDM explains, some conditions should cause the
3849
* instruction to fail, while others will cause the instruction to seem
3850
* to succeed, but return an EXIT_REASON_INVALID_STATE.
3851
* To speed up the normal (success) code path, we should avoid checking
3852
* for misconfigurations which will anyway be caught by the processor
3853
* when using the merged vmcs02.
3854
*/
3855
if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
3856
return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3857
3858
if (CC(vmcs12->launch_state == launch))
3859
return nested_vmx_fail(vcpu,
3860
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3861
: VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3862
3863
if (nested_vmx_check_controls(vcpu, vmcs12))
3864
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3865
3866
if (nested_vmx_check_address_space_size(vcpu, vmcs12))
3867
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3868
3869
if (nested_vmx_check_host_state(vcpu, vmcs12))
3870
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3871
3872
/*
3873
* We're finally done with prerequisite checking, and can start with
3874
* the nested entry.
3875
*/
3876
vmx->nested.nested_run_pending = 1;
3877
vmx->nested.has_preemption_timer_deadline = false;
3878
status = nested_vmx_enter_non_root_mode(vcpu, true);
3879
if (unlikely(status != NVMX_VMENTRY_SUCCESS))
3880
goto vmentry_failed;
3881
3882
/* Hide L1D cache contents from the nested guest. */
3883
vmx->vcpu.arch.l1tf_flush_l1d = true;
3884
3885
/*
3886
* Must happen outside of nested_vmx_enter_non_root_mode() as it will
3887
* also be used as part of restoring nVMX state for
3888
* snapshot restore (migration).
3889
*
3890
* In this flow, it is assumed that vmcs12 cache was
3891
* transferred as part of captured nVMX state and should
3892
* therefore not be read from guest memory (which may not
3893
* exist on destination host yet).
3894
*/
3895
nested_cache_shadow_vmcs12(vcpu, vmcs12);
3896
3897
switch (vmcs12->guest_activity_state) {
3898
case GUEST_ACTIVITY_HLT:
3899
/*
3900
* If we're entering a halted L2 vcpu and the L2 vcpu won't be
3901
* awakened by event injection or by an NMI-window VM-exit or
3902
* by an interrupt-window VM-exit, halt the vcpu.
3903
*/
3904
if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3905
!nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
3906
!(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
3907
(vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3908
vmx->nested.nested_run_pending = 0;
3909
return kvm_emulate_halt_noskip(vcpu);
3910
}
3911
break;
3912
case GUEST_ACTIVITY_WAIT_SIPI:
3913
vmx->nested.nested_run_pending = 0;
3914
kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
3915
break;
3916
default:
3917
break;
3918
}
3919
3920
return 1;
3921
3922
vmentry_failed:
3923
vmx->nested.nested_run_pending = 0;
3924
if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
3925
return 0;
3926
if (status == NVMX_VMENTRY_VMEXIT)
3927
return 1;
3928
WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
3929
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3930
}
3931
3932
/*
3933
* On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3934
* because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
3935
* This function returns the new value we should put in vmcs12.guest_cr0.
3936
* It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3937
* 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3938
* available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3939
* didn't trap the bit, because if L1 did, so would L0).
3940
* 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3941
* been modified by L2, and L1 knows it. So just leave the old value of
3942
* the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3943
* isn't relevant, because if L0 traps this bit it can set it to anything.
3944
* 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3945
* changed these bits, and therefore they need to be updated, but L0
3946
* didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3947
* put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3948
*/
3949
static inline unsigned long
3950
vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3951
{
3952
return
3953
/*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3954
/*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3955
/*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3956
vcpu->arch.cr0_guest_owned_bits));
3957
}
3958
3959
static inline unsigned long
3960
vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3961
{
3962
return
3963
/*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3964
/*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3965
/*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3966
vcpu->arch.cr4_guest_owned_bits));
3967
}
3968
3969
static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3970
struct vmcs12 *vmcs12,
3971
u32 vm_exit_reason, u32 exit_intr_info)
3972
{
3973
u32 idt_vectoring;
3974
unsigned int nr;
3975
3976
/*
3977
* Per the SDM, VM-Exits due to double and triple faults are never
3978
* considered to occur during event delivery, even if the double/triple
3979
* fault is the result of an escalating vectoring issue.
3980
*
3981
* Note, the SDM qualifies the double fault behavior with "The original
3982
* event results in a double-fault exception". It's unclear why the
3983
* qualification exists since exits due to double fault can occur only
3984
* while vectoring a different exception (injected events are never
3985
* subject to interception), i.e. there's _always_ an original event.
3986
*
3987
* The SDM also uses NMI as a confusing example for the "original event
3988
* causes the VM exit directly" clause. NMI isn't special in any way,
3989
* the same rule applies to all events that cause an exit directly.
3990
* NMI is an odd choice for the example because NMIs can only occur on
3991
* instruction boundaries, i.e. they _can't_ occur during vectoring.
3992
*/
3993
if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
3994
((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
3995
is_double_fault(exit_intr_info))) {
3996
vmcs12->idt_vectoring_info_field = 0;
3997
} else if (vcpu->arch.exception.injected) {
3998
nr = vcpu->arch.exception.vector;
3999
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
4000
4001
if (kvm_exception_is_soft(nr)) {
4002
vmcs12->vm_exit_instruction_len =
4003
vcpu->arch.event_exit_inst_len;
4004
idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
4005
} else
4006
idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
4007
4008
if (vcpu->arch.exception.has_error_code) {
4009
idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
4010
vmcs12->idt_vectoring_error_code =
4011
vcpu->arch.exception.error_code;
4012
}
4013
4014
vmcs12->idt_vectoring_info_field = idt_vectoring;
4015
} else if (vcpu->arch.nmi_injected) {
4016
vmcs12->idt_vectoring_info_field =
4017
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
4018
} else if (vcpu->arch.interrupt.injected) {
4019
nr = vcpu->arch.interrupt.nr;
4020
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
4021
4022
if (vcpu->arch.interrupt.soft) {
4023
idt_vectoring |= INTR_TYPE_SOFT_INTR;
4024
vmcs12->vm_entry_instruction_len =
4025
vcpu->arch.event_exit_inst_len;
4026
} else
4027
idt_vectoring |= INTR_TYPE_EXT_INTR;
4028
4029
vmcs12->idt_vectoring_info_field = idt_vectoring;
4030
} else {
4031
vmcs12->idt_vectoring_info_field = 0;
4032
}
4033
}
4034
4035
4036
void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
4037
{
4038
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4039
gfn_t gfn;
4040
4041
/*
4042
* Don't need to mark the APIC access page dirty; it is never
4043
* written to by the CPU during APIC virtualization.
4044
*/
4045
4046
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
4047
gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
4048
kvm_vcpu_mark_page_dirty(vcpu, gfn);
4049
}
4050
4051
if (nested_cpu_has_posted_intr(vmcs12)) {
4052
gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
4053
kvm_vcpu_mark_page_dirty(vcpu, gfn);
4054
}
4055
}
4056
4057
static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
4058
{
4059
struct vcpu_vmx *vmx = to_vmx(vcpu);
4060
int max_irr;
4061
void *vapic_page;
4062
u16 status;
4063
4064
if (!vmx->nested.pi_pending)
4065
return 0;
4066
4067
if (!vmx->nested.pi_desc)
4068
goto mmio_needed;
4069
4070
vmx->nested.pi_pending = false;
4071
4072
if (!pi_test_and_clear_on(vmx->nested.pi_desc))
4073
return 0;
4074
4075
max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
4076
if (max_irr > 0) {
4077
vapic_page = vmx->nested.virtual_apic_map.hva;
4078
if (!vapic_page)
4079
goto mmio_needed;
4080
4081
__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
4082
vapic_page, &max_irr);
4083
status = vmcs_read16(GUEST_INTR_STATUS);
4084
if ((u8)max_irr > ((u8)status & 0xff)) {
4085
status &= ~0xff;
4086
status |= (u8)max_irr;
4087
vmcs_write16(GUEST_INTR_STATUS, status);
4088
}
4089
}
4090
4091
nested_mark_vmcs12_pages_dirty(vcpu);
4092
return 0;
4093
4094
mmio_needed:
4095
kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
4096
return -ENXIO;
4097
}
4098
4099
static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
4100
{
4101
struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
4102
u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
4103
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4104
unsigned long exit_qual;
4105
4106
if (ex->has_payload) {
4107
exit_qual = ex->payload;
4108
} else if (ex->vector == PF_VECTOR) {
4109
exit_qual = vcpu->arch.cr2;
4110
} else if (ex->vector == DB_VECTOR) {
4111
exit_qual = vcpu->arch.dr6;
4112
exit_qual &= ~DR6_BT;
4113
exit_qual ^= DR6_ACTIVE_LOW;
4114
} else {
4115
exit_qual = 0;
4116
}
4117
4118
/*
4119
* Unlike AMD's Paged Real Mode, which reports an error code on #PF
4120
* VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the
4121
* "has error code" flags on VM-Exit if the CPU is in Real Mode.
4122
*/
4123
if (ex->has_error_code && is_protmode(vcpu)) {
4124
/*
4125
* Intel CPUs do not generate error codes with bits 31:16 set,
4126
* and more importantly VMX disallows setting bits 31:16 in the
4127
* injected error code for VM-Entry. Drop the bits to mimic
4128
* hardware and avoid inducing failure on nested VM-Entry if L1
4129
* chooses to inject the exception back to L2. AMD CPUs _do_
4130
* generate "full" 32-bit error codes, so KVM allows userspace
4131
* to inject exception error codes with bits 31:16 set.
4132
*/
4133
vmcs12->vm_exit_intr_error_code = (u16)ex->error_code;
4134
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
4135
}
4136
4137
if (kvm_exception_is_soft(ex->vector))
4138
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
4139
else
4140
intr_info |= INTR_TYPE_HARD_EXCEPTION;
4141
4142
if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
4143
vmx_get_nmi_mask(vcpu))
4144
intr_info |= INTR_INFO_UNBLOCK_NMI;
4145
4146
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
4147
}
4148
4149
/*
4150
* Returns true if a debug trap is (likely) pending delivery. Infer the class
4151
* of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6).
4152
* Using the payload is flawed because code breakpoints (fault-like) and data
4153
* breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e.
4154
* this will return false positives if a to-be-injected code breakpoint #DB is
4155
* pending (from KVM's perspective, but not "pending" across an instruction
4156
* boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it
4157
* too is trap-like.
4158
*
4159
* KVM "works" despite these flaws as ICEBP isn't currently supported by the
4160
* emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the
4161
* #DB has already happened), and MTF isn't marked pending on code breakpoints
4162
* from the emulator (because such #DBs are fault-like and thus don't trigger
4163
* actions that fire on instruction retire).
4164
*/
4165
static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex)
4166
{
4167
if (!ex->pending || ex->vector != DB_VECTOR)
4168
return 0;
4169
4170
/* General Detect #DBs are always fault-like. */
4171
return ex->payload & ~DR6_BD;
4172
}
4173
4174
/*
4175
* Returns true if there's a pending #DB exception that is lower priority than
4176
* a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by
4177
* KVM, but could theoretically be injected by userspace. Note, this code is
4178
* imperfect, see above.
4179
*/
4180
static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex)
4181
{
4182
return vmx_get_pending_dbg_trap(ex) & ~DR6_BT;
4183
}
4184
4185
/*
4186
* Certain VM-exits set the 'pending debug exceptions' field to indicate a
4187
* recognized #DB (data or single-step) that has yet to be delivered. Since KVM
4188
* represents these debug traps with a payload that is said to be compatible
4189
* with the 'pending debug exceptions' field, write the payload to the VMCS
4190
* field if a VM-exit is delivered before the debug trap.
4191
*/
4192
static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
4193
{
4194
unsigned long pending_dbg;
4195
4196
pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception);
4197
if (pending_dbg)
4198
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg);
4199
}
4200
4201
static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
4202
{
4203
return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
4204
to_vmx(vcpu)->nested.preemption_timer_expired;
4205
}
4206
4207
static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
4208
{
4209
struct vcpu_vmx *vmx = to_vmx(vcpu);
4210
void *vapic = vmx->nested.virtual_apic_map.hva;
4211
int max_irr, vppr;
4212
4213
if (nested_vmx_preemption_timer_pending(vcpu) ||
4214
vmx->nested.mtf_pending)
4215
return true;
4216
4217
/*
4218
* Virtual Interrupt Delivery doesn't require manual injection. Either
4219
* the interrupt is already in GUEST_RVI and will be recognized by CPU
4220
* at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move
4221
* the interrupt from the PIR to RVI prior to entering the guest.
4222
*/
4223
if (for_injection)
4224
return false;
4225
4226
if (!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
4227
__vmx_interrupt_blocked(vcpu))
4228
return false;
4229
4230
if (!vapic)
4231
return false;
4232
4233
vppr = *((u32 *)(vapic + APIC_PROCPRI));
4234
4235
max_irr = vmx_get_rvi();
4236
if ((max_irr & 0xf0) > (vppr & 0xf0))
4237
return true;
4238
4239
if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
4240
pi_test_on(vmx->nested.pi_desc)) {
4241
max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
4242
if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
4243
return true;
4244
}
4245
4246
return false;
4247
}
4248
4249
/*
4250
* Per the Intel SDM's table "Priority Among Concurrent Events", with minor
4251
* edits to fill in missing examples, e.g. #DB due to split-lock accesses,
4252
* and less minor edits to splice in the priority of VMX Non-Root specific
4253
* events, e.g. MTF and NMI/INTR-window exiting.
4254
*
4255
* 1 Hardware Reset and Machine Checks
4256
* - RESET
4257
* - Machine Check
4258
*
4259
* 2 Trap on Task Switch
4260
* - T flag in TSS is set (on task switch)
4261
*
4262
* 3 External Hardware Interventions
4263
* - FLUSH
4264
* - STOPCLK
4265
* - SMI
4266
* - INIT
4267
*
4268
* 3.5 Monitor Trap Flag (MTF) VM-exit[1]
4269
*
4270
* 4 Traps on Previous Instruction
4271
* - Breakpoints
4272
* - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O
4273
* breakpoint, or #DB due to a split-lock access)
4274
*
4275
* 4.3 VMX-preemption timer expired VM-exit
4276
*
4277
* 4.6 NMI-window exiting VM-exit[2]
4278
*
4279
* 5 Nonmaskable Interrupts (NMI)
4280
*
4281
* 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery
4282
*
4283
* 6 Maskable Hardware Interrupts
4284
*
4285
* 7 Code Breakpoint Fault
4286
*
4287
* 8 Faults from Fetching Next Instruction
4288
* - Code-Segment Limit Violation
4289
* - Code Page Fault
4290
* - Control protection exception (missing ENDBRANCH at target of indirect
4291
* call or jump)
4292
*
4293
* 9 Faults from Decoding Next Instruction
4294
* - Instruction length > 15 bytes
4295
* - Invalid Opcode
4296
* - Coprocessor Not Available
4297
*
4298
*10 Faults on Executing Instruction
4299
* - Overflow
4300
* - Bound error
4301
* - Invalid TSS
4302
* - Segment Not Present
4303
* - Stack fault
4304
* - General Protection
4305
* - Data Page Fault
4306
* - Alignment Check
4307
* - x86 FPU Floating-point exception
4308
* - SIMD floating-point exception
4309
* - Virtualization exception
4310
* - Control protection exception
4311
*
4312
* [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs),
4313
* INIT signals, and higher priority events take priority over MTF VM exits.
4314
* MTF VM exits take priority over debug-trap exceptions and lower priority
4315
* events.
4316
*
4317
* [2] Debug-trap exceptions and higher priority events take priority over VM exits
4318
* caused by the VMX-preemption timer. VM exits caused by the VMX-preemption
4319
* timer take priority over VM exits caused by the "NMI-window exiting"
4320
* VM-execution control and lower priority events.
4321
*
4322
* [3] Debug-trap exceptions and higher priority events take priority over VM exits
4323
* caused by "NMI-window exiting". VM exits caused by this control take
4324
* priority over non-maskable interrupts (NMIs) and lower priority events.
4325
*
4326
* [4] Virtual-interrupt delivery has the same priority as that of VM exits due to
4327
* the 1-setting of the "interrupt-window exiting" VM-execution control. Thus,
4328
* non-maskable interrupts (NMIs) and higher priority events take priority over
4329
* delivery of a virtual interrupt; delivery of a virtual interrupt takes
4330
* priority over external interrupts and lower priority events.
4331
*/
4332
static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
4333
{
4334
struct kvm_lapic *apic = vcpu->arch.apic;
4335
struct vcpu_vmx *vmx = to_vmx(vcpu);
4336
/*
4337
* Only a pending nested run blocks a pending exception. If there is a
4338
* previously injected event, the pending exception occurred while said
4339
* event was being delivered and thus needs to be handled.
4340
*/
4341
bool block_nested_exceptions = vmx->nested.nested_run_pending;
4342
/*
4343
* Events that don't require injection, i.e. that are virtualized by
4344
* hardware, aren't blocked by a pending VM-Enter as KVM doesn't need
4345
* to regain control in order to deliver the event, and hardware will
4346
* handle event ordering, e.g. with respect to injected exceptions.
4347
*
4348
* But, new events (not exceptions) are only recognized at instruction
4349
* boundaries. If an event needs reinjection, then KVM is handling a
4350
* VM-Exit that occurred _during_ instruction execution; new events,
4351
* irrespective of whether or not they're injected, are blocked until
4352
* the instruction completes.
4353
*/
4354
bool block_non_injected_events = kvm_event_needs_reinjection(vcpu);
4355
/*
4356
* Inject events are blocked by nested VM-Enter, as KVM is responsible
4357
* for managing priority between concurrent events, i.e. KVM needs to
4358
* wait until after VM-Enter completes to deliver injected events.
4359
*/
4360
bool block_nested_events = block_nested_exceptions ||
4361
block_non_injected_events;
4362
4363
if (lapic_in_kernel(vcpu) &&
4364
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
4365
if (block_nested_events)
4366
return -EBUSY;
4367
nested_vmx_update_pending_dbg(vcpu);
4368
clear_bit(KVM_APIC_INIT, &apic->pending_events);
4369
if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
4370
nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
4371
4372
/* MTF is discarded if the vCPU is in WFS. */
4373
vmx->nested.mtf_pending = false;
4374
return 0;
4375
}
4376
4377
if (lapic_in_kernel(vcpu) &&
4378
test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
4379
if (block_nested_events)
4380
return -EBUSY;
4381
4382
clear_bit(KVM_APIC_SIPI, &apic->pending_events);
4383
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
4384
nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
4385
apic->sipi_vector & 0xFFUL);
4386
return 0;
4387
}
4388
/* Fallthrough, the SIPI is completely ignored. */
4389
}
4390
4391
/*
4392
* Process exceptions that are higher priority than Monitor Trap Flag:
4393
* fault-like exceptions, TSS T flag #DB (not emulated by KVM, but
4394
* could theoretically come in from userspace), and ICEBP (INT1).
4395
*
4396
* TODO: SMIs have higher priority than MTF and trap-like #DBs (except
4397
* for TSS T flag #DBs). KVM also doesn't save/restore pending MTF
4398
* across SMI/RSM as it should; that needs to be addressed in order to
4399
* prioritize SMI over MTF and trap-like #DBs.
4400
*/
4401
if (vcpu->arch.exception_vmexit.pending &&
4402
!vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) {
4403
if (block_nested_exceptions)
4404
return -EBUSY;
4405
4406
nested_vmx_inject_exception_vmexit(vcpu);
4407
return 0;
4408
}
4409
4410
if (vcpu->arch.exception.pending &&
4411
!vmx_is_low_priority_db_trap(&vcpu->arch.exception)) {
4412
if (block_nested_exceptions)
4413
return -EBUSY;
4414
goto no_vmexit;
4415
}
4416
4417
if (vmx->nested.mtf_pending) {
4418
if (block_nested_events)
4419
return -EBUSY;
4420
nested_vmx_update_pending_dbg(vcpu);
4421
nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
4422
return 0;
4423
}
4424
4425
if (vcpu->arch.exception_vmexit.pending) {
4426
if (block_nested_exceptions)
4427
return -EBUSY;
4428
4429
nested_vmx_inject_exception_vmexit(vcpu);
4430
return 0;
4431
}
4432
4433
if (vcpu->arch.exception.pending) {
4434
if (block_nested_exceptions)
4435
return -EBUSY;
4436
goto no_vmexit;
4437
}
4438
4439
if (nested_vmx_preemption_timer_pending(vcpu)) {
4440
if (block_nested_events)
4441
return -EBUSY;
4442
nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
4443
return 0;
4444
}
4445
4446
if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
4447
if (block_nested_events)
4448
return -EBUSY;
4449
goto no_vmexit;
4450
}
4451
4452
if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
4453
if (block_nested_events)
4454
return -EBUSY;
4455
if (!nested_exit_on_nmi(vcpu))
4456
goto no_vmexit;
4457
4458
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
4459
NMI_VECTOR | INTR_TYPE_NMI_INTR |
4460
INTR_INFO_VALID_MASK, 0);
4461
/*
4462
* The NMI-triggered VM exit counts as injection:
4463
* clear this one and block further NMIs.
4464
*/
4465
vcpu->arch.nmi_pending = 0;
4466
vmx_set_nmi_mask(vcpu, true);
4467
return 0;
4468
}
4469
4470
if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
4471
int irq;
4472
4473
if (!nested_exit_on_intr(vcpu)) {
4474
if (block_nested_events)
4475
return -EBUSY;
4476
4477
goto no_vmexit;
4478
}
4479
4480
if (!nested_exit_intr_ack_set(vcpu)) {
4481
if (block_nested_events)
4482
return -EBUSY;
4483
4484
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
4485
return 0;
4486
}
4487
4488
irq = kvm_cpu_get_extint(vcpu);
4489
if (irq != -1) {
4490
if (block_nested_events)
4491
return -EBUSY;
4492
4493
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
4494
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
4495
return 0;
4496
}
4497
4498
irq = kvm_apic_has_interrupt(vcpu);
4499
if (WARN_ON_ONCE(irq < 0))
4500
goto no_vmexit;
4501
4502
/*
4503
* If the IRQ is L2's PI notification vector, process posted
4504
* interrupts for L2 instead of injecting VM-Exit, as the
4505
* detection/morphing architecturally occurs when the IRQ is
4506
* delivered to the CPU. Note, only interrupts that are routed
4507
* through the local APIC trigger posted interrupt processing,
4508
* and enabling posted interrupts requires ACK-on-exit.
4509
*/
4510
if (irq == vmx->nested.posted_intr_nv) {
4511
/*
4512
* Nested posted interrupts are delivered via RVI, i.e.
4513
* aren't injected by KVM, and so can be queued even if
4514
* manual event injection is disallowed.
4515
*/
4516
if (block_non_injected_events)
4517
return -EBUSY;
4518
4519
vmx->nested.pi_pending = true;
4520
kvm_apic_clear_irr(vcpu, irq);
4521
goto no_vmexit;
4522
}
4523
4524
if (block_nested_events)
4525
return -EBUSY;
4526
4527
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
4528
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
4529
4530
/*
4531
* ACK the interrupt _after_ emulating VM-Exit, as the IRQ must
4532
* be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI
4533
* if APICv is active.
4534
*/
4535
kvm_apic_ack_interrupt(vcpu, irq);
4536
return 0;
4537
}
4538
4539
no_vmexit:
4540
return vmx_complete_nested_posted_interrupt(vcpu);
4541
}
4542
4543
static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
4544
{
4545
ktime_t remaining =
4546
hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
4547
u64 value;
4548
4549
if (ktime_to_ns(remaining) <= 0)
4550
return 0;
4551
4552
value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
4553
do_div(value, 1000000);
4554
return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
4555
}
4556
4557
static bool is_vmcs12_ext_field(unsigned long field)
4558
{
4559
switch (field) {
4560
case GUEST_ES_SELECTOR:
4561
case GUEST_CS_SELECTOR:
4562
case GUEST_SS_SELECTOR:
4563
case GUEST_DS_SELECTOR:
4564
case GUEST_FS_SELECTOR:
4565
case GUEST_GS_SELECTOR:
4566
case GUEST_LDTR_SELECTOR:
4567
case GUEST_TR_SELECTOR:
4568
case GUEST_ES_LIMIT:
4569
case GUEST_CS_LIMIT:
4570
case GUEST_SS_LIMIT:
4571
case GUEST_DS_LIMIT:
4572
case GUEST_FS_LIMIT:
4573
case GUEST_GS_LIMIT:
4574
case GUEST_LDTR_LIMIT:
4575
case GUEST_TR_LIMIT:
4576
case GUEST_GDTR_LIMIT:
4577
case GUEST_IDTR_LIMIT:
4578
case GUEST_ES_AR_BYTES:
4579
case GUEST_DS_AR_BYTES:
4580
case GUEST_FS_AR_BYTES:
4581
case GUEST_GS_AR_BYTES:
4582
case GUEST_LDTR_AR_BYTES:
4583
case GUEST_TR_AR_BYTES:
4584
case GUEST_ES_BASE:
4585
case GUEST_CS_BASE:
4586
case GUEST_SS_BASE:
4587
case GUEST_DS_BASE:
4588
case GUEST_FS_BASE:
4589
case GUEST_GS_BASE:
4590
case GUEST_LDTR_BASE:
4591
case GUEST_TR_BASE:
4592
case GUEST_GDTR_BASE:
4593
case GUEST_IDTR_BASE:
4594
case GUEST_PENDING_DBG_EXCEPTIONS:
4595
case GUEST_BNDCFGS:
4596
return true;
4597
default:
4598
break;
4599
}
4600
4601
return false;
4602
}
4603
4604
static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4605
struct vmcs12 *vmcs12)
4606
{
4607
struct vcpu_vmx *vmx = to_vmx(vcpu);
4608
4609
vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
4610
vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
4611
vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
4612
vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
4613
vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
4614
vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
4615
vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
4616
vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
4617
vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
4618
vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
4619
vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
4620
vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
4621
vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
4622
vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
4623
vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
4624
vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
4625
vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
4626
vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
4627
vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
4628
vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
4629
vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
4630
vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
4631
vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
4632
vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
4633
vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
4634
vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
4635
vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
4636
vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
4637
vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
4638
vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
4639
vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
4640
vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
4641
vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
4642
vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
4643
vmcs12->guest_pending_dbg_exceptions =
4644
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
4645
4646
vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
4647
}
4648
4649
static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4650
struct vmcs12 *vmcs12)
4651
{
4652
struct vcpu_vmx *vmx = to_vmx(vcpu);
4653
int cpu;
4654
4655
if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
4656
return;
4657
4658
4659
WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
4660
4661
cpu = get_cpu();
4662
vmx->loaded_vmcs = &vmx->nested.vmcs02;
4663
vmx_vcpu_load_vmcs(vcpu, cpu);
4664
4665
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4666
4667
vmx->loaded_vmcs = &vmx->vmcs01;
4668
vmx_vcpu_load_vmcs(vcpu, cpu);
4669
put_cpu();
4670
}
4671
4672
/*
4673
* Update the guest state fields of vmcs12 to reflect changes that
4674
* occurred while L2 was running. (The "IA-32e mode guest" bit of the
4675
* VM-entry controls is also updated, since this is really a guest
4676
* state bit.)
4677
*/
4678
static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
4679
{
4680
struct vcpu_vmx *vmx = to_vmx(vcpu);
4681
4682
if (nested_vmx_is_evmptr12_valid(vmx))
4683
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4684
4685
vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
4686
!nested_vmx_is_evmptr12_valid(vmx);
4687
4688
vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
4689
vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
4690
4691
vmcs12->guest_rsp = kvm_rsp_read(vcpu);
4692
vmcs12->guest_rip = kvm_rip_read(vcpu);
4693
vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
4694
4695
vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
4696
vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
4697
4698
vmcs12->guest_interruptibility_info =
4699
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
4700
4701
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
4702
vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
4703
else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
4704
vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
4705
else
4706
vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
4707
4708
if (nested_cpu_has_preemption_timer(vmcs12) &&
4709
vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
4710
!vmx->nested.nested_run_pending)
4711
vmcs12->vmx_preemption_timer_value =
4712
vmx_get_preemption_timer_value(vcpu);
4713
4714
/*
4715
* In some cases (usually, nested EPT), L2 is allowed to change its
4716
* own CR3 without exiting. If it has changed it, we must keep it.
4717
* Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
4718
* by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
4719
*
4720
* Additionally, restore L2's PDPTR to vmcs12.
4721
*/
4722
if (enable_ept) {
4723
vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
4724
if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
4725
vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
4726
vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
4727
vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
4728
vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
4729
}
4730
}
4731
4732
vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
4733
4734
if (nested_cpu_has_vid(vmcs12))
4735
vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
4736
4737
vmcs12->vm_entry_controls =
4738
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
4739
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
4740
4741
/*
4742
* Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
4743
* Writes to DEBUGCTL that aren't intercepted by L1 are immediately
4744
* propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
4745
* vmcs02 doesn't strictly track vmcs12.
4746
*/
4747
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
4748
vmcs12->guest_dr7 = vcpu->arch.dr7;
4749
4750
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
4751
vmcs12->guest_ia32_efer = vcpu->arch.efer;
4752
4753
vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet,
4754
&vmcs12->guest_ssp,
4755
&vmcs12->guest_ssp_tbl);
4756
}
4757
4758
/*
4759
* prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
4760
* and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
4761
* and this function updates it to reflect the changes to the guest state while
4762
* L2 was running (and perhaps made some exits which were handled directly by L0
4763
* without going back to L1), and to reflect the exit reason.
4764
* Note that we do not have to copy here all VMCS fields, just those that
4765
* could have changed by the L2 guest or the exit - i.e., the guest-state and
4766
* exit-information fields only. Other fields are modified by L1 with VMWRITE,
4767
* which already writes to vmcs12 directly.
4768
*/
4769
static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
4770
u32 vm_exit_reason, u32 exit_intr_info,
4771
unsigned long exit_qualification, u32 exit_insn_len)
4772
{
4773
/* update exit information fields: */
4774
vmcs12->vm_exit_reason = vm_exit_reason;
4775
if (vmx_get_exit_reason(vcpu).enclave_mode)
4776
vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
4777
vmcs12->exit_qualification = exit_qualification;
4778
4779
/*
4780
* On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
4781
* and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
4782
* exit info fields are unmodified.
4783
*/
4784
if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
4785
vmcs12->launch_state = 1;
4786
4787
/* vm_entry_intr_info_field is cleared on exit. Emulate this
4788
* instead of reading the real value. */
4789
vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
4790
4791
/*
4792
* Transfer the event that L0 or L1 may wanted to inject into
4793
* L2 to IDT_VECTORING_INFO_FIELD.
4794
*/
4795
vmcs12_save_pending_event(vcpu, vmcs12,
4796
vm_exit_reason, exit_intr_info);
4797
4798
vmcs12->vm_exit_intr_info = exit_intr_info;
4799
vmcs12->vm_exit_instruction_len = exit_insn_len;
4800
vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4801
4802
/*
4803
* According to spec, there's no need to store the guest's
4804
* MSRs if the exit is due to a VM-entry failure that occurs
4805
* during or after loading the guest state. Since this exit
4806
* does not fall in that category, we need to save the MSRs.
4807
*/
4808
if (nested_vmx_store_msr(vcpu,
4809
vmcs12->vm_exit_msr_store_addr,
4810
vmcs12->vm_exit_msr_store_count))
4811
nested_vmx_abort(vcpu,
4812
VMX_ABORT_SAVE_GUEST_MSR_FAIL);
4813
}
4814
}
4815
4816
/*
4817
* A part of what we need to when the nested L2 guest exits and we want to
4818
* run its L1 parent, is to reset L1's guest state to the host state specified
4819
* in vmcs12.
4820
* This function is to be called not only on normal nested exit, but also on
4821
* a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
4822
* Failures During or After Loading Guest State").
4823
* This function should be called when the active VMCS is L1's (vmcs01).
4824
*/
4825
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
4826
struct vmcs12 *vmcs12)
4827
{
4828
enum vm_entry_failure_code ignored;
4829
struct kvm_segment seg;
4830
4831
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
4832
vcpu->arch.efer = vmcs12->host_ia32_efer;
4833
else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4834
vcpu->arch.efer |= (EFER_LMA | EFER_LME);
4835
else
4836
vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
4837
vmx_set_efer(vcpu, vcpu->arch.efer);
4838
4839
kvm_rsp_write(vcpu, vmcs12->host_rsp);
4840
kvm_rip_write(vcpu, vmcs12->host_rip);
4841
vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
4842
vmx_set_interrupt_shadow(vcpu, 0);
4843
4844
/*
4845
* Note that calling vmx_set_cr0 is important, even if cr0 hasn't
4846
* actually changed, because vmx_set_cr0 refers to efer set above.
4847
*
4848
* CR0_GUEST_HOST_MASK is already set in the original vmcs01
4849
* (KVM doesn't change it);
4850
*/
4851
vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
4852
vmx_set_cr0(vcpu, vmcs12->host_cr0);
4853
4854
/* Same as above - no reason to call set_cr4_guest_host_mask(). */
4855
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4856
vmx_set_cr4(vcpu, vmcs12->host_cr4);
4857
4858
nested_ept_uninit_mmu_context(vcpu);
4859
4860
/*
4861
* Only PDPTE load can fail as the value of cr3 was checked on entry and
4862
* couldn't have changed.
4863
*/
4864
if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
4865
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
4866
4867
nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
4868
4869
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
4870
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
4871
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
4872
vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
4873
vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
4874
vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
4875
vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
4876
4877
/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
4878
if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
4879
vmcs_write64(GUEST_BNDCFGS, 0);
4880
4881
/*
4882
* Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set.
4883
* otherwise CET state should be retained across VM-exit, i.e.,
4884
* guest values should be propagated from vmcs12 to vmcs01.
4885
*/
4886
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE)
4887
vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp,
4888
vmcs12->host_ssp_tbl);
4889
else
4890
vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp,
4891
vmcs12->guest_ssp_tbl);
4892
4893
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
4894
vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
4895
vcpu->arch.pat = vmcs12->host_ia32_pat;
4896
}
4897
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
4898
kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
4899
WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4900
vmcs12->host_ia32_perf_global_ctrl));
4901
4902
/* Set L1 segment info according to Intel SDM
4903
27.5.2 Loading Host Segment and Descriptor-Table Registers */
4904
seg = (struct kvm_segment) {
4905
.base = 0,
4906
.limit = 0xFFFFFFFF,
4907
.selector = vmcs12->host_cs_selector,
4908
.type = 11,
4909
.present = 1,
4910
.s = 1,
4911
.g = 1
4912
};
4913
if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4914
seg.l = 1;
4915
else
4916
seg.db = 1;
4917
__vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
4918
seg = (struct kvm_segment) {
4919
.base = 0,
4920
.limit = 0xFFFFFFFF,
4921
.type = 3,
4922
.present = 1,
4923
.s = 1,
4924
.db = 1,
4925
.g = 1
4926
};
4927
seg.selector = vmcs12->host_ds_selector;
4928
__vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
4929
seg.selector = vmcs12->host_es_selector;
4930
__vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
4931
seg.selector = vmcs12->host_ss_selector;
4932
__vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
4933
seg.selector = vmcs12->host_fs_selector;
4934
seg.base = vmcs12->host_fs_base;
4935
__vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
4936
seg.selector = vmcs12->host_gs_selector;
4937
seg.base = vmcs12->host_gs_base;
4938
__vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
4939
seg = (struct kvm_segment) {
4940
.base = vmcs12->host_tr_base,
4941
.limit = 0x67,
4942
.selector = vmcs12->host_tr_selector,
4943
.type = 11,
4944
.present = 1
4945
};
4946
__vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
4947
4948
memset(&seg, 0, sizeof(seg));
4949
seg.unusable = 1;
4950
__vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
4951
4952
kvm_set_dr(vcpu, 7, 0x400);
4953
vmx_guest_debugctl_write(vcpu, 0);
4954
4955
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4956
vmcs12->vm_exit_msr_load_count))
4957
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4958
4959
to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu);
4960
}
4961
4962
static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4963
{
4964
struct vmx_uret_msr *efer_msr;
4965
unsigned int i;
4966
4967
if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4968
return vmcs_read64(GUEST_IA32_EFER);
4969
4970
if (cpu_has_load_ia32_efer())
4971
return kvm_host.efer;
4972
4973
for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4974
if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4975
return vmx->msr_autoload.guest.val[i].value;
4976
}
4977
4978
efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
4979
if (efer_msr)
4980
return efer_msr->data;
4981
4982
return kvm_host.efer;
4983
}
4984
4985
static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4986
{
4987
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4988
struct vcpu_vmx *vmx = to_vmx(vcpu);
4989
struct vmx_msr_entry g, h;
4990
gpa_t gpa;
4991
u32 i, j;
4992
4993
vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4994
4995
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4996
/*
4997
* L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4998
* as vmcs01.GUEST_DR7 contains a userspace defined value
4999
* and vcpu->arch.dr7 is not squirreled away before the
5000
* nested VMENTER (not worth adding a variable in nested_vmx).
5001
*/
5002
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
5003
kvm_set_dr(vcpu, 7, DR7_FIXED_1);
5004
else
5005
WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
5006
}
5007
5008
/* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
5009
vmx_reload_guest_debugctl(vcpu);
5010
5011
/*
5012
* Note that calling vmx_set_{efer,cr0,cr4} is important as they
5013
* handle a variety of side effects to KVM's software model.
5014
*/
5015
vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
5016
5017
vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
5018
vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
5019
5020
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
5021
vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
5022
5023
nested_ept_uninit_mmu_context(vcpu);
5024
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
5025
kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
5026
5027
/*
5028
* Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
5029
* from vmcs01 (if necessary). The PDPTRs are not loaded on
5030
* VMFail, like everything else we just need to ensure our
5031
* software model is up-to-date.
5032
*/
5033
if (enable_ept && is_pae_paging(vcpu))
5034
ept_save_pdptrs(vcpu);
5035
5036
kvm_mmu_reset_context(vcpu);
5037
5038
/*
5039
* This nasty bit of open coding is a compromise between blindly
5040
* loading L1's MSRs using the exit load lists (incorrect emulation
5041
* of VMFail), leaving the nested VM's MSRs in the software model
5042
* (incorrect behavior) and snapshotting the modified MSRs (too
5043
* expensive since the lists are unbound by hardware). For each
5044
* MSR that was (prematurely) loaded from the nested VMEntry load
5045
* list, reload it from the exit load list if it exists and differs
5046
* from the guest value. The intent is to stuff host state as
5047
* silently as possible, not to fully process the exit load list.
5048
*/
5049
for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
5050
gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
5051
if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
5052
pr_debug_ratelimited(
5053
"%s read MSR index failed (%u, 0x%08llx)\n",
5054
__func__, i, gpa);
5055
goto vmabort;
5056
}
5057
5058
for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
5059
gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
5060
if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
5061
pr_debug_ratelimited(
5062
"%s read MSR failed (%u, 0x%08llx)\n",
5063
__func__, j, gpa);
5064
goto vmabort;
5065
}
5066
if (h.index != g.index)
5067
continue;
5068
if (h.value == g.value)
5069
break;
5070
5071
if (nested_vmx_load_msr_check(vcpu, &h)) {
5072
pr_debug_ratelimited(
5073
"%s check failed (%u, 0x%x, 0x%x)\n",
5074
__func__, j, h.index, h.reserved);
5075
goto vmabort;
5076
}
5077
5078
if (kvm_emulate_msr_write(vcpu, h.index, h.value)) {
5079
pr_debug_ratelimited(
5080
"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
5081
__func__, j, h.index, h.value);
5082
goto vmabort;
5083
}
5084
}
5085
}
5086
5087
return;
5088
5089
vmabort:
5090
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
5091
}
5092
5093
/*
5094
* Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
5095
* and modify vmcs12 to make it see what it would expect to see there if
5096
* L2 was its real guest. Must only be called when in L2 (is_guest_mode())
5097
*/
5098
void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
5099
u32 exit_intr_info, unsigned long exit_qualification,
5100
u32 exit_insn_len)
5101
{
5102
struct vcpu_vmx *vmx = to_vmx(vcpu);
5103
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5104
5105
/* Pending MTF traps are discarded on VM-Exit. */
5106
vmx->nested.mtf_pending = false;
5107
5108
/* trying to cancel vmlaunch/vmresume is a bug */
5109
WARN_ON_ONCE(vmx->nested.nested_run_pending);
5110
5111
#ifdef CONFIG_KVM_HYPERV
5112
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
5113
/*
5114
* KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
5115
* Enlightened VMCS after migration and we still need to
5116
* do that when something is forcing L2->L1 exit prior to
5117
* the first L2 run.
5118
*/
5119
(void)nested_get_evmcs_page(vcpu);
5120
}
5121
#endif
5122
5123
/* Service pending TLB flush requests for L2 before switching to L1. */
5124
kvm_service_local_tlb_flush_requests(vcpu);
5125
5126
/*
5127
* VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
5128
* now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
5129
* up-to-date before switching to L1.
5130
*/
5131
if (enable_ept && is_pae_paging(vcpu))
5132
vmx_ept_load_pdptrs(vcpu);
5133
5134
leave_guest_mode(vcpu);
5135
5136
if (nested_cpu_has_preemption_timer(vmcs12))
5137
hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
5138
5139
if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
5140
vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
5141
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
5142
vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
5143
}
5144
5145
if (likely(!vmx->fail)) {
5146
sync_vmcs02_to_vmcs12(vcpu, vmcs12);
5147
5148
if (vm_exit_reason != -1)
5149
prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
5150
exit_intr_info, exit_qualification,
5151
exit_insn_len);
5152
5153
/*
5154
* Must happen outside of sync_vmcs02_to_vmcs12() as it will
5155
* also be used to capture vmcs12 cache as part of
5156
* capturing nVMX state for snapshot (migration).
5157
*
5158
* Otherwise, this flush will dirty guest memory at a
5159
* point it is already assumed by user-space to be
5160
* immutable.
5161
*/
5162
nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
5163
} else {
5164
/*
5165
* The only expected VM-instruction error is "VM entry with
5166
* invalid control field(s)." Anything else indicates a
5167
* problem with L0. And we should never get here with a
5168
* VMFail of any type if early consistency checks are enabled.
5169
*/
5170
WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
5171
VMXERR_ENTRY_INVALID_CONTROL_FIELD);
5172
WARN_ON_ONCE(nested_early_check);
5173
}
5174
5175
/*
5176
* Drop events/exceptions that were queued for re-injection to L2
5177
* (picked up via vmx_complete_interrupts()), as well as exceptions
5178
* that were pending for L2. Note, this must NOT be hoisted above
5179
* prepare_vmcs12(), events/exceptions queued for re-injection need to
5180
* be captured in vmcs12 (see vmcs12_save_pending_event()).
5181
*/
5182
vcpu->arch.nmi_injected = false;
5183
kvm_clear_exception_queue(vcpu);
5184
kvm_clear_interrupt_queue(vcpu);
5185
5186
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
5187
5188
kvm_nested_vmexit_handle_ibrs(vcpu);
5189
5190
/* Update any VMCS fields that might have changed while L2 ran */
5191
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
5192
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
5193
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
5194
if (kvm_caps.has_tsc_control)
5195
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
5196
5197
if (vmx->nested.l1_tpr_threshold != -1)
5198
vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
5199
5200
if (vmx->nested.change_vmcs01_virtual_apic_mode) {
5201
vmx->nested.change_vmcs01_virtual_apic_mode = false;
5202
vmx_set_virtual_apic_mode(vcpu);
5203
}
5204
5205
if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
5206
vmx->nested.update_vmcs01_cpu_dirty_logging = false;
5207
vmx_update_cpu_dirty_logging(vcpu);
5208
}
5209
5210
nested_put_vmcs12_pages(vcpu);
5211
5212
if (vmx->nested.reload_vmcs01_apic_access_page) {
5213
vmx->nested.reload_vmcs01_apic_access_page = false;
5214
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
5215
}
5216
5217
if (vmx->nested.update_vmcs01_apicv_status) {
5218
vmx->nested.update_vmcs01_apicv_status = false;
5219
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
5220
}
5221
5222
if (vmx->nested.update_vmcs01_hwapic_isr) {
5223
vmx->nested.update_vmcs01_hwapic_isr = false;
5224
kvm_apic_update_hwapic_isr(vcpu);
5225
}
5226
5227
if ((vm_exit_reason != -1) &&
5228
(enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)))
5229
vmx->nested.need_vmcs12_to_shadow_sync = true;
5230
5231
/* in case we halted in L2 */
5232
kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
5233
5234
if (likely(!vmx->fail)) {
5235
if (vm_exit_reason != -1)
5236
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
5237
vmcs12->exit_qualification,
5238
vmcs12->idt_vectoring_info_field,
5239
vmcs12->vm_exit_intr_info,
5240
vmcs12->vm_exit_intr_error_code,
5241
KVM_ISA_VMX);
5242
5243
load_vmcs12_host_state(vcpu, vmcs12);
5244
5245
/*
5246
* Process events if an injectable IRQ or NMI is pending, even
5247
* if the event is blocked (RFLAGS.IF is cleared on VM-Exit).
5248
* If an event became pending while L2 was active, KVM needs to
5249
* either inject the event or request an IRQ/NMI window. SMIs
5250
* don't need to be processed as SMM is mutually exclusive with
5251
* non-root mode. INIT/SIPI don't need to be checked as INIT
5252
* is blocked post-VMXON, and SIPIs are ignored.
5253
*/
5254
if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending)
5255
kvm_make_request(KVM_REQ_EVENT, vcpu);
5256
return;
5257
}
5258
5259
/*
5260
* After an early L2 VM-entry failure, we're now back
5261
* in L1 which thinks it just finished a VMLAUNCH or
5262
* VMRESUME instruction, so we need to set the failure
5263
* flag and the VM-instruction error field of the VMCS
5264
* accordingly, and skip the emulated instruction.
5265
*/
5266
(void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
5267
5268
/*
5269
* Restore L1's host state to KVM's software model. We're here
5270
* because a consistency check was caught by hardware, which
5271
* means some amount of guest state has been propagated to KVM's
5272
* model and needs to be unwound to the host's state.
5273
*/
5274
nested_vmx_restore_host_state(vcpu);
5275
5276
vmx->fail = 0;
5277
}
5278
5279
static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
5280
{
5281
kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5282
nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
5283
}
5284
5285
/*
5286
* Decode the memory-address operand of a vmx instruction, as recorded on an
5287
* exit caused by such an instruction (run by a guest hypervisor).
5288
* On success, returns 0. When the operand is invalid, returns 1 and throws
5289
* #UD, #GP, or #SS.
5290
*/
5291
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
5292
u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
5293
{
5294
gva_t off;
5295
bool exn;
5296
struct kvm_segment s;
5297
5298
/*
5299
* According to Vol. 3B, "Information for VM Exits Due to Instruction
5300
* Execution", on an exit, vmx_instruction_info holds most of the
5301
* addressing components of the operand. Only the displacement part
5302
* is put in exit_qualification (see 3B, "Basic VM-Exit Information").
5303
* For how an actual address is calculated from all these components,
5304
* refer to Vol. 1, "Operand Addressing".
5305
*/
5306
int scaling = vmx_instruction_info & 3;
5307
int addr_size = (vmx_instruction_info >> 7) & 7;
5308
bool is_reg = vmx_instruction_info & (1u << 10);
5309
int seg_reg = (vmx_instruction_info >> 15) & 7;
5310
int index_reg = (vmx_instruction_info >> 18) & 0xf;
5311
bool index_is_valid = !(vmx_instruction_info & (1u << 22));
5312
int base_reg = (vmx_instruction_info >> 23) & 0xf;
5313
bool base_is_valid = !(vmx_instruction_info & (1u << 27));
5314
5315
if (is_reg) {
5316
kvm_queue_exception(vcpu, UD_VECTOR);
5317
return 1;
5318
}
5319
5320
/* Addr = segment_base + offset */
5321
/* offset = base + [index * scale] + displacement */
5322
off = exit_qualification; /* holds the displacement */
5323
if (addr_size == 1)
5324
off = (gva_t)sign_extend64(off, 31);
5325
else if (addr_size == 0)
5326
off = (gva_t)sign_extend64(off, 15);
5327
if (base_is_valid)
5328
off += kvm_register_read(vcpu, base_reg);
5329
if (index_is_valid)
5330
off += kvm_register_read(vcpu, index_reg) << scaling;
5331
vmx_get_segment(vcpu, &s, seg_reg);
5332
5333
/*
5334
* The effective address, i.e. @off, of a memory operand is truncated
5335
* based on the address size of the instruction. Note that this is
5336
* the *effective address*, i.e. the address prior to accounting for
5337
* the segment's base.
5338
*/
5339
if (addr_size == 1) /* 32 bit */
5340
off &= 0xffffffff;
5341
else if (addr_size == 0) /* 16 bit */
5342
off &= 0xffff;
5343
5344
/* Checks for #GP/#SS exceptions. */
5345
exn = false;
5346
if (is_long_mode(vcpu)) {
5347
/*
5348
* The virtual/linear address is never truncated in 64-bit
5349
* mode, e.g. a 32-bit address size can yield a 64-bit virtual
5350
* address when using FS/GS with a non-zero base.
5351
*/
5352
if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
5353
*ret = s.base + off;
5354
else
5355
*ret = off;
5356
5357
*ret = vmx_get_untagged_addr(vcpu, *ret, 0);
5358
/* Long mode: #GP(0)/#SS(0) if the memory address is in a
5359
* non-canonical form. This is the only check on the memory
5360
* destination for long mode!
5361
*/
5362
exn = is_noncanonical_address(*ret, vcpu, 0);
5363
} else {
5364
/*
5365
* When not in long mode, the virtual/linear address is
5366
* unconditionally truncated to 32 bits regardless of the
5367
* address size.
5368
*/
5369
*ret = (s.base + off) & 0xffffffff;
5370
5371
/* Protected mode: apply checks for segment validity in the
5372
* following order:
5373
* - segment type check (#GP(0) may be thrown)
5374
* - usability check (#GP(0)/#SS(0))
5375
* - limit check (#GP(0)/#SS(0))
5376
*/
5377
if (wr)
5378
/* #GP(0) if the destination operand is located in a
5379
* read-only data segment or any code segment.
5380
*/
5381
exn = ((s.type & 0xa) == 0 || (s.type & 8));
5382
else
5383
/* #GP(0) if the source operand is located in an
5384
* execute-only code segment
5385
*/
5386
exn = ((s.type & 0xa) == 8);
5387
if (exn) {
5388
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
5389
return 1;
5390
}
5391
/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
5392
*/
5393
exn = (s.unusable != 0);
5394
5395
/*
5396
* Protected mode: #GP(0)/#SS(0) if the memory operand is
5397
* outside the segment limit. All CPUs that support VMX ignore
5398
* limit checks for flat segments, i.e. segments with base==0,
5399
* limit==0xffffffff and of type expand-up data or code.
5400
*/
5401
if (!(s.base == 0 && s.limit == 0xffffffff &&
5402
((s.type & 8) || !(s.type & 4))))
5403
exn = exn || ((u64)off + len - 1 > s.limit);
5404
}
5405
if (exn) {
5406
kvm_queue_exception_e(vcpu,
5407
seg_reg == VCPU_SREG_SS ?
5408
SS_VECTOR : GP_VECTOR,
5409
0);
5410
return 1;
5411
}
5412
5413
return 0;
5414
}
5415
5416
static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
5417
int *ret)
5418
{
5419
gva_t gva;
5420
struct x86_exception e;
5421
int r;
5422
5423
if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5424
vmcs_read32(VMX_INSTRUCTION_INFO), false,
5425
sizeof(*vmpointer), &gva)) {
5426
*ret = 1;
5427
return -EINVAL;
5428
}
5429
5430
r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
5431
if (r != X86EMUL_CONTINUE) {
5432
*ret = kvm_handle_memory_failure(vcpu, r, &e);
5433
return -EINVAL;
5434
}
5435
5436
return 0;
5437
}
5438
5439
/*
5440
* Allocate a shadow VMCS and associate it with the currently loaded
5441
* VMCS, unless such a shadow VMCS already exists. The newly allocated
5442
* VMCS is also VMCLEARed, so that it is ready for use.
5443
*/
5444
static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
5445
{
5446
struct vcpu_vmx *vmx = to_vmx(vcpu);
5447
struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
5448
5449
/*
5450
* KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
5451
* when L1 executes VMXOFF or the vCPU is forced out of nested
5452
* operation. VMXON faults if the CPU is already post-VMXON, so it
5453
* should be impossible to already have an allocated shadow VMCS. KVM
5454
* doesn't support virtualization of VMCS shadowing, so vmcs01 should
5455
* always be the loaded VMCS.
5456
*/
5457
if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
5458
return loaded_vmcs->shadow_vmcs;
5459
5460
loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
5461
if (loaded_vmcs->shadow_vmcs)
5462
vmcs_clear(loaded_vmcs->shadow_vmcs);
5463
5464
return loaded_vmcs->shadow_vmcs;
5465
}
5466
5467
static int enter_vmx_operation(struct kvm_vcpu *vcpu)
5468
{
5469
struct vcpu_vmx *vmx = to_vmx(vcpu);
5470
int r;
5471
5472
r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
5473
if (r < 0)
5474
goto out_vmcs02;
5475
5476
vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
5477
if (!vmx->nested.cached_vmcs12)
5478
goto out_cached_vmcs12;
5479
5480
vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
5481
vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
5482
if (!vmx->nested.cached_shadow_vmcs12)
5483
goto out_cached_shadow_vmcs12;
5484
5485
if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
5486
goto out_shadow_vmcs;
5487
5488
hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC,
5489
HRTIMER_MODE_ABS_PINNED);
5490
5491
vmx->nested.vpid02 = allocate_vpid();
5492
5493
vmx->nested.vmcs02_initialized = false;
5494
vmx->nested.vmxon = true;
5495
5496
if (vmx_pt_mode_is_host_guest()) {
5497
vmx->pt_desc.guest.ctl = 0;
5498
pt_update_intercept_for_msr(vcpu);
5499
}
5500
5501
return 0;
5502
5503
out_shadow_vmcs:
5504
kfree(vmx->nested.cached_shadow_vmcs12);
5505
5506
out_cached_shadow_vmcs12:
5507
kfree(vmx->nested.cached_vmcs12);
5508
5509
out_cached_vmcs12:
5510
free_loaded_vmcs(&vmx->nested.vmcs02);
5511
5512
out_vmcs02:
5513
return -ENOMEM;
5514
}
5515
5516
/* Emulate the VMXON instruction. */
5517
static int handle_vmxon(struct kvm_vcpu *vcpu)
5518
{
5519
int ret;
5520
gpa_t vmptr;
5521
uint32_t revision;
5522
struct vcpu_vmx *vmx = to_vmx(vcpu);
5523
const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
5524
| FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
5525
5526
/*
5527
* Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter
5528
* the guest and so cannot rely on hardware to perform the check,
5529
* which has higher priority than VM-Exit (see Intel SDM's pseudocode
5530
* for VMXON).
5531
*
5532
* Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86
5533
* and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't
5534
* force any of the relevant guest state. For a restricted guest, KVM
5535
* does force CR0.PE=1, but only to also force VM86 in order to emulate
5536
* Real Mode, and so there's no need to check CR0.PE manually.
5537
*/
5538
if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) {
5539
kvm_queue_exception(vcpu, UD_VECTOR);
5540
return 1;
5541
}
5542
5543
/*
5544
* The CPL is checked for "not in VMX operation" and for "in VMX root",
5545
* and has higher priority than the VM-Fail due to being post-VMXON,
5546
* i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root,
5547
* VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits
5548
* from L2 to L1, i.e. there's no need to check for the vCPU being in
5549
* VMX non-root.
5550
*
5551
* Forwarding the VM-Exit unconditionally, i.e. without performing the
5552
* #UD checks (see above), is functionally ok because KVM doesn't allow
5553
* L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's
5554
* CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are
5555
* missed by hardware due to shadowing CR0 and/or CR4.
5556
*/
5557
if (vmx_get_cpl(vcpu)) {
5558
kvm_inject_gp(vcpu, 0);
5559
return 1;
5560
}
5561
5562
if (vmx->nested.vmxon)
5563
return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
5564
5565
/*
5566
* Invalid CR0/CR4 generates #GP. These checks are performed if and
5567
* only if the vCPU isn't already in VMX operation, i.e. effectively
5568
* have lower priority than the VM-Fail above.
5569
*/
5570
if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
5571
!nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
5572
kvm_inject_gp(vcpu, 0);
5573
return 1;
5574
}
5575
5576
if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
5577
!= VMXON_NEEDED_FEATURES) {
5578
kvm_inject_gp(vcpu, 0);
5579
return 1;
5580
}
5581
5582
if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
5583
return ret;
5584
5585
/*
5586
* SDM 3: 24.11.5
5587
* The first 4 bytes of VMXON region contain the supported
5588
* VMCS revision identifier
5589
*
5590
* Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
5591
* which replaces physical address width with 32
5592
*/
5593
if (!page_address_valid(vcpu, vmptr))
5594
return nested_vmx_failInvalid(vcpu);
5595
5596
if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
5597
revision != VMCS12_REVISION)
5598
return nested_vmx_failInvalid(vcpu);
5599
5600
vmx->nested.vmxon_ptr = vmptr;
5601
ret = enter_vmx_operation(vcpu);
5602
if (ret)
5603
return ret;
5604
5605
return nested_vmx_succeed(vcpu);
5606
}
5607
5608
static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
5609
{
5610
struct vcpu_vmx *vmx = to_vmx(vcpu);
5611
5612
if (vmx->nested.current_vmptr == INVALID_GPA)
5613
return;
5614
5615
copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
5616
5617
if (enable_shadow_vmcs) {
5618
/* copy to memory all shadowed fields in case
5619
they were modified */
5620
copy_shadow_to_vmcs12(vmx);
5621
vmx_disable_shadow_vmcs(vmx);
5622
}
5623
vmx->nested.posted_intr_nv = -1;
5624
5625
/* Flush VMCS12 to guest memory */
5626
kvm_vcpu_write_guest_page(vcpu,
5627
vmx->nested.current_vmptr >> PAGE_SHIFT,
5628
vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
5629
5630
kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5631
5632
vmx->nested.current_vmptr = INVALID_GPA;
5633
}
5634
5635
/* Emulate the VMXOFF instruction */
5636
static int handle_vmxoff(struct kvm_vcpu *vcpu)
5637
{
5638
if (!nested_vmx_check_permission(vcpu))
5639
return 1;
5640
5641
free_nested(vcpu);
5642
5643
if (kvm_apic_has_pending_init_or_sipi(vcpu))
5644
kvm_make_request(KVM_REQ_EVENT, vcpu);
5645
5646
return nested_vmx_succeed(vcpu);
5647
}
5648
5649
/* Emulate the VMCLEAR instruction */
5650
static int handle_vmclear(struct kvm_vcpu *vcpu)
5651
{
5652
struct vcpu_vmx *vmx = to_vmx(vcpu);
5653
u32 zero = 0;
5654
gpa_t vmptr;
5655
int r;
5656
5657
if (!nested_vmx_check_permission(vcpu))
5658
return 1;
5659
5660
if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5661
return r;
5662
5663
if (!page_address_valid(vcpu, vmptr))
5664
return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
5665
5666
if (vmptr == vmx->nested.vmxon_ptr)
5667
return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
5668
5669
if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) {
5670
if (vmptr == vmx->nested.current_vmptr)
5671
nested_release_vmcs12(vcpu);
5672
5673
/*
5674
* Silently ignore memory errors on VMCLEAR, Intel's pseudocode
5675
* for VMCLEAR includes a "ensure that data for VMCS referenced
5676
* by the operand is in memory" clause that guards writes to
5677
* memory, i.e. doing nothing for I/O is architecturally valid.
5678
*
5679
* FIXME: Suppress failures if and only if no memslot is found,
5680
* i.e. exit to userspace if __copy_to_user() fails.
5681
*/
5682
(void)kvm_vcpu_write_guest(vcpu,
5683
vmptr + offsetof(struct vmcs12,
5684
launch_state),
5685
&zero, sizeof(zero));
5686
}
5687
5688
return nested_vmx_succeed(vcpu);
5689
}
5690
5691
/* Emulate the VMLAUNCH instruction */
5692
static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5693
{
5694
return nested_vmx_run(vcpu, true);
5695
}
5696
5697
/* Emulate the VMRESUME instruction */
5698
static int handle_vmresume(struct kvm_vcpu *vcpu)
5699
{
5700
5701
return nested_vmx_run(vcpu, false);
5702
}
5703
5704
static int handle_vmread(struct kvm_vcpu *vcpu)
5705
{
5706
struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5707
: get_vmcs12(vcpu);
5708
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5709
u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5710
struct vcpu_vmx *vmx = to_vmx(vcpu);
5711
struct x86_exception e;
5712
unsigned long field;
5713
u64 value;
5714
gva_t gva = 0;
5715
short offset;
5716
int len, r;
5717
5718
if (!nested_vmx_check_permission(vcpu))
5719
return 1;
5720
5721
/* Decode instruction info and find the field to read */
5722
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
5723
5724
if (!nested_vmx_is_evmptr12_valid(vmx)) {
5725
/*
5726
* In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
5727
* any VMREAD sets the ALU flags for VMfailInvalid.
5728
*/
5729
if (vmx->nested.current_vmptr == INVALID_GPA ||
5730
(is_guest_mode(vcpu) &&
5731
get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
5732
return nested_vmx_failInvalid(vcpu);
5733
5734
offset = get_vmcs12_field_offset(field);
5735
if (offset < 0)
5736
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5737
5738
if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
5739
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5740
5741
/* Read the field, zero-extended to a u64 value */
5742
value = vmcs12_read_any(vmcs12, field, offset);
5743
} else {
5744
/*
5745
* Hyper-V TLFS (as of 6.0b) explicitly states, that while an
5746
* enlightened VMCS is active VMREAD/VMWRITE instructions are
5747
* unsupported. Unfortunately, certain versions of Windows 11
5748
* don't comply with this requirement which is not enforced in
5749
* genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
5750
* workaround, as misbehaving guests will panic on VM-Fail.
5751
* Note, enlightened VMCS is incompatible with shadow VMCS so
5752
* all VMREADs from L2 should go to L1.
5753
*/
5754
if (WARN_ON_ONCE(is_guest_mode(vcpu)))
5755
return nested_vmx_failInvalid(vcpu);
5756
5757
offset = evmcs_field_offset(field, NULL);
5758
if (offset < 0)
5759
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5760
5761
/* Read the field, zero-extended to a u64 value */
5762
value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset);
5763
}
5764
5765
/*
5766
* Now copy part of this value to register or memory, as requested.
5767
* Note that the number of bits actually copied is 32 or 64 depending
5768
* on the guest's mode (32 or 64 bit), not on the given field's length.
5769
*/
5770
if (instr_info & BIT(10)) {
5771
kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
5772
} else {
5773
len = is_64_bit_mode(vcpu) ? 8 : 4;
5774
if (get_vmx_mem_address(vcpu, exit_qualification,
5775
instr_info, true, len, &gva))
5776
return 1;
5777
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
5778
r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
5779
if (r != X86EMUL_CONTINUE)
5780
return kvm_handle_memory_failure(vcpu, r, &e);
5781
}
5782
5783
return nested_vmx_succeed(vcpu);
5784
}
5785
5786
static bool is_shadow_field_rw(unsigned long field)
5787
{
5788
switch (field) {
5789
#define SHADOW_FIELD_RW(x, y) case x:
5790
#include "vmcs_shadow_fields.h"
5791
return true;
5792
default:
5793
break;
5794
}
5795
return false;
5796
}
5797
5798
static bool is_shadow_field_ro(unsigned long field)
5799
{
5800
switch (field) {
5801
#define SHADOW_FIELD_RO(x, y) case x:
5802
#include "vmcs_shadow_fields.h"
5803
return true;
5804
default:
5805
break;
5806
}
5807
return false;
5808
}
5809
5810
static int handle_vmwrite(struct kvm_vcpu *vcpu)
5811
{
5812
struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5813
: get_vmcs12(vcpu);
5814
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5815
u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5816
struct vcpu_vmx *vmx = to_vmx(vcpu);
5817
struct x86_exception e;
5818
unsigned long field;
5819
short offset;
5820
gva_t gva;
5821
int len, r;
5822
5823
/*
5824
* The value to write might be 32 or 64 bits, depending on L1's long
5825
* mode, and eventually we need to write that into a field of several
5826
* possible lengths. The code below first zero-extends the value to 64
5827
* bit (value), and then copies only the appropriate number of
5828
* bits into the vmcs12 field.
5829
*/
5830
u64 value = 0;
5831
5832
if (!nested_vmx_check_permission(vcpu))
5833
return 1;
5834
5835
/*
5836
* In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
5837
* any VMWRITE sets the ALU flags for VMfailInvalid.
5838
*/
5839
if (vmx->nested.current_vmptr == INVALID_GPA ||
5840
(is_guest_mode(vcpu) &&
5841
get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
5842
return nested_vmx_failInvalid(vcpu);
5843
5844
if (instr_info & BIT(10))
5845
value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
5846
else {
5847
len = is_64_bit_mode(vcpu) ? 8 : 4;
5848
if (get_vmx_mem_address(vcpu, exit_qualification,
5849
instr_info, false, len, &gva))
5850
return 1;
5851
r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
5852
if (r != X86EMUL_CONTINUE)
5853
return kvm_handle_memory_failure(vcpu, r, &e);
5854
}
5855
5856
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
5857
5858
offset = get_vmcs12_field_offset(field);
5859
if (offset < 0)
5860
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5861
5862
/*
5863
* If the vCPU supports "VMWRITE to any supported field in the
5864
* VMCS," then the "read-only" fields are actually read/write.
5865
*/
5866
if (vmcs_field_readonly(field) &&
5867
!nested_cpu_has_vmwrite_any_field(vcpu))
5868
return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
5869
5870
/*
5871
* Ensure vmcs12 is up-to-date before any VMWRITE that dirties
5872
* vmcs12, else we may crush a field or consume a stale value.
5873
*/
5874
if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
5875
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5876
5877
/*
5878
* Some Intel CPUs intentionally drop the reserved bits of the AR byte
5879
* fields on VMWRITE. Emulate this behavior to ensure consistent KVM
5880
* behavior regardless of the underlying hardware, e.g. if an AR_BYTE
5881
* field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
5882
* from L1 will return a different value than VMREAD from L2 (L1 sees
5883
* the stripped down value, L2 sees the full value as stored by KVM).
5884
*/
5885
if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
5886
value &= 0x1f0ff;
5887
5888
vmcs12_write_any(vmcs12, field, offset, value);
5889
5890
/*
5891
* Do not track vmcs12 dirty-state if in guest-mode as we actually
5892
* dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
5893
* by L1 without a vmexit are always updated in the vmcs02, i.e. don't
5894
* "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
5895
*/
5896
if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
5897
/*
5898
* L1 can read these fields without exiting, ensure the
5899
* shadow VMCS is up-to-date.
5900
*/
5901
if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
5902
preempt_disable();
5903
vmcs_load(vmx->vmcs01.shadow_vmcs);
5904
5905
__vmcs_writel(field, value);
5906
5907
vmcs_clear(vmx->vmcs01.shadow_vmcs);
5908
vmcs_load(vmx->loaded_vmcs->vmcs);
5909
preempt_enable();
5910
}
5911
vmx->nested.dirty_vmcs12 = true;
5912
}
5913
5914
return nested_vmx_succeed(vcpu);
5915
}
5916
5917
static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
5918
{
5919
vmx->nested.current_vmptr = vmptr;
5920
if (enable_shadow_vmcs) {
5921
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
5922
vmcs_write64(VMCS_LINK_POINTER,
5923
__pa(vmx->vmcs01.shadow_vmcs));
5924
vmx->nested.need_vmcs12_to_shadow_sync = true;
5925
}
5926
vmx->nested.dirty_vmcs12 = true;
5927
vmx->nested.force_msr_bitmap_recalc = true;
5928
}
5929
5930
/* Emulate the VMPTRLD instruction */
5931
static int handle_vmptrld(struct kvm_vcpu *vcpu)
5932
{
5933
struct vcpu_vmx *vmx = to_vmx(vcpu);
5934
gpa_t vmptr;
5935
int r;
5936
5937
if (!nested_vmx_check_permission(vcpu))
5938
return 1;
5939
5940
if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5941
return r;
5942
5943
if (!page_address_valid(vcpu, vmptr))
5944
return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
5945
5946
if (vmptr == vmx->nested.vmxon_ptr)
5947
return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
5948
5949
/* Forbid normal VMPTRLD if Enlightened version was used */
5950
if (nested_vmx_is_evmptr12_valid(vmx))
5951
return 1;
5952
5953
if (vmx->nested.current_vmptr != vmptr) {
5954
struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
5955
struct vmcs_hdr hdr;
5956
5957
if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
5958
/*
5959
* Reads from an unbacked page return all 1s,
5960
* which means that the 32 bits located at the
5961
* given physical address won't match the required
5962
* VMCS12_REVISION identifier.
5963
*/
5964
return nested_vmx_fail(vcpu,
5965
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5966
}
5967
5968
if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
5969
offsetof(struct vmcs12, hdr),
5970
sizeof(hdr))) {
5971
return nested_vmx_fail(vcpu,
5972
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5973
}
5974
5975
if (hdr.revision_id != VMCS12_REVISION ||
5976
(hdr.shadow_vmcs &&
5977
!nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
5978
return nested_vmx_fail(vcpu,
5979
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5980
}
5981
5982
nested_release_vmcs12(vcpu);
5983
5984
/*
5985
* Load VMCS12 from guest memory since it is not already
5986
* cached.
5987
*/
5988
if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12,
5989
VMCS12_SIZE)) {
5990
return nested_vmx_fail(vcpu,
5991
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5992
}
5993
5994
set_current_vmptr(vmx, vmptr);
5995
}
5996
5997
return nested_vmx_succeed(vcpu);
5998
}
5999
6000
/* Emulate the VMPTRST instruction */
6001
static int handle_vmptrst(struct kvm_vcpu *vcpu)
6002
{
6003
unsigned long exit_qual = vmx_get_exit_qual(vcpu);
6004
u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6005
gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
6006
struct x86_exception e;
6007
gva_t gva;
6008
int r;
6009
6010
if (!nested_vmx_check_permission(vcpu))
6011
return 1;
6012
6013
if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu))))
6014
return 1;
6015
6016
if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
6017
true, sizeof(gpa_t), &gva))
6018
return 1;
6019
/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
6020
r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
6021
sizeof(gpa_t), &e);
6022
if (r != X86EMUL_CONTINUE)
6023
return kvm_handle_memory_failure(vcpu, r, &e);
6024
6025
return nested_vmx_succeed(vcpu);
6026
}
6027
6028
/* Emulate the INVEPT instruction */
6029
static int handle_invept(struct kvm_vcpu *vcpu)
6030
{
6031
struct vcpu_vmx *vmx = to_vmx(vcpu);
6032
u32 vmx_instruction_info, types;
6033
unsigned long type, roots_to_free;
6034
struct kvm_mmu *mmu;
6035
gva_t gva;
6036
struct x86_exception e;
6037
struct {
6038
u64 eptp, gpa;
6039
} operand;
6040
int i, r, gpr_index;
6041
6042
if (!(vmx->nested.msrs.secondary_ctls_high &
6043
SECONDARY_EXEC_ENABLE_EPT) ||
6044
!(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
6045
kvm_queue_exception(vcpu, UD_VECTOR);
6046
return 1;
6047
}
6048
6049
if (!nested_vmx_check_permission(vcpu))
6050
return 1;
6051
6052
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6053
gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
6054
type = kvm_register_read(vcpu, gpr_index);
6055
6056
types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
6057
6058
if (type >= 32 || !(types & (1 << type)))
6059
return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6060
6061
/* According to the Intel VMX instruction reference, the memory
6062
* operand is read even if it isn't needed (e.g., for type==global)
6063
*/
6064
if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
6065
vmx_instruction_info, false, sizeof(operand), &gva))
6066
return 1;
6067
r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
6068
if (r != X86EMUL_CONTINUE)
6069
return kvm_handle_memory_failure(vcpu, r, &e);
6070
6071
/*
6072
* Nested EPT roots are always held through guest_mmu,
6073
* not root_mmu.
6074
*/
6075
mmu = &vcpu->arch.guest_mmu;
6076
6077
switch (type) {
6078
case VMX_EPT_EXTENT_CONTEXT:
6079
if (!nested_vmx_check_eptp(vcpu, operand.eptp))
6080
return nested_vmx_fail(vcpu,
6081
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6082
6083
roots_to_free = 0;
6084
if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd,
6085
operand.eptp))
6086
roots_to_free |= KVM_MMU_ROOT_CURRENT;
6087
6088
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
6089
if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
6090
mmu->prev_roots[i].pgd,
6091
operand.eptp))
6092
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
6093
}
6094
break;
6095
case VMX_EPT_EXTENT_GLOBAL:
6096
roots_to_free = KVM_MMU_ROOTS_ALL;
6097
break;
6098
default:
6099
BUG();
6100
break;
6101
}
6102
6103
if (roots_to_free)
6104
kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
6105
6106
return nested_vmx_succeed(vcpu);
6107
}
6108
6109
static int handle_invvpid(struct kvm_vcpu *vcpu)
6110
{
6111
struct vcpu_vmx *vmx = to_vmx(vcpu);
6112
u32 vmx_instruction_info;
6113
unsigned long type, types;
6114
gva_t gva;
6115
struct x86_exception e;
6116
struct {
6117
u64 vpid;
6118
u64 gla;
6119
} operand;
6120
u16 vpid02;
6121
int r, gpr_index;
6122
6123
if (!(vmx->nested.msrs.secondary_ctls_high &
6124
SECONDARY_EXEC_ENABLE_VPID) ||
6125
!(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
6126
kvm_queue_exception(vcpu, UD_VECTOR);
6127
return 1;
6128
}
6129
6130
if (!nested_vmx_check_permission(vcpu))
6131
return 1;
6132
6133
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6134
gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
6135
type = kvm_register_read(vcpu, gpr_index);
6136
6137
types = (vmx->nested.msrs.vpid_caps &
6138
VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
6139
6140
if (type >= 32 || !(types & (1 << type)))
6141
return nested_vmx_fail(vcpu,
6142
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6143
6144
/* according to the intel vmx instruction reference, the memory
6145
* operand is read even if it isn't needed (e.g., for type==global)
6146
*/
6147
if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
6148
vmx_instruction_info, false, sizeof(operand), &gva))
6149
return 1;
6150
r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
6151
if (r != X86EMUL_CONTINUE)
6152
return kvm_handle_memory_failure(vcpu, r, &e);
6153
6154
if (operand.vpid >> 16)
6155
return nested_vmx_fail(vcpu,
6156
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6157
6158
/*
6159
* Always flush the effective vpid02, i.e. never flush the current VPID
6160
* and never explicitly flush vpid01. INVVPID targets a VPID, not a
6161
* VMCS, and so whether or not the current vmcs12 has VPID enabled is
6162
* irrelevant (and there may not be a loaded vmcs12).
6163
*/
6164
vpid02 = nested_get_vpid02(vcpu);
6165
switch (type) {
6166
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
6167
/*
6168
* LAM doesn't apply to addresses that are inputs to TLB
6169
* invalidation.
6170
*/
6171
if (!operand.vpid ||
6172
is_noncanonical_invlpg_address(operand.gla, vcpu))
6173
return nested_vmx_fail(vcpu,
6174
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6175
vpid_sync_vcpu_addr(vpid02, operand.gla);
6176
break;
6177
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
6178
case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
6179
if (!operand.vpid)
6180
return nested_vmx_fail(vcpu,
6181
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6182
vpid_sync_context(vpid02);
6183
break;
6184
case VMX_VPID_EXTENT_ALL_CONTEXT:
6185
vpid_sync_context(vpid02);
6186
break;
6187
default:
6188
WARN_ON_ONCE(1);
6189
return kvm_skip_emulated_instruction(vcpu);
6190
}
6191
6192
/*
6193
* Sync the shadow page tables if EPT is disabled, L1 is invalidating
6194
* linear mappings for L2 (tagged with L2's VPID). Free all guest
6195
* roots as VPIDs are not tracked in the MMU role.
6196
*
6197
* Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
6198
* an MMU when EPT is disabled.
6199
*
6200
* TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
6201
*/
6202
if (!enable_ept)
6203
kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu);
6204
6205
return nested_vmx_succeed(vcpu);
6206
}
6207
6208
static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
6209
struct vmcs12 *vmcs12)
6210
{
6211
u32 index = kvm_rcx_read(vcpu);
6212
u64 new_eptp;
6213
6214
if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
6215
return 1;
6216
if (index >= VMFUNC_EPTP_ENTRIES)
6217
return 1;
6218
6219
if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
6220
&new_eptp, index * 8, 8))
6221
return 1;
6222
6223
/*
6224
* If the (L2) guest does a vmfunc to the currently
6225
* active ept pointer, we don't have to do anything else
6226
*/
6227
if (vmcs12->ept_pointer != new_eptp) {
6228
if (!nested_vmx_check_eptp(vcpu, new_eptp))
6229
return 1;
6230
6231
vmcs12->ept_pointer = new_eptp;
6232
nested_ept_new_eptp(vcpu);
6233
6234
if (!nested_cpu_has_vpid(vmcs12))
6235
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
6236
}
6237
6238
return 0;
6239
}
6240
6241
static int handle_vmfunc(struct kvm_vcpu *vcpu)
6242
{
6243
struct vcpu_vmx *vmx = to_vmx(vcpu);
6244
struct vmcs12 *vmcs12;
6245
u32 function = kvm_rax_read(vcpu);
6246
6247
/*
6248
* VMFUNC should never execute cleanly while L1 is active; KVM supports
6249
* VMFUNC for nested VMs, but not for L1.
6250
*/
6251
if (WARN_ON_ONCE(!is_guest_mode(vcpu))) {
6252
kvm_queue_exception(vcpu, UD_VECTOR);
6253
return 1;
6254
}
6255
6256
vmcs12 = get_vmcs12(vcpu);
6257
6258
/*
6259
* #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC
6260
* is enabled in vmcs02 if and only if it's enabled in vmcs12.
6261
*/
6262
if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) {
6263
kvm_queue_exception(vcpu, UD_VECTOR);
6264
return 1;
6265
}
6266
6267
if (!(vmcs12->vm_function_control & BIT_ULL(function)))
6268
goto fail;
6269
6270
switch (function) {
6271
case 0:
6272
if (nested_vmx_eptp_switching(vcpu, vmcs12))
6273
goto fail;
6274
break;
6275
default:
6276
goto fail;
6277
}
6278
return kvm_skip_emulated_instruction(vcpu);
6279
6280
fail:
6281
/*
6282
* This is effectively a reflected VM-Exit, as opposed to a synthesized
6283
* nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
6284
* EXIT_REASON_VMFUNC as the exit reason.
6285
*/
6286
nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full,
6287
vmx_get_intr_info(vcpu),
6288
vmx_get_exit_qual(vcpu));
6289
return 1;
6290
}
6291
6292
/*
6293
* Return true if an IO instruction with the specified port and size should cause
6294
* a VM-exit into L1.
6295
*/
6296
bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
6297
int size)
6298
{
6299
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6300
gpa_t bitmap, last_bitmap;
6301
u8 b;
6302
6303
last_bitmap = INVALID_GPA;
6304
b = -1;
6305
6306
while (size > 0) {
6307
if (port < 0x8000)
6308
bitmap = vmcs12->io_bitmap_a;
6309
else if (port < 0x10000)
6310
bitmap = vmcs12->io_bitmap_b;
6311
else
6312
return true;
6313
bitmap += (port & 0x7fff) / 8;
6314
6315
if (last_bitmap != bitmap)
6316
if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
6317
return true;
6318
if (b & (1 << (port & 7)))
6319
return true;
6320
6321
port++;
6322
size--;
6323
last_bitmap = bitmap;
6324
}
6325
6326
return false;
6327
}
6328
6329
static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
6330
struct vmcs12 *vmcs12)
6331
{
6332
unsigned long exit_qualification;
6333
unsigned short port;
6334
int size;
6335
6336
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
6337
return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
6338
6339
exit_qualification = vmx_get_exit_qual(vcpu);
6340
6341
port = exit_qualification >> 16;
6342
size = (exit_qualification & 7) + 1;
6343
6344
return nested_vmx_check_io_bitmaps(vcpu, port, size);
6345
}
6346
6347
/*
6348
* Return 1 if we should exit from L2 to L1 to handle an MSR access,
6349
* rather than handle it ourselves in L0. I.e., check whether L1 expressed
6350
* disinterest in the current event (read or write a specific MSR) by using an
6351
* MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
6352
*/
6353
static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
6354
struct vmcs12 *vmcs12,
6355
union vmx_exit_reason exit_reason)
6356
{
6357
u32 msr_index;
6358
gpa_t bitmap;
6359
6360
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
6361
return true;
6362
6363
if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM ||
6364
exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
6365
msr_index = vmx_get_exit_qual(vcpu);
6366
else
6367
msr_index = kvm_rcx_read(vcpu);
6368
6369
/*
6370
* The MSR_BITMAP page is divided into four 1024-byte bitmaps,
6371
* for the four combinations of read/write and low/high MSR numbers.
6372
* First we need to figure out which of the four to use:
6373
*/
6374
bitmap = vmcs12->msr_bitmap;
6375
if (exit_reason.basic == EXIT_REASON_MSR_WRITE ||
6376
exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
6377
bitmap += 2048;
6378
if (msr_index >= 0xc0000000) {
6379
msr_index -= 0xc0000000;
6380
bitmap += 1024;
6381
}
6382
6383
/* Then read the msr_index'th bit from this bitmap: */
6384
if (msr_index < 1024*8) {
6385
unsigned char b;
6386
if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
6387
return true;
6388
return 1 & (b >> (msr_index & 7));
6389
} else
6390
return true; /* let L1 handle the wrong parameter */
6391
}
6392
6393
/*
6394
* Return 1 if we should exit from L2 to L1 to handle a CR access exit,
6395
* rather than handle it ourselves in L0. I.e., check if L1 wanted to
6396
* intercept (via guest_host_mask etc.) the current event.
6397
*/
6398
static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
6399
struct vmcs12 *vmcs12)
6400
{
6401
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
6402
int cr = exit_qualification & 15;
6403
int reg;
6404
unsigned long val;
6405
6406
switch ((exit_qualification >> 4) & 3) {
6407
case 0: /* mov to cr */
6408
reg = (exit_qualification >> 8) & 15;
6409
val = kvm_register_read(vcpu, reg);
6410
switch (cr) {
6411
case 0:
6412
if (vmcs12->cr0_guest_host_mask &
6413
(val ^ vmcs12->cr0_read_shadow))
6414
return true;
6415
break;
6416
case 3:
6417
if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
6418
return true;
6419
break;
6420
case 4:
6421
if (vmcs12->cr4_guest_host_mask &
6422
(vmcs12->cr4_read_shadow ^ val))
6423
return true;
6424
break;
6425
case 8:
6426
if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
6427
return true;
6428
break;
6429
}
6430
break;
6431
case 2: /* clts */
6432
if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
6433
(vmcs12->cr0_read_shadow & X86_CR0_TS))
6434
return true;
6435
break;
6436
case 1: /* mov from cr */
6437
switch (cr) {
6438
case 3:
6439
if (vmcs12->cpu_based_vm_exec_control &
6440
CPU_BASED_CR3_STORE_EXITING)
6441
return true;
6442
break;
6443
case 8:
6444
if (vmcs12->cpu_based_vm_exec_control &
6445
CPU_BASED_CR8_STORE_EXITING)
6446
return true;
6447
break;
6448
}
6449
break;
6450
case 3: /* lmsw */
6451
/*
6452
* lmsw can change bits 1..3 of cr0, and only set bit 0 of
6453
* cr0. Other attempted changes are ignored, with no exit.
6454
*/
6455
val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
6456
if (vmcs12->cr0_guest_host_mask & 0xe &
6457
(val ^ vmcs12->cr0_read_shadow))
6458
return true;
6459
if ((vmcs12->cr0_guest_host_mask & 0x1) &&
6460
!(vmcs12->cr0_read_shadow & 0x1) &&
6461
(val & 0x1))
6462
return true;
6463
break;
6464
}
6465
return false;
6466
}
6467
6468
static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
6469
struct vmcs12 *vmcs12)
6470
{
6471
u32 encls_leaf;
6472
6473
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) ||
6474
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
6475
return false;
6476
6477
encls_leaf = kvm_rax_read(vcpu);
6478
if (encls_leaf > 62)
6479
encls_leaf = 63;
6480
return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
6481
}
6482
6483
static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
6484
struct vmcs12 *vmcs12, gpa_t bitmap)
6485
{
6486
u32 vmx_instruction_info;
6487
unsigned long field;
6488
u8 b;
6489
6490
if (!nested_cpu_has_shadow_vmcs(vmcs12))
6491
return true;
6492
6493
/* Decode instruction info and find the field to access */
6494
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6495
field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
6496
6497
/* Out-of-range fields always cause a VM exit from L2 to L1 */
6498
if (field >> 15)
6499
return true;
6500
6501
if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
6502
return true;
6503
6504
return 1 & (b >> (field & 7));
6505
}
6506
6507
static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
6508
{
6509
u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
6510
6511
if (nested_cpu_has_mtf(vmcs12))
6512
return true;
6513
6514
/*
6515
* An MTF VM-exit may be injected into the guest by setting the
6516
* interruption-type to 7 (other event) and the vector field to 0. Such
6517
* is the case regardless of the 'monitor trap flag' VM-execution
6518
* control.
6519
*/
6520
return entry_intr_info == (INTR_INFO_VALID_MASK
6521
| INTR_TYPE_OTHER_EVENT);
6522
}
6523
6524
/*
6525
* Return true if L0 wants to handle an exit from L2 regardless of whether or not
6526
* L1 wants the exit. Only call this when in is_guest_mode (L2).
6527
*/
6528
static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
6529
union vmx_exit_reason exit_reason)
6530
{
6531
u32 intr_info;
6532
6533
switch ((u16)exit_reason.basic) {
6534
case EXIT_REASON_EXCEPTION_NMI:
6535
intr_info = vmx_get_intr_info(vcpu);
6536
if (is_nmi(intr_info))
6537
return true;
6538
else if (is_page_fault(intr_info))
6539
return vcpu->arch.apf.host_apf_flags ||
6540
vmx_need_pf_intercept(vcpu);
6541
else if (is_debug(intr_info) &&
6542
vcpu->guest_debug &
6543
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
6544
return true;
6545
else if (is_breakpoint(intr_info) &&
6546
vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
6547
return true;
6548
else if (is_alignment_check(intr_info) &&
6549
!vmx_guest_inject_ac(vcpu))
6550
return true;
6551
else if (is_ve_fault(intr_info))
6552
return true;
6553
return false;
6554
case EXIT_REASON_EXTERNAL_INTERRUPT:
6555
return true;
6556
case EXIT_REASON_MCE_DURING_VMENTRY:
6557
return true;
6558
case EXIT_REASON_EPT_VIOLATION:
6559
/*
6560
* L0 always deals with the EPT violation. If nested EPT is
6561
* used, and the nested mmu code discovers that the address is
6562
* missing in the guest EPT table (EPT12), the EPT violation
6563
* will be injected with nested_ept_inject_page_fault()
6564
*/
6565
return true;
6566
case EXIT_REASON_EPT_MISCONFIG:
6567
/*
6568
* L2 never uses directly L1's EPT, but rather L0's own EPT
6569
* table (shadow on EPT) or a merged EPT table that L0 built
6570
* (EPT on EPT). So any problems with the structure of the
6571
* table is L0's fault.
6572
*/
6573
return true;
6574
case EXIT_REASON_PREEMPTION_TIMER:
6575
return true;
6576
case EXIT_REASON_PML_FULL:
6577
/*
6578
* PML is emulated for an L1 VMM and should never be enabled in
6579
* vmcs02, always "handle" PML_FULL by exiting to userspace.
6580
*/
6581
return true;
6582
case EXIT_REASON_VMFUNC:
6583
/* VM functions are emulated through L2->L0 vmexits. */
6584
return true;
6585
case EXIT_REASON_BUS_LOCK:
6586
/*
6587
* At present, bus lock VM exit is never exposed to L1.
6588
* Handle L2's bus locks in L0 directly.
6589
*/
6590
return true;
6591
#ifdef CONFIG_KVM_HYPERV
6592
case EXIT_REASON_VMCALL:
6593
/* Hyper-V L2 TLB flush hypercall is handled by L0 */
6594
return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
6595
nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
6596
kvm_hv_is_tlb_flush_hcall(vcpu);
6597
#endif
6598
default:
6599
break;
6600
}
6601
return false;
6602
}
6603
6604
/*
6605
* Return 1 if L1 wants to intercept an exit from L2. Only call this when in
6606
* is_guest_mode (L2).
6607
*/
6608
static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
6609
union vmx_exit_reason exit_reason)
6610
{
6611
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6612
u32 intr_info;
6613
6614
switch ((u16)exit_reason.basic) {
6615
case EXIT_REASON_EXCEPTION_NMI:
6616
intr_info = vmx_get_intr_info(vcpu);
6617
if (is_nmi(intr_info))
6618
return true;
6619
else if (is_page_fault(intr_info))
6620
return true;
6621
return vmcs12->exception_bitmap &
6622
(1u << (intr_info & INTR_INFO_VECTOR_MASK));
6623
case EXIT_REASON_EXTERNAL_INTERRUPT:
6624
return nested_exit_on_intr(vcpu);
6625
case EXIT_REASON_TRIPLE_FAULT:
6626
return true;
6627
case EXIT_REASON_INTERRUPT_WINDOW:
6628
return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
6629
case EXIT_REASON_NMI_WINDOW:
6630
return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
6631
case EXIT_REASON_TASK_SWITCH:
6632
return true;
6633
case EXIT_REASON_CPUID:
6634
return true;
6635
case EXIT_REASON_HLT:
6636
return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
6637
case EXIT_REASON_INVD:
6638
return true;
6639
case EXIT_REASON_INVLPG:
6640
return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6641
case EXIT_REASON_RDPMC:
6642
return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
6643
case EXIT_REASON_RDRAND:
6644
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
6645
case EXIT_REASON_RDSEED:
6646
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
6647
case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
6648
return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
6649
case EXIT_REASON_VMREAD:
6650
return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
6651
vmcs12->vmread_bitmap);
6652
case EXIT_REASON_VMWRITE:
6653
return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
6654
vmcs12->vmwrite_bitmap);
6655
case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
6656
case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
6657
case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
6658
case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
6659
case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
6660
/*
6661
* VMX instructions trap unconditionally. This allows L1 to
6662
* emulate them for its L2 guest, i.e., allows 3-level nesting!
6663
*/
6664
return true;
6665
case EXIT_REASON_CR_ACCESS:
6666
return nested_vmx_exit_handled_cr(vcpu, vmcs12);
6667
case EXIT_REASON_DR_ACCESS:
6668
return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
6669
case EXIT_REASON_IO_INSTRUCTION:
6670
return nested_vmx_exit_handled_io(vcpu, vmcs12);
6671
case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
6672
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
6673
case EXIT_REASON_MSR_READ:
6674
case EXIT_REASON_MSR_WRITE:
6675
case EXIT_REASON_MSR_READ_IMM:
6676
case EXIT_REASON_MSR_WRITE_IMM:
6677
return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
6678
case EXIT_REASON_INVALID_STATE:
6679
return true;
6680
case EXIT_REASON_MWAIT_INSTRUCTION:
6681
return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
6682
case EXIT_REASON_MONITOR_TRAP_FLAG:
6683
return nested_vmx_exit_handled_mtf(vmcs12);
6684
case EXIT_REASON_MONITOR_INSTRUCTION:
6685
return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
6686
case EXIT_REASON_PAUSE_INSTRUCTION:
6687
return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
6688
nested_cpu_has2(vmcs12,
6689
SECONDARY_EXEC_PAUSE_LOOP_EXITING);
6690
case EXIT_REASON_MCE_DURING_VMENTRY:
6691
return true;
6692
case EXIT_REASON_TPR_BELOW_THRESHOLD:
6693
return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
6694
case EXIT_REASON_APIC_ACCESS:
6695
case EXIT_REASON_APIC_WRITE:
6696
case EXIT_REASON_EOI_INDUCED:
6697
/*
6698
* The controls for "virtualize APIC accesses," "APIC-
6699
* register virtualization," and "virtual-interrupt
6700
* delivery" only come from vmcs12.
6701
*/
6702
return true;
6703
case EXIT_REASON_INVPCID:
6704
return
6705
nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
6706
nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6707
case EXIT_REASON_WBINVD:
6708
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
6709
case EXIT_REASON_XSETBV:
6710
return true;
6711
case EXIT_REASON_XSAVES:
6712
case EXIT_REASON_XRSTORS:
6713
/*
6714
* Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize
6715
* XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap
6716
* verbatim, i.e. any exit is due to L1's bitmap. WARN if
6717
* XSAVES isn't enabled, as the CPU is supposed to inject #UD
6718
* in that case, before consulting the XSS-bitmap.
6719
*/
6720
WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES));
6721
return true;
6722
case EXIT_REASON_UMWAIT:
6723
case EXIT_REASON_TPAUSE:
6724
return nested_cpu_has2(vmcs12,
6725
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
6726
case EXIT_REASON_ENCLS:
6727
return nested_vmx_exit_handled_encls(vcpu, vmcs12);
6728
case EXIT_REASON_NOTIFY:
6729
/* Notify VM exit is not exposed to L1 */
6730
return false;
6731
default:
6732
return true;
6733
}
6734
}
6735
6736
/*
6737
* Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was
6738
* reflected into L1.
6739
*/
6740
bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
6741
{
6742
struct vcpu_vmx *vmx = to_vmx(vcpu);
6743
union vmx_exit_reason exit_reason = vmx->vt.exit_reason;
6744
unsigned long exit_qual;
6745
u32 exit_intr_info;
6746
6747
WARN_ON_ONCE(vmx->nested.nested_run_pending);
6748
6749
/*
6750
* Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
6751
* has already loaded L2's state.
6752
*/
6753
if (unlikely(vmx->fail)) {
6754
trace_kvm_nested_vmenter_failed(
6755
"hardware VM-instruction error: ",
6756
vmcs_read32(VM_INSTRUCTION_ERROR));
6757
exit_intr_info = 0;
6758
exit_qual = 0;
6759
goto reflect_vmexit;
6760
}
6761
6762
trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX);
6763
6764
/* If L0 (KVM) wants the exit, it trumps L1's desires. */
6765
if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
6766
return false;
6767
6768
/* If L1 doesn't want the exit, handle it in L0. */
6769
if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
6770
return false;
6771
6772
/*
6773
* vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For
6774
* EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
6775
* need to be synthesized by querying the in-kernel LAPIC, but external
6776
* interrupts are never reflected to L1 so it's a non-issue.
6777
*/
6778
exit_intr_info = vmx_get_intr_info(vcpu);
6779
if (is_exception_with_error_code(exit_intr_info)) {
6780
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6781
6782
vmcs12->vm_exit_intr_error_code =
6783
vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6784
}
6785
exit_qual = vmx_get_exit_qual(vcpu);
6786
6787
reflect_vmexit:
6788
nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
6789
return true;
6790
}
6791
6792
static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
6793
struct kvm_nested_state __user *user_kvm_nested_state,
6794
u32 user_data_size)
6795
{
6796
struct vcpu_vmx *vmx;
6797
struct vmcs12 *vmcs12;
6798
struct kvm_nested_state kvm_state = {
6799
.flags = 0,
6800
.format = KVM_STATE_NESTED_FORMAT_VMX,
6801
.size = sizeof(kvm_state),
6802
.hdr.vmx.flags = 0,
6803
.hdr.vmx.vmxon_pa = INVALID_GPA,
6804
.hdr.vmx.vmcs12_pa = INVALID_GPA,
6805
.hdr.vmx.preemption_timer_deadline = 0,
6806
};
6807
struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6808
&user_kvm_nested_state->data.vmx[0];
6809
6810
if (!vcpu)
6811
return kvm_state.size + sizeof(*user_vmx_nested_state);
6812
6813
vmx = to_vmx(vcpu);
6814
vmcs12 = get_vmcs12(vcpu);
6815
6816
if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) &&
6817
(vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
6818
kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
6819
kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
6820
6821
if (vmx_has_valid_vmcs12(vcpu)) {
6822
kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
6823
6824
/* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
6825
if (nested_vmx_is_evmptr12_set(vmx))
6826
kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
6827
6828
if (is_guest_mode(vcpu) &&
6829
nested_cpu_has_shadow_vmcs(vmcs12) &&
6830
vmcs12->vmcs_link_pointer != INVALID_GPA)
6831
kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
6832
}
6833
6834
if (vmx->nested.smm.vmxon)
6835
kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
6836
6837
if (vmx->nested.smm.guest_mode)
6838
kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
6839
6840
if (is_guest_mode(vcpu)) {
6841
kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
6842
6843
if (vmx->nested.nested_run_pending)
6844
kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
6845
6846
if (vmx->nested.mtf_pending)
6847
kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
6848
6849
if (nested_cpu_has_preemption_timer(vmcs12) &&
6850
vmx->nested.has_preemption_timer_deadline) {
6851
kvm_state.hdr.vmx.flags |=
6852
KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
6853
kvm_state.hdr.vmx.preemption_timer_deadline =
6854
vmx->nested.preemption_timer_deadline;
6855
}
6856
}
6857
}
6858
6859
if (user_data_size < kvm_state.size)
6860
goto out;
6861
6862
if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
6863
return -EFAULT;
6864
6865
if (!vmx_has_valid_vmcs12(vcpu))
6866
goto out;
6867
6868
/*
6869
* When running L2, the authoritative vmcs12 state is in the
6870
* vmcs02. When running L1, the authoritative vmcs12 state is
6871
* in the shadow or enlightened vmcs linked to vmcs01, unless
6872
* need_vmcs12_to_shadow_sync is set, in which case, the authoritative
6873
* vmcs12 state is in the vmcs12 already.
6874
*/
6875
if (is_guest_mode(vcpu)) {
6876
sync_vmcs02_to_vmcs12(vcpu, vmcs12);
6877
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
6878
} else {
6879
copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
6880
if (!vmx->nested.need_vmcs12_to_shadow_sync) {
6881
if (nested_vmx_is_evmptr12_valid(vmx))
6882
/*
6883
* L1 hypervisor is not obliged to keep eVMCS
6884
* clean fields data always up-to-date while
6885
* not in guest mode, 'hv_clean_fields' is only
6886
* supposed to be actual upon vmentry so we need
6887
* to ignore it here and do full copy.
6888
*/
6889
copy_enlightened_to_vmcs12(vmx, 0);
6890
else if (enable_shadow_vmcs)
6891
copy_shadow_to_vmcs12(vmx);
6892
}
6893
}
6894
6895
BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
6896
BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
6897
6898
/*
6899
* Copy over the full allocated size of vmcs12 rather than just the size
6900
* of the struct.
6901
*/
6902
if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
6903
return -EFAULT;
6904
6905
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6906
vmcs12->vmcs_link_pointer != INVALID_GPA) {
6907
if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
6908
get_shadow_vmcs12(vcpu), VMCS12_SIZE))
6909
return -EFAULT;
6910
}
6911
out:
6912
return kvm_state.size;
6913
}
6914
6915
void vmx_leave_nested(struct kvm_vcpu *vcpu)
6916
{
6917
if (is_guest_mode(vcpu)) {
6918
to_vmx(vcpu)->nested.nested_run_pending = 0;
6919
nested_vmx_vmexit(vcpu, -1, 0, 0);
6920
}
6921
free_nested(vcpu);
6922
}
6923
6924
static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
6925
struct kvm_nested_state __user *user_kvm_nested_state,
6926
struct kvm_nested_state *kvm_state)
6927
{
6928
struct vcpu_vmx *vmx = to_vmx(vcpu);
6929
struct vmcs12 *vmcs12;
6930
enum vm_entry_failure_code ignored;
6931
struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6932
&user_kvm_nested_state->data.vmx[0];
6933
int ret;
6934
6935
if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
6936
return -EINVAL;
6937
6938
if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) {
6939
if (kvm_state->hdr.vmx.smm.flags)
6940
return -EINVAL;
6941
6942
if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)
6943
return -EINVAL;
6944
6945
/*
6946
* KVM_STATE_NESTED_EVMCS used to signal that KVM should
6947
* enable eVMCS capability on vCPU. However, since then
6948
* code was changed such that flag signals vmcs12 should
6949
* be copied into eVMCS in guest memory.
6950
*
6951
* To preserve backwards compatibility, allow user
6952
* to set this flag even when there is no VMXON region.
6953
*/
6954
if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
6955
return -EINVAL;
6956
} else {
6957
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
6958
return -EINVAL;
6959
6960
if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
6961
return -EINVAL;
6962
}
6963
6964
if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6965
(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6966
return -EINVAL;
6967
6968
if (kvm_state->hdr.vmx.smm.flags &
6969
~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
6970
return -EINVAL;
6971
6972
if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
6973
return -EINVAL;
6974
6975
/*
6976
* SMM temporarily disables VMX, so we cannot be in guest mode,
6977
* nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
6978
* must be zero.
6979
*/
6980
if (is_smm(vcpu) ?
6981
(kvm_state->flags &
6982
(KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
6983
: kvm_state->hdr.vmx.smm.flags)
6984
return -EINVAL;
6985
6986
if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6987
!(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
6988
return -EINVAL;
6989
6990
if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
6991
(!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) ||
6992
!vmx->nested.enlightened_vmcs_enabled))
6993
return -EINVAL;
6994
6995
vmx_leave_nested(vcpu);
6996
6997
if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA)
6998
return 0;
6999
7000
vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
7001
ret = enter_vmx_operation(vcpu);
7002
if (ret)
7003
return ret;
7004
7005
/* Empty 'VMXON' state is permitted if no VMCS loaded */
7006
if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
7007
/* See vmx_has_valid_vmcs12. */
7008
if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
7009
(kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
7010
(kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA))
7011
return -EINVAL;
7012
else
7013
return 0;
7014
}
7015
7016
if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) {
7017
if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
7018
!page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
7019
return -EINVAL;
7020
7021
set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
7022
#ifdef CONFIG_KVM_HYPERV
7023
} else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
7024
/*
7025
* nested_vmx_handle_enlightened_vmptrld() cannot be called
7026
* directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
7027
* restored yet. EVMCS will be mapped from
7028
* nested_get_vmcs12_pages().
7029
*/
7030
vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
7031
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
7032
#endif
7033
} else {
7034
return -EINVAL;
7035
}
7036
7037
if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
7038
vmx->nested.smm.vmxon = true;
7039
vmx->nested.vmxon = false;
7040
7041
if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
7042
vmx->nested.smm.guest_mode = true;
7043
}
7044
7045
vmcs12 = get_vmcs12(vcpu);
7046
if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
7047
return -EFAULT;
7048
7049
if (vmcs12->hdr.revision_id != VMCS12_REVISION)
7050
return -EINVAL;
7051
7052
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
7053
return 0;
7054
7055
vmx->nested.nested_run_pending =
7056
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
7057
7058
vmx->nested.mtf_pending =
7059
!!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
7060
7061
ret = -EINVAL;
7062
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
7063
vmcs12->vmcs_link_pointer != INVALID_GPA) {
7064
struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
7065
7066
if (kvm_state->size <
7067
sizeof(*kvm_state) +
7068
sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
7069
goto error_guest_mode;
7070
7071
if (copy_from_user(shadow_vmcs12,
7072
user_vmx_nested_state->shadow_vmcs12,
7073
sizeof(*shadow_vmcs12))) {
7074
ret = -EFAULT;
7075
goto error_guest_mode;
7076
}
7077
7078
if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
7079
!shadow_vmcs12->hdr.shadow_vmcs)
7080
goto error_guest_mode;
7081
}
7082
7083
vmx->nested.has_preemption_timer_deadline = false;
7084
if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
7085
vmx->nested.has_preemption_timer_deadline = true;
7086
vmx->nested.preemption_timer_deadline =
7087
kvm_state->hdr.vmx.preemption_timer_deadline;
7088
}
7089
7090
if (nested_vmx_check_controls(vcpu, vmcs12) ||
7091
nested_vmx_check_host_state(vcpu, vmcs12) ||
7092
nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
7093
goto error_guest_mode;
7094
7095
vmx->nested.dirty_vmcs12 = true;
7096
vmx->nested.force_msr_bitmap_recalc = true;
7097
ret = nested_vmx_enter_non_root_mode(vcpu, false);
7098
if (ret)
7099
goto error_guest_mode;
7100
7101
if (vmx->nested.mtf_pending)
7102
kvm_make_request(KVM_REQ_EVENT, vcpu);
7103
7104
return 0;
7105
7106
error_guest_mode:
7107
vmx->nested.nested_run_pending = 0;
7108
return ret;
7109
}
7110
7111
void nested_vmx_set_vmcs_shadowing_bitmap(void)
7112
{
7113
if (enable_shadow_vmcs) {
7114
vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
7115
vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
7116
}
7117
}
7118
7119
/*
7120
* Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo
7121
* that madness to get the encoding for comparison.
7122
*/
7123
#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
7124
7125
static u64 nested_vmx_calc_vmcs_enum_msr(void)
7126
{
7127
/*
7128
* Note these are the so called "index" of the VMCS field encoding, not
7129
* the index into vmcs12.
7130
*/
7131
unsigned int max_idx, idx;
7132
int i;
7133
7134
/*
7135
* For better or worse, KVM allows VMREAD/VMWRITE to all fields in
7136
* vmcs12, regardless of whether or not the associated feature is
7137
* exposed to L1. Simply find the field with the highest index.
7138
*/
7139
max_idx = 0;
7140
for (i = 0; i < nr_vmcs12_fields; i++) {
7141
/* The vmcs12 table is very, very sparsely populated. */
7142
if (!vmcs12_field_offsets[i])
7143
continue;
7144
7145
idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
7146
if (idx > max_idx)
7147
max_idx = idx;
7148
}
7149
7150
return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
7151
}
7152
7153
static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf,
7154
struct nested_vmx_msrs *msrs)
7155
{
7156
msrs->pinbased_ctls_low =
7157
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
7158
7159
msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl;
7160
msrs->pinbased_ctls_high &=
7161
PIN_BASED_EXT_INTR_MASK |
7162
PIN_BASED_NMI_EXITING |
7163
PIN_BASED_VIRTUAL_NMIS |
7164
(enable_apicv ? PIN_BASED_POSTED_INTR : 0);
7165
msrs->pinbased_ctls_high |=
7166
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
7167
PIN_BASED_VMX_PREEMPTION_TIMER;
7168
}
7169
7170
static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf,
7171
struct nested_vmx_msrs *msrs)
7172
{
7173
msrs->exit_ctls_low =
7174
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
7175
7176
msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl;
7177
msrs->exit_ctls_high &=
7178
#ifdef CONFIG_X86_64
7179
VM_EXIT_HOST_ADDR_SPACE_SIZE |
7180
#endif
7181
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
7182
VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE;
7183
msrs->exit_ctls_high |=
7184
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
7185
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
7186
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT |
7187
VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
7188
7189
if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
7190
!kvm_cpu_cap_has(X86_FEATURE_IBT))
7191
msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE;
7192
7193
/* We support free control of debug control saving. */
7194
msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
7195
}
7196
7197
static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf,
7198
struct nested_vmx_msrs *msrs)
7199
{
7200
msrs->entry_ctls_low =
7201
VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
7202
7203
msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl;
7204
msrs->entry_ctls_high &=
7205
#ifdef CONFIG_X86_64
7206
VM_ENTRY_IA32E_MODE |
7207
#endif
7208
VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
7209
VM_ENTRY_LOAD_CET_STATE;
7210
msrs->entry_ctls_high |=
7211
(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER |
7212
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
7213
7214
if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
7215
!kvm_cpu_cap_has(X86_FEATURE_IBT))
7216
msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE;
7217
7218
/* We support free control of debug control loading. */
7219
msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
7220
}
7221
7222
static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf,
7223
struct nested_vmx_msrs *msrs)
7224
{
7225
msrs->procbased_ctls_low =
7226
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
7227
7228
msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl;
7229
msrs->procbased_ctls_high &=
7230
CPU_BASED_INTR_WINDOW_EXITING |
7231
CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
7232
CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
7233
CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
7234
CPU_BASED_CR3_STORE_EXITING |
7235
#ifdef CONFIG_X86_64
7236
CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
7237
#endif
7238
CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
7239
CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
7240
CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
7241
CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
7242
CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
7243
/*
7244
* We can allow some features even when not supported by the
7245
* hardware. For example, L1 can specify an MSR bitmap - and we
7246
* can use it to avoid exits to L1 - even when L0 runs L2
7247
* without MSR bitmaps.
7248
*/
7249
msrs->procbased_ctls_high |=
7250
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
7251
CPU_BASED_USE_MSR_BITMAPS;
7252
7253
/* We support free control of CR3 access interception. */
7254
msrs->procbased_ctls_low &=
7255
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
7256
}
7257
7258
static void nested_vmx_setup_secondary_ctls(u32 ept_caps,
7259
struct vmcs_config *vmcs_conf,
7260
struct nested_vmx_msrs *msrs)
7261
{
7262
msrs->secondary_ctls_low = 0;
7263
7264
msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
7265
msrs->secondary_ctls_high &=
7266
SECONDARY_EXEC_DESC |
7267
SECONDARY_EXEC_ENABLE_RDTSCP |
7268
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7269
SECONDARY_EXEC_WBINVD_EXITING |
7270
SECONDARY_EXEC_APIC_REGISTER_VIRT |
7271
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
7272
SECONDARY_EXEC_RDRAND_EXITING |
7273
SECONDARY_EXEC_ENABLE_INVPCID |
7274
SECONDARY_EXEC_ENABLE_VMFUNC |
7275
SECONDARY_EXEC_RDSEED_EXITING |
7276
SECONDARY_EXEC_ENABLE_XSAVES |
7277
SECONDARY_EXEC_TSC_SCALING |
7278
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
7279
7280
/*
7281
* We can emulate "VMCS shadowing," even if the hardware
7282
* doesn't support it.
7283
*/
7284
msrs->secondary_ctls_high |=
7285
SECONDARY_EXEC_SHADOW_VMCS;
7286
7287
if (enable_ept) {
7288
/* nested EPT: emulate EPT also to L1 */
7289
msrs->secondary_ctls_high |=
7290
SECONDARY_EXEC_ENABLE_EPT;
7291
msrs->ept_caps =
7292
VMX_EPT_PAGE_WALK_4_BIT |
7293
VMX_EPT_PAGE_WALK_5_BIT |
7294
VMX_EPTP_WB_BIT |
7295
VMX_EPT_INVEPT_BIT |
7296
VMX_EPT_EXECUTE_ONLY_BIT;
7297
7298
msrs->ept_caps &= ept_caps;
7299
msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
7300
VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
7301
VMX_EPT_1GB_PAGE_BIT;
7302
if (enable_ept_ad_bits) {
7303
msrs->secondary_ctls_high |=
7304
SECONDARY_EXEC_ENABLE_PML;
7305
msrs->ept_caps |= VMX_EPT_AD_BIT;
7306
}
7307
7308
/*
7309
* Advertise EPTP switching irrespective of hardware support,
7310
* KVM emulates it in software so long as VMFUNC is supported.
7311
*/
7312
if (cpu_has_vmx_vmfunc())
7313
msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING;
7314
}
7315
7316
/*
7317
* Old versions of KVM use the single-context version without
7318
* checking for support, so declare that it is supported even
7319
* though it is treated as global context. The alternative is
7320
* not failing the single-context invvpid, and it is worse.
7321
*/
7322
if (enable_vpid) {
7323
msrs->secondary_ctls_high |=
7324
SECONDARY_EXEC_ENABLE_VPID;
7325
msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
7326
VMX_VPID_EXTENT_SUPPORTED_MASK;
7327
}
7328
7329
if (enable_unrestricted_guest)
7330
msrs->secondary_ctls_high |=
7331
SECONDARY_EXEC_UNRESTRICTED_GUEST;
7332
7333
if (flexpriority_enabled)
7334
msrs->secondary_ctls_high |=
7335
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
7336
7337
if (enable_sgx)
7338
msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
7339
}
7340
7341
static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf,
7342
struct nested_vmx_msrs *msrs)
7343
{
7344
msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
7345
msrs->misc_low |=
7346
VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
7347
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
7348
VMX_MISC_ACTIVITY_HLT |
7349
VMX_MISC_ACTIVITY_WAIT_SIPI;
7350
msrs->misc_high = 0;
7351
}
7352
7353
static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
7354
{
7355
/*
7356
* This MSR reports some information about VMX support. We
7357
* should return information about the VMX we emulate for the
7358
* guest, and the VMCS structure we give it - not about the
7359
* VMX support of the underlying hardware.
7360
*/
7361
msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE,
7362
X86_MEMTYPE_WB);
7363
7364
msrs->basic |= VMX_BASIC_TRUE_CTLS;
7365
if (cpu_has_vmx_basic_inout())
7366
msrs->basic |= VMX_BASIC_INOUT;
7367
if (cpu_has_vmx_basic_no_hw_errcode_cc())
7368
msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC;
7369
}
7370
7371
static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
7372
{
7373
/*
7374
* These MSRs specify bits which the guest must keep fixed on
7375
* while L1 is in VMXON mode (in L1's root mode, or running an L2).
7376
* We picked the standard core2 setting.
7377
*/
7378
#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
7379
#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
7380
msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
7381
msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
7382
7383
/* These MSRs specify bits which the guest must keep fixed off. */
7384
rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
7385
rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
7386
7387
if (vmx_umip_emulated())
7388
msrs->cr4_fixed1 |= X86_CR4_UMIP;
7389
}
7390
7391
/*
7392
* nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
7393
* returned for the various VMX controls MSRs when nested VMX is enabled.
7394
* The same values should also be used to verify that vmcs12 control fields are
7395
* valid during nested entry from L1 to L2.
7396
* Each of these control msrs has a low and high 32-bit half: A low bit is on
7397
* if the corresponding bit in the (32-bit) control field *must* be on, and a
7398
* bit in the high half is on if the corresponding bit in the control field
7399
* may be on. See also vmx_control_verify().
7400
*/
7401
void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
7402
{
7403
struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
7404
7405
/*
7406
* Note that as a general rule, the high half of the MSRs (bits in
7407
* the control fields which may be 1) should be initialized by the
7408
* intersection of the underlying hardware's MSR (i.e., features which
7409
* can be supported) and the list of features we want to expose -
7410
* because they are known to be properly supported in our code.
7411
* Also, usually, the low half of the MSRs (bits which must be 1) can
7412
* be set to 0, meaning that L1 may turn off any of these bits. The
7413
* reason is that if one of these bits is necessary, it will appear
7414
* in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
7415
* fields of vmcs01 and vmcs02, will turn these bits off - and
7416
* nested_vmx_l1_wants_exit() will not pass related exits to L1.
7417
* These rules have exceptions below.
7418
*/
7419
nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs);
7420
7421
nested_vmx_setup_exit_ctls(vmcs_conf, msrs);
7422
7423
nested_vmx_setup_entry_ctls(vmcs_conf, msrs);
7424
7425
nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs);
7426
7427
nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs);
7428
7429
nested_vmx_setup_misc_data(vmcs_conf, msrs);
7430
7431
nested_vmx_setup_basic(msrs);
7432
7433
nested_vmx_setup_cr_fixed(msrs);
7434
7435
msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
7436
}
7437
7438
void nested_vmx_hardware_unsetup(void)
7439
{
7440
int i;
7441
7442
if (enable_shadow_vmcs) {
7443
for (i = 0; i < VMX_BITMAP_NR; i++)
7444
free_page((unsigned long)vmx_bitmap[i]);
7445
}
7446
}
7447
7448
__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
7449
{
7450
int i;
7451
7452
if (!cpu_has_vmx_shadow_vmcs())
7453
enable_shadow_vmcs = 0;
7454
if (enable_shadow_vmcs) {
7455
for (i = 0; i < VMX_BITMAP_NR; i++) {
7456
/*
7457
* The vmx_bitmap is not tied to a VM and so should
7458
* not be charged to a memcg.
7459
*/
7460
vmx_bitmap[i] = (unsigned long *)
7461
__get_free_page(GFP_KERNEL);
7462
if (!vmx_bitmap[i]) {
7463
nested_vmx_hardware_unsetup();
7464
return -ENOMEM;
7465
}
7466
}
7467
7468
init_vmcs_shadow_fields();
7469
}
7470
7471
exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear;
7472
exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
7473
exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld;
7474
exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst;
7475
exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
7476
exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
7477
exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
7478
exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff;
7479
exit_handlers[EXIT_REASON_VMON] = handle_vmxon;
7480
exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
7481
exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
7482
exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
7483
7484
return 0;
7485
}
7486
7487
struct kvm_x86_nested_ops vmx_nested_ops = {
7488
.leave_nested = vmx_leave_nested,
7489
.is_exception_vmexit = nested_vmx_is_exception_vmexit,
7490
.check_events = vmx_check_nested_events,
7491
.has_events = vmx_has_nested_events,
7492
.triple_fault = nested_vmx_triple_fault,
7493
.get_state = vmx_get_nested_state,
7494
.set_state = vmx_set_nested_state,
7495
.get_nested_state_pages = vmx_get_nested_state_pages,
7496
.write_log_dirty = nested_vmx_write_pml_buffer,
7497
#ifdef CONFIG_KVM_HYPERV
7498
.enable_evmcs = nested_enable_evmcs,
7499
.get_evmcs_version = nested_get_evmcs_version,
7500
.hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush,
7501
#endif
7502
};
7503
7504