Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/entry/vsyscall/vsyscall_64.c
29524 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (c) 2012-2014 Andy Lutomirski <[email protected]>
4
*
5
* Based on the original implementation which is:
6
* Copyright (C) 2001 Andrea Arcangeli <[email protected]> SuSE
7
* Copyright 2003 Andi Kleen, SuSE Labs.
8
*
9
* Parts of the original code have been moved to arch/x86/vdso/vma.c
10
*
11
* This file implements vsyscall emulation. vsyscalls are a legacy ABI:
12
* Userspace can request certain kernel services by calling fixed
13
* addresses. This concept is problematic:
14
*
15
* - It interferes with ASLR.
16
* - It's awkward to write code that lives in kernel addresses but is
17
* callable by userspace at fixed addresses.
18
* - The whole concept is impossible for 32-bit compat userspace.
19
* - UML cannot easily virtualize a vsyscall.
20
*
21
* As of mid-2014, I believe that there is no new userspace code that
22
* will use a vsyscall if the vDSO is present. I hope that there will
23
* soon be no new userspace code that will ever use a vsyscall.
24
*
25
* The code in this file emulates vsyscalls when notified of a page
26
* fault to a vsyscall address.
27
*/
28
29
#include <linux/kernel.h>
30
#include <linux/timer.h>
31
#include <linux/sched/signal.h>
32
#include <linux/mm_types.h>
33
#include <linux/syscalls.h>
34
#include <linux/ratelimit.h>
35
36
#include <asm/vsyscall.h>
37
#include <asm/unistd.h>
38
#include <asm/fixmap.h>
39
#include <asm/traps.h>
40
#include <asm/paravirt.h>
41
42
#define CREATE_TRACE_POINTS
43
#include "vsyscall_trace.h"
44
45
static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
46
#ifdef CONFIG_LEGACY_VSYSCALL_NONE
47
NONE;
48
#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
49
XONLY;
50
#else
51
#error VSYSCALL config is broken
52
#endif
53
54
static int __init vsyscall_setup(char *str)
55
{
56
if (str) {
57
if (!strcmp("emulate", str))
58
vsyscall_mode = EMULATE;
59
else if (!strcmp("xonly", str))
60
vsyscall_mode = XONLY;
61
else if (!strcmp("none", str))
62
vsyscall_mode = NONE;
63
else
64
return -EINVAL;
65
66
return 0;
67
}
68
69
return -EINVAL;
70
}
71
early_param("vsyscall", vsyscall_setup);
72
73
static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
74
const char *message)
75
{
76
if (!show_unhandled_signals)
77
return;
78
79
printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",
80
level, current->comm, task_pid_nr(current),
81
message, regs->ip, regs->cs,
82
regs->sp, regs->ax, regs->si, regs->di);
83
}
84
85
static int addr_to_vsyscall_nr(unsigned long addr)
86
{
87
int nr;
88
89
if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
90
return -EINVAL;
91
92
nr = (addr & 0xC00UL) >> 10;
93
if (nr >= 3)
94
return -EINVAL;
95
96
return nr;
97
}
98
99
static bool write_ok_or_segv(unsigned long ptr, size_t size)
100
{
101
if (!access_ok((void __user *)ptr, size)) {
102
struct thread_struct *thread = &current->thread;
103
104
thread->error_code = X86_PF_USER | X86_PF_WRITE;
105
thread->cr2 = ptr;
106
thread->trap_nr = X86_TRAP_PF;
107
108
force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr);
109
return false;
110
} else {
111
return true;
112
}
113
}
114
115
bool emulate_vsyscall(unsigned long error_code,
116
struct pt_regs *regs, unsigned long address)
117
{
118
unsigned long caller;
119
int vsyscall_nr, syscall_nr, tmp;
120
long ret;
121
unsigned long orig_dx;
122
123
/* Write faults or kernel-privilege faults never get fixed up. */
124
if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
125
return false;
126
127
/*
128
* Assume that faults at regs->ip are because of an
129
* instruction fetch. Return early and avoid
130
* emulation for faults during data accesses:
131
*/
132
if (address != regs->ip) {
133
/* Failed vsyscall read */
134
if (vsyscall_mode == EMULATE)
135
return false;
136
137
/*
138
* User code tried and failed to read the vsyscall page.
139
*/
140
warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
141
return false;
142
}
143
144
/*
145
* X86_PF_INSTR is only set when NX is supported. When
146
* available, use it to double-check that the emulation code
147
* is only being used for instruction fetches:
148
*/
149
if (cpu_feature_enabled(X86_FEATURE_NX))
150
WARN_ON_ONCE(!(error_code & X86_PF_INSTR));
151
152
/*
153
* No point in checking CS -- the only way to get here is a user mode
154
* trap to a high address, which means that we're in 64-bit user code.
155
*/
156
157
if (vsyscall_mode == NONE) {
158
warn_bad_vsyscall(KERN_INFO, regs,
159
"vsyscall attempted with vsyscall=none");
160
return false;
161
}
162
163
vsyscall_nr = addr_to_vsyscall_nr(address);
164
165
trace_emulate_vsyscall(vsyscall_nr);
166
167
if (vsyscall_nr < 0) {
168
warn_bad_vsyscall(KERN_WARNING, regs,
169
"misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
170
goto sigsegv;
171
}
172
173
if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
174
warn_bad_vsyscall(KERN_WARNING, regs,
175
"vsyscall with bad stack (exploit attempt?)");
176
goto sigsegv;
177
}
178
179
/*
180
* Check for access_ok violations and find the syscall nr.
181
*
182
* NULL is a valid user pointer (in the access_ok sense) on 32-bit and
183
* 64-bit, so we don't need to special-case it here. For all the
184
* vsyscalls, NULL means "don't write anything" not "write it at
185
* address 0".
186
*/
187
switch (vsyscall_nr) {
188
case 0:
189
if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) ||
190
!write_ok_or_segv(regs->si, sizeof(struct timezone))) {
191
ret = -EFAULT;
192
goto check_fault;
193
}
194
195
syscall_nr = __NR_gettimeofday;
196
break;
197
198
case 1:
199
if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) {
200
ret = -EFAULT;
201
goto check_fault;
202
}
203
204
syscall_nr = __NR_time;
205
break;
206
207
case 2:
208
if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
209
!write_ok_or_segv(regs->si, sizeof(unsigned))) {
210
ret = -EFAULT;
211
goto check_fault;
212
}
213
214
syscall_nr = __NR_getcpu;
215
break;
216
}
217
218
/*
219
* Handle seccomp. regs->ip must be the original value.
220
* See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.
221
*
222
* We could optimize the seccomp disabled case, but performance
223
* here doesn't matter.
224
*/
225
regs->orig_ax = syscall_nr;
226
regs->ax = -ENOSYS;
227
tmp = secure_computing();
228
if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
229
warn_bad_vsyscall(KERN_DEBUG, regs,
230
"seccomp tried to change syscall nr or ip");
231
force_exit_sig(SIGSYS);
232
return true;
233
}
234
regs->orig_ax = -1;
235
if (tmp)
236
goto do_ret; /* skip requested */
237
238
/*
239
* With a real vsyscall, page faults cause SIGSEGV.
240
*/
241
ret = -EFAULT;
242
switch (vsyscall_nr) {
243
case 0:
244
/* this decodes regs->di and regs->si on its own */
245
ret = __x64_sys_gettimeofday(regs);
246
break;
247
248
case 1:
249
/* this decodes regs->di on its own */
250
ret = __x64_sys_time(regs);
251
break;
252
253
case 2:
254
/* while we could clobber regs->dx, we didn't in the past... */
255
orig_dx = regs->dx;
256
regs->dx = 0;
257
/* this decodes regs->di, regs->si and regs->dx on its own */
258
ret = __x64_sys_getcpu(regs);
259
regs->dx = orig_dx;
260
break;
261
}
262
263
check_fault:
264
if (ret == -EFAULT) {
265
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
266
warn_bad_vsyscall(KERN_INFO, regs,
267
"vsyscall fault (exploit attempt?)");
268
goto sigsegv;
269
}
270
271
regs->ax = ret;
272
273
do_ret:
274
/* Emulate a ret instruction. */
275
regs->ip = caller;
276
regs->sp += 8;
277
return true;
278
279
sigsegv:
280
force_sig(SIGSEGV);
281
return true;
282
}
283
284
/*
285
* A pseudo VMA to allow ptrace access for the vsyscall page. This only
286
* covers the 64bit vsyscall page now. 32bit has a real VMA now and does
287
* not need special handling anymore:
288
*/
289
static const char *gate_vma_name(struct vm_area_struct *vma)
290
{
291
return "[vsyscall]";
292
}
293
static const struct vm_operations_struct gate_vma_ops = {
294
.name = gate_vma_name,
295
};
296
static struct vm_area_struct gate_vma __ro_after_init = {
297
.vm_start = VSYSCALL_ADDR,
298
.vm_end = VSYSCALL_ADDR + PAGE_SIZE,
299
.vm_page_prot = PAGE_READONLY_EXEC,
300
.vm_flags = VM_READ | VM_EXEC,
301
.vm_ops = &gate_vma_ops,
302
};
303
304
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
305
{
306
#ifdef CONFIG_COMPAT
307
if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags))
308
return NULL;
309
#endif
310
if (vsyscall_mode == NONE)
311
return NULL;
312
return &gate_vma;
313
}
314
315
int in_gate_area(struct mm_struct *mm, unsigned long addr)
316
{
317
struct vm_area_struct *vma = get_gate_vma(mm);
318
319
if (!vma)
320
return 0;
321
322
return (addr >= vma->vm_start) && (addr < vma->vm_end);
323
}
324
325
/*
326
* Use this when you have no reliable mm, typically from interrupt
327
* context. It is less reliable than using a task's mm and may give
328
* false positives.
329
*/
330
int in_gate_area_no_mm(unsigned long addr)
331
{
332
return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
333
}
334
335
/*
336
* The VSYSCALL page is the only user-accessible page in the kernel address
337
* range. Normally, the kernel page tables can have _PAGE_USER clear, but
338
* the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
339
* are enabled.
340
*
341
* Some day we may create a "minimal" vsyscall mode in which we emulate
342
* vsyscalls but leave the page not present. If so, we skip calling
343
* this.
344
*/
345
void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
346
{
347
pgd_t *pgd;
348
p4d_t *p4d;
349
pud_t *pud;
350
pmd_t *pmd;
351
352
pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
353
set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
354
p4d = p4d_offset(pgd, VSYSCALL_ADDR);
355
set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));
356
pud = pud_offset(p4d, VSYSCALL_ADDR);
357
set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
358
pmd = pmd_offset(pud, VSYSCALL_ADDR);
359
set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
360
}
361
362
void __init map_vsyscall(void)
363
{
364
extern char __vsyscall_page;
365
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
366
367
/*
368
* For full emulation, the page needs to exist for real. In
369
* execute-only mode, there is no PTE at all backing the vsyscall
370
* page.
371
*/
372
if (vsyscall_mode == EMULATE) {
373
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
374
PAGE_KERNEL_VVAR);
375
set_vsyscall_pgtable_user_bits(swapper_pg_dir);
376
}
377
378
if (vsyscall_mode == XONLY)
379
vm_flags_init(&gate_vma, VM_EXEC);
380
381
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
382
(unsigned long)VSYSCALL_ADDR);
383
}
384
385