Path: blob/master/arch/x86/entry/vsyscall/vsyscall_64.c
29524 views
// SPDX-License-Identifier: GPL-2.01/*2* Copyright (c) 2012-2014 Andy Lutomirski <[email protected]>3*4* Based on the original implementation which is:5* Copyright (C) 2001 Andrea Arcangeli <[email protected]> SuSE6* Copyright 2003 Andi Kleen, SuSE Labs.7*8* Parts of the original code have been moved to arch/x86/vdso/vma.c9*10* This file implements vsyscall emulation. vsyscalls are a legacy ABI:11* Userspace can request certain kernel services by calling fixed12* addresses. This concept is problematic:13*14* - It interferes with ASLR.15* - It's awkward to write code that lives in kernel addresses but is16* callable by userspace at fixed addresses.17* - The whole concept is impossible for 32-bit compat userspace.18* - UML cannot easily virtualize a vsyscall.19*20* As of mid-2014, I believe that there is no new userspace code that21* will use a vsyscall if the vDSO is present. I hope that there will22* soon be no new userspace code that will ever use a vsyscall.23*24* The code in this file emulates vsyscalls when notified of a page25* fault to a vsyscall address.26*/2728#include <linux/kernel.h>29#include <linux/timer.h>30#include <linux/sched/signal.h>31#include <linux/mm_types.h>32#include <linux/syscalls.h>33#include <linux/ratelimit.h>3435#include <asm/vsyscall.h>36#include <asm/unistd.h>37#include <asm/fixmap.h>38#include <asm/traps.h>39#include <asm/paravirt.h>4041#define CREATE_TRACE_POINTS42#include "vsyscall_trace.h"4344static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =45#ifdef CONFIG_LEGACY_VSYSCALL_NONE46NONE;47#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)48XONLY;49#else50#error VSYSCALL config is broken51#endif5253static int __init vsyscall_setup(char *str)54{55if (str) {56if (!strcmp("emulate", str))57vsyscall_mode = EMULATE;58else if (!strcmp("xonly", str))59vsyscall_mode = XONLY;60else if (!strcmp("none", str))61vsyscall_mode = NONE;62else63return -EINVAL;6465return 0;66}6768return -EINVAL;69}70early_param("vsyscall", vsyscall_setup);7172static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,73const char *message)74{75if (!show_unhandled_signals)76return;7778printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",79level, current->comm, task_pid_nr(current),80message, regs->ip, regs->cs,81regs->sp, regs->ax, regs->si, regs->di);82}8384static int addr_to_vsyscall_nr(unsigned long addr)85{86int nr;8788if ((addr & ~0xC00UL) != VSYSCALL_ADDR)89return -EINVAL;9091nr = (addr & 0xC00UL) >> 10;92if (nr >= 3)93return -EINVAL;9495return nr;96}9798static bool write_ok_or_segv(unsigned long ptr, size_t size)99{100if (!access_ok((void __user *)ptr, size)) {101struct thread_struct *thread = ¤t->thread;102103thread->error_code = X86_PF_USER | X86_PF_WRITE;104thread->cr2 = ptr;105thread->trap_nr = X86_TRAP_PF;106107force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr);108return false;109} else {110return true;111}112}113114bool emulate_vsyscall(unsigned long error_code,115struct pt_regs *regs, unsigned long address)116{117unsigned long caller;118int vsyscall_nr, syscall_nr, tmp;119long ret;120unsigned long orig_dx;121122/* Write faults or kernel-privilege faults never get fixed up. */123if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)124return false;125126/*127* Assume that faults at regs->ip are because of an128* instruction fetch. Return early and avoid129* emulation for faults during data accesses:130*/131if (address != regs->ip) {132/* Failed vsyscall read */133if (vsyscall_mode == EMULATE)134return false;135136/*137* User code tried and failed to read the vsyscall page.138*/139warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");140return false;141}142143/*144* X86_PF_INSTR is only set when NX is supported. When145* available, use it to double-check that the emulation code146* is only being used for instruction fetches:147*/148if (cpu_feature_enabled(X86_FEATURE_NX))149WARN_ON_ONCE(!(error_code & X86_PF_INSTR));150151/*152* No point in checking CS -- the only way to get here is a user mode153* trap to a high address, which means that we're in 64-bit user code.154*/155156if (vsyscall_mode == NONE) {157warn_bad_vsyscall(KERN_INFO, regs,158"vsyscall attempted with vsyscall=none");159return false;160}161162vsyscall_nr = addr_to_vsyscall_nr(address);163164trace_emulate_vsyscall(vsyscall_nr);165166if (vsyscall_nr < 0) {167warn_bad_vsyscall(KERN_WARNING, regs,168"misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");169goto sigsegv;170}171172if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {173warn_bad_vsyscall(KERN_WARNING, regs,174"vsyscall with bad stack (exploit attempt?)");175goto sigsegv;176}177178/*179* Check for access_ok violations and find the syscall nr.180*181* NULL is a valid user pointer (in the access_ok sense) on 32-bit and182* 64-bit, so we don't need to special-case it here. For all the183* vsyscalls, NULL means "don't write anything" not "write it at184* address 0".185*/186switch (vsyscall_nr) {187case 0:188if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) ||189!write_ok_or_segv(regs->si, sizeof(struct timezone))) {190ret = -EFAULT;191goto check_fault;192}193194syscall_nr = __NR_gettimeofday;195break;196197case 1:198if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) {199ret = -EFAULT;200goto check_fault;201}202203syscall_nr = __NR_time;204break;205206case 2:207if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||208!write_ok_or_segv(regs->si, sizeof(unsigned))) {209ret = -EFAULT;210goto check_fault;211}212213syscall_nr = __NR_getcpu;214break;215}216217/*218* Handle seccomp. regs->ip must be the original value.219* See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.220*221* We could optimize the seccomp disabled case, but performance222* here doesn't matter.223*/224regs->orig_ax = syscall_nr;225regs->ax = -ENOSYS;226tmp = secure_computing();227if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {228warn_bad_vsyscall(KERN_DEBUG, regs,229"seccomp tried to change syscall nr or ip");230force_exit_sig(SIGSYS);231return true;232}233regs->orig_ax = -1;234if (tmp)235goto do_ret; /* skip requested */236237/*238* With a real vsyscall, page faults cause SIGSEGV.239*/240ret = -EFAULT;241switch (vsyscall_nr) {242case 0:243/* this decodes regs->di and regs->si on its own */244ret = __x64_sys_gettimeofday(regs);245break;246247case 1:248/* this decodes regs->di on its own */249ret = __x64_sys_time(regs);250break;251252case 2:253/* while we could clobber regs->dx, we didn't in the past... */254orig_dx = regs->dx;255regs->dx = 0;256/* this decodes regs->di, regs->si and regs->dx on its own */257ret = __x64_sys_getcpu(regs);258regs->dx = orig_dx;259break;260}261262check_fault:263if (ret == -EFAULT) {264/* Bad news -- userspace fed a bad pointer to a vsyscall. */265warn_bad_vsyscall(KERN_INFO, regs,266"vsyscall fault (exploit attempt?)");267goto sigsegv;268}269270regs->ax = ret;271272do_ret:273/* Emulate a ret instruction. */274regs->ip = caller;275regs->sp += 8;276return true;277278sigsegv:279force_sig(SIGSEGV);280return true;281}282283/*284* A pseudo VMA to allow ptrace access for the vsyscall page. This only285* covers the 64bit vsyscall page now. 32bit has a real VMA now and does286* not need special handling anymore:287*/288static const char *gate_vma_name(struct vm_area_struct *vma)289{290return "[vsyscall]";291}292static const struct vm_operations_struct gate_vma_ops = {293.name = gate_vma_name,294};295static struct vm_area_struct gate_vma __ro_after_init = {296.vm_start = VSYSCALL_ADDR,297.vm_end = VSYSCALL_ADDR + PAGE_SIZE,298.vm_page_prot = PAGE_READONLY_EXEC,299.vm_flags = VM_READ | VM_EXEC,300.vm_ops = &gate_vma_ops,301};302303struct vm_area_struct *get_gate_vma(struct mm_struct *mm)304{305#ifdef CONFIG_COMPAT306if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags))307return NULL;308#endif309if (vsyscall_mode == NONE)310return NULL;311return &gate_vma;312}313314int in_gate_area(struct mm_struct *mm, unsigned long addr)315{316struct vm_area_struct *vma = get_gate_vma(mm);317318if (!vma)319return 0;320321return (addr >= vma->vm_start) && (addr < vma->vm_end);322}323324/*325* Use this when you have no reliable mm, typically from interrupt326* context. It is less reliable than using a task's mm and may give327* false positives.328*/329int in_gate_area_no_mm(unsigned long addr)330{331return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;332}333334/*335* The VSYSCALL page is the only user-accessible page in the kernel address336* range. Normally, the kernel page tables can have _PAGE_USER clear, but337* the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls338* are enabled.339*340* Some day we may create a "minimal" vsyscall mode in which we emulate341* vsyscalls but leave the page not present. If so, we skip calling342* this.343*/344void __init set_vsyscall_pgtable_user_bits(pgd_t *root)345{346pgd_t *pgd;347p4d_t *p4d;348pud_t *pud;349pmd_t *pmd;350351pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);352set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));353p4d = p4d_offset(pgd, VSYSCALL_ADDR);354set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));355pud = pud_offset(p4d, VSYSCALL_ADDR);356set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));357pmd = pmd_offset(pud, VSYSCALL_ADDR);358set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));359}360361void __init map_vsyscall(void)362{363extern char __vsyscall_page;364unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);365366/*367* For full emulation, the page needs to exist for real. In368* execute-only mode, there is no PTE at all backing the vsyscall369* page.370*/371if (vsyscall_mode == EMULATE) {372__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,373PAGE_KERNEL_VVAR);374set_vsyscall_pgtable_user_bits(swapper_pg_dir);375}376377if (vsyscall_mode == XONLY)378vm_flags_init(&gate_vma, VM_EXEC);379380BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=381(unsigned long)VSYSCALL_ADDR);382}383384385