untrusted comment: signature from openbsd 6.1 base secret key RWQEQa33SgQSEldy+UzqNGAlYgLi6bG6oWxjooOKpm9ZU4/VaZhLdkHV/uAYeQEemjOai0Gk1MapXITat+qKayx1B8zG4cOJoAw= OpenBSD 6.1 errata 037, March 1st, 2018: Intel CPUs contain a speculative execution flaw called Meltdown which allows userspace programs to access kernel memory. A complex workaround solves the problem. Apply by doing: signify -Vep /etc/signify/openbsd-61-base.pub -x 037_meltdown.patch.sig \ -m - | (cd /usr/src && patch -p0) And then rebuild and install a new kernel: KK=`sysctl -n kern.osversion | cut -d# -f1` cd /usr/src/sys/arch/`machine`/compile/$KK make obj make config make make install Index: sys/arch/amd64/amd64/cpu.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v retrieving revision 1.102.4.1 diff -u -p -r1.102.4.1 cpu.c --- sys/arch/amd64/amd64/cpu.c 26 Aug 2017 00:14:20 -0000 1.102.4.1 +++ sys/arch/amd64/amd64/cpu.c 21 Feb 2018 22:42:15 -0000 @@ -80,7 +80,7 @@ #include #include -#include +#include #include #include #include @@ -112,6 +112,14 @@ #include #endif /* HIBERNATE */ +/* #define CPU_DEBUG */ + +#ifdef CPU_DEBUG +#define DPRINTF(x...) do { printf(x); } while(0) +#else +#define DPRINTF(x...) +#endif /* CPU_DEBUG */ + int cpu_match(struct device *, void *, void *); void cpu_attach(struct device *, struct device *, void *); int cpu_activate(struct device *, int); @@ -170,7 +178,7 @@ struct cfdriver cpu_cd = { * CPU, on uniprocessors). The CPU info list is initialized to * point at it. */ -struct cpu_info cpu_info_primary = { 0, &cpu_info_primary }; +struct cpu_info_full cpu_info_full_primary = { .cif_cpu = { .ci_self = &cpu_info_primary } }; struct cpu_info *cpu_info_list = &cpu_info_primary; @@ -336,8 +344,15 @@ cpu_attach(struct device *parent, struct * structure, otherwise use the primary's. */ if (caa->cpu_role == CPU_ROLE_AP) { - ci = malloc(sizeof(*ci), M_DEVBUF, M_WAITOK|M_ZERO); + struct cpu_info_full *cif; + + cif = km_alloc(sizeof *cif, &kv_any, &kp_zero, &kd_waitok); + ci = &cif->cif_cpu; #if defined(MULTIPROCESSOR) + ci->ci_tss = &cif->cif_tss; + ci->ci_gdt = (void *)(ci->ci_tss + 1); + memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE); + cpu_enter_pages(cif); if (cpu_info[cpunum] != NULL) panic("cpu at apic id %d already attached?", cpunum); cpu_info[cpunum] = ci; @@ -443,7 +458,6 @@ cpu_attach(struct device *parent, struct #if defined(MULTIPROCESSOR) cpu_intr_init(ci); - gdt_alloc_cpu(ci); sched_init_cpu(ci); cpu_start_secondary(ci); ncpus++; @@ -920,4 +934,63 @@ cpu_activate(struct device *self, int ac } return (0); +} + +/* + * cpu_enter_pages + * + * Requests mapping of various special pages required in the Intel Meltdown + * case (to be entered into the U-K page table): + * + * 1 tss+gdt page for each CPU + * 1 trampoline stack page for each CPU + * + * The cpu_info_full struct for each CPU straddles these pages. The offset into + * 'cif' is calculated below, for each page. For more information, consult + * the definition of struct cpu_info_full in cpu_full.h + * + * On CPUs unaffected by Meltdown, this function still configures 'cif' but + * the calls to pmap_enter_special become no-ops. + * + * Parameters: + * cif : the cpu_info_full structure describing a CPU whose pages are to be + * entered into the special meltdown U-K page table. + */ +void +cpu_enter_pages(struct cpu_info_full *cif) +{ + vaddr_t va; + paddr_t pa; + + /* The TSS+GDT need to be readable */ + va = (vaddr_t)cif; + pmap_extract(pmap_kernel(), va, &pa); + pmap_enter_special(va, pa, PROT_READ); + DPRINTF("%s: entered tss+gdt page at va 0x%llx pa 0x%llx\n", __func__, + (uint64_t)va, (uint64_t)pa); + + /* The trampoline stack page needs to be read/write */ + va = (vaddr_t)&cif->cif_tramp_stack; + pmap_extract(pmap_kernel(), va, &pa); + pmap_enter_special(va, pa, PROT_READ | PROT_WRITE); + DPRINTF("%s: entered t.stack page at va 0x%llx pa 0x%llx\n", __func__, + (uint64_t)va, (uint64_t)pa); + + cif->cif_tss.tss_rsp0 = va + sizeof(cif->cif_tramp_stack) - 16; + DPRINTF("%s: cif_tss.tss_rsp0 = 0x%llx\n" ,__func__, + (uint64_t)cif->cif_tss.tss_rsp0); + cif->cif_cpu.ci_intr_rsp = cif->cif_tss.tss_rsp0 - + sizeof(struct iretq_frame); + +#define SETUP_IST_SPECIAL_STACK(ist, cif, member) do { \ + (cif)->cif_tss.tss_ist[(ist)] = (vaddr_t)&(cif)->member + \ + sizeof((cif)->member) - 16; \ + (cif)->member[nitems((cif)->member) - 2] = (int64_t)&(cif)->cif_cpu; \ +} while (0) + + SETUP_IST_SPECIAL_STACK(0, cif, cif_dblflt_stack); + SETUP_IST_SPECIAL_STACK(1, cif, cif_nmi_stack); + + /* an empty iomap, by setting its offset to the TSS limit */ + cif->cif_tss.tss_iobase = sizeof(cif->cif_tss); } Index: sys/arch/amd64/amd64/gdt.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/gdt.c,v retrieving revision 1.24 diff -u -p -r1.24 gdt.c --- sys/arch/amd64/amd64/gdt.c 24 May 2015 01:01:49 -0000 1.24 +++ sys/arch/amd64/amd64/gdt.c 21 Feb 2018 22:42:15 -0000 @@ -45,33 +45,6 @@ #include /* - * Allocate shadow GDT for a slave cpu. - */ -void -gdt_alloc_cpu(struct cpu_info *ci) -{ - struct vm_page *pg; - vaddr_t va; - - ci->ci_gdt = (char *)uvm_km_valloc(kernel_map, - GDT_SIZE + sizeof(*ci->ci_tss)); - ci->ci_tss = (void *)(ci->ci_gdt + GDT_SIZE); - uvm_map_pageable(kernel_map, (vaddr_t)ci->ci_gdt, - (vaddr_t)ci->ci_gdt + GDT_SIZE, FALSE, FALSE); - for (va = (vaddr_t)ci->ci_gdt; - va < (vaddr_t)ci->ci_gdt + GDT_SIZE + sizeof(*ci->ci_tss); - va += PAGE_SIZE) { - pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); - if (pg == NULL) - panic("gdt_init: no pages"); - pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), PROT_READ | PROT_WRITE); - } - memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE); - bzero(ci->ci_tss, sizeof(*ci->ci_tss)); -} - - -/* * Load appropriate gdt descriptor; we better be running on *ci * (for the most part, this is how a cpu knows who it is). */ Index: sys/arch/amd64/amd64/genassym.cf =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/genassym.cf,v retrieving revision 1.31 diff -u -p -r1.31 genassym.cf --- sys/arch/amd64/amd64/genassym.cf 18 May 2015 19:59:27 -0000 1.31 +++ sys/arch/amd64/amd64/genassym.cf 21 Feb 2018 22:42:15 -0000 @@ -78,6 +78,15 @@ member tf_ss define FRAMESIZE sizeof(struct trapframe) +struct iretq_frame +member IRETQ_CS iretq_cs +member IRETQ_RIP iretq_rip +member IRETQ_RFLAGS iretq_rflags +member IRETQ_RSP iretq_rsp +member IRETQ_SS iretq_ss + +define IRETQ_SIZE sizeof(struct iretq_frame) + struct pcb member pcb_cr3 member pcb_rsp @@ -91,6 +100,8 @@ member pcb_cr0 struct pmap member pm_cpus +member pm_pdirpa +member pm_pdirpa_intel struct x86_64_tss member tss_rsp0 @@ -115,6 +126,10 @@ endif member CPU_INFO_GDT ci_gdt member CPU_INFO_TSS ci_tss member CPU_INFO_FLAGS ci_flags +member CPU_INFO_KERN_CR3 ci_kern_cr3 +member CPU_INFO_USER_CR3 ci_user_cr3 +member CPU_INFO_KERN_RSP ci_kern_rsp +member CPU_INFO_INTR_RSP ci_intr_rsp export CPUF_USERSEGS_BIT Index: sys/arch/amd64/amd64/identcpu.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v retrieving revision 1.82 diff -u -p -r1.82 identcpu.c --- sys/arch/amd64/amd64/identcpu.c 28 Mar 2017 21:36:27 -0000 1.82 +++ sys/arch/amd64/amd64/identcpu.c 21 Feb 2018 22:42:15 -0000 @@ -204,6 +204,10 @@ const struct { { SEFF0ECX_AVX512VBMI, "AVX512VBMI" }, { SEFF0ECX_UMIP, "UMIP" }, { SEFF0ECX_PKU, "PKU" }, +}, cpu_seff0_edxfeatures[] = { + { SEFF0EDX_IBRS, "IBRS,IBPB" }, + { SEFF0EDX_STIBP, "STIBP" }, + /* SEFF0EDX_ARCH_CAP (not printed) */ }, cpu_tpm_eaxfeatures[] = { { TPM_SENSOR, "SENSOR" }, { TPM_ARAT, "ARAT" }, @@ -211,6 +215,8 @@ const struct { { CPUIDEAX_VERID, "PERF" }, }, cpu_cpuid_apmi_edx[] = { { CPUIDEDX_ITSC, "ITSC" }, +}, cpu_amdspec_ebxfeatures[] = { + { CPUIDEBX_IBPB, "IBPB" }, }; int @@ -489,6 +495,7 @@ identifycpu(struct cpu_info *ci) int i; char *brandstr_from, *brandstr_to; int skipspace; + extern uint32_t cpu_meltdown; CPUID(1, ci->ci_signature, val, dummy, ci->ci_feature_flags); CPUID(0x80000000, ci->ci_pnfeatset, dummy, dummy, dummy); @@ -607,7 +614,7 @@ identifycpu(struct cpu_info *ci) if (cpuid_level >= 0x07) { /* "Structured Extended Feature Flags" */ CPUID_LEAF(0x7, 0, dummy, ci->ci_feature_sefflags_ebx, - ci->ci_feature_sefflags_ecx, dummy); + ci->ci_feature_sefflags_ecx, ci->ci_feature_sefflags_edx); for (i = 0; i < nitems(cpu_seff0_ebxfeatures); i++) if (ci->ci_feature_sefflags_ebx & cpu_seff0_ebxfeatures[i].bit) @@ -616,6 +623,10 @@ identifycpu(struct cpu_info *ci) if (ci->ci_feature_sefflags_ecx & cpu_seff0_ecxfeatures[i].bit) printf(",%s", cpu_seff0_ecxfeatures[i].str); + for (i = 0; i < nitems(cpu_seff0_edxfeatures); i++) + if (ci->ci_feature_sefflags_edx & + cpu_seff0_edxfeatures[i].bit) + printf(",%s", cpu_seff0_edxfeatures[i].str); } if (!strcmp(cpu_vendor, "GenuineIntel") && cpuid_level >= 0x06) { @@ -628,6 +639,22 @@ identifycpu(struct cpu_info *ci) if (ci->ci_family >= 0x12) ci->ci_feature_tpmflags |= TPM_ARAT; } + + /* AMD speculation control features */ + if (!strcmp(cpu_vendor, "AuthenticAMD")) { + if (ci->ci_pnfeatset >= 0x80000008) { + CPUID(0x80000008, dummy, ci->ci_feature_amdspec_ebx, + dummy, dummy); + for (i = 0; i < nitems(cpu_amdspec_ebxfeatures); i++) + if (ci->ci_feature_amdspec_ebx & + cpu_amdspec_ebxfeatures[i].bit) + printf(",%s", + cpu_amdspec_ebxfeatures[i].str); + } + } + + if (cpu_meltdown) + printf(",MELTDOWN"); printf("\n"); Index: sys/arch/amd64/amd64/lapic.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v retrieving revision 1.45 diff -u -p -r1.45 lapic.c --- sys/arch/amd64/amd64/lapic.c 1 Nov 2016 01:13:19 -0000 1.45 +++ sys/arch/amd64/amd64/lapic.c 21 Feb 2018 22:42:15 -0000 @@ -62,6 +62,14 @@ #include #endif +/* #define LAPIC_DEBUG */ + +#ifdef LAPIC_DEBUG +#define DPRINTF(x...) do { printf(x); } while(0) +#else +#define DPRINTF(x...) +#endif /* LAPIC_DEBUG */ + struct evcount clk_count; #ifdef MULTIPROCESSOR struct evcount ipi_count; @@ -202,6 +210,7 @@ lapic_map(paddr_t lapic_base) codepatch_call(CPTAG_EOI, &x2apic_eoi); lapic_writereg(LAPIC_TPRI, s); + va = (vaddr_t)&local_apic; } else { /* * Map local apic. If we have a local apic, it's safe to @@ -220,6 +229,17 @@ lapic_map(paddr_t lapic_base) lapic_tpr = s; } + + /* + * Enter the LAPIC MMIO page in the U-K page table for handling + * Meltdown (needed in the interrupt stub to acknowledge the + * incoming interrupt). On CPUs unaffected by Meltdown, + * pmap_enter_special is a no-op. + * XXX - need to map this PG_N + */ + pmap_enter_special(va, lapic_base, PROT_READ | PROT_WRITE); + DPRINTF("%s: entered lapic page va 0x%llx pa 0x%llx\n", __func__, + (uint64_t)va, (uint64_t)lapic_base); enable_intr(); } Index: sys/arch/amd64/amd64/locore.S =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/locore.S,v retrieving revision 1.84.4.2 diff -u -p -r1.84.4.2 locore.S --- sys/arch/amd64/amd64/locore.S 4 Oct 2017 19:38:03 -0000 1.84.4.2 +++ sys/arch/amd64/amd64/locore.S 21 Feb 2018 22:42:22 -0000 @@ -113,6 +113,7 @@ #include #include +#include #include #include #include @@ -181,6 +182,8 @@ _C_LABEL(lapic_isr): .globl _C_LABEL(biosbasemem),_C_LABEL(biosextmem) .globl _C_LABEL(bootapiver) .globl _C_LABEL(pg_nx) + .globl _C_LABEL(pg_g_kern) + .globl _C_LABEL(cpu_meltdown) _C_LABEL(cpu_id): .long 0 # saved from `cpuid' instruction _C_LABEL(cpu_feature): .long 0 # feature flags from 'cpuid' # instruction @@ -213,6 +216,10 @@ _C_LABEL(biosextmem): .long 0 # extended _C_LABEL(biosextmem): .long REALEXTMEM #endif _C_LABEL(pg_nx): .quad 0 # NX PTE bit (if CPU supports) +_C_LABEL(pg_g_kern): .quad 0 # 0x100 if global pages should be used + # in kernel mappings, 0 otherwise (for + # insecure CPUs) +_C_LABEL(cpu_meltdown): .long 0 # 1 if this CPU has Meltdown #define _RELOC(x) ((x) - KERNBASE) #define RELOC(x) _RELOC(_C_LABEL(x)) @@ -318,6 +325,48 @@ bi_size_ok: movl %ecx,8(%ebp) movl $0, 12(%ebp) + /* + * Determine if CPU has meltdown. Certain Intel CPUs do not properly + * respect page permissions when speculatively loading data into + * the cache ("Meltdown" CVE). These CPUs must utilize a secondary + * sanitized page table lacking kernel mappings when executing user + * processes, and may not use PG_G global PTEs for kernel VAs. + */ + movl $0x1, RELOC(cpu_meltdown) /* assume insecure at first */ + movl $0x0, RELOC(pg_g_kern) + + cmpl $0x756e6547, %ebx # "Genu" + jne .Lcpu_secure + cmpl $0x6c65746e, %ecx # "ntel" + jne .Lcpu_secure + cmpl $0x49656e69, %edx # "ineI" + jne .Lcpu_secure + + /* + * Intel CPU, now check if IA32_ARCH_CAPABILITIES is supported and + * if it says this CPU is safe. + */ + movl $0x0, %eax + cpuid + cmpl $0x7, %eax + jl .Lcpu_check_finished + + movl $0x7, %eax + cpuid + testl $SEFF0EDX_ARCH_CAP, %edx + jz .Lcpu_check_finished + + /* IA32_ARCH_CAPABILITIES MSR avaialble, use it to check CPU security */ + movl $MSR_ARCH_CAPABILITIES, %ecx + rdmsr + testl $ARCH_CAPABILITIES_RDCL_NO, %eax + jz .Lcpu_check_finished + +.Lcpu_secure: + movl $0x0, RELOC(cpu_meltdown) + movl $PG_G, RELOC(pg_g_kern) + +.Lcpu_check_finished: movl $1,%eax cpuid movl %eax,RELOC(cpu_id) @@ -584,7 +633,8 @@ map_tables: leal (PROC0_DMP2_OFF)(%esi), %ebx xorl %eax, %eax movl $(NDML2_ENTRIES * NPDPG), %ecx -1: orl $(PG_V|PG_KW|PG_PS|PG_G), %eax +1: orl $(PG_V|PG_KW|PG_PS), %eax + orl RELOC(pg_g_kern), %eax cmpl $__kernel_base_phys, %eax jl store_pte cmpl $__kernel_end_phys, %eax @@ -751,7 +801,7 @@ longmode_hi: /*****************************************************************************/ /* - * Signal trampoline; copied to top of user stack. + * Signal trampoline; copied to a page mapped into userspace. * gdb's backtrace logic matches against the instructions in this. */ .section .rodata @@ -888,11 +938,15 @@ switch_exited: btrl $CPUF_USERSEGS_BIT, CPUVAR(FLAGS) jnc restore_saved - /* set %ds, %es, and %fs to expected value to prevent info leak */ + /* set %ds, %es, %fs, and %gs to expected value to prevent info leak */ movw $(GSEL(GUDATA_SEL, SEL_UPL)),%ax movw %ax,%ds movw %ax,%es movw %ax,%fs + cli /* block interrupts when on user GS.base */ + swapgs /* switch from kernel to user GS.base */ + movw %ax,%gs /* set %gs to UDATA and GS.base to 0 */ + swapgs /* back to kernel GS.base */ restore_saved: /* @@ -912,20 +966,34 @@ restore_saved: movq PCB_RSP(%r13),%rsp movq PCB_RBP(%r13),%rbp - movq CPUVAR(TSS),%rcx - movq PCB_KSTACK(%r13),%rdx - movq %rdx,TSS_RSP0(%rcx) - movq PCB_CR3(%r13),%rax - movq %rax,%cr3 + movq %rax,%cr3 /* %rax used below too */ /* Don't bother with the rest if switching to a system process. */ testl $P_SYSTEM,P_FLAG(%r12) jnz switch_restored + /* record the bits needed for future U-->K transition */ + movq PCB_KSTACK(%r13),%rdx + subq $FRAMESIZE,%rdx + movq %rdx,CPUVAR(KERN_RSP) + movq PCB_PMAP(%r13),%rcx + + /* + * Meltdown: iff we're doing separate U+K and U-K page tables, + * then record them in cpu_info for easy access in syscall and + * interrupt trampolines. XXX code patch this + */ + + movq PM_PDIRPA_INTEL(%rcx),%rdx + testq %rdx,%rdx + jz 0f /* yay, no intel suckiness */ + movq %rax,CPUVAR(KERN_CR3) + movq %rdx,CPUVAR(USER_CR3) +0: + /* set the new pmap's bit for the cpu */ movl CPUVAR(CPUID),%edi - movq PCB_PMAP(%r13),%rcx lock btsq %rdi,PM_CPUS(%rcx) #ifdef DIAGNOSTIC @@ -1011,8 +1079,7 @@ IDTVEC(syscall32) sysret /* go away please */ /* - * syscall insn entry. This currently isn't much faster, but - * it can be made faster in the future. + * syscall insn entry. */ IDTVEC(syscall) /* @@ -1022,13 +1089,20 @@ IDTVEC(syscall) * the user-space value. * First order of business is to swap to the kernel gs.base so that * we can access our struct cpu_info and use the scratch space there - * to switch to our kernel stack. Once that's in place we can + * to switch to the kernel page tables (thank you, Intel), then + * switch to our kernel stack. Once that's in place we can * unblock interrupts and save the rest of the syscall frame. */ swapgs movq %r15,CPUVAR(SCRATCH) - movq CPUVAR(CURPCB),%r15 - movq PCB_KSTACK(%r15),%r15 + movq CPUVAR(KERN_CR3),%r15 + testq %r15,%r15 + jz Xsyscall_untramp + movq %r15,%cr3 + jmp Xsyscall_untramp + +NENTRY(Xsyscall_untramp) + movq CPUVAR(KERN_RSP),%r15 xchgq %r15,%rsp sti @@ -1039,12 +1113,11 @@ IDTVEC(syscall) * ss:rsp, etc, so that all GP registers can be * saved. Then, fill in the rest. */ - pushq $(GSEL(GUDATA_SEL, SEL_UPL)) - pushq %r15 - subq $(TF_RSP-TF_TRAPNO),%rsp + movq $(GSEL(GUDATA_SEL, SEL_UPL)),TF_SS(%rsp) + movq %r15,TF_RSP(%rsp) movq CPUVAR(SCRATCH),%r15 - subq $32,%rsp - INTR_SAVE_GPRS + INTR_SAVE_MOST_GPRS_NO_ADJ + movq %rcx,TF_RCX(%rsp) movq %r11, TF_RFLAGS(%rsp) /* old rflags from syscall insn */ movq $(GSEL(GUCODE_SEL, SEL_UPL)), TF_CS(%rsp) movq %rcx,TF_RIP(%rsp) @@ -1089,16 +1162,45 @@ IDTVEC(syscall) movq TF_RBP(%rsp),%rbp movq TF_RBX(%rsp),%rbx - INTR_RESTORE_SELECTORS + /* Restore FS.base if it's not already in the CPU */ + btsl $CPUF_USERSEGS_BIT,CPUVAR(FLAGS) + jc 99f + movq CPUVAR(CURPCB),%rdx + movq PCB_FSBASE(%rdx),%rax + movq %rax,%rdx + shrq $32,%rdx + movl $MSR_FSBASE,%ecx + wrmsr +99: + /* + * We need to finish reading from the trapframe, then switch + * to the user page tables, swapgs, and return. We need + * to get the final value for the register that was used + * for the mov to %cr3 from somewhere accessible on the + * user page tables, so save it in CPUVAR(SCRATCH) across + * the switch. + */ movq TF_RDX(%rsp),%rdx movq TF_RAX(%rsp),%rax + movq %rax,CPUVAR(SCRATCH) + movq CPUVAR(USER_CR3),%rax movq TF_RIP(%rsp),%rcx movq TF_RFLAGS(%rsp),%r11 movq TF_RSP(%rsp),%rsp + testq %rax,%rax + jz 1f + jmp syscall_trampback + +KUENTRY(syscall_trampback) + movq %rax,%cr3 +1: movq CPUVAR(SCRATCH),%rax + swapgs sysretq + .text + #ifdef DIAGNOSTIC .Lsyscall_spl_not_lowered: movabsq $4f, %rdi @@ -1132,6 +1234,12 @@ NENTRY(proc_trampoline) * Return via iretq, for real interrupts and signal returns */ NENTRY(intr_fast_exit) +#ifdef DIAGNOSTIC + pushfq + popq %rdx + testq $PSL_I,%rdx + jnz .Lintr_exit_not_blocked +#endif /* DIAGNOSTIC */ movq TF_RDI(%rsp),%rdi movq TF_RSI(%rsp),%rsi movq TF_R8(%rsp),%r8 @@ -1145,11 +1253,68 @@ NENTRY(intr_fast_exit) movq TF_RBX(%rsp),%rbx testq $SEL_RPL,TF_CS(%rsp) - je 5f + je intr_exit_recurse /* returning back to kernel? */ + + /* returning to userspace. XXX fix up iret frame here */ + + /* restore FS.base if it's not already in the CPU */ + btsl $CPUF_USERSEGS_BIT,CPUVAR(FLAGS) + jc 99f + movq CPUVAR(CURPCB),%rdx /* for below */ + movq PCB_FSBASE(%rdx),%rax + movq %rax,%rdx + shrq $32,%rdx + movl $MSR_FSBASE,%ecx + wrmsr +99: + /* + * Returning to userspace. We need to go things in this order: + * - update the iret frame from the trapframe + * - finish reading from the trapframe + * - switch to the trampoline stack + * - jump to the .kutext segment + * - switch to the user page tables + * - swapgs + * - iretq + * To get the final value for the register that was used + * for the mov to %cr3, we need access to somewhere accessible + * on the user page tables, so we save it in CPUVAR(SCRATCH) + * across the switch. + */ + /* update iret frame */ + movq CPUVAR(INTR_RSP),%rdx + movq $(GSEL(GUCODE_SEL,SEL_UPL)),IRETQ_CS(%rdx) + movq TF_RIP(%rsp),%rax + movq %rax,IRETQ_RIP(%rdx) + movq TF_RFLAGS(%rsp),%rax + movq %rax,IRETQ_RFLAGS(%rdx) + movq TF_RSP(%rsp),%rax + movq %rax,IRETQ_RSP(%rdx) + movq $(GSEL(GUDATA_SEL,SEL_UPL)),IRETQ_SS(%rdx) + /* finish with the trap frame */ + movq TF_RAX(%rsp),%rax + movq %rax,CPUVAR(SCRATCH) + movq TF_RCX(%rsp),%rcx + movq TF_R11(%rsp),%r11 + /* switch to the trampoline stack */ + xchgq %rdx,%rsp + movq TF_RDX(%rdx),%rdx + movq CPUVAR(USER_CR3),%rax + testq %rax,%rax + jz 1f + jmp iretq_tramp + +KUENTRY(iretq_tramp) + movq %rax,%cr3 +1: movq CPUVAR(SCRATCH),%rax + swapgs - INTR_RESTORE_SELECTORS + .globl _C_LABEL(doreti_iret) +_C_LABEL(doreti_iret): + iretq -5: movq TF_RDX(%rsp),%rdx +NENTRY(intr_exit_recurse) + movq TF_RDX(%rsp),%rdx movq TF_RCX(%rsp),%rcx movq TF_R11(%rsp),%r11 movq TF_RAX(%rsp),%rax @@ -1167,9 +1332,6 @@ NENTRY(intr_fast_exit) #endif /* !defined(GPROF) && defined(DDBPROF) */ addq $TF_RIP,%rsp - - .globl _C_LABEL(doreti_iret) -_C_LABEL(doreti_iret): iretq @@ -1202,6 +1364,33 @@ _C_LABEL(doreti_iret): addq $TF_RIP,%rsp iretq #endif /* !defined(GPROF) && defined(DDBPROF) */ + .text + +#ifdef DIAGNOSTIC +.Lintr_exit_not_blocked: + xchgw %bx, %bx + movl warn_once(%rip),%edi + testl %edi,%edi + jnz 1f + incl %edi + movl %edi,warn_once(%rip) + leaq .Lnot_blocked(%rip),%rdi + call _C_LABEL(printf) +#ifdef DDB + int $3 +#endif /* DDB */ +1: cli + jmp intr_fast_exit + + .data +.global warn_once +warn_once: + .long 0 + .section .rodata +.Lnot_blocked: + .asciz "WARNING: INTERRUPTS NOT BLOCKED ON INTERRUPT RETURN: 0x%x 0x%x\n" + .text +#endif ENTRY(xrstor_user) movq %rsi, %rdx Index: sys/arch/amd64/amd64/machdep.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/machdep.c,v retrieving revision 1.226 diff -u -p -r1.226 machdep.c --- sys/arch/amd64/amd64/machdep.c 11 Mar 2017 11:55:03 -0000 1.226 +++ sys/arch/amd64/amd64/machdep.c 21 Feb 2018 22:42:22 -0000 @@ -99,7 +99,7 @@ #include -#include +#include #include #include #include @@ -152,6 +152,14 @@ extern int db_console; #include #endif +/* #define MACHDEP_DEBUG */ + +#ifdef MACHDEP_DEBUG +#define DPRINTF(x...) do { printf(x); } while(0) +#else +#define DPRINTF(x...) +#endif /* MACHDEP_DEBUG */ + /* the following is used externally (sysctl_hw) */ char machine[] = MACHINE; @@ -267,6 +275,7 @@ void cpu_init_extents(void); void map_tramps(void); void init_x86_64(paddr_t); void (*cpuresetfn)(void); +void enter_shared_special_pages(void); #ifdef KGDB #ifndef KGDB_DEVNAME @@ -341,6 +350,66 @@ cpu_startup(void) /* Safe for i/o port / memory space allocation to use malloc now. */ x86_bus_space_mallocok(); + + /* enter the IDT and trampoline code in the u-k maps */ + enter_shared_special_pages(); + + /* initialize CPU0's TSS and GDT and put them in the u-k maps */ + cpu_enter_pages(&cpu_info_full_primary); +} + +/* + * enter_shared_special_pages + * + * Requests mapping of various special pages required in the Intel Meltdown + * case (to be entered into the U-K page table): + * + * 1 IDT page + * Various number of pages covering the U-K ".kutext" section. This section + * contains code needed during trampoline operation + * Various number of pages covering the U-K ".kudata" section. This section + * contains data accessed by the trampoline, before switching to U+K + * (for example, various shared global variables used by IPIs, etc) + * + * The linker script places the required symbols in the sections above. + * + * On CPUs not affected by Meltdown, the calls to pmap_enter_special below + * become no-ops. + */ +void +enter_shared_special_pages(void) +{ + extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[]; + extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[]; + vaddr_t va; + paddr_t pa; + + /* idt */ + pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ); + DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__, + (uint64_t)idt_vaddr, (uint64_t)idt_paddr); + + /* .kutext section */ + va = (vaddr_t)__kutext_start; + pa = (paddr_t)__kernel_kutext_phys; + while (va < (vaddr_t)__kutext_end) { + pmap_enter_special(va, pa, PROT_READ | PROT_EXEC); + DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n", + __func__, (uint64_t)va, (uint64_t)pa); + va += PAGE_SIZE; + pa += PAGE_SIZE; + } + + /* .kudata section */ + va = (vaddr_t)__kudata_start; + pa = (paddr_t)__kernel_kudata_phys; + while (va < (vaddr_t)__kudata_end) { + pmap_enter_special(va, pa, PROT_READ | PROT_WRITE); + DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n", + __func__, (uint64_t)va, (uint64_t)pa); + va += PAGE_SIZE; + pa += PAGE_SIZE; + } } /* @@ -357,12 +426,6 @@ x86_64_proc0_tss_ldt_init(void) pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16; proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1; - /* an empty iomap, by setting its offset to the TSS limit */ - cpu_info_primary.ci_tss->tss_iobase = sizeof(struct x86_64_tss); - cpu_info_primary.ci_tss->tss_rsp0 = pcb->pcb_kstack; - cpu_info_primary.ci_tss->tss_ist[0] = - (u_int64_t)proc0.p_addr + PAGE_SIZE - 16; - ltr(GSYSSEL(GPROC0_SEL, SEL_KPL)); lldt(0); } @@ -374,15 +437,11 @@ x86_64_proc0_tss_ldt_init(void) #ifdef MULTIPROCESSOR void x86_64_init_pcb_tss_ldt(struct cpu_info *ci) -{ +{ struct pcb *pcb = ci->ci_idle_pcb; - ci->ci_tss->tss_iobase = sizeof(*ci->ci_tss); - ci->ci_tss->tss_rsp0 = pcb->pcb_kstack; - ci->ci_tss->tss_ist[0] = pcb->pcb_kstack - USPACE + PAGE_SIZE; - pcb->pcb_cr0 = rcr0(); -} +} #endif /* MULTIPROCESSOR */ bios_diskinfo_t * @@ -1027,25 +1086,27 @@ dumpsys(void) /* * Force the userspace FS.base to be reloaded from the PCB on return from - * the kernel, and reset most the segment registers (%ds, %es, and %fs) + * the kernel, and reset the segment registers (%ds, %es, %fs, and %gs) * to their expected userspace value. */ void reset_segs(void) { /* - * Segment registers (%ds, %es, %fs, %gs) aren't in the trapframe. - * %gs is reset on return to userspace to avoid having to deal with - * swapgs; others are reset on context switch and here. This - * operates like the cpu_switchto() sequence: if we haven't reset - * %[def]s already, do so now. - */ + * This operates like the cpu_switchto() sequence: if we + * haven't reset %[defg]s already, do so now. + */ if (curcpu()->ci_flags & CPUF_USERSEGS) { curcpu()->ci_flags &= ~CPUF_USERSEGS; __asm volatile( "movw %%ax,%%ds\n\t" "movw %%ax,%%es\n\t" - "movw %%ax,%%fs" : : "a"(GSEL(GUDATA_SEL, SEL_UPL))); + "movw %%ax,%%fs\n\t" + "cli\n\t" /* block intr when on user GS.base */ + "swapgs\n\t" /* swap from kernel to user GS.base */ + "movw %%ax,%%gs\n\t"/* set %gs to UDATA and GS.base to 0 */ + "swapgs\n\t" /* back to kernel GS.base */ + "sti" : : "a"(GSEL(GUDATA_SEL, SEL_UPL))); } } @@ -1571,8 +1632,6 @@ init_x86_64(paddr_t first_avail) pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024); pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE); - pmap_kenter_pa(idt_vaddr + PAGE_SIZE, idt_paddr + PAGE_SIZE, - PROT_READ | PROT_WRITE); #if defined(MULTIPROCESSOR) || \ (NACPI > 0 && !defined(SMALL_KERNEL)) @@ -1580,7 +1639,7 @@ init_x86_64(paddr_t first_avail) #endif idt = (struct gate_descriptor *)idt_vaddr; - cpu_info_primary.ci_tss = (void *)(idt + NIDT); + cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss; cpu_info_primary.ci_gdt = (void *)(cpu_info_primary.ci_tss + 1); /* make gdt gates and memory segments */ @@ -1605,9 +1664,10 @@ init_x86_64(paddr_t first_avail) /* exceptions */ for (x = 0; x < 32; x++) { - ist = (x == 8) ? 1 : 0; + /* trap2 == NMI, trap8 == double fault */ + ist = (x == 2) ? 2 : (x == 8) ? 1 : 0; setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT, - (x == 3 || x == 4) ? SEL_UPL : SEL_KPL, + (x == 3) ? SEL_UPL : SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); idt_allocmap[x] = 1; } Index: sys/arch/amd64/amd64/pmap.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/pmap.c,v retrieving revision 1.103 diff -u -p -r1.103 pmap.c --- sys/arch/amd64/amd64/pmap.c 2 Jan 2017 07:41:18 -0000 1.103 +++ sys/arch/amd64/amd64/pmap.c 22 Feb 2018 20:55:47 -0000 @@ -127,6 +127,15 @@ #include "acpi.h" +/* #define PMAP_DEBUG */ + +#ifdef PMAP_DEBUG +#define DPRINTF(x...) do { printf(x); } while(0) +#else +#define DPRINTF(x...) +#endif /* PMAP_DEBUG */ + + /* * general info: * @@ -263,6 +272,7 @@ TAILQ_HEAD(pg_to_free, vm_page); struct pool pmap_pdp_pool; void pmap_pdp_ctor(pd_entry_t *); +void pmap_pdp_ctor_intel(pd_entry_t *); extern vaddr_t msgbuf_vaddr; extern paddr_t msgbuf_paddr; @@ -276,6 +286,8 @@ extern vaddr_t lo32_paddr; vaddr_t virtual_avail; extern int end; +extern uint32_t cpu_meltdown; + /* * local prototypes */ @@ -315,7 +327,6 @@ void pmap_tlb_shootwait(void); #define pmap_tlb_shootwait() #endif - /* * p m a p i n l i n e h e l p e r f u n c t i o n s */ @@ -329,7 +340,8 @@ static __inline boolean_t pmap_is_curpmap(struct pmap *pmap) { return((pmap == pmap_kernel()) || - (pmap->pm_pdirpa == (paddr_t) rcr3())); + (pmap->pm_pdirpa == (paddr_t) rcr3()) || + (pmap->pm_pdirpa_intel == (paddr_t) rcr3())); } /* @@ -488,7 +500,6 @@ pmap_find_pte_direct(struct pmap *pm, va return (0); } - /* * p m a p k e n t e r f u n c t i o n s * @@ -517,7 +528,7 @@ pmap_kenter_pa(vaddr_t va, paddr_t pa, v /* special 1:1 mappings in the first 2MB must not be global */ if (va >= (vaddr_t)NBPD_L2) - npte |= PG_G; + npte |= pg_g_kern; if (!(prot & PROT_EXEC)) npte |= pg_nx; @@ -590,12 +601,12 @@ pmap_kremove(vaddr_t sva, vsize_t len) paddr_t pmap_bootstrap(paddr_t first_avail, paddr_t max_pa) { - vaddr_t kva, kva_end, kva_start = VM_MIN_KERNEL_ADDRESS; + vaddr_t kva_start = VM_MIN_KERNEL_ADDRESS; struct pmap *kpm; int i; - unsigned long p1i; long ndmpdp; paddr_t dmpd, dmpdp; + vaddr_t kva, kva_end; /* * define the boundaries of the managed kernel virtual address @@ -651,9 +662,14 @@ pmap_bootstrap(paddr_t first_avail, padd curpcb->pcb_pmap = kpm; /* proc0's pcb */ /* - * enable global TLB entries. + * Add PG_G attribute to already mapped kernel pages. pg_g_kern + * is calculated in locore0.S and may be set to: + * + * 0 if this CPU does not safely support global pages in the kernel + * (Intel/Meltdown) + * PG_G if this CPU does safely support global pages in the kernel + * (AMD) */ - /* add PG_G attribute to already mapped kernel pages */ #if KERNBASE == VM_MIN_KERNEL_ADDRESS for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ; #else @@ -661,9 +677,9 @@ pmap_bootstrap(paddr_t first_avail, padd for (kva = KERNBASE; kva < kva_end ; #endif kva += PAGE_SIZE) { - p1i = pl1_i(kva); + unsigned long p1i = pl1_i(kva); if (pmap_valid_entry(PTE_BASE[p1i])) - PTE_BASE[p1i] |= PG_G; + PTE_BASE[p1i] |= pg_g_kern; } /* @@ -688,7 +704,7 @@ pmap_bootstrap(paddr_t first_avail, padd va = PMAP_DIRECT_MAP(pdp); *((pd_entry_t *)va) = ((paddr_t)i << L2_SHIFT); - *((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | PG_G | PG_U | + *((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | pg_g_kern | PG_U | PG_M | pg_nx; } @@ -734,7 +750,7 @@ pmap_bootstrap(paddr_t first_avail, padd LIST_INIT(&pmaps); /* - * initialize the pmap pool. + * initialize the pmap pools. */ pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_NONE, 0, @@ -750,6 +766,9 @@ pmap_bootstrap(paddr_t first_avail, padd pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, IPL_NONE, PR_WAITOK, "pdppl", NULL); + kpm->pm_pdir_intel = 0; + kpm->pm_pdirpa_intel = 0; + /* * ensure the TLB is sync'd with reality by flushing it... */ @@ -902,13 +921,21 @@ pmap_free_ptp(struct pmap *pmap, struct unsigned long index; int level; vaddr_t invaladdr; - pd_entry_t opde; + pd_entry_t opde, *mdpml4es; level = 1; do { pmap_freepage(pmap, ptp, level, pagelist); index = pl_i(va, level + 1); opde = pmap_pte_set(&pdes[level - 1][index], 0); + if (level == 3 && pmap->pm_pdir_intel) { + /* Zap special meltdown PML4e */ + mdpml4es = (pd_entry_t *)pmap->pm_pdir_intel; + opde = pmap_pte_set(&mdpml4es[index], 0); + DPRINTF("%s: cleared meltdown PML4e @ index %lu " + "(va range start 0x%llx)\n", __func__, index, + (uint64_t)(index << L4_SHIFT)); + } invaladdr = level == 1 ? (vaddr_t)ptes : (vaddr_t)pdes[level - 2]; pmap_tlb_shootpage(curpcb->pcb_pmap, @@ -942,7 +969,7 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t struct vm_page *ptp, *pptp; int i; unsigned long index; - pd_entry_t *pva; + pd_entry_t *pva, *pva_intel; paddr_t ppa, pa; struct uvm_object *obj; @@ -981,6 +1008,20 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t pmap->pm_ptphint[i - 2] = ptp; pa = VM_PAGE_TO_PHYS(ptp); pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V); + + /* + * Meltdown Special case - if we are adding a new PML4e for + * usermode addresses, just copy the PML4e to the U-K page + * table. + */ + if (pmap->pm_pdir_intel && i == 4 && va < VM_MAXUSER_ADDRESS) { + pva_intel = (pd_entry_t *)pmap->pm_pdir_intel; + pva_intel[index] = pva[index]; + DPRINTF("%s: copying usermode PML4e (content=0x%llx) " + "from 0x%llx -> 0x%llx\n", __func__, pva[index], + (uint64_t)&pva[index], (uint64_t)&pva_intel[index]); + } + pmap->pm_stats.resident_count++; /* * If we're not in the top level, increase the @@ -1056,6 +1097,15 @@ pmap_pdp_ctor(pd_entry_t *pdir) #endif } +void +pmap_pdp_ctor_intel(pd_entry_t *pdir) +{ + struct pmap *kpm = pmap_kernel(); + + /* Copy PML4es from pmap_kernel's U-K view */ + memcpy(pdir, kpm->pm_pdir_intel, PAGE_SIZE); +} + /* * pmap_create: create a pmap * @@ -1096,6 +1146,22 @@ pmap_create(void) pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME; + /* + * Intel CPUs need a special page table to be used during usermode + * execution, one that lacks all kernel mappings. + */ + if (cpu_meltdown) { + pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, PR_WAITOK); + pmap_pdp_ctor_intel(pmap->pm_pdir_intel); + if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel, + &pmap->pm_pdirpa_intel)) + panic("%s: unknown PA mapping for meltdown PML4\n", + __func__); + } else { + pmap->pm_pdir_intel = 0; + pmap->pm_pdirpa_intel = 0; + } + LIST_INSERT_HEAD(&pmaps, pmap, pm_list); return (pmap); } @@ -1153,6 +1219,9 @@ pmap_destroy(struct pmap *pmap) /* XXX: need to flush it out of other processor's space? */ pool_put(&pmap_pdp_pool, pmap->pm_pdir); + if (pmap->pm_pdir_intel) + pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel); + pool_put(&pmap_pmap_pool, pmap); } @@ -1967,6 +2036,132 @@ pmap_collect(struct pmap *pmap) * defined as macro in pmap.h */ +void +pmap_enter_special(vaddr_t va, paddr_t pa, vm_prot_t prot) +{ + uint64_t l4idx, l3idx, l2idx, l1idx; + pd_entry_t *pd, *ptp; + paddr_t npa; + struct pmap *pmap = pmap_kernel(); + pt_entry_t *ptes; + int level, offs; + + /* If CPU is secure, no need to do anything */ + if (!cpu_meltdown) + return; + + /* Must be kernel VA */ + if (va < VM_MIN_KERNEL_ADDRESS) + panic("%s: invalid special mapping va 0x%lx requested", + __func__, va); + + if (!pmap->pm_pdir_intel) + pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, + PR_WAITOK | PR_ZERO); + + l4idx = (va & L4_MASK) >> L4_SHIFT; /* PML4E idx */ + l3idx = (va & L3_MASK) >> L3_SHIFT; /* PDPTE idx */ + l2idx = (va & L2_MASK) >> L2_SHIFT; /* PDE idx */ + l1idx = (va & L1_MASK) >> L1_SHIFT; /* PTE idx */ + + DPRINTF("%s: va=0x%llx pa=0x%llx l4idx=%lld l3idx=%lld " + "l2idx=%lld l1idx=%lld\n", __func__, (uint64_t)va, + (uint64_t)pa, l4idx, l3idx, l2idx, l1idx); + + /* Start at PML4 / top level */ + pd = (pd_entry_t *)pmap->pm_pdir_intel; + + if (!pd) + panic("%s: PML4 not initialized for pmap @ %p\n", __func__, + pmap); + + /* npa = physaddr of PDPT */ + npa = pd[l4idx] & PMAP_PA_MASK; + + /* Valid PML4e for the 512GB region containing va? */ + if (!npa) { + /* No valid PML4E - allocate PDPT page and set PML4E */ + + ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); + + if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) + panic("%s: can't locate PDPT page\n", __func__); + + pd[l4idx] = (npa | PG_u | PG_RW | PG_V); + + DPRINTF("%s: allocated new PDPT page at phys 0x%llx, " + "setting PML4e[%lld] = 0x%llx\n", __func__, + (uint64_t)npa, l4idx, pd[l4idx]); + } + + pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); + if (!pd) + panic("%s: can't locate PDPT @ pa=0x%llx\n", __func__, + (uint64_t)npa); + + /* npa = physaddr of PD page */ + npa = pd[l3idx] & PMAP_PA_MASK; + + /* Valid PDPTe for the 1GB region containing va? */ + if (!npa) { + /* No valid PDPTe - allocate PD page and set PDPTe */ + + ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); + + if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) + panic("%s: can't locate PD page\n", __func__); + + pd[l3idx] = (npa | PG_u | PG_RW | PG_V); + + DPRINTF("%s: allocated new PD page at phys 0x%llx, " + "setting PDPTe[%lld] = 0x%llx\n", __func__, + (uint64_t)npa, l3idx, pd[l3idx]); + } + + pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); + if (!pd) + panic("%s: can't locate PD page @ pa=0x%llx\n", __func__, + (uint64_t)npa); + + /* npa = physaddr of PT page */ + npa = pd[l2idx] & PMAP_PA_MASK; + + /* Valid PDE for the 2MB region containing va? */ + if (!npa) { + /* No valid PDE - allocate PT page and set PDE */ + + ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); + + if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) + panic("%s: can't locate PT page\n", __func__); + + pd[l2idx] = (npa | PG_u | PG_RW | PG_V); + + DPRINTF("%s: allocated new PT page at phys 0x%llx, " + "setting PDE[%lld] = 0x%llx\n", __func__, + (uint64_t)npa, l2idx, pd[l2idx]); + } + + pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); + if (!pd) + panic("%s: can't locate PT page @ pa=0x%llx\n", __func__, + (uint64_t)npa); + + DPRINTF("%s: setting PTE, PT page @ phys 0x%llx virt 0x%llx prot " + "0x%llx was 0x%llx\n", __func__, (uint64_t)npa, (uint64_t)pd, + (uint64_t)prot, (uint64_t)pd[l1idx]); + + pd[l1idx] = pa | protection_codes[prot] | PG_V | PG_G | PG_W; + DPRINTF("%s: setting PTE[%lld] = 0x%llx\n", __func__, l1idx, pd[l1idx]); + + /* now set the PG_G flag on the corresponding U+K entry */ + level = pmap_find_pte_direct(pmap, va, &ptes, &offs); + if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs]))) + ptes[offs] |= PG_G; + else + DPRINTF("%s: no U+K mapping for special mapping?\n", __func__); +} + /* * pmap_enter: enter a mapping into a pmap * @@ -2163,7 +2358,7 @@ enter_now: else if (va < VM_MAX_ADDRESS) npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ if (pmap == pmap_kernel()) - npte |= PG_G; + npte |= pg_g_kern; ptes[pl1_i(va)] = npte; /* zap! */ @@ -2447,10 +2642,10 @@ pmap_convert(struct pmap *pmap, int mode * release the lock if we get an interrupt in a bad moment. */ -volatile long tlb_shoot_wait; +volatile long tlb_shoot_wait __attribute__((section(".kudata"))); -volatile vaddr_t tlb_shoot_addr1; -volatile vaddr_t tlb_shoot_addr2; +volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata"))); +volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata"))); void pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself) Index: sys/arch/amd64/amd64/spl.S =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/spl.S,v retrieving revision 1.11 diff -u -p -r1.11 spl.S --- sys/arch/amd64/amd64/spl.S 20 May 2016 14:37:53 -0000 1.11 +++ sys/arch/amd64/amd64/spl.S 21 Feb 2018 22:42:22 -0000 @@ -114,7 +114,7 @@ _C_LABEL(splx): * a lower-prio one first, which needs to take the kernel lock --> * the sending CPU will never see the that CPU accept the IPI */ -IDTVEC(spllower) +KIDTVEC(spllower) _PROF_PROLOGUE pushq %rbx pushq %r13 @@ -143,7 +143,7 @@ IDTVEC(spllower) * ebx - cpl to restore * r13 - address to resume loop at */ -IDTVEC(doreti) +KIDTVEC(doreti) popq %rbx # get previous priority decl CPUVAR(IDEPTH) leaq 1f(%rip),%r13 @@ -168,4 +168,8 @@ IDTVEC(doreti) call _C_LABEL(ast) cli jmp 5b -3: INTRFASTEXIT +3: +#ifdef DIAGNOSTIC + movl $254,%esi +#endif /* DIAGNOSTIC */ + INTRFASTEXIT Index: sys/arch/amd64/amd64/trap.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/trap.c,v retrieving revision 1.53.4.3 diff -u -p -r1.53.4.3 trap.c --- sys/arch/amd64/amd64/trap.c 4 Oct 2017 19:38:03 -0000 1.53.4.3 +++ sys/arch/amd64/amd64/trap.c 21 Feb 2018 22:42:22 -0000 @@ -232,6 +232,18 @@ trap(struct trapframe *frame) frame->tf_rip = (u_int64_t)xrstor_resume; return; } + + /* + * Check for failure during return to user mode. + * We do this by looking at the address of the + * instruction that faulted. + */ + if (frame->tf_rip == (u_int64_t)doreti_iret) { + frame->tf_rip = (u_int64_t)resume_iret; + return; + } + /* FALLTHROUGH */ + case T_SEGNPFLT: case T_ALIGNFLT: case T_TSSFLT: @@ -245,16 +257,6 @@ copyfault: frame->tf_rax = error; return; } - - /* - * Check for failure during return to user mode. - * We do this by looking at the address of the - * instruction that faulted. - */ - if (frame->tf_rip == (u_int64_t)doreti_iret) { - frame->tf_rip = (u_int64_t)resume_iret; - return; - } goto we_re_toast; case T_PROTFLT|T_USER: /* protection fault */ @@ -478,8 +480,12 @@ out: static void frame_dump(struct trapframe *tf) { - printf("rip %p rsp %p rfl %p\n", - (void *)tf->tf_rip, (void *)tf->tf_rsp, (void *)tf->tf_rflags); + printf("rip %p cs 0x%x rfl %p rsp %p ss 0x%x\n", + (void *)tf->tf_rip, (unsigned)tf->tf_cs & 0xffff, + (void *)tf->tf_rflags, + (void *)tf->tf_rsp, (unsigned)tf->tf_ss & 0xffff); + printf("err 0x%llx trapno 0x%llx\n", + tf->tf_err, tf->tf_trapno); printf("rdi %p rsi %p rdx %p\n", (void *)tf->tf_rdi, (void *)tf->tf_rsi, (void *)tf->tf_rdx); printf("rcx %p r8 %p r9 %p\n", Index: sys/arch/amd64/amd64/vector.S =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/vector.S,v retrieving revision 1.47.4.2 diff -u -p -r1.47.4.2 vector.S --- sys/arch/amd64/amd64/vector.S 4 Oct 2017 19:38:03 -0000 1.47.4.2 +++ sys/arch/amd64/amd64/vector.S 28 Feb 2018 17:03:00 -0000 @@ -104,36 +104,97 @@ #define TRAP(a) pushq $(a) ; jmp _C_LABEL(alltraps) #define ZTRAP(a) pushq $0 ; TRAP(a) - .text IDTVEC(trap00) ZTRAP(T_DIVIDE) IDTVEC(trap01) ZTRAP(T_TRCTRAP) + +/* + * NMIs can happen at any time, so there's no simple way to tell + * which GS.base is in place at the time of the interrupt. Instead, + * borrow a couple ideas from FreeBSD and put the CPU's kernel + * GS.base in the memory right above the stack, storing the current + * one in a pair of callee-saved registers (%r12/13). We save the + * current %cr3 in a callee-saved register too (%r15). + * Note: we don't unblock interrupts because a nested normal interrupt + * would also reenable NMIs. + */ IDTVEC(trap02) - ZTRAP(T_NMI) + pushq $0 + pushq $T_NMI +calltrap_specstk: # special stack path + INTR_REENTRY + movl $MSR_GSBASE,%ecx # save current GS.base... + rdmsr + movq %rax,%r12 # ...in %r12 and %r13 + movq %rdx,%r13 + movq FRAMESIZE(%rsp),%rax # get kernel GS.base + movq %rax,%rdx + shrq $32,%rdx + wrmsr # switch to it + movq %cr3,%r15 # save current %cr3 in %r15 + movq CPUVAR(KERN_CR3),%rax # switch to kernel page tables + testq %rax,%rax + jz INTRENTRY_LABEL(calltrap_specstk) + movq %rax,%cr3 + jmp INTRENTRY_LABEL(calltrap_specstk) + .text + .globl INTRENTRY_LABEL(calltrap_specstk) +INTRENTRY_LABEL(calltrap_specstk): + cld + SMAP_CLAC + movq %rsp,%rdi + call trap + movl $MSR_GSBASE,%ecx # restore GS.base + movq %r12,%rax + movq %r13,%rdx + wrmsr + popq %rdi + popq %rsi + popq %rdx + popq %rcx + popq %r8 + popq %r9 + popq %r10 + popq %r11 + popq %r12 + popq %r13 + popq %r14 + jmp calltrap_specstk_tramp +KUENTRY(calltrap_specstk_tramp) + movq %r15,%cr3 # restore %cr3 + popq %r15 + popq %rbp + popq %rbx + popq %rax + addq $48,%rsp # ignored TF_[DEFG]S + iretq + IDTVEC(trap03) ZTRAP(T_BPTFLT) IDTVEC(trap04) - ZTRAP(T_OFLOW) + ZTRAP(T_OFLOW) # impossible: INTO instruction invalid in amd64 IDTVEC(trap05) - ZTRAP(T_BOUND) + ZTRAP(T_BOUND) # impossible: BOUND instruction invalid in amd64 IDTVEC(trap06) ZTRAP(T_PRIVINFLT) IDTVEC(trap07) pushq $0 # dummy error code pushq $T_DNA - INTRENTRY + INTRENTRY(trap07) sti cld SMAP_CLAC movq CPUVAR(SELF),%rdi movq %rsp, %rsi call _C_LABEL(fpudna) + cli INTRFASTEXIT IDTVEC(trap08) - TRAP(T_DOUBLEFLT) + pushq $T_DOUBLEFLT + jmp calltrap_specstk IDTVEC(trap09) - ZTRAP(T_FPOPFLT) + ZTRAP(T_FPOPFLT) # impossible: not generated on amd64 IDTVEC(trap0a) TRAP(T_TSSFLT) IDTVEC(trap0b) @@ -149,30 +210,49 @@ IDTVEC(trap0c) * so that we can do the necessary swapgs in that case. */ IDTVEC(trap0d) - subq $TF_ERR,%rsp - movl $T_PROTFLT,TF_TRAPNO(%rsp) - movq %rdi,TF_RDI(%rsp) - leaq _C_LABEL(doreti_iret)(%rip),%rdi - cmpq %rdi,TF_RIP(%rsp) + pushq %rcx + leaq _C_LABEL(doreti_iret)(%rip),%rcx + cmpq %rcx,16(%rsp) /* over %rcx and err to %rip */ + popq %rcx je 1f - testq $SEL_RPL,TF_CS(%rsp) - jz 2f + testq $SEL_RPL,16(%rsp) /* over err and %rip to %cs */ + je INTRENTRY_LABEL(trap0d) 1: swapgs -2: movq %r15,TF_R15(%rsp) - movq %r14,TF_R14(%rsp) - movq %r13,TF_R13(%rsp) - movq %r12,TF_R12(%rsp) - movq %r11,TF_R11(%rsp) - movq %r10,TF_R10(%rsp) - movq %r9,TF_R9(%rsp) - movq %r8,TF_R8(%rsp) - /*movq %rdi,TF_RDI(%rsp) done above */ - movq %rsi,TF_RSI(%rsp) - movq %rbp,TF_RBP(%rsp) - movq %rbx,TF_RBX(%rsp) - movq %rdx,TF_RDX(%rsp) + movq %rax,CPUVAR(SCRATCH) + movq CPUVAR(KERN_CR3),%rax + testq %rax,%rax + jz 98f + movq %rax,%cr3 + jmp 98f + .text + .globl INTRENTRY_LABEL(trap0d) +INTRENTRY_LABEL(trap0d): /* from kernel */ + pushq $T_PROTFLT + subq $152,%rsp movq %rcx,TF_RCX(%rsp) - movq %rax,TF_RAX(%rsp) + jmp 99f +98: /* from userspace */ + movq CPUVAR(KERN_RSP),%rax + xchgq %rax,%rsp + movq %rcx,TF_RCX(%rsp) + /* set trapno in the trap frame */ + movq $T_PROTFLT,TF_TRAPNO(%rsp) + /* copy err and iretq frame to the trap frame */ + movq 0(%rax),%rcx + movq %rcx,TF_ERR(%rsp) + add $8,%rax + movq IRETQ_RIP(%rax),%rcx + movq %rcx,TF_RIP(%rsp) + movq IRETQ_CS(%rax),%rcx + movq %rcx,TF_CS(%rsp) + movq IRETQ_RFLAGS(%rax),%rcx + movq %rcx,TF_RFLAGS(%rsp) + movq IRETQ_RSP(%rax),%rcx + movq %rcx,TF_RSP(%rsp) + movq IRETQ_SS(%rax),%rcx + movq %rcx,TF_SS(%rsp) + movq CPUVAR(SCRATCH),%rax +99: INTR_SAVE_MOST_GPRS_NO_ADJ sti jmp calltrap @@ -204,7 +284,9 @@ IDTVEC(trap1f) /* 20 - 31 reserved for future exp */ ZTRAP(T_RESERVED) -IDTVEC(exceptions) + .section .rodata + .globl Xexceptions +Xexceptions: .quad _C_LABEL(Xtrap00), _C_LABEL(Xtrap01) .quad _C_LABEL(Xtrap02), _C_LABEL(Xtrap03) .quad _C_LABEL(Xtrap04), _C_LABEL(Xtrap05) @@ -232,19 +314,44 @@ IDTVEC(exceptions) * protection fault. This will cause the process to get a SIGBUS. */ NENTRY(resume_iret) - pushq $0 - pushq $T_PROTFLT - subq $32,%rsp - INTR_SAVE_GPRS + movq %rax,CPUVAR(SCRATCH) + movq CPUVAR(KERN_CR3),%rax + testq %rax,%rax + jz INTRENTRY_LABEL(iret) + movq %rax,%cr3 + jmp INTRENTRY_LABEL(iret) + .text + .globl INTRENTRY_LABEL(iret) +INTRENTRY_LABEL(iret): /* from kernel */ + movq CPUVAR(KERN_RSP),%rax + xchgq %rax,%rsp + movq %rcx,TF_RCX(%rsp) + /* set trapno+err in the trap frame */ + movq $T_PROTFLT,TF_TRAPNO(%rsp) + movq $0,TF_ERR(%rsp) + /* copy iretq frame to the trap frame */ + movq IRETQ_RIP(%rax),%rcx + movq %rcx,TF_RIP(%rsp) + movq IRETQ_CS(%rax),%rcx + movq %rcx,TF_CS(%rsp) + movq IRETQ_RFLAGS(%rax),%rcx + movq %rcx,TF_RFLAGS(%rsp) + movq IRETQ_RSP(%rax),%rcx + movq %rcx,TF_RSP(%rsp) + movq IRETQ_SS(%rax),%rcx + movq %rcx,TF_SS(%rsp) + movq CPUVAR(SCRATCH),%rax + INTR_SAVE_MOST_GPRS_NO_ADJ sti jmp calltrap + /* * All traps go through here. Call the generic trap handler, and * check for ASTs afterwards. */ -NENTRY(alltraps) - INTRENTRY +KUENTRY(alltraps) + INTRENTRY(alltraps) sti calltrap: cld @@ -325,6 +432,7 @@ calltrap: /* XXX See comment in locore.s */ #define XINTR(name,num) Xintr_##name##num + KUTEXT .globl _C_LABEL(x2apic_eoi) _C_LABEL(x2apic_eoi): pushq %rax @@ -341,23 +449,23 @@ _C_LABEL(x2apic_eoi): #if NLAPIC > 0 #ifdef MULTIPROCESSOR -IDTVEC(recurse_lapic_ipi) +KIDTVEC(recurse_lapic_ipi) INTR_RECURSE_HWFRAME - pushq $0 + pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTR_REENTRY jmp 1f IDTVEC(intr_lapic_ipi) - pushq $0 + pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTRENTRY(intr_lapic_ipi) CODEPATCH_START movl $0,_C_LABEL(local_apic)+LAPIC_EOI CODEPATCH_END(CPTAG_EOI) movl CPUVAR(ILEVEL),%ebx cmpl $IPL_IPI,%ebx jae 2f -IDTVEC(resume_lapic_ipi) +KIDTVEC(resume_lapic_ipi) 1: incl CPUVAR(IDEPTH) movl $IPL_IPI,CPUVAR(ILEVEL) @@ -421,27 +529,27 @@ IDTVEC(ipi_invlrange) iretq #endif /* MULTIPROCESSOR */ - + /* * Interrupt from the local APIC timer. */ -IDTVEC(recurse_lapic_ltimer) +KIDTVEC(recurse_lapic_ltimer) INTR_RECURSE_HWFRAME - pushq $0 + pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTR_REENTRY jmp 1f IDTVEC(intr_lapic_ltimer) - pushq $0 + pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTRENTRY(intr_lapic_ltimer) CODEPATCH_START movl $0,_C_LABEL(local_apic)+LAPIC_EOI CODEPATCH_END(CPTAG_EOI) movl CPUVAR(ILEVEL),%ebx cmpl $IPL_CLOCK,%ebx jae 2f -IDTVEC(resume_lapic_ltimer) +KIDTVEC(resume_lapic_ltimer) 1: incl CPUVAR(IDEPTH) movl $IPL_CLOCK,CPUVAR(ILEVEL) @@ -462,21 +570,21 @@ IDTVEC(resume_lapic_ltimer) * Xen event channel upcall interrupt handler. * Only used when the hypervisor supports direct vector callbacks. */ -IDTVEC(recurse_xen_upcall) +KIDTVEC(recurse_xen_upcall) INTR_RECURSE_HWFRAME pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTR_REENTRY jmp 1f IDTVEC(intr_xen_upcall) pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTRENTRY(intr_xen_upcall) call _C_LABEL(xen_intr_ack) movl CPUVAR(ILEVEL),%ebx cmpl $IPL_NET,%ebx jae 2f -IDTVEC(resume_xen_upcall) +KIDTVEC(resume_xen_upcall) 1: incl CPUVAR(IDEPTH) movl $IPL_NET,CPUVAR(ILEVEL) @@ -498,20 +606,20 @@ IDTVEC(resume_xen_upcall) * Hyperv event channel upcall interrupt handler. * Only used when the hypervisor supports direct vector callbacks. */ -IDTVEC(recurse_hyperv_upcall) +KIDTVEC(recurse_hyperv_upcall) INTR_RECURSE_HWFRAME pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTR_REENTRY jmp 1f IDTVEC(intr_hyperv_upcall) pushq $0 subq $8,%rsp /* unused __if_trapno */ - INTRENTRY + INTRENTRY(intr_hyperv_upcall) movl CPUVAR(ILEVEL),%ebx cmpl $IPL_NET,%ebx jae 2f -IDTVEC(resume_hyperv_upcall) +KIDTVEC(resume_hyperv_upcall) 1: incl CPUVAR(IDEPTH) movl $IPL_NET,CPUVAR(ILEVEL) @@ -538,11 +646,11 @@ IDTVEC(resume_hyperv_upcall) */ #define INTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \ -IDTVEC(recurse_##name##num) ;\ +KIDTVEC(recurse_##name##num) ;\ INTR_RECURSE_HWFRAME ;\ subq $16,%rsp /* space for __if_{trapno,err} */;\ - INTRENTRY ;\ -IDTVEC(resume_##name##num) \ + INTR_REENTRY ;\ +KIDTVEC(resume_##name##num) \ movq $IREENT_MAGIC,TF_ERR(%rsp) ;\ movl %ebx,%r13d ;\ movq CPUVAR(ISOURCES) + (num) * 8, %r14 ;\ @@ -551,7 +659,7 @@ IDTVEC(resume_##name##num) \ IDTVEC(intr_##name##num) ;\ pushq $0 /* dummy error code */ ;\ subq $8,%rsp /* unused __if_trapno */ ;\ - INTRENTRY ;\ + INTRENTRY(intr_##name##num) ;\ movq CPUVAR(ISOURCES) + (num) * 8, %r14 ;\ mask(num) /* mask it in hardware */ ;\ early_ack(num) /* and allow other intrs */ ;\ @@ -1089,8 +1197,7 @@ _C_LABEL(ioapic_level_stubs): /* * Soft interrupt handlers */ - .text -IDTVEC(softtty) +KIDTVEC(softtty) movl $IPL_SOFTTTY, CPUVAR(ILEVEL) sti incl CPUVAR(IDEPTH) @@ -1099,7 +1206,7 @@ IDTVEC(softtty) decl CPUVAR(IDEPTH) jmp *%r13 -IDTVEC(softnet) +KIDTVEC(softnet) movl $IPL_SOFTNET, CPUVAR(ILEVEL) sti incl CPUVAR(IDEPTH) @@ -1108,7 +1215,7 @@ IDTVEC(softnet) decl CPUVAR(IDEPTH) jmp *%r13 -IDTVEC(softclock) +KIDTVEC(softclock) movl $IPL_SOFTCLOCK, CPUVAR(ILEVEL) sti incl CPUVAR(IDEPTH) Index: sys/arch/amd64/conf/ld.script =================================================================== RCS file: /cvs/src/sys/arch/amd64/conf/ld.script,v retrieving revision 1.4 diff -u -p -r1.4 ld.script --- sys/arch/amd64/conf/ld.script 3 Sep 2016 13:13:07 -0000 1.4 +++ sys/arch/amd64/conf/ld.script 21 Feb 2018 23:05:49 -0000 @@ -55,6 +55,16 @@ SECTIONS locore.o(.text) *(.text .text.*) } :text + + . = ALIGN(__ALIGN_SIZE); + __kernel_kutext_phys = . & 0x7fffffff; + .kutext : AT (__kernel_kutext_phys) + { + __kutext_start = ABSOLUTE(.); + *(.kutext) + __kutext_end = ABSOLUTE(.); + } :text =0xcccccccc + PROVIDE (__etext = .); PROVIDE (etext = .); _etext = .; @@ -93,6 +103,17 @@ SECTIONS __data_load = LOADADDR(.data); *(.data .data.*) } :data + . = ALIGN(0x1000); + + . = ALIGN(__ALIGN_SIZE); + __kernel_kudata_phys = . & 0x7fffffff; + .kudata : AT (__kernel_kudata_phys) + { + __kudata_start = ABSOLUTE(.); + *(.kudata) + __kudata_end = ABSOLUTE(.); + } :data =0xcccccccc + . = ALIGN(0x1000); PROVIDE (edata = .); _edata = .; Index: sys/arch/amd64/include/asm.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/asm.h,v retrieving revision 1.6 diff -u -p -r1.6 asm.h --- sys/arch/amd64/include/asm.h 13 May 2015 05:29:57 -0000 1.6 +++ sys/arch/amd64/include/asm.h 21 Feb 2018 22:42:22 -0000 @@ -68,14 +68,19 @@ .text; _ALIGN_TEXT; .globl x; .type x,@function; x: #ifdef _KERNEL +#define KUTEXT .section .kutext, "ax" +/*#define KUTEXT .text */ + /* XXX Can't use __CONCAT() here, as it would be evaluated incorrectly. */ -#ifdef __STDC__ #define IDTVEC(name) \ - .text; ALIGN_TEXT; .globl X ## name; .type X ## name,@function; X ## name: -#else -#define IDTVEC(name) \ - .text; ALIGN_TEXT; .globl X/**/name; .type X/**/name,@function; X/**/name: -#endif /* __STDC__ */ + KUTEXT; ALIGN_TEXT; \ + .globl X ## name; .type X ## name,@function; X ## name: +#define KIDTVEC(name) \ + .text; ALIGN_TEXT; \ + .globl X ## name; .type X ## name,@function; X ## name: +#define KUENTRY(x) \ + KUTEXT; _ALIGN_TEXT; .globl x; .type x,@function; x: + #endif /* _KERNEL */ #ifdef __STDC__ Index: sys/arch/amd64/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v retrieving revision 1.110 diff -u -p -r1.110 cpu.h --- sys/arch/amd64/include/cpu.h 16 Mar 2017 10:02:03 -0000 1.110 +++ sys/arch/amd64/include/cpu.h 21 Feb 2018 22:42:22 -0000 @@ -43,7 +43,7 @@ */ #ifdef _KERNEL #include -#include +#include /* USERMODE */ #include #include #endif /* _KERNEL */ @@ -86,6 +86,17 @@ union vmm_cpu_cap { struct x86_64_tss; struct cpu_info { + /* + * The beginning of this structure in mapped in the userspace "u-k" + * page tables, so that these first couple members can be accessed + * from the trampoline code. The ci_PAGEALIGN member defines where + * the part that is *not* visible begins, so don't put anything + * above it that must be kept hidden from userspace! + */ + u_int64_t ci_kern_cr3; /* U+K page table */ + u_int64_t ci_scratch; /* for U<-->K transition */ + +#define ci_PAGEALIGN ci_dev struct device *ci_dev; struct cpu_info *ci_self; struct schedstate_percpu ci_schedstate; /* scheduler state */ @@ -97,7 +108,9 @@ struct cpu_info { u_int ci_acpi_proc_id; u_int32_t ci_randseed; - u_int64_t ci_scratch; + u_int64_t ci_kern_rsp; /* kernel-only stack */ + u_int64_t ci_intr_rsp; /* U<-->K trampoline stack */ + u_int64_t ci_user_cr3; /* U-K page table */ struct proc *ci_fpcurproc; struct proc *ci_fpsaveproc; @@ -124,6 +137,8 @@ struct cpu_info { u_int32_t ci_feature_eflags; u_int32_t ci_feature_sefflags_ebx; u_int32_t ci_feature_sefflags_ecx; + u_int32_t ci_feature_sefflags_edx; + u_int32_t ci_feature_amdspec_ebx; u_int32_t ci_feature_tpmflags; u_int32_t ci_pnfeatset; u_int32_t ci_efeature_eax; @@ -212,7 +227,10 @@ struct cpu_info { #define PROC_PC(p) ((p)->p_md.md_regs->tf_rip) #define PROC_STACK(p) ((p)->p_md.md_regs->tf_rsp) -extern struct cpu_info cpu_info_primary; +struct cpu_info_full; +extern struct cpu_info_full cpu_info_full_primary; +#define cpu_info_primary (*(struct cpu_info *)((char *)&cpu_info_full_primary + 4096*2 - offsetof(struct cpu_info, ci_PAGEALIGN))) + extern struct cpu_info *cpu_info_list; #define CPU_INFO_ITERATOR int @@ -237,7 +255,8 @@ extern void need_resched(struct cpu_info #define CPU_START_CLEANUP(_ci) ((_ci)->ci_func->cleanup(_ci)) #define curcpu() ({struct cpu_info *__ci; \ - asm volatile("movq %%gs:8,%0" : "=r" (__ci)); \ + asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) \ + :"n" (offsetof(struct cpu_info, ci_self))); \ __ci;}) #define cpu_number() (curcpu()->ci_cpuid) @@ -258,8 +277,6 @@ void cpu_unidle(struct cpu_info *); #define MAXCPUS 1 #ifdef _KERNEL -extern struct cpu_info cpu_info_primary; - #define curcpu() (&cpu_info_primary) #define cpu_kick(ci) Index: sys/arch/amd64/include/cpu_full.h =================================================================== RCS file: sys/arch/amd64/include/cpu_full.h diff -N sys/arch/amd64/include/cpu_full.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/arch/amd64/include/cpu_full.h 22 Feb 2018 19:13:32 -0000 @@ -0,0 +1,66 @@ +/* $OpenBSD$ */ +/* + * Copyright (c) Philip Guenther + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _MACHINE_CPU_FULL_H_ +#define _MACHINE_CPU_FULL_H_ + +#include /* offsetof, PAGE_SIZE */ +#include +#include + +/* + * The layout of the full per-CPU information, including TSS, GDT, + * trampoline stacks, and cpu_info described in + */ +struct cpu_info_full { + /* page mapped kRO in u-k */ + union { + struct x86_64_tss u_tss; /* followed by gdt */ + char u_align[PAGE_SIZE]; + } cif_RO; +#define cif_tss cif_RO.u_tss + + /* start of page mapped kRW in u-k */ + uint64_t cif_tramp_stack[(PAGE_SIZE / 4 + - offsetof(struct cpu_info, ci_PAGEALIGN)) / sizeof(uint64_t)]; + uint64_t cif_dblflt_stack[(PAGE_SIZE / 4) / sizeof(uint64_t)]; + uint64_t cif_nmi_stack[(2 * PAGE_SIZE / 4) / sizeof(uint64_t)]; + + /* + * Beginning of this hangs over into the kRW page; rest is + * unmapped in u-k + */ + struct cpu_info cif_cpu; +} __aligned(PAGE_SIZE); + +/* tss, align shim, and gdt must fit in a page */ +CTASSERT(_ALIGN(sizeof(struct x86_64_tss)) + + sizeof(struct mem_segment_descriptor) * (NGDT_MEM + 2*NGDT_SYS) + < PAGE_SIZE); + +/* verify expected alignment */ +CTASSERT(offsetof(struct cpu_info_full, cif_cpu.ci_PAGEALIGN) % PAGE_SIZE == 0); + +/* verify total size is multiple of page size */ +CTASSERT(sizeof(struct cpu_info_full) % PAGE_SIZE == 0); + +extern struct cpu_info_full cpu_info_full_primary; + +/* Now make sure the cpu_info_primary macro is correct */ +CTASSERT(&cpu_info_primary - &cpu_info_full_primary.cif_cpu == 0); + +#endif /* _MACHINE_CPU_FULL_H_ */ Index: sys/arch/amd64/include/cpufunc.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/cpufunc.h,v retrieving revision 1.13.4.1 diff -u -p -r1.13.4.1 cpufunc.h --- sys/arch/amd64/include/cpufunc.h 3 May 2017 02:29:16 -0000 1.13.4.1 +++ sys/arch/amd64/include/cpufunc.h 21 Feb 2018 22:42:22 -0000 @@ -357,6 +357,9 @@ breakpoint(void) void amd64_errata(struct cpu_info *); +struct cpu_info_full; +void cpu_enter_pages(struct cpu_info_full *); + #endif /* _KERNEL */ #endif /* !_MACHINE_CPUFUNC_H_ */ Index: sys/arch/amd64/include/frame.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/frame.h,v retrieving revision 1.6 diff -u -p -r1.6 frame.h --- sys/arch/amd64/include/frame.h 26 Feb 2016 09:29:20 -0000 1.6 +++ sys/arch/amd64/include/frame.h 21 Feb 2018 22:42:22 -0000 @@ -147,6 +147,20 @@ struct intrframe { int64_t if_ss; }; + +/* + * The trampoline frame used on the kernel stack page which is present + * but kernel-only, in the page tables used when in userspace. This is + * the minimum for iretq operation. + */ +struct iretq_frame { + int64_t iretq_rip; + int64_t iretq_cs; + int64_t iretq_rflags; + int64_t iretq_rsp; + int64_t iretq_ss; +}; + /* * Stack frame inside cpu_switch() */ Index: sys/arch/amd64/include/frameasm.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/frameasm.h,v retrieving revision 1.10 diff -u -p -r1.10 frameasm.h --- sys/arch/amd64/include/frameasm.h 4 Sep 2016 09:22:28 -0000 1.10 +++ sys/arch/amd64/include/frameasm.h 21 Feb 2018 22:42:22 -0000 @@ -13,7 +13,10 @@ * These are used on interrupt or trap entry or exit. */ #define INTR_SAVE_GPRS \ - subq $120,%rsp ; \ + subq $120,%rsp ; \ + INTR_SAVE_MOST_GPRS_NO_ADJ ; \ + movq %rcx,TF_RCX(%rsp) +#define INTR_SAVE_MOST_GPRS_NO_ADJ \ movq %r15,TF_R15(%rsp) ; \ movq %r14,TF_R14(%rsp) ; \ movq %r13,TF_R13(%rsp) ; \ @@ -27,15 +30,54 @@ movq %rbp,TF_RBP(%rsp) ; \ movq %rbx,TF_RBX(%rsp) ; \ movq %rdx,TF_RDX(%rsp) ; \ - movq %rcx,TF_RCX(%rsp) ; \ movq %rax,TF_RAX(%rsp) -#define INTRENTRY \ - subq $32,%rsp ; \ - testq $SEL_RPL,56(%rsp) ; \ - je 98f ; \ +/* For real interrupt code paths, where we can come from userspace */ +#define INTRENTRY_LABEL(label) X##label##_untramp +#define INTRENTRY(label) \ + testq $SEL_RPL,24(%rsp) ; \ + je INTRENTRY_LABEL(label) ; \ swapgs ; \ -98: INTR_SAVE_GPRS + movq %rax,CPUVAR(SCRATCH) ; \ + movq CPUVAR(KERN_CR3),%rax ; \ + testq %rax,%rax ; \ + jz 98f ; \ + movq %rax,%cr3 ; \ + jmp 98f ; \ + .text ; \ + .global INTRENTRY_LABEL(label) ; \ +INTRENTRY_LABEL(label): /* from kernel */ \ + subq $152,%rsp ; \ + movq %rcx,TF_RCX(%rsp) ; \ + jmp 99f ; \ +98: /* from userspace */ \ + movq CPUVAR(KERN_RSP),%rax ; \ + xchgq %rax,%rsp ; \ + movq %rcx,TF_RCX(%rsp) ; \ + /* copy trapno+err to the trap frame */ \ + movq 0(%rax),%rcx ; \ + movq %rcx,TF_TRAPNO(%rsp) ; \ + movq 8(%rax),%rcx ; \ + movq %rcx,TF_ERR(%rsp) ; \ + addq $16,%rax ; \ + /* copy iretq frame to the trap frame */ \ + movq IRETQ_RIP(%rax),%rcx ; \ + movq %rcx,TF_RIP(%rsp) ; \ + movq IRETQ_CS(%rax),%rcx ; \ + movq %rcx,TF_CS(%rsp) ; \ + movq IRETQ_RFLAGS(%rax),%rcx ; \ + movq %rcx,TF_RFLAGS(%rsp) ; \ + movq IRETQ_RSP(%rax),%rcx ; \ + movq %rcx,TF_RSP(%rsp) ; \ + movq IRETQ_SS(%rax),%rcx ; \ + movq %rcx,TF_SS(%rsp) ; \ + movq CPUVAR(SCRATCH),%rax ; \ +99: INTR_SAVE_MOST_GPRS_NO_ADJ + +/* For faking up an interrupt frame when we're already in the kernel */ +#define INTR_REENTRY \ + subq $32,%rsp ; \ + INTR_SAVE_GPRS #define INTRFASTEXIT \ jmp intr_fast_exit @@ -49,26 +91,6 @@ movl %cs,%r11d ; \ pushq %r11 ; \ pushq %r13 ; - -/* - * Restore FS.base if it's not already in the CPU, and do the cli/swapgs. - * Uses %rax, %rcx, and %rdx - */ -#define INTR_RESTORE_SELECTORS \ - btsl $CPUF_USERSEGS_BIT, CPUVAR(FLAGS) ; \ - jc 99f ; \ - movq CPUVAR(CURPCB),%rdx /* for below */ ; \ - movq PCB_FSBASE(%rdx),%rax ; \ - cmpq $0,%rax ; \ - je 99f /* setting %fs has zeroed FS.base */ ; \ - movq %rax,%rdx ; \ - shrq $32,%rdx ; \ - movl $MSR_FSBASE,%ecx ; \ - wrmsr ; \ -99: movw $(GSEL(GUDATA_SEL, SEL_UPL)),%ax ; \ - cli ; \ - swapgs ; \ - movw %ax,%gs #define INTR_FAKE_TRAP 0xbadabada Index: sys/arch/amd64/include/gdt.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/gdt.h,v retrieving revision 1.5 diff -u -p -r1.5 gdt.h --- sys/arch/amd64/include/gdt.h 13 Nov 2010 04:16:42 -0000 1.5 +++ sys/arch/amd64/include/gdt.h 21 Feb 2018 22:42:22 -0000 @@ -31,4 +31,3 @@ */ void gdt_init_cpu(struct cpu_info *); -void gdt_alloc_cpu(struct cpu_info *); Index: sys/arch/amd64/include/pmap.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/pmap.h,v retrieving revision 1.62 diff -u -p -r1.62 pmap.h --- sys/arch/amd64/include/pmap.h 8 Feb 2016 18:23:04 -0000 1.62 +++ sys/arch/amd64/include/pmap.h 21 Feb 2018 22:42:22 -0000 @@ -283,8 +283,19 @@ struct pmap { struct mutex pm_mtx; struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */ LIST_ENTRY(pmap) pm_list; /* list (lck by pm_list lock) */ - pd_entry_t *pm_pdir; /* VA of PD (lck by object lock) */ - paddr_t pm_pdirpa; /* PA of PD (read-only after create) */ + /* + * pm_pdir : VA of page table to be used when executing in + * privileged mode + * pm_pdirpa : PA of page table to be used when executing in + * privileged mode + * pm_pdir_intel : VA of special page table to be used when executing + * on an Intel CPU in usermode (no kernel mappings) + * pm_pdirpa_intel : PA of special page table to be used when executing + * on an Intel CPU in usermode (no kernel mappings) + */ + pd_entry_t *pm_pdir, *pm_pdir_intel; + paddr_t pm_pdirpa, pm_pdirpa_intel; + struct vm_page *pm_ptphint[PTP_LEVELS-1]; /* pointer to a PTP in our pmap */ struct pmap_statistics pm_stats; /* pmap stats (lck by object lock) */ @@ -378,6 +389,7 @@ paddr_t pmap_prealloc_lowmem_ptps(paddr_ void pagezero(vaddr_t); int pmap_convert(struct pmap *, int); +void pmap_enter_special(vaddr_t, paddr_t, vm_prot_t); /* * functions for flushing the cache for vaddrs and pages. Index: sys/arch/amd64/include/pte.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/pte.h,v retrieving revision 1.13 diff -u -p -r1.13 pte.h --- sys/arch/amd64/include/pte.h 9 Nov 2015 00:49:33 -0000 1.13 +++ sys/arch/amd64/include/pte.h 21 Feb 2018 22:42:22 -0000 @@ -158,6 +158,7 @@ typedef u_int64_t pt_entry_t; /* PTE */ #ifdef _KERNEL extern pt_entry_t pg_nx; /* NX pte bit */ +extern pt_entry_t pg_g_kern; /* PG_G if glbl mappings can be used in kern */ #endif /* _KERNEL */ #endif /* _MACHINE_PTE_H_ */ Index: sys/arch/amd64/include/specialreg.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/specialreg.h,v retrieving revision 1.55 diff -u -p -r1.55 specialreg.h --- sys/arch/amd64/include/specialreg.h 28 Mar 2017 21:36:27 -0000 1.55 +++ sys/arch/amd64/include/specialreg.h 22 Feb 2018 19:13:32 -0000 @@ -214,6 +214,10 @@ #define SEFF0ECX_AVX512VBMI 0x00000002 /* AVX-512 vector bit inst */ #define SEFF0ECX_UMIP 0x00000004 /* UMIP support */ #define SEFF0ECX_PKU 0x00000008 /* Page prot keys for user mode */ +/* SEFF EDX bits */ +#define SEFF0EDX_IBRS 0x04000000 /* IBRS / IBPB Speculation Control */ +#define SEFF0EDX_STIBP 0x08000000 /* STIBP Speculation Control */ +#define SEFF0EDX_ARCH_CAP 0x20000000 /* Has IA32_ARCH_CAPABILITIES MSR */ /* * Thermal and Power Management (CPUID function 0x6) EAX bits @@ -285,9 +289,13 @@ * "Advanced Power Management Information" bits (CPUID function 0x80000007): * EDX bits. */ - #define CPUIDEDX_ITSC (1 << 8) /* Invariant TSC */ +/* + * AMD CPUID function 0x80000008 EBX bits + */ +#define CPUIDEBX_IBPB (1ULL << 12) /* Speculation Control IBPB */ + #define CPUID2FAMILY(cpuid) (((cpuid) >> 8) & 15) #define CPUID2MODEL(cpuid) (((cpuid) >> 4) & 15) #define CPUID2STEPPING(cpuid) ((cpuid) & 15) @@ -319,6 +327,11 @@ #define MSR_EBC_FREQUENCY_ID 0x02c /* Pentium 4 only */ #define MSR_TEST_CTL 0x033 #define MSR_IA32_FEATURE_CONTROL 0x03a +#define MSR_SPEC_CTRL 0x048 /* Speculation Control IBRS / STIBP */ +#define SPEC_CTRL_IBRS (1ULL << 0) +#define SPEC_CTRL_STIBP (1ULL << 1) +#define MSR_PRED_CMD 0x049 /* Speculation Control IBPB */ +#define PRED_CMD_IBPB (1ULL << 0) #define MSR_BIOS_UPDT_TRIG 0x079 #define MSR_BBL_CR_D0 0x088 /* PII+ only */ #define MSR_BBL_CR_D1 0x089 /* PII+ only */ @@ -344,6 +357,8 @@ #define MTRRcap_FIXED 0x100 /* bit 8 - fixed MTRRs supported */ #define MTRRcap_WC 0x400 /* bit 10 - WC type supported */ #define MTRRcap_SMRR 0x800 /* bit 11 - SMM range reg supported */ +#define MSR_ARCH_CAPABILITIES 0x10a +#define ARCH_CAPABILITIES_RDCL_NO (1 << 0) /* Meltdown safe */ #define MSR_BBL_CR_ADDR 0x116 /* PII+ only */ #define MSR_BBL_CR_DECC 0x118 /* PII+ only */ #define MSR_BBL_CR_CTL 0x119 /* PII+ only */ Index: distrib/sets/lists/comp/md.amd64 =================================================================== RCS file: /cvs/src/distrib/sets/lists/comp/md.amd64,v retrieving revision 1.100 diff -u -p -r1.100 md.amd64 --- distrib/sets/lists/comp/md.amd64 4 Mar 2017 16:52:59 -0000 1.100 +++ distrib/sets/lists/comp/md.amd64 26 Feb 2018 13:13:42 -0000 @@ -15,6 +15,7 @@ ./usr/include/amd64/codepatch.h ./usr/include/amd64/conf.h ./usr/include/amd64/cpu.h +./usr/include/amd64/cpu_full.h ./usr/include/amd64/cpufunc.h ./usr/include/amd64/cpuvar.h ./usr/include/amd64/db_machdep.h